From 18950f19356a9bf4a5906c4b3f5452d2add4709d Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 30 Aug 2024 12:16:11 +0200
Subject: [PATCH 01/80] add support for sql functional tests

---
 tests/unit/source_code/test_functional.py | 73 +++++++++++++++--------
 1 file changed, 48 insertions(+), 25 deletions(-)
diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py
index 3b6bbf9ffb..338ab99e8f 100644
--- a/tests/unit/source_code/test_functional.py
+++ b/tests/unit/source_code/test_functional.py
@@ -16,6 +16,7 @@
 from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver
 from databricks.labs.ucx.source_code.linters.context import LinterContext
 from databricks.labs.ucx.source_code.linters.files import FileLoader
+from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
 from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
@@ -62,11 +63,19 @@ def from_advice(cls, advice: Advice) -> Expectation:
         )
 
 
+_UCX_REGEX_SUFFIX = r" ucx\[(?P<code>[\w-]+):(?P<start_line>[\d+]+):(?P<start_col>[\d]+):(?P<end_line>[\d+]+):(?P<end_col>[\d]+)] (?P<message>.*)"
+_STATE_REGEX_SUFFIX = r' ucx\[session-state] (?P<session_state_json>\{.*})'
+
 class Functional:
-    _re = re.compile(
-        r"# ucx\[(?P<code>[\w-]+):(?P<start_line>[\d+]+):(?P<start_col>[\d]+):(?P<end_line>[\d+]+):(?P<end_col>[\d]+)] (?P<message>.*)"
-    )
-    _re_session_state = re.compile(r'# ucx\[session-state] (?P<session_state_json>\{.*})')
+
+    _ucx_regex = {
+        CellLanguage.PYTHON: re.compile(CellLanguage.PYTHON.comment_prefix + _UCX_REGEX_SUFFIX),
+        CellLanguage.SQL: re.compile(CellLanguage.SQL.comment_prefix + _UCX_REGEX_SUFFIX),
+    }
+    _session_states = {
+        CellLanguage.PYTHON: re.compile(CellLanguage.PYTHON.comment_prefix + _STATE_REGEX_SUFFIX),
+        CellLanguage.SQL: re.compile(CellLanguage.SQL.comment_prefix + _STATE_REGEX_SUFFIX),
+    }
 
     _location = Path(__file__).parent / 'samples/functional'
 
@@ -95,10 +104,11 @@ def test_id(cls, sample: Functional) -> str:
     def __init__(self, path: Path, parent: Path | None = None) -> None:
         self.path = path
         self.parent = parent
+        self.language = CellLanguage.PYTHON if path.suffix.endswith("py") else CellLanguage.SQL
 
-    def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver) -> None:
+    def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> None:
         expected_problems = list(self._expected_problems())
-        actual_advices = list(self._lint(path_lookup, dependency_resolver))
+        actual_advices = list(self._lint(path_lookup, dependency_resolver, migration_index))
         # Convert the actual problems to the same type as our expected problems for easier comparison.
         actual_problems = [Expectation.from_advice(advice) for advice in actual_advices]
 
@@ -118,13 +128,7 @@ def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolve
         assert no_errors, "\n".join(errors)
         # TODO: output annotated file with comments for quick fixing
 
-    def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver) -> Iterable[Advice]:
-        migration_index = MigrationIndex(
-            [
-                MigrationStatus('old', 'things', dst_catalog='brand', dst_schema='new', dst_table='stuff'),
-                MigrationStatus('other', 'matters', dst_catalog='some', dst_schema='certain', dst_table='issues'),
-            ]
-        )
+    def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> Iterable[Advice]:
         session_state = self._test_session_state()
         print(str(session_state))
         session_state.named_parameters = {"my-widget": "my-path.py"}
@@ -145,9 +149,10 @@ def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver
         return linter.lint()
 
     def _regex_match(self, regex: re.Pattern[str]) -> Generator[tuple[Comment, dict[str, Any]], None, None]:
+        ucx_comment_prefix = self.language.comment_prefix + ' ucx['
         with self.path.open('rb') as f:
             for comment in self._comments(f):
-                if not comment.text.startswith('# ucx['):
+                if not comment.text.startswith(ucx_comment_prefix):
                     continue
                 match = regex.match(comment.text)
                 if not match:
@@ -156,7 +161,8 @@ def _regex_match(self, regex: re.Pattern[str]) -> Generator[tuple[Comment, dict[
                 yield comment, groups
 
     def _expected_problems(self) -> Generator[Expectation, None, None]:
-        for comment, groups in self._regex_match(self._re):
+        regex = self._ucx_regex[self.language]
+        for comment, groups in self._regex_match(regex):
             reported_start_line = groups['start_line']
             if '+' in reported_start_line:
                 start_line = int(reported_start_line[1:]) + comment.start_line
@@ -177,7 +183,8 @@ def _expected_problems(self) -> Generator[Expectation, None, None]:
             )
 
     def _test_session_state(self) -> CurrentSessionState:
-        matches = list(self._regex_match(self._re_session_state))
+        regex = self._session_states[self.language]
+        matches = list(self._regex_match(regex))
         if len(matches) > 1:
             raise ValueError("A test should have no more than one session state definition")
         if len(matches) == 0:
@@ -186,18 +193,34 @@ def _test_session_state(self) -> CurrentSessionState:
         json_str = groups['session_state_json']
         return CurrentSessionState.from_json(json.loads(json_str))
 
+    def _comments(self, f) -> Generator[Comment, None, None]:
+        if self.language is CellLanguage.PYTHON:
+            yield from self._python_comments(f)
+            return
+        if self.language is CellLanguage.SQL:
+            yield from self._sql_comments(f)
+
     @staticmethod
-    def _comments(f) -> Generator[Comment, None, None]:
+    def _python_comments(f) -> Generator[Comment, None, None]:
         for token in tokenize.tokenize(f.readline):
             if token.type != tokenize.COMMENT:
                 continue
             yield Comment.from_token(token)
 
+    @staticmethod
+    def _sql_comments(f) -> Generator[Comment, None, None]:
+        # SQLGlot does not propagate tokens. See https://github.com/tobymao/sqlglot/issues/3159
+        # Hence SQL statement advice offsets can be wrong because of multi-line comments and statements
+        for idx, line in enumerate(f.readlines()):
+            if not line.startswith(b"--"):
+                continue
+            yield Comment(text=line.decode("utf-8"), start_line=idx, end_line=idx)
+
 
 @pytest.mark.parametrize("sample", Functional.all(), ids=Functional.test_id)
-def test_functional(sample: Functional, mock_path_lookup, simple_dependency_resolver) -> None:
+def test_functional(sample: Functional, mock_path_lookup, simple_dependency_resolver, extended_test_index) -> None:
     path_lookup = mock_path_lookup.change_directory(sample.path.parent)
-    sample.verify(path_lookup, simple_dependency_resolver)
+    sample.verify(path_lookup, simple_dependency_resolver, extended_test_index)
 
 
 @pytest.mark.parametrize(
@@ -211,15 +234,15 @@ def test_functional(sample: Functional, mock_path_lookup, simple_dependency_reso
         ("_child_that_uses_value_from_parent.py", "grand_parent_that_imports_parent_that_magic_runs_child.py"),
     ],
 )
-def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simple_dependency_resolver) -> None:
+def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simple_dependency_resolver, extended_test_index) -> None:
     sample = Functional.for_child(child, parent)
     path_lookup = mock_path_lookup.change_directory(sample.path.parent)
-    sample.verify(path_lookup, simple_dependency_resolver)
+    sample.verify(path_lookup, simple_dependency_resolver, extended_test_index)
 
 
-@pytest.mark.skip(reason="Used for troubleshooting failing tests")
-def test_one_functional(mock_path_lookup, simple_dependency_resolver):
-    path = mock_path_lookup.resolve(Path("functional/widgets.py"))
+# @pytest.mark.skip(reason="Used for troubleshooting failing tests")
+def test_one_functional(mock_path_lookup, simple_dependency_resolver, extended_test_index):
+    path = mock_path_lookup.resolve(Path("functional/table-migration/table-migration-notebook.sql"))
     path_lookup = mock_path_lookup.change_directory(path.parent)
     sample = Functional(path)
-    sample.verify(path_lookup, simple_dependency_resolver)
+    sample.verify(path_lookup, simple_dependency_resolver, extended_test_index)

From 224796dbda6824f32049bb4b2f98c439cba33c3b Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 30 Aug 2024 12:16:34 +0200
Subject: [PATCH 02/80] disable

---
 tests/unit/source_code/test_functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py
index 338ab99e8f..8b21891364 100644
--- a/tests/unit/source_code/test_functional.py
+++ b/tests/unit/source_code/test_functional.py
@@ -240,7 +240,7 @@ def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simpl
     sample.verify(path_lookup, simple_dependency_resolver, extended_test_index)
 
 
-# @pytest.mark.skip(reason="Used for troubleshooting failing tests")
+@pytest.mark.skip(reason="Used for troubleshooting failing tests")
 def test_one_functional(mock_path_lookup, simple_dependency_resolver, extended_test_index):
     path = mock_path_lookup.resolve(Path("functional/table-migration/table-migration-notebook.sql"))
     path_lookup = mock_path_lookup.change_directory(path.parent)

From b63cb19178b5142bdb6e3626029cba3a9a49c2e0 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 30 Aug 2024 12:16:55 +0200
Subject: [PATCH 03/80] more functional tests

---
 .../file-access/complex-sql-notebook.sql      | 60 +++++++++++++++++
 .../python-notebook-with-embedded-sql.py      | 31 +++++++++
 .../sql-notebook-with-embedded-python.sql     | 18 +++++
 .../table-migration-notebook.py               | 35 ++++++++++
 .../table-migration-notebook.sql              | 67 +++++++++++++++++++
 5 files changed, 211 insertions(+)
 create mode 100644 tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql
 create mode 100644 tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py
 create mode 100644 tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
 create mode 100644 tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py
 create mode 100644 tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql

diff --git a/tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql b/tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql
new file mode 100644
index 0000000000..d2ecb3cafd
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql
@@ -0,0 +1,60 @@
+-- Databricks notebook source
+-- MAGIC %md
+-- MAGIC #Test notebook for DBFS discovery in Notebooks
+
+-- COMMAND ----------
+-- DBTITLE 1,A Python cell that references DBFS
+-- MAGIC %python
+-- ucx[dbfs-usage:+1:7:+1:18] Deprecated file system path: dbfs:/...
+-- MAGIC DBFS = "dbfs:/..."
+-- ucx[dbfs-usage:+1:7:+1:18] Deprecated file system path: /dbfs/mnt
+-- MAGIC DBFS = "/dbfs/mnt"
+-- ucx[dbfs-usage:+1:7:+1:14] Deprecated file system path: /mnt/
+-- MAGIC DBFS = "/mnt/"
+-- ucx[dbfs-usage:+1:7:+1:18] Deprecated file system path: dbfs:/...
+-- MAGIC DBFS = "dbfs:/..."
+-- ucx[dbfs-usage:+1:10:+1:26] Deprecated file system path: /dbfs/mnt/data
+-- MAGIC load_data('/dbfs/mnt/data')
+-- MAGIC load_data('/data')
+-- ucx[dbfs-usage:+1:10:+1:26] Deprecated file system path: /dbfs/mnt/data
+-- MAGIC load_data('/dbfs/mnt/data', '/data')
+-- MAGIC # load_data('/dbfs/mnt/data', '/data')
+-- ucx[implicit-dbfs-usage:+2:0:+2:34] The use of default dbfs: references is deprecated: /mnt/foo/bar
+-- ucx[dbfs-usage:+1:19:+1:33] Deprecated file system path: /mnt/foo/bar
+-- MAGIC spark.read.parquet("/mnt/foo/bar")
+-- ucx[direct-filesystem-access:+2:0:+2:39] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar
+-- ucx[dbfs-usage:+1:19:+1:38] Deprecated file system path: dbfs:/mnt/foo/bar
+-- MAGIC spark.read.parquet("dbfs:/mnt/foo/bar")
+-- ucx[direct-filesystem-access:+2:0:+2:40] The use of direct filesystem references is deprecated: dbfs://mnt/foo/bar
+-- ucx[dbfs-usage:+1:19:+1:39] Deprecated file system path: dbfs://mnt/foo/bar
+-- MAGIC spark.read.parquet("dbfs://mnt/foo/bar")
+-- MAGIC # Would need a stateful linter to detect this next one
+-- MAGIC spark.read.parquet(DBFS)
+
+-- COMMAND ----------
+-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/...
+-- DBTITLE 1,A SQL cell that references DBFS
+
+SELECT * FROM parquet.`dbfs:/...` LIMIT 10
+
+-- COMMAND ----------
+-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /mnt/...
+-- DBTITLE 1,A SQL cell that references DBFS
+SELECT * FROM delta.`/mnt/...` WHERE foo > 6
+
+-- COMMAND ----------
+-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /a/b/c
+-- DBTITLE 1,A SQL cell that references DBFS
+        SELECT * FROM json.`/a/b/c` WHERE foo > 6
+
+-- COMMAND ----------
+-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /...
+-- DBTITLE 1,A SQL cell that references DBFS
+        DELETE FROM json.`/...` WHERE foo = 'bar'
+
+-- COMMAND ----------
+-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: /dbfs/...
+-- DBTITLE 1,A SQL cell that references DBFS
+
+MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE
+
diff --git a/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py
new file mode 100644
index 0000000000..e0b7504e09
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py
@@ -0,0 +1,31 @@
+# Databricks notebook source
+# MAGIC %md # This is a Python notebook, that has SQL cell embedded
+
+# COMMAND ----------
+
+# ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g
+# ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g
+display(spark.read.csv('/mnt/things/e/f/g'))
+
+# COMMAND ----------
+
+# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/foo
+# MAGIC %sql  SELECT * FROM csv.`dbfs:/mnt/foo`
+
+# COMMAND ----------
+
+# MAGIC %md mess around with formatting
+
+
+
+
+# COMMAND ----------
+
+
+# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g
+# MAGIC %sql
+# MAGIC SELECT * FROM
+# MAGIC   csv.`dbfs:/mnt/bar/e/f/g`
+# MAGIC WHERE _c1 > 5
+
+
diff --git a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
new file mode 100644
index 0000000000..d6bf93b291
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
@@ -0,0 +1,18 @@
+-- Databricks notebook source
+-- MAGIC %md # This is a SQL notebook, that has Python cell embedded
+
+-- COMMAND ----------
+
+-- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/whatever
+SELECT * FROM csv.`dbfs:/mnt/whatever`
+
+
+
+
+
+-- COMMAND ----------
+
+-- MAGIC %python
+-- ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g
+-- ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g
+-- MAGIC display(spark.read.csv('/mnt/things/e/f/g'))
diff --git a/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py
new file mode 100644
index 0000000000..22c8325035
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py
@@ -0,0 +1,35 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC #Test notebook for Use tracking in Notebooks
+
+# COMMAND ----------
+
+# ucx[table-migrated-to-uc:+2:8:+2:29] Table people is migrated to cata4.nondefault.newpeople in Unity Catalog
+# ucx[default-format-changed-in-dbr8:+1:8:+1:29] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+display(spark.table('people')) # we are looking at default.people table
+
+# COMMAND ----------
+
+# MAGIC %sql USE something
+
+# COMMAND ----------
+
+# ucx[table-migrated-to-uc:+2:8:+2:30] Table persons is migrated to cata4.newsomething.persons in Unity Catalog
+# ucx[default-format-changed-in-dbr8:+1:8:+1:30] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+display(spark.table('persons')) # we are looking at something.persons table
+
+# COMMAND ----------
+
+spark.sql('USE whatever')
+
+# COMMAND ----------
+
+# ucx[table-migrated-to-uc:+2:8:+2:30] Table kittens is migrated to cata4.felines.toms in Unity Catalog
+# ucx[default-format-changed-in-dbr8:+1:8:+1:30] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+display(spark.table('kittens')) # we are looking at whatever.kittens table
+
+# COMMAND ----------
+
+# ucx[table-migrated-to-uc:+2:0:+2:38] Table numbers is migrated to cata4.counting.numbers in Unity Catalog
+# ucx[default-format-changed-in-dbr8:+1:0:+1:38] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+spark.range(10).saveAsTable('numbers') # we are saving to whatever.numbers table.
diff --git a/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql
new file mode 100644
index 0000000000..d2c6acb4e8
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.sql
@@ -0,0 +1,67 @@
+-- Databricks notebook source
+-- MAGIC %md
+-- MAGIC #Test notebook for Use tracking in Notebooks
+
+-- COMMAND ----------
+-- DBTITLE 1,A SQL cell that changes the DB
+
+USE different_db
+
+-- COMMAND ----------
+-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table different_db.testtable is migrated to cata2.newspace.table in Unity Catalog
+-- DBTITLE 1,A SQL cell that references tables
+
+SELECT * FROM  testtable LIMIT 10
+
+-- COMMAND ----------
+-- DBTITLE 1,A SQL cell that changes the DB to one we migrate from
+
+USE old
+
+-- COMMAND ----------
+-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table old.testtable is migrated to cata3.newspace.table in Unity Catalog
+-- DBTITLE 1,A SQL cell that references tables
+
+SELECT * FROM  testtable LIMIT 10
+
+-- COMMAND ----------
+-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table old.stuff is migrated to brand.new.things in Unity Catalog
+-- DBTITLE 1,A SQL cell that references tables
+
+SELECT * FROM  stuff LIMIT 10
+
+-- COMMAND ----------
+-- DBTITLE 1,A Python cell that uses calls to change the USE
+-- MAGIC %python
+-- MAGIC # This is a Python cell that uses calls to change the USE...
+
+spark.sql("use different_db")
+
+-- COMMAND ----------
+-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table different_db.testtable is migrated to cata2.newspace.table in Unity Catalog
+-- DBTITLE 1,A SQL cell that references DBFS
+
+SELECT * FROM testtable LIMIT 10
+
+-- COMMAND ----------
+-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table old.testtable is migrated to cata3.newspace.table in Unity Catalog
+-- DBTITLE 1,A SQL cell that references DBFS
+
+SELECT * FROM old.testtable LIMIT 10
+
+-- COMMAND ----------
+-- DBTITLE 1,A SQL cell that changes the DB to the default
+
+USE default
+
+-- COMMAND ----------
+-- ucx[table-migrated-to-uc:+0:0:+0:1024] Table default.testtable is migrated to cata.nondefault.table in Unity Catalog
+-- DBTITLE 1,A SQL cell that references DBFS
+
+SELECT * FROM testtable LIMIT 10
+
+-- COMMAND ----------
+-- DBTITLE 1,A SQL cell that references tables
+
+MERGE INTO catalog.schema.testtable t USING source ON t.key = source.key WHEN MATCHED THEN DELETE
+

From 6d911ca4c1cca79fe1653e97e32e50fbdaa41aae Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 30 Aug 2024 12:17:29 +0200
Subject: [PATCH 04/80] move test to functional

---
 .../unit/source_code/test_notebook_linter.py  | 534 ------------------
 1 file changed, 534 deletions(-)

diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py
index a371a56ea5..17c00feee6 100644
--- a/tests/unit/source_code/test_notebook_linter.py
+++ b/tests/unit/source_code/test_notebook_linter.py
@@ -8,542 +8,8 @@
 index = MigrationIndex([])
 
 
-@pytest.mark.parametrize(
-    "lang, source, expected",
-    [
-        # 2 alerts
-        (
-            Language.SQL,
-            """-- Databricks notebook source
--- MAGIC %md # This is a SQL notebook, that has Python cell embedded
-
--- COMMAND ----------
-
-SELECT * FROM csv.`dbfs:/mnt/whatever`
-
-
-
-
-
--- COMMAND ----------
-
--- MAGIC %python
--- MAGIC display(spark.read.csv('/mnt/things/e/f/g'))
-""",
-            [
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: dbfs:/mnt/whatever',
-                    start_line=5,
-                    start_col=0,
-                    end_line=5,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='implicit-dbfs-usage',
-                    message='The use of default dbfs: references is deprecated: /mnt/things/e/f/g',
-                    start_line=14,
-                    start_col=8,
-                    end_line=14,
-                    end_col=43,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: /mnt/things/e/f/g',
-                    start_line=14,
-                    start_col=23,
-                    end_line=14,
-                    end_col=42,
-                ),
-            ],
-        ),
-        (
-            Language.PYTHON,
-            # 3 alerts
-            """# Databricks notebook source
-# MAGIC %md # This is a Python notebook, that has SQL cell embedded
-
-# COMMAND ----------
-
-display(spark.read.csv('/mnt/things/e/f/g'))
-
-# COMMAND ----------
-
-# MAGIC %sql  SELECT * FROM csv.`dbfs:/mnt/foo`
-
-# COMMAND ----------
-
-# MAGIC %md mess around with formatting
-
-
-
-
-# COMMAND ----------
-
-
-# MAGIC %sql
-# MAGIC SELECT * FROM
-# MAGIC   csv.`dbfs:/mnt/bar/e/f/g`
-# MAGIC WHERE _c1 > 5
-
-
-
-""",
-            [
-                Deprecation(
-                    code='implicit-dbfs-usage',
-                    message='The use of default dbfs: references is deprecated: ' '/mnt/things/e/f/g',
-                    start_line=5,
-                    start_col=8,
-                    end_line=5,
-                    end_col=43,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: /mnt/things/e/f/g',
-                    start_line=5,
-                    start_col=23,
-                    end_line=5,
-                    end_col=42,
-                ),
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: dbfs:/mnt/foo',
-                    start_line=9,
-                    start_col=0,
-                    end_line=9,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g',
-                    start_line=21,
-                    start_col=0,
-                    end_line=21,
-                    end_col=1024,
-                ),
-            ],
-        ),
-        (
-            Language.SQL,
-            """-- Databricks notebook source
--- MAGIC %md
--- MAGIC #Test notebook for DBFS discovery in Notebooks
-
--- COMMAND ----------
--- DBTITLE 1,A Python cell that references DBFS
--- MAGIC %python
--- MAGIC DBFS = "dbfs:/..."
--- MAGIC DBFS = "/dbfs/mnt"
--- MAGIC DBFS = "/mnt/"
--- MAGIC DBFS = "dbfs:/..."
--- MAGIC load_data('/dbfs/mnt/data')
--- MAGIC load_data('/data')
--- MAGIC load_data('/dbfs/mnt/data', '/data')
--- MAGIC # load_data('/dbfs/mnt/data', '/data')
--- MAGIC spark.read.parquet("/mnt/foo/bar")
--- MAGIC spark.read.parquet("dbfs:/mnt/foo/bar")
--- MAGIC spark.read.parquet("dbfs://mnt/foo/bar")
--- MAGIC # Would need a stateful linter to detect this next one
--- MAGIC spark.read.parquet(DBFS)
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-
-SELECT * FROM parquet.`dbfs:/...` LIMIT 10
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-SELECT * FROM delta.`/mnt/...` WHERE foo > 6
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-        SELECT * FROM json.`/a/b/c` WHERE foo > 6
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-        DELETE FROM json.`/...` WHERE foo = 'bar'
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-
-MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE
-    """,
-            [
-                Deprecation(
-                    code='implicit-dbfs-usage',
-                    message='The use of default dbfs: references is deprecated: /mnt/foo/bar',
-                    start_line=15,
-                    start_col=0,
-                    end_line=15,
-                    end_col=34,
-                ),
-                Deprecation(
-                    code='direct-filesystem-access',
-                    message='The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar',
-                    start_line=16,
-                    start_col=0,
-                    end_line=16,
-                    end_col=39,
-                ),
-                Deprecation(
-                    code='direct-filesystem-access',
-                    message='The use of direct filesystem references is deprecated: dbfs://mnt/foo/bar',
-                    start_line=17,
-                    start_col=0,
-                    end_line=17,
-                    end_col=40,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: dbfs:/...',
-                    start_line=7,
-                    start_col=7,
-                    end_line=7,
-                    end_col=18,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: /dbfs/mnt',
-                    start_line=8,
-                    start_col=7,
-                    end_line=8,
-                    end_col=18,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: /mnt/',
-                    start_line=9,
-                    start_col=7,
-                    end_line=9,
-                    end_col=14,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: dbfs:/...',
-                    start_line=10,
-                    start_col=7,
-                    end_line=10,
-                    end_col=18,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: /dbfs/mnt/data',
-                    start_line=11,
-                    start_col=10,
-                    end_line=11,
-                    end_col=26,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: /dbfs/mnt/data',
-                    start_line=13,
-                    start_col=10,
-                    end_line=13,
-                    end_col=26,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: /mnt/foo/bar',
-                    start_line=15,
-                    start_col=19,
-                    end_line=15,
-                    end_col=33,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: dbfs:/mnt/foo/bar',
-                    start_line=16,
-                    start_col=19,
-                    end_line=16,
-                    end_col=38,
-                ),
-                Deprecation(
-                    code='dbfs-usage',
-                    message='Deprecated file system path: dbfs://mnt/foo/bar',
-                    start_line=17,
-                    start_col=19,
-                    end_line=17,
-                    end_col=39,
-                ),
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: dbfs:/...',
-                    start_line=22,
-                    start_col=0,
-                    end_line=22,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: /mnt/...',
-                    start_line=27,
-                    start_col=0,
-                    end_line=27,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: /a/b/c',
-                    start_line=31,
-                    start_col=0,
-                    end_line=31,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: /...',
-                    start_line=35,
-                    start_col=0,
-                    end_line=35,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='dbfs-read-from-sql-query',
-                    message='The use of DBFS is deprecated: /dbfs/...',
-                    start_line=39,
-                    start_col=0,
-                    end_line=39,
-                    end_col=1024,
-                ),
-            ],
-        ),
-        # Add more test cases here
-    ],
-)
-def test_notebook_linter(lang, source, expected, mock_path_lookup):
-    # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
-    # Hence SQL statement advice offsets can be wrong because of comments and statements
-    # over multiple lines.
-    linter = NotebookLinter.from_source(index, mock_path_lookup, CurrentSessionState(), source, lang)
-    assert linter is not None
-    gathered = list(linter.lint())
-    assert gathered == expected
-
-
 def test_notebook_linter_name(mock_path_lookup):
     source = """-- Databricks notebook source"""
     linter = NotebookLinter.from_source(index, mock_path_lookup, CurrentSessionState(), source, Language.SQL)
     assert linter.name() == "notebook-linter"
 
-
-@pytest.mark.parametrize(
-    "lang, source, expected",
-    [
-        (
-            Language.SQL,
-            """-- Databricks notebook source
--- MAGIC %md
--- MAGIC #Test notebook for Use tracking in Notebooks
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that changes the DB
-
-USE different_db
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references tables
-
-SELECT * FROM  testtable LIMIT 10
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that changes the DB to one we migrate from
-
-USE old
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references tables
-
-SELECT * FROM  testtable LIMIT 10
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references tables
-
-SELECT * FROM  stuff LIMIT 10
-
--- COMMAND ----------
--- DBTITLE 1,A Python cell that uses calls to change the USE
--- MAGIC %python
--- MAGIC # This is a Python cell that uses calls to change the USE...
-
-spark.sql("use different_db")
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-
-SELECT * FROM testtable LIMIT 10
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-
-SELECT * FROM old.testtable LIMIT 10
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that changes the DB to the default
-
-USE default
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references DBFS
-
-SELECT * FROM testtable LIMIT 10
-
--- COMMAND ----------
--- DBTITLE 1,A SQL cell that references tables
-
-MERGE INTO catalog.schema.testtable t USING source ON t.key = source.key WHEN MATCHED THEN DELETE
-    """,
-            [
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table different_db.testtable is migrated to cata2.newspace.table in Unity Catalog',
-                    start_line=10,
-                    start_col=0,
-                    end_line=10,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table old.testtable is migrated to cata3.newspace.table in Unity Catalog',
-                    start_line=20,
-                    start_col=0,
-                    end_line=20,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table old.stuff is migrated to brand.new.things in Unity Catalog',
-                    start_line=25,
-                    start_col=0,
-                    end_line=25,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table different_db.testtable is migrated to ' 'cata2.newspace.table in Unity Catalog',
-                    start_line=37,
-                    start_col=0,
-                    end_line=37,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table old.testtable is migrated to cata3.newspace.table in Unity Catalog',
-                    start_line=42,
-                    start_col=0,
-                    end_line=42,
-                    end_col=1024,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table default.testtable is migrated to cata.nondefault.table in Unity Catalog',
-                    start_line=52,
-                    start_col=0,
-                    end_line=52,
-                    end_col=1024,
-                ),
-            ],
-        ),
-        (
-            Language.PYTHON,
-            """# Databricks notebook source
---- MAGIC %md
--- MAGIC #Test notebook for Use tracking in Notebooks
-
-# COMMAND ----------
-
-display(spark.table('people')) # we are looking at default.people table
-
-# COMMAND ----------
-
-# MAGIC %sql USE something
-
-# COMMAND ----------
-
-display(spark.table('persons')) # we are looking at something.persons table
-
-# COMMAND ----------
-
-spark.sql('USE whatever')
-
-# COMMAND ----------
-
-display(spark.table('kittens')) # we are looking at whatever.kittens table
-
-# COMMAND ----------
-
-spark.range(10).saveAsTable('numbers') # we are saving to whatever.numbers table.""",
-            [
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table people is migrated to cata4.nondefault.newpeople in Unity Catalog',
-                    start_line=6,
-                    start_col=8,
-                    end_line=6,
-                    end_col=29,
-                ),
-                Advice(
-                    code='default-format-changed-in-dbr8',
-                    message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta',
-                    start_line=6,
-                    start_col=8,
-                    end_line=6,
-                    end_col=29,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table persons is migrated to cata4.newsomething.persons in Unity Catalog',
-                    start_line=14,
-                    start_col=8,
-                    end_line=14,
-                    end_col=30,
-                ),
-                Advice(
-                    code='default-format-changed-in-dbr8',
-                    message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta',
-                    start_line=14,
-                    start_col=8,
-                    end_line=14,
-                    end_col=30,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table kittens is migrated to cata4.felines.toms in Unity Catalog',
-                    start_line=22,
-                    start_col=8,
-                    end_line=22,
-                    end_col=30,
-                ),
-                Advice(
-                    code='default-format-changed-in-dbr8',
-                    message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta',
-                    start_line=22,
-                    start_col=8,
-                    end_line=22,
-                    end_col=30,
-                ),
-                Deprecation(
-                    code='table-migrated-to-uc',
-                    message='Table numbers is migrated to cata4.counting.numbers in Unity Catalog',
-                    start_line=26,
-                    start_col=0,
-                    end_line=26,
-                    end_col=38,
-                ),
-                Advice(
-                    code='default-format-changed-in-dbr8',
-                    message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta',
-                    start_line=26,
-                    start_col=0,
-                    end_line=26,
-                    end_col=38,
-                ),
-            ],
-        ),
-    ],
-)
-def test_notebook_linter_tracks_use(extended_test_index, lang, source, expected, mock_path_lookup):
-    linter = NotebookLinter.from_source(extended_test_index, mock_path_lookup, CurrentSessionState(), source, lang)
-    assert linter is not None
-    advices = list(linter.lint())
-    assert advices == expected

From 5f07dc83665fffa64fbae40f773958071cacf97f Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 30 Aug 2024 12:23:21 +0200
Subject: [PATCH 05/80] formatting

---
 tests/unit/source_code/test_functional.py      | 15 +++++++++++----
 tests/unit/source_code/test_notebook_linter.py |  4 +---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py
index 8b21891364..ed2f6fe6dc 100644
--- a/tests/unit/source_code/test_functional.py
+++ b/tests/unit/source_code/test_functional.py
@@ -11,7 +11,7 @@
 
 import pytest
 
-from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex, MigrationStatus
+from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
 from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState, is_a_notebook
 from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver
 from databricks.labs.ucx.source_code.linters.context import LinterContext
@@ -66,6 +66,7 @@ def from_advice(cls, advice: Advice) -> Expectation:
 _UCX_REGEX_SUFFIX = r" ucx\[(?P<code>[\w-]+):(?P<start_line>[\d+]+):(?P<start_col>[\d]+):(?P<end_line>[\d+]+):(?P<end_col>[\d]+)] (?P<message>.*)"
 _STATE_REGEX_SUFFIX = r' ucx\[session-state] (?P<session_state_json>\{.*})'
 
+
 class Functional:
 
     _ucx_regex = {
@@ -106,7 +107,9 @@ def __init__(self, path: Path, parent: Path | None = None) -> None:
         self.parent = parent
         self.language = CellLanguage.PYTHON if path.suffix.endswith("py") else CellLanguage.SQL
 
-    def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> None:
+    def verify(
+        self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex
+    ) -> None:
         expected_problems = list(self._expected_problems())
         actual_advices = list(self._lint(path_lookup, dependency_resolver, migration_index))
         # Convert the actual problems to the same type as our expected problems for easier comparison.
@@ -128,7 +131,9 @@ def verify(self, path_lookup: PathLookup, dependency_resolver: DependencyResolve
         assert no_errors, "\n".join(errors)
         # TODO: output annotated file with comments for quick fixing
 
-    def _lint(self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex) -> Iterable[Advice]:
+    def _lint(
+        self, path_lookup: PathLookup, dependency_resolver: DependencyResolver, migration_index: MigrationIndex
+    ) -> Iterable[Advice]:
         session_state = self._test_session_state()
         print(str(session_state))
         session_state.named_parameters = {"my-widget": "my-path.py"}
@@ -234,7 +239,9 @@ def test_functional(sample: Functional, mock_path_lookup, simple_dependency_reso
         ("_child_that_uses_value_from_parent.py", "grand_parent_that_imports_parent_that_magic_runs_child.py"),
     ],
 )
-def test_functional_with_parent(child: str, parent: str, mock_path_lookup, simple_dependency_resolver, extended_test_index) -> None:
+def test_functional_with_parent(
+    child: str, parent: str, mock_path_lookup, simple_dependency_resolver, extended_test_index
+) -> None:
     sample = Functional.for_child(child, parent)
     path_lookup = mock_path_lookup.change_directory(sample.path.parent)
     sample.verify(path_lookup, simple_dependency_resolver, extended_test_index)
diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py
index 17c00feee6..f4a7785317 100644
--- a/tests/unit/source_code/test_notebook_linter.py
+++ b/tests/unit/source_code/test_notebook_linter.py
@@ -1,8 +1,7 @@
-import pytest
 from databricks.sdk.service.workspace import Language
 
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
-from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState
+from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.notebooks.sources import NotebookLinter
 
 index = MigrationIndex([])
@@ -12,4 +11,3 @@ def test_notebook_linter_name(mock_path_lookup):
     source = """-- Databricks notebook source"""
     linter = NotebookLinter.from_source(index, mock_path_lookup, CurrentSessionState(), source, Language.SQL)
     assert linter.name() == "notebook-linter"
-

From a8f3ecdb13d0e18d2d8333e31ef2533e5497f7c7 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 30 Aug 2024 12:52:33 +0200
Subject: [PATCH 06/80] formatting

---
 .../functional/file-access/sql-notebook-with-embedded-python.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
index d6bf93b291..2a9361fad5 100644
--- a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
+++ b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
@@ -16,3 +16,4 @@ SELECT * FROM csv.`dbfs:/mnt/whatever`
 -- ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g
 -- ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g
 -- MAGIC display(spark.read.csv('/mnt/things/e/f/g'))
+

From 505402fc9326ac594723ca34cac18b21269d90f6 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 16:49:14 +0200
Subject: [PATCH 07/80] enhance dbfs linting to all direct file system access

---
 .../labs/ucx/source_code/linters/context.py   |   6 +-
 .../labs/ucx/source_code/linters/dbfs.py      | 124 ------------
 .../labs/ucx/source_code/linters/dfsa.py      | 176 ++++++++++++++++++
 tests/unit/source_code/linters/test_dbfs.py   | 130 -------------
 tests/unit/source_code/linters/test_dfsa.py   | 147 +++++++++++++++
 5 files changed, 326 insertions(+), 257 deletions(-)
 delete mode 100644 src/databricks/labs/ucx/source_code/linters/dbfs.py
 create mode 100644 src/databricks/labs/ucx/source_code/linters/dfsa.py
 delete mode 100644 tests/unit/source_code/linters/test_dbfs.py
 create mode 100644 tests/unit/source_code/linters/test_dfsa.py

diff --git a/src/databricks/labs/ucx/source_code/linters/context.py b/src/databricks/labs/ucx/source_code/linters/context.py
index 7b87b3f2c2..1106b85612 100644
--- a/src/databricks/labs/ucx/source_code/linters/context.py
+++ b/src/databricks/labs/ucx/source_code/linters/context.py
@@ -12,7 +12,7 @@
     PythonLinter,
     SqlLinter,
 )
-from databricks.labs.ucx.source_code.linters.dbfs import DbfsUsageSqlLinter, DBFSUsagePyLinter
+from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter
 from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter
 
 from databricks.labs.ucx.source_code.linters.pyspark import SparkSqlPyLinter
@@ -40,12 +40,12 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe
             python_fixers.append(SparkSqlPyLinter(from_table, index, session_state))
 
         python_linters += [
-            DBFSUsagePyLinter(session_state),
+            DfsaPyLinter(session_state),
             DBRv8d0PyLinter(dbr_version=session_state.dbr_version),
             SparkConnectPyLinter(session_state),
             DbutilsPyLinter(session_state),
         ]
-        sql_linters.append(DbfsUsageSqlLinter())
+        sql_linters.append(DfsaSqlLinter())
 
         self._linters: dict[Language, list[SqlLinter] | list[PythonLinter]] = {
             Language.PYTHON: python_linters,
diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py
deleted file mode 100644
index 46a617fafc..0000000000
--- a/src/databricks/labs/ucx/source_code/linters/dbfs.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import logging
-from collections.abc import Iterable
-
-from astroid import Call, Const, InferenceError, NodeNG  # type: ignore
-from sqlglot import Expression
-from sqlglot.expressions import Table
-
-from databricks.labs.ucx.source_code.base import (
-    Advice,
-    Deprecation,
-    CurrentSessionState,
-    PythonLinter,
-    SqlLinter,
-)
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor
-from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
-
-logger = logging.getLogger(__name__)
-
-
-class DetectDbfsVisitor(TreeVisitor):
-    """
-    Visitor that detects file system paths in Python code and checks them
-    against a list of known deprecated paths.
-    """
-
-    def __init__(self, session_state: CurrentSessionState) -> None:
-        self._session_state = session_state
-        self._advices: list[Advice] = []
-        self._fs_prefixes = ["/dbfs/mnt", "dbfs:/", "/mnt/"]
-        self._reported_locations: set[tuple[int, int]] = set()  # Set to store reported locations; astroid coordinates!
-
-    def visit_call(self, node: Call):
-        for arg in node.args:
-            self._visit_arg(arg)
-
-    def _visit_arg(self, arg: NodeNG):
-        try:
-            for inferred in InferredValue.infer_from_node(arg, self._session_state):
-                if not inferred.is_inferred():
-                    logger.debug(f"Could not infer value of {arg.as_string()}")
-                    continue
-                self._check_str_constant(arg, inferred)
-        except InferenceError as e:
-            logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e)
-
-    def visit_const(self, node: Const):
-        # Constant strings yield Advisories
-        if isinstance(node.value, str):
-            self._check_str_constant(node, InferredValue([node]))
-
-    def _check_str_constant(self, source_node, inferred: InferredValue):
-        if self._already_reported(source_node, inferred):
-            return
-        value = inferred.as_string()
-        if any(value.startswith(prefix) for prefix in self._fs_prefixes):
-            advisory = Deprecation.from_node(
-                code='dbfs-usage',
-                message=f"Deprecated file system path: {value}",
-                node=source_node,
-            )
-            self._advices.append(advisory)
-
-    def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
-        all_nodes = [source_node]
-        all_nodes.extend(inferred.nodes)
-        reported = any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes)
-        for node in all_nodes:
-            self._reported_locations.add((node.lineno, node.col_offset))
-        return reported
-
-    def get_advices(self) -> Iterable[Advice]:
-        yield from self._advices
-
-
-class DBFSUsagePyLinter(PythonLinter):
-
-    def __init__(self, session_state: CurrentSessionState):
-        self._session_state = session_state
-
-    @staticmethod
-    def name() -> str:
-        """
-        Returns the name of the linter, for reporting etc
-        """
-        return 'dbfs-usage'
-
-    def lint_tree(self, tree: Tree) -> Iterable[Advice]:
-        """
-        Lints the code looking for file system paths that are deprecated
-        """
-        visitor = DetectDbfsVisitor(self._session_state)
-        visitor.visit(tree.node)
-        yield from visitor.get_advices()
-
-
-class DbfsUsageSqlLinter(SqlLinter):
-    def __init__(self):
-        self._dbfs_prefixes = ["/dbfs/mnt", "dbfs:/", "/mnt/", "/dbfs/", "/"]
-
-    @staticmethod
-    def name() -> str:
-        return 'dbfs-query'
-
-    def lint_expression(self, expression: Expression):
-        for table in expression.find_all(Table):
-            # Check table names for deprecated DBFS table names
-            yield from self._check_dbfs_folder(table)
-
-    def _check_dbfs_folder(self, table: Table) -> Iterable[Advice]:
-        """
-        Check if the table is a DBFS table or reference in some way
-        and yield a deprecation message if it is
-        """
-        if any(table.name.startswith(prefix) for prefix in self._dbfs_prefixes):
-            yield Deprecation(
-                code='dbfs-read-from-sql-query',
-                message=f"The use of DBFS is deprecated: {table.name}",
-                # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
-                start_line=0,
-                start_col=0,
-                end_line=0,
-                end_col=1024,
-            )
diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
new file mode 100644
index 0000000000..a1c131c7f8
--- /dev/null
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -0,0 +1,176 @@
+from dataclasses import dataclass
+import logging
+from abc import ABC
+from collections.abc import Iterable
+
+from astroid import Call, Const, InferenceError, NodeNG  # type: ignore
+from sqlglot import Expression
+from sqlglot.expressions import Table
+
+from databricks.labs.ucx.source_code.base import (
+    Advice,
+    Deprecation,
+    CurrentSessionState,
+    PythonLinter,
+    SqlLinter,
+)
+from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor
+from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
+
+logger = logging.getLogger(__name__)
+
+
+class DFSAPattern(ABC):
+
+    def __init__(self, prefix: str, allowed_roots: list[str]):
+        self._prefix = prefix
+        self._allowed_roots = allowed_roots
+
+    def matches(self, value: str) -> bool:
+        return value.startswith(self._prefix) and not self._matches_allowed_root(value)
+
+    def _matches_allowed_root(self, value: str):
+        return any(value.startswith(f"{self._prefix}/{root}") for root in self._allowed_roots)
+
+
+class RootPattern(DFSAPattern):
+
+    def _matches_allowed_root(self, value: str):
+        return any(value.startswith(f"/{root}") for root in self._allowed_roots)
+
+
+# the below aims to implement https://docs.databricks.com/en/files/index.html
+DFSA_PATTERNS = [
+    DFSAPattern("dbfs:/", []),
+    DFSAPattern("file:/", ["Workspace/", "tmp/"]),
+    DFSAPattern("s3:/", []),
+    DFSAPattern("s3n:/", []),
+    DFSAPattern("s3a:/", []),
+    DFSAPattern("wasb:/", []),
+    DFSAPattern("wasbs:/", []),
+    DFSAPattern("abfs:/", []),
+    DFSAPattern("abfss:/", []),
+    DFSAPattern("hdfs:/", []),
+    DFSAPattern("/mnt/", []),
+    RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]),
+]
+
+
+@dataclass
+class DFSA:
+    """A DFSA is a record describing a Direct File System Access"""
+
+    path: str
+
+
+@dataclass
+class DFSANode:
+    dfsa: DFSA
+    node: NodeNG
+
+
+class _DetectDfsaVisitor(TreeVisitor):
+    """
+    Visitor that detects file system paths in Python code and checks them
+    against a list of known deprecated paths.
+    """
+
+    def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None:
+        self._session_state = session_state
+        self._dfsa_nodes: list[DFSANode] = []
+        self._reported_locations: set[tuple[int, int]] = set()
+        self._allow_spark_duplicates = allow_spark_duplicates
+
+    def visit_call(self, node: Call):
+        for arg in node.args:
+            self._visit_arg(arg)
+
+    def _visit_arg(self, arg: NodeNG):
+        try:
+            for inferred in InferredValue.infer_from_node(arg, self._session_state):
+                if not inferred.is_inferred():
+                    logger.debug(f"Could not infer value of {arg.as_string()}")
+                    continue
+                self._check_str_constant(arg, inferred)
+        except InferenceError as e:
+            logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e)
+
+    def visit_const(self, node: Const):
+        # Constant strings yield Advisories
+        if isinstance(node.value, str):
+            self._check_str_constant(node, InferredValue([node]))
+
+    def _check_str_constant(self, source_node, inferred: InferredValue):
+        if self._already_reported(source_node, inferred):
+            return
+        # avoid duplicate advices that are reported by SparkSqlPyLinter
+        if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates:
+            return
+        value = inferred.as_string()
+        if any(pattern.matches(value) for pattern in DFSA_PATTERNS):
+            self._dfsa_nodes.append(DFSANode(DFSA(value), source_node))
+            self._reported_locations.add((source_node.lineno, source_node.col_offset))
+
+    def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
+        all_nodes = [source_node] + inferred.nodes
+        return any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes)
+
+    @property
+    def dfsa_nodes(self):
+        return self._dfsa_nodes
+
+
+class DfsaPyLinter(PythonLinter):
+
+    def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False):
+        self._session_state = session_state
+        self._allow_spark_duplicates = allow_spark_duplicates
+
+    @staticmethod
+    def name() -> str:
+        """
+        Returns the name of the linter, for reporting etc
+        """
+        return 'dfsa-usage'
+
+    def lint_tree(self, tree: Tree) -> Iterable[Advice]:
+        """
+        Lints the code looking for file system paths that are deprecated
+        """
+        visitor = _DetectDfsaVisitor(self._session_state, self._allow_spark_duplicates)
+        visitor.visit(tree.node)
+        for dfsa_node in visitor.dfsa_nodes:
+            advisory = Deprecation.from_node(
+                code='direct-filesystem-access',
+                message=f"The use of direct filesystem references is deprecated: {dfsa_node.dfsa.path}",
+                node=dfsa_node.node,
+            )
+            yield advisory
+
+
+class DfsaSqlLinter(SqlLinter):
+
+    @staticmethod
+    def name() -> str:
+        return 'dfsa-query'
+
+    def lint_expression(self, expression: Expression):
+        for table in expression.find_all(Table):
+            # Check table names for direct file system access
+            yield from self._check_dfsa(table)
+
+    def _check_dfsa(self, table: Table) -> Iterable[Advice]:
+        """
+        Check if the table is a DBFS table or reference in some way
+        and yield a deprecation message if it is
+        """
+        if any(pattern.matches(table.name) for pattern in DFSA_PATTERNS):
+            yield Deprecation(
+                code='direct-filesystem-access-in-sql-query',
+                message=f"The use of direct filesystem references is deprecated: {table.name}",
+                # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
+                start_line=0,
+                start_col=0,
+                end_line=0,
+                end_col=1024,
+            )
diff --git a/tests/unit/source_code/linters/test_dbfs.py b/tests/unit/source_code/linters/test_dbfs.py
deleted file mode 100644
index b71113b0a8..0000000000
--- a/tests/unit/source_code/linters/test_dbfs.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import pytest
-
-from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure
-from databricks.labs.ucx.source_code.linters.dbfs import DBFSUsagePyLinter, DbfsUsageSqlLinter
-
-
-class TestDetectDBFS:
-    @pytest.mark.parametrize(
-        "code, expected",
-        [
-            ('SOME_CONSTANT = "not a file system path"', 0),
-            ('SOME_CONSTANT = ("/dbfs/mnt", "dbfs:/", "/mnt/")', 3),
-            ('# "/dbfs/mnt"', 0),
-            ('SOME_CONSTANT = "/dbfs/mnt"', 1),
-            ('SOME_CONSTANT = "/dbfs/mnt"; load_data(SOME_CONSTANT)', 1),
-            ('SOME_CONSTANT = 42; load_data(SOME_CONSTANT)', 0),
-        ],
-    )
-    def test_detects_dbfs_paths(self, code, expected):
-        linter = DBFSUsagePyLinter(CurrentSessionState())
-        advices = list(linter.lint(code))
-        for advice in advices:
-            assert isinstance(advice, Advice)
-        assert len(advices) == expected
-
-    @pytest.mark.parametrize(
-        "code, expected",
-        [
-            ("load_data('/dbfs/mnt/data')", 1),
-            ("load_data('/data')", 0),
-            ("load_data('/dbfs/mnt/data', '/data')", 1),
-            ("# load_data('/dbfs/mnt/data', '/data')", 0),
-            ('spark.read.parquet("/mnt/foo/bar")', 1),
-            ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1),
-            ('spark.read.parquet("dbfs://mnt/foo/bar")', 1),
-            ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1),
-            (
-                """
-DBFS1="dbfs:/mnt/foo/bar1"
-systems=[DBFS1, "dbfs:/mnt/foo/bar2"]
-for system in systems:
-    spark.read.parquet(system)
-""",
-                2,
-            ),
-        ],
-    )
-    def test_dbfs_usage_linter(self, code, expected):
-        linter = DBFSUsagePyLinter(CurrentSessionState())
-        advices = linter.lint(code)
-        count = 0
-        for advice in advices:
-            if isinstance(advice, Deprecation):
-                count += 1
-        assert count == expected
-
-    def test_dbfs_name(self):
-        linter = DBFSUsagePyLinter(CurrentSessionState())
-        assert linter.name() == "dbfs-usage"
-
-
-@pytest.mark.parametrize(
-    "query",
-    [
-        "SELECT * FROM old.things LEFT JOIN hive_metastore.other.matters USING (x) WHERE state > 1 LIMIT 10",
-        "SELECT * FROM json.`s3a://abc/d/e/f`",
-        "SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6",
-        "SELECT * FROM delta.`s3a://foo/bar`",
-        # Make sure non-sql doesn't just fail
-        "print('hello')",
-        "",
-    ],
-)
-def test_non_dbfs_trigger_nothing(query):
-    ftf = DbfsUsageSqlLinter()
-    assert not list(ftf.lint(query))
-
-
-@pytest.mark.parametrize(
-    "query, table",
-    [
-        ('SELECT * FROM parquet.`dbfs:/...` LIMIT 10', "dbfs:/..."),
-        ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."),
-        ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"),
-        ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."),
-        (
-            "MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE",
-            "/dbfs/...",
-        ),
-    ],
-)
-def test_dbfs_tables_trigger_messages_param(query: str, table: str):
-    ftf = DbfsUsageSqlLinter()
-    actual = list(ftf.lint(query))
-    assert actual == [
-        Deprecation(
-            code='dbfs-read-from-sql-query',
-            message=f'The use of DBFS is deprecated: {table}',
-            start_line=0,
-            start_col=0,
-            end_line=0,
-            end_col=1024,
-        ),
-    ]
-
-
-@pytest.mark.parametrize(
-    "query",
-    [
-        'SELECT * FROM {{some_db.some_table}}',
-    ],
-)
-def test_dbfs_queries_failure(query: str):
-    ftf = DbfsUsageSqlLinter()
-    actual = list(ftf.lint(query))
-    assert actual == [
-        Failure(
-            code='sql-parse-error',
-            message=f'SQL expression is not supported yet: {query}',
-            start_line=0,
-            start_col=0,
-            end_line=0,
-            end_col=1024,
-        ),
-    ]
-
-
-def test_dbfs_queries_name():
-    ftf = DbfsUsageSqlLinter()
-    assert ftf.name() == 'dbfs-query'
diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_dfsa.py
new file mode 100644
index 0000000000..e94f1b1192
--- /dev/null
+++ b/tests/unit/source_code/linters/test_dfsa.py
@@ -0,0 +1,147 @@
+import pytest
+
+from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure
+from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter, DFSA_PATTERNS
+
+
+@pytest.mark.parametrize(
+    "path, matches",
+    [
+        ("/mnt/foo/bar", True),
+        ("dbfs:/mnt/foo/bar", True),
+        ("s3a://bucket1/folder1", True),
+        ("/dbfs/mnt/foo/bar", True),
+        ("/tmp/foo", False),
+        ("table.we.know.nothing.about", False),
+    ],
+)
+def test_matches_dfsa_pattern(path, matches):
+    """see https://github.com/databrickslabs/ucx/issues/2350"""
+    matched = any(pattern.matches(path) for pattern in DFSA_PATTERNS)
+    assert matches == matched
+
+
+@pytest.mark.parametrize(
+    "code, expected",
+    [
+        ('SOME_CONSTANT = "not a file system path"', 0),
+        ('SOME_CONSTANT = ("/dbfs/mnt", "dbfs:/", "/mnt/")', 3),
+        ('# "/dbfs/mnt"', 0),
+        ('SOME_CONSTANT = "/dbfs/mnt"', 1),
+        ('SOME_CONSTANT = "/dbfs/mnt"; load_data(SOME_CONSTANT)', 1),
+        ('SOME_CONSTANT = 42; load_data(SOME_CONSTANT)', 0),
+    ],
+)
+def test_detects_dfsa_paths(code, expected):
+    linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    advices = list(linter.lint(code))
+    for advice in advices:
+        assert isinstance(advice, Advice)
+    assert len(advices) == expected
+
+@pytest.mark.parametrize(
+    "code, expected",
+    [
+        ("load_data('/dbfs/mnt/data')", 1),
+    ("load_data('/data')", 1),
+    ("load_data('/dbfs/mnt/data', '/data')", 2),
+        ("# load_data('/dbfs/mnt/data', '/data')", 0),
+        ('spark.read.parquet("/mnt/foo/bar")', 1),
+        ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1),
+        ('spark.read.parquet("dbfs://mnt/foo/bar")', 1),
+        ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1),
+        ("""
+DBFS1="dbfs:/mnt/foo/bar1"
+systems=[DBFS1, "dbfs:/mnt/foo/bar2"]
+for system in systems:
+    spark.read.parquet(system)
+""",
+                2,
+            ),
+        ],
+    )
+def test_dfsa_usage_linter(code, expected):
+    linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    advices = linter.lint(code)
+    count = 0
+    for advice in advices:
+        if isinstance(advice, Deprecation):
+            count += 1
+    assert count == expected
+
+
+def test_dfsa_name():
+    linter = DfsaPyLinter(CurrentSessionState())
+    assert linter.name() == "dfsa-usage"
+
+
+@pytest.mark.parametrize(
+    "query",
+    [
+        "SELECT * FROM old.things LEFT JOIN hive_metastore.other.matters USING (x) WHERE state > 1 LIMIT 10",
+        # Make sure non-sql doesn't just fail
+        "print('hello')",
+        "",
+    ],
+)
+def test_non_dfsa_triggers_nothing(query):
+    ftf = DfsaSqlLinter()
+    assert not list(ftf.lint(query))
+
+
+@pytest.mark.parametrize(
+    "query, table",
+    [
+        ('SELECT * FROM parquet.`dbfs:/...` LIMIT 10', "dbfs:/..."),
+        ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."),
+        ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"),
+        ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."),
+        (
+            "MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE",
+            "/dbfs/...",
+        ),
+        ("SELECT * FROM json.`s3a://abc/d/e/f`", "s3a://abc/d/e/f"),
+        ("SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6", "s3a://abc/d/e/f"),
+        ("SELECT * FROM delta.`s3a://foo/bar`", "s3a://foo/bar"),
+        ("SELECT * FROM csv.`dbfs:/mnt/foo`", "dbfs:/mnt/foo"),
+    ],
+)
+def test_dfsa_tables_trigger_messages_param(query: str, table: str):
+    ftf = DfsaSqlLinter()
+    actual = list(ftf.lint(query))
+    assert actual == [
+        Deprecation(
+            code='direct-filesystem-access-in-sql-query',
+            message=f'The use of direct filesystem references is deprecated: {table}',
+            start_line=0,
+            start_col=0,
+            end_line=0,
+            end_col=1024,
+        ),
+    ]
+
+
+@pytest.mark.parametrize(
+    "query",
+    [
+        'SELECT * FROM {{some_db.some_table}}',
+    ],
+)
+def test_dfsa_queries_failure(query: str):
+    ftf = DfsaSqlLinter()
+    actual = list(ftf.lint(query))
+    assert actual == [
+        Failure(
+            code='sql-parse-error',
+            message=f'SQL expression is not supported yet: {query}',
+            start_line=0,
+            start_col=0,
+            end_line=0,
+            end_col=1024,
+        ),
+    ]
+
+
+def test_dfsa_queries_name():
+    ftf = DfsaSqlLinter()
+    assert ftf.name() == 'dfsa-query'

From 992ffe7e81eedad8c03f82bb3781e23a6bc8fb20 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 16:49:57 +0200
Subject: [PATCH 08/80] use dfsa for pyspark

---
 .../labs/ucx/source_code/linters/pyspark.py   | 47 +++++++------------
 .../unit/source_code/linters/test_pyspark.py  |  8 ++--
 2 files changed, 20 insertions(+), 35 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index 1b3e51d67b..1f1eae7fce 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -1,3 +1,4 @@
+import logging
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
@@ -12,11 +13,15 @@
     CurrentSessionState,
     PythonLinter,
 )
+from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS
 from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
 from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
 
 
+logger = logging.getLogger(__name__)
+
+
 @dataclass
 class Matcher(ABC):
     method_name: str
@@ -178,18 +183,6 @@ def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Cal
 
 @dataclass
 class DirectFilesystemAccessMatcher(Matcher):
-    _DIRECT_FS_REFS = {
-        "s3a://",
-        "s3n://",
-        "s3://",
-        "wasb://",
-        "wasbs://",
-        "abfs://",
-        "abfss://",
-        "dbfs:/",
-        "hdfs://",
-        "file:/",
-    }
 
     def matches(self, node: NodeNG):
         return (
@@ -203,25 +196,17 @@ def lint(
         self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: NodeNG
     ) -> Iterator[Advice]:
         table_arg = self._get_table_arg(node)
-        if not isinstance(table_arg, Const):
-            return
-        if not table_arg.value:
-            return
-        if not isinstance(table_arg.value, str):
-            return
-        if any(table_arg.value.startswith(prefix) for prefix in self._DIRECT_FS_REFS):
-            yield Deprecation.from_node(
-                code='direct-filesystem-access',
-                message=f"The use of direct filesystem references is deprecated: {table_arg.value}",
-                node=node,
-            )
-            return
-        if table_arg.value.startswith("/") and self._check_call_context(node):
-            yield Deprecation.from_node(
-                code='implicit-dbfs-usage',
-                message=f"The use of default dbfs: references is deprecated: {table_arg.value}",
-                node=node,
-            )
+        for inferred in InferredValue.infer_from_node(table_arg):
+            if not inferred.is_inferred():
+                logger.debug(f"Could not infer value of {table_arg.as_string()}")
+                continue
+            value = inferred.as_string()
+            if any(pattern.matches(value) for pattern in DFSA_PATTERNS):
+                yield Deprecation.from_node(
+                    code='direct-filesystem-access',
+                    message=f"The use of direct filesystem references is deprecated: {value}",
+                    node=node,
+                )
 
     def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None:
         # No transformations to apply
diff --git a/tests/unit/source_code/linters/test_pyspark.py b/tests/unit/source_code/linters/test_pyspark.py
index cda1997e06..cbb2dd5b15 100644
--- a/tests/unit/source_code/linters/test_pyspark.py
+++ b/tests/unit/source_code/linters/test_pyspark.py
@@ -346,8 +346,8 @@ def test_spark_sql_fix(migration_index):
             """spark.read.load("/bucket/path")""",
             [
                 Deprecation(
-                    code='implicit-dbfs-usage',
-                    message="The use of default dbfs: references is deprecated: /bucket/path",
+                    code='direct-filesystem-access',
+                    message="The use of direct filesystem references is deprecated: /bucket/path",
                     start_line=0,
                     start_col=0,
                     end_line=0,
@@ -559,12 +559,12 @@ def test_spark_cloud_direct_access(empty_index, code, expected):
 
 
 @pytest.mark.parametrize("fs_function", FS_FUNCTIONS)
-def test_direct_cloud_access_reports_nothing(empty_index, fs_function):
+def test_direct_cloud_access_to_tmp_reports_nothing(empty_index, fs_function):
     session_state = CurrentSessionState()
     ftf = FromTableSqlLinter(empty_index, session_state)
     sqf = SparkSqlPyLinter(ftf, empty_index, session_state)
     # ls function calls have to be from dbutils.fs, or we ignore them
-    code = f"""spark.{fs_function}("/bucket/path")"""
+    code = f"""spark.{fs_function}("/tmp/bucket/path")"""
     advisories = list(sqf.lint(code))
     assert not advisories
 

From 830b0269da399921ddaf6078b202c0b1bf56bf9a Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 16:50:28 +0200
Subject: [PATCH 09/80] fix duplicate advice

---
 src/databricks/labs/ucx/source_code/linters/python_ast.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py
index 00baad629f..7ce7228dd9 100644
--- a/src/databricks/labs/ucx/source_code/linters/python_ast.py
+++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py
@@ -179,6 +179,8 @@ def is_from_module(self, module_name: str) -> bool:
             return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name)
         if isinstance(self._node, Attribute):
             return Tree(self._node.expr).is_from_module(module_name)
+        if isinstance(self._node, Const):
+            return Tree(self._node.parent).is_from_module(module_name)
         return False
 
     def has_global(self, name: str) -> bool:

From a1e15e71a651cfcda593bf29d034ed38ee79303c Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 16:50:44 +0200
Subject: [PATCH 10/80] fix functional tests

---
 .../samples/functional/file-access/direct-fs.py            | 7 +++----
 .../file-access/python-notebook-with-embedded-sql.py       | 7 +++----
 .../file-access/sql-notebook-with-embedded-python.sql      | 5 ++---
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/tests/unit/source_code/samples/functional/file-access/direct-fs.py b/tests/unit/source_code/samples/functional/file-access/direct-fs.py
index 6815bb9627..251c6f6072 100644
--- a/tests/unit/source_code/samples/functional/file-access/direct-fs.py
+++ b/tests/unit/source_code/samples/functional/file-access/direct-fs.py
@@ -3,13 +3,12 @@
 
 # COMMAND ----------
 
-# ucx[dbfs-usage:+2:23:+2:42] Deprecated file system path: /mnt/things/e/f/g
-# ucx[implicit-dbfs-usage:+1:8:+1:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g
+# ucx[direct-filesystem-access:+1:8:+1:43] The use of direct filesystem references is deprecated: /mnt/things/e/f/g
 display(spark.read.csv('/mnt/things/e/f/g'))
 
 # COMMAND ----------
 
-# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/foo
+# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/foo
 # MAGIC %sql  SELECT * FROM csv.`dbfs:/mnt/foo`
 
 # COMMAND ----------
@@ -18,7 +17,7 @@
 
 # COMMAND ----------
 
-# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g
+# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/bar/e/f/g
 # MAGIC %sql
 # MAGIC SELECT * FROM
 # MAGIC   csv.`dbfs:/mnt/bar/e/f/g`
diff --git a/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py
index e0b7504e09..be071b69d7 100644
--- a/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py
+++ b/tests/unit/source_code/samples/functional/file-access/python-notebook-with-embedded-sql.py
@@ -3,13 +3,12 @@
 
 # COMMAND ----------
 
-# ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g
-# ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g
+# ucx[direct-filesystem-access:+1:8:+1:43] The use of direct filesystem references is deprecated: /mnt/things/e/f/g
 display(spark.read.csv('/mnt/things/e/f/g'))
 
 # COMMAND ----------
 
-# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/foo
+# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/foo
 # MAGIC %sql  SELECT * FROM csv.`dbfs:/mnt/foo`
 
 # COMMAND ----------
@@ -22,7 +21,7 @@
 # COMMAND ----------
 
 
-# ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/bar/e/f/g
+# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs:/mnt/bar/e/f/g
 # MAGIC %sql
 # MAGIC SELECT * FROM
 # MAGIC   csv.`dbfs:/mnt/bar/e/f/g`
diff --git a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
index 2a9361fad5..4de8f4adb5 100644
--- a/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
+++ b/tests/unit/source_code/samples/functional/file-access/sql-notebook-with-embedded-python.sql
@@ -3,7 +3,7 @@
 
 -- COMMAND ----------
 
--- ucx[dbfs-read-from-sql-query:+0:0:+0:1024] The use of DBFS is deprecated: dbfs:/mnt/whatever
+-- ucx[direct-file-system-access-in-sql-query:+0:0:+0:1024] The use of direct file system access is deprecated: dbfs:/mnt/whatever
 SELECT * FROM csv.`dbfs:/mnt/whatever`
 
 
@@ -13,7 +13,6 @@ SELECT * FROM csv.`dbfs:/mnt/whatever`
 -- COMMAND ----------
 
 -- MAGIC %python
--- ucx[implicit-dbfs-usage:+2:8:+2:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g
--- ucx[dbfs-usage:+1:23:+1:42] Deprecated file system path: /mnt/things/e/f/g
+-- ucx[direct-file-system-access:+0:0:+0:1024] The use of direct file system access is deprecated: /mnt/things/e/f/g
 -- MAGIC display(spark.read.csv('/mnt/things/e/f/g'))
 

From eb566364874dd103ee529c680a8e0faf831f4342 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 16:56:23 +0200
Subject: [PATCH 11/80] formatting

---
 tests/unit/source_code/linters/test_dfsa.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_dfsa.py
index e94f1b1192..11cb5a1d08 100644
--- a/tests/unit/source_code/linters/test_dfsa.py
+++ b/tests/unit/source_code/linters/test_dfsa.py
@@ -39,27 +39,29 @@ def test_detects_dfsa_paths(code, expected):
         assert isinstance(advice, Advice)
     assert len(advices) == expected
 
+
 @pytest.mark.parametrize(
     "code, expected",
     [
         ("load_data('/dbfs/mnt/data')", 1),
-    ("load_data('/data')", 1),
-    ("load_data('/dbfs/mnt/data', '/data')", 2),
+        ("load_data('/data')", 1),
+        ("load_data('/dbfs/mnt/data', '/data')", 2),
         ("# load_data('/dbfs/mnt/data', '/data')", 0),
         ('spark.read.parquet("/mnt/foo/bar")', 1),
         ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1),
         ('spark.read.parquet("dbfs://mnt/foo/bar")', 1),
         ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1),
-        ("""
+        (
+            """
 DBFS1="dbfs:/mnt/foo/bar1"
 systems=[DBFS1, "dbfs:/mnt/foo/bar2"]
 for system in systems:
     spark.read.parquet(system)
 """,
-                2,
-            ),
-        ],
-    )
+            2,
+        ),
+    ],
+)
 def test_dfsa_usage_linter(code, expected):
     linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
     advices = linter.lint(code)

From b8e6b826f7e43ddd2ce4dfce0aafa0daa05fd7a9 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 18:46:19 +0200
Subject: [PATCH 12/80] move python parsing code under dedicated package

---
 src/databricks/labs/ucx/source_code/base.py                   | 2 +-
 src/databricks/labs/ucx/source_code/graph.py                  | 2 +-
 src/databricks/labs/ucx/source_code/jobs.py                   | 2 +-
 src/databricks/labs/ucx/source_code/linters/dbfs.py           | 4 ++--
 src/databricks/labs/ucx/source_code/linters/files.py          | 2 +-
 src/databricks/labs/ucx/source_code/linters/imports.py        | 4 ++--
 src/databricks/labs/ucx/source_code/linters/pyspark.py        | 4 ++--
 src/databricks/labs/ucx/source_code/linters/spark_connect.py  | 2 +-
 src/databricks/labs/ucx/source_code/linters/table_creation.py | 2 +-
 src/databricks/labs/ucx/source_code/notebooks/cells.py        | 2 +-
 src/databricks/labs/ucx/source_code/notebooks/sources.py      | 2 +-
 src/databricks/labs/ucx/source_code/python/__init__.py        | 0
 .../labs/ucx/source_code/{linters => python}/python_ast.py    | 0
 .../labs/ucx/source_code/{linters => python}/python_infer.py  | 2 +-
 tests/integration/source_code/message_codes.py                | 2 +-
 tests/unit/source_code/linters/test_pyspark.py                | 2 +-
 tests/unit/source_code/linters/test_python_imports.py         | 2 +-
 tests/unit/source_code/linters/test_spark_connect.py          | 2 +-
 tests/unit/source_code/notebooks/test_cells.py                | 2 +-
 tests/unit/source_code/python/__init__.py                     | 0
 tests/unit/source_code/{linters => python}/test_python_ast.py | 4 ++--
 .../unit/source_code/{linters => python}/test_python_infer.py | 4 ++--
 tests/unit/source_code/test_notebook.py                       | 2 +-
 23 files changed, 25 insertions(+), 25 deletions(-)
 create mode 100644 src/databricks/labs/ucx/source_code/python/__init__.py
 rename src/databricks/labs/ucx/source_code/{linters => python}/python_ast.py (100%)
 rename src/databricks/labs/ucx/source_code/{linters => python}/python_infer.py (99%)
 create mode 100644 tests/unit/source_code/python/__init__.py
 rename tests/unit/source_code/{linters => python}/test_python_ast.py (97%)
 rename tests/unit/source_code/{linters => python}/test_python_infer.py (97%)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index 57acaf56c4..e00bc86847 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -15,7 +15,7 @@
 from databricks.sdk.service.workspace import Language
 
 from databricks.labs.blueprint.paths import WorkspacePath
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 
 # Code mapping between LSP, PyLint, and our own diagnostics:
 # | LSP                       | PyLint     | Our            |
diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index 2d47649839..4841fe904c 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -11,7 +11,7 @@
     NodeNG,
 )
 from databricks.labs.ucx.source_code.base import Advisory, CurrentSessionState, is_a_notebook
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
 
 logger = logging.Logger(__name__)
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 4a6274f31c..feab7bc1d0 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -31,7 +31,7 @@
     DependencyGraphWalker,
 )
 from databricks.labs.ucx.source_code.linters.context import LinterContext
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
 
diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py
index 46a617fafc..06941ef97b 100644
--- a/src/databricks/labs/ucx/source_code/linters/dbfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/dbfs.py
@@ -12,8 +12,8 @@
     PythonLinter,
     SqlLinter,
 )
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor
-from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
+from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor
+from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py
index 31d4a7faa6..d1eb2c8a27 100644
--- a/src/databricks/labs/ucx/source_code/linters/files.py
+++ b/src/databricks/labs/ucx/source_code/linters/files.py
@@ -7,7 +7,7 @@
 from typing import TextIO
 
 from databricks.labs.ucx.source_code.base import LocatedAdvice, CurrentSessionState, file_language, is_a_notebook
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
 from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
diff --git a/src/databricks/labs/ucx/source_code/linters/imports.py b/src/databricks/labs/ucx/source_code/linters/imports.py
index 26a1258dff..0cbb79e5a1 100644
--- a/src/databricks/labs/ucx/source_code/linters/imports.py
+++ b/src/databricks/labs/ucx/source_code/linters/imports.py
@@ -18,8 +18,8 @@
 )
 
 from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState, PythonLinter
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase, TreeVisitor
-from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
+from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, TreeVisitor
+from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
 
 logger = logging.getLogger(__name__)
diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index 1b3e51d67b..f30e50cb91 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -12,9 +12,9 @@
     CurrentSessionState,
     PythonLinter,
 )
-from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
+from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
+from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
 
 
 @dataclass
diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py
index f02e254406..1d79e0cb29 100644
--- a/src/databricks/labs/ucx/source_code/linters/spark_connect.py
+++ b/src/databricks/labs/ucx/source_code/linters/spark_connect.py
@@ -11,7 +11,7 @@
 )
 from databricks.sdk.service.compute import DataSecurityMode
 
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
+from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
 
 
 @dataclass
diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py
index 4720944a99..4c27865016 100644
--- a/src/databricks/labs/ucx/source_code/linters/table_creation.py
+++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py
@@ -9,7 +9,7 @@
     Advice,
     PythonLinter,
 )
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
+from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
 
 
 @dataclass
diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py
index 5d8aa5ac56..dc0d90870a 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/cells.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py
@@ -30,7 +30,7 @@
     NotebookRunCall,
     UnresolvedPath,
 )
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase
+from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
 
 # use a specific logger for sqlglot warnings so we can disable them selectively
 sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot")
diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py
index 2c17476931..c2cffde9c4 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/sources.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py
@@ -35,7 +35,7 @@
     SysPathChange,
     UnresolvedPath,
 )
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase
+from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
 from databricks.labs.ucx.source_code.notebooks.cells import (
     CellLanguage,
     Cell,
diff --git a/src/databricks/labs/ucx/source_code/python/__init__.py b/src/databricks/labs/ucx/source_code/python/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py
similarity index 100%
rename from src/databricks/labs/ucx/source_code/linters/python_ast.py
rename to src/databricks/labs/ucx/source_code/python/python_ast.py
diff --git a/src/databricks/labs/ucx/source_code/linters/python_infer.py b/src/databricks/labs/ucx/source_code/python/python_infer.py
similarity index 99%
rename from src/databricks/labs/ucx/source_code/linters/python_infer.py
rename to src/databricks/labs/ucx/source_code/python/python_infer.py
index 073ab362a6..2ed7929260 100644
--- a/src/databricks/labs/ucx/source_code/linters/python_infer.py
+++ b/src/databricks/labs/ucx/source_code/python/python_infer.py
@@ -21,7 +21,7 @@
 from astroid.exceptions import InferenceError  # type: ignore
 
 from databricks.labs.ucx.source_code.base import CurrentSessionState
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/integration/source_code/message_codes.py b/tests/integration/source_code/message_codes.py
index f07a049cb2..f118b4e871 100644
--- a/tests/integration/source_code/message_codes.py
+++ b/tests/integration/source_code/message_codes.py
@@ -2,7 +2,7 @@
 from databricks.labs.blueprint.wheels import ProductInfo
 
 from databricks.labs.ucx.source_code.base import Advice
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 
 
 def main():
diff --git a/tests/unit/source_code/linters/test_pyspark.py b/tests/unit/source_code/linters/test_pyspark.py
index cda1997e06..e176a4ae9e 100644
--- a/tests/unit/source_code/linters/test_pyspark.py
+++ b/tests/unit/source_code/linters/test_pyspark.py
@@ -3,7 +3,7 @@
 from astroid import Call, Const, Expr  # type: ignore
 
 from databricks.labs.ucx.source_code.base import Deprecation, CurrentSessionState
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
+from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
 from databricks.labs.ucx.source_code.linters.pyspark import TableNameMatcher, SparkSqlPyLinter
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
 
diff --git a/tests/unit/source_code/linters/test_python_imports.py b/tests/unit/source_code/linters/test_python_imports.py
index 1c51edadb4..27d44a3482 100644
--- a/tests/unit/source_code/linters/test_python_imports.py
+++ b/tests/unit/source_code/linters/test_python_imports.py
@@ -9,7 +9,7 @@
 from databricks.labs.ucx.source_code.linters.files import FileLoader
 
 from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter, ImportSource, SysPathChange
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.cells import PythonCodeAnalyzer
 
 
diff --git a/tests/unit/source_code/linters/test_spark_connect.py b/tests/unit/source_code/linters/test_spark_connect.py
index 6e4b0e75e1..68c9048c06 100644
--- a/tests/unit/source_code/linters/test_spark_connect.py
+++ b/tests/unit/source_code/linters/test_spark_connect.py
@@ -4,7 +4,7 @@
 
 
 from databricks.labs.ucx.source_code.base import Failure, CurrentSessionState
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.linters.spark_connect import LoggingMatcher, SparkConnectPyLinter
 from databricks.sdk.service.compute import DataSecurityMode
 
diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py
index 8d24bbb924..779099576c 100644
--- a/tests/unit/source_code/notebooks/test_cells.py
+++ b/tests/unit/source_code/notebooks/test_cells.py
@@ -8,7 +8,7 @@
 from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, DependencyProblem
 from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.cells import (
     CellLanguage,
     PipCell,
diff --git a/tests/unit/source_code/python/__init__.py b/tests/unit/source_code/python/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py
similarity index 97%
rename from tests/unit/source_code/linters/test_python_ast.py
rename to tests/unit/source_code/python/test_python_ast.py
index 4b8c8e7152..c80abb5ceb 100644
--- a/tests/unit/source_code/linters/test_python_ast.py
+++ b/tests/unit/source_code/python/test_python_ast.py
@@ -1,8 +1,8 @@
 import pytest
 from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr, Name  # type: ignore
 
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
-from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
+from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
+from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 
 
 def test_extracts_root():
diff --git a/tests/unit/source_code/linters/test_python_infer.py b/tests/unit/source_code/python/test_python_infer.py
similarity index 97%
rename from tests/unit/source_code/linters/test_python_infer.py
rename to tests/unit/source_code/python/test_python_infer.py
index f838cb7154..38f3b4cb6f 100644
--- a/tests/unit/source_code/linters/test_python_infer.py
+++ b/tests/unit/source_code/python/test_python_infer.py
@@ -1,8 +1,8 @@
 from astroid import Assign  # type: ignore
 
 from databricks.labs.ucx.source_code.base import CurrentSessionState
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
-from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
+from databricks.labs.ucx.source_code.python.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 
 
 def test_infers_empty_list():
diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py
index 5b9fffa34a..9e2945a10d 100644
--- a/tests/unit/source_code/test_notebook.py
+++ b/tests/unit/source_code/test_notebook.py
@@ -9,7 +9,7 @@
 from databricks.labs.ucx.source_code.known import KnownList
 from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader
 from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter
-from databricks.labs.ucx.source_code.linters.python_ast import Tree
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.sources import Notebook
 from databricks.labs.ucx.source_code.notebooks.loaders import (
     NotebookResolver,

From 472c417d13b658fd0a052656c35088aed5f707f3 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 20:19:50 +0200
Subject: [PATCH 13/80] move PythnCodeAnalyzer to dedicated file

---
 .../labs/ucx/source_code/notebooks/cells.py   | 265 ++---------------
 .../labs/ucx/source_code/notebooks/sources.py |   2 +-
 .../ucx/source_code/python/python_analyzer.py | 275 ++++++++++++++++++
 .../linters/test_python_imports.py            |   2 +-
 .../unit/source_code/notebooks/test_cells.py  |   2 +-
 5 files changed, 300 insertions(+), 246 deletions(-)
 create mode 100644 src/databricks/labs/ucx/source_code/python/python_analyzer.py

diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py
index dc0d90870a..58e5bd763e 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/cells.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py
@@ -5,13 +5,10 @@
 import shlex
 from abc import ABC, abstractmethod
 from ast import parse as parse_python
-from collections.abc import Callable, Iterable
 from enum import Enum
 from pathlib import Path
-from typing import TypeVar, cast
 
-from astroid import Call, Const, ImportFrom, Name, NodeNG, Try  # type: ignore
-from astroid.exceptions import AstroidSyntaxError  # type: ignore
+from astroid import NodeNG  # type: ignore
 from sqlglot import parse as parse_sql, ParseError as SQLParseError
 
 from databricks.sdk.service.workspace import Language
@@ -23,14 +20,12 @@
     DependencyGraphContext,
     InheritedContext,
 )
-from databricks.labs.ucx.source_code.linters.imports import (
-    SysPathChange,
-    DbutilsPyLinter,
-    ImportSource,
-    NotebookRunCall,
-    UnresolvedPath,
+from databricks.labs.ucx.source_code.python.python_analyzer import (
+    PythonCodeAnalyzer,
+    MagicCommand,
+    MagicNode,
+    register_magic_command_factory,
 )
-from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
 
 # use a specific logger for sqlglot warnings so we can disable them selectively
 sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot")
@@ -403,242 +398,14 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str:
         return "\n".join(lines)
 
 
-class PythonCodeAnalyzer:
-
-    def __init__(self, context: DependencyGraphContext, python_code: str):
-        self._context = context
-        self._python_code = python_code
-
-    def build_graph(self) -> list[DependencyProblem]:
-        """Check python code for dependency-related problems.
-
-        Returns:
-            A list of dependency problems; position information is relative to the python code itself.
-        """
-        problems: list[DependencyProblem] = []
-        try:
-            _, nodes, parse_problems = self._parse_and_extract_nodes()
-            problems.extend(parse_problems)
-        except AstroidSyntaxError as e:
-            logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True)
-            problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}"))
-            return problems
-        for base_node in nodes:
-            for problem in self._build_graph_from_node(base_node):
-                # Astroid line numbers are 1-based.
-                problem = problem.replace(
-                    start_line=base_node.node.lineno - 1,
-                    start_col=base_node.node.col_offset,
-                    end_line=(base_node.node.end_lineno or 1) - 1,
-                    end_col=base_node.node.end_col_offset or 0,
-                )
-                problems.append(problem)
-        return problems
-
-    def build_inherited_context(self, child_path: Path) -> InheritedContext:
-        try:
-            tree, nodes, _ = self._parse_and_extract_nodes()
-        except AstroidSyntaxError:
-            logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True)
-            return InheritedContext(None, False)
-        if len(nodes) == 0:
-            return InheritedContext(tree, False)
-        context = InheritedContext(Tree.new_module(), False)
-        last_line = -1
-        for base_node in nodes:
-            # append nodes
-            node_line = base_node.node.lineno
-            nodes = tree.nodes_between(last_line + 1, node_line - 1)
-            context.tree.append_nodes(nodes)
-            globs = tree.globals_between(last_line + 1, node_line - 1)
-            context.tree.append_globals(globs)
-            last_line = node_line
-            # process node
-            child_context = self._build_inherited_context_from_node(base_node, child_path)
-            context = context.append(child_context, True)
-            if context.found:
-                return context
-        line_count = tree.line_count()
-        if last_line < line_count:
-            nodes = tree.nodes_between(last_line + 1, line_count)
-            context.tree.append_nodes(nodes)
-            globs = tree.globals_between(last_line + 1, line_count)
-            context.tree.append_globals(globs)
-        return context
-
-    def _parse_and_extract_nodes(self) -> tuple[Tree, list[NodeBase], Iterable[DependencyProblem]]:
-        problems: list[DependencyProblem] = []
-        tree = Tree.normalize_and_parse(self._python_code)
-        syspath_changes = SysPathChange.extract_from_tree(self._context.session_state, tree)
-        run_calls = DbutilsPyLinter.list_dbutils_notebook_run_calls(tree)
-        import_sources: list[ImportSource]
-        import_problems: list[DependencyProblem]
-        import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node)
-        problems.extend(import_problems)
-        magic_lines, command_problems = MagicLine.extract_from_tree(tree, DependencyProblem.from_node)
-        problems.extend(command_problems)
-        # need to evaluate things in intertwined sequence so concat and sort them
-        nodes: list[NodeBase] = cast(list[NodeBase], syspath_changes + run_calls + import_sources + magic_lines)
-        nodes = sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset))
-        return tree, nodes, problems
-
-    def _build_graph_from_node(self, base_node: NodeBase) -> Iterable[DependencyProblem]:
-        if isinstance(base_node, SysPathChange):
-            yield from self._mutate_path_lookup(base_node)
-        elif isinstance(base_node, NotebookRunCall):
-            yield from self._register_notebook(base_node)
-        elif isinstance(base_node, ImportSource):
-            yield from self._register_import(base_node)
-        elif isinstance(base_node, MagicLine):
-            yield from base_node.build_dependency_graph(self._context.parent)
-        else:
-            logger.warning(f"Can't build graph for node {NodeBase.__name__} of type {type(base_node).__name__}")
-
-    def _build_inherited_context_from_node(self, base_node: NodeBase, child_path: Path) -> InheritedContext:
-        if isinstance(base_node, SysPathChange):
-            self._mutate_path_lookup(base_node)
-            return InheritedContext(None, False)
-        if isinstance(base_node, ImportSource):
-            # nothing to do, Astroid takes care of imports
-            return InheritedContext(None, False)
-        if isinstance(base_node, NotebookRunCall):
-            # nothing to do, dbutils.notebook.run uses a dedicated context
-            return InheritedContext(None, False)
-        if isinstance(base_node, MagicLine):
-            return base_node.build_inherited_context(self._context, child_path)
-        logger.warning(f"Can't build inherited context for node {NodeBase.__name__} of type {type(base_node).__name__}")
-        return InheritedContext(None, False)
-
-    def _register_import(self, base_node: ImportSource) -> Iterable[DependencyProblem]:
-        prefix = ""
-        if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None:
-            prefix = "." * base_node.node.level
-        name = base_node.name or ""
-        problems = self._context.parent.register_import(prefix + name)
-        for problem in problems:
-            prob = self._filter_import_problem_in_try_except(problem, base_node)
-            if prob is not None:
-                yield prob
-
-    @classmethod
-    def _filter_import_problem_in_try_except(
-        cls, problem: DependencyProblem, base_node: ImportSource
-    ) -> DependencyProblem | None:
-        if problem.code != 'import-not-found':
-            return problem
-        # is base_node in a try-except clause ?
-        node = base_node.node.parent
-        while node and not isinstance(node, Try):
-            node = node.parent
-        if cls._is_try_except_import_error(node):
-            return None
-        return problem
-
-    @classmethod
-    def _is_try_except_import_error(cls, node: Try | None) -> bool:
-        if not isinstance(node, Try):
-            return False
-        for handler in node.handlers:
-            if isinstance(handler.type, Name):
-                if handler.type.name == "ImportError":
-                    return True
-        return False
-
-    def _register_notebook(self, base_node: NotebookRunCall) -> Iterable[DependencyProblem]:
-        has_unresolved, paths = base_node.get_notebook_paths(self._context.session_state)
-        if has_unresolved:
-            yield DependencyProblem(
-                'dependency-cannot-compute-value',
-                f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed",
-            )
-        for path in paths:
-            # notebooks ran via dbutils.notebook.run do not inherit or propagate context
-            yield from self._context.parent.register_notebook(Path(path), False)
-
-    def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[DependencyProblem]:
-        if isinstance(change, UnresolvedPath):
-            yield DependencyProblem(
-                'sys-path-cannot-compute-value',
-                f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed",
-            )
-            return
-        change.apply_to(self._context.path_lookup)
-
-
-T = TypeVar("T")
-
-
-class MagicLine(NodeBase):
-
-    @classmethod
-    def extract_from_tree(
-        cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T]
-    ) -> tuple[list[MagicLine], list[T]]:
-        problems: list[T] = []
-        commands: list[MagicLine] = []
-        try:
-            nodes = tree.locate(Call, [("magic_command", Name)])
-            for command in cls._make_commands_for_magic_command_call_nodes(nodes):
-                commands.append(command)
-        except Exception as e:  # pylint: disable=broad-except
-            logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True)
-            problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root)
-            problems.append(problem)
-        return commands, problems
+class RunCommand(MagicCommand):
 
     @classmethod
-    def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]):
-        for node in nodes:
-            arg = node.args[0]
-            if isinstance(arg, Const):
-                yield MagicLine(node, arg.value)
-
-    def __init__(self, node: NodeNG, command: bytes):
-        super().__init__(node)
-        self._command = command.decode()
-
-    def as_magic(self) -> MagicCommand | None:
-        if self._command.startswith("%pip") or self._command.startswith("!pip"):
-            return PipCommand(self.node, self._command)
-        if self._command.startswith("%run"):
-            return RunCommand(self.node, self._command)
+    def factory(cls, command: str, node: NodeNG) -> MagicCommand | None:
+        if command.startswith("%run"):
+            return RunCommand(node, command)
         return None
 
-    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
-        magic = self.as_magic()
-        if magic is not None:
-            return magic.build_dependency_graph(parent)
-        problem = DependencyProblem.from_node(
-            code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node
-        )
-        return [problem]
-
-    def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext:
-        magic = self.as_magic()
-        if magic is not None:
-            return magic.build_inherited_context(context, child_path)
-        return InheritedContext(None, False)
-
-
-class MagicNode(NodeNG):
-    pass
-
-
-class MagicCommand(ABC):
-
-    def __init__(self, node: NodeNG, code: str):
-        self._node = node
-        self._code = code
-
-    @abstractmethod
-    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ...
-
-    def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext:
-        return InheritedContext(None, False)
-
-
-class RunCommand(MagicCommand):
-
     def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
         path = self.notebook_path
         if path is not None:
@@ -676,8 +443,17 @@ def build_inherited_context(self, context: DependencyGraphContext, child_path: P
         return container.build_inherited_context(context.parent, child_path)
 
 
+register_magic_command_factory(RunCommand.factory)
+
+
 class PipCommand(MagicCommand):
 
+    @classmethod
+    def factory(cls, command: str, node: NodeNG) -> MagicCommand | None:
+        if command.startswith("%pip") or command.startswith("!pip"):
+            return PipCommand(node, command)
+        return None
+
     def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
         argv = self._split(self._code)
         if len(argv) == 1:
@@ -716,3 +492,6 @@ def _split(cls, code: str) -> list[str]:
         code = code.replace("\\\n", " ")
         lexer = shlex.split(code, posix=True)
         return list(lexer)
+
+
+register_magic_command_factory(PipCommand.factory)
diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py
index c2cffde9c4..ab7a51cf3c 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/sources.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py
@@ -35,6 +35,7 @@
     SysPathChange,
     UnresolvedPath,
 )
+from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine
 from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
 from databricks.labs.ucx.source_code.notebooks.cells import (
     CellLanguage,
@@ -43,7 +44,6 @@
     NOTEBOOK_HEADER,
     RunCell,
     PythonCell,
-    MagicLine,
     RunCommand,
 )
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
new file mode 100644
index 0000000000..05d27709f9
--- /dev/null
+++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
@@ -0,0 +1,275 @@
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Callable
+from pathlib import Path
+from typing import cast, TypeVar
+
+from astroid import AstroidSyntaxError, Call, Const, ImportFrom, NodeNG, Try, Name  # type: ignore
+
+from databricks.labs.ucx.source_code.graph import (
+    DependencyGraphContext,
+    DependencyProblem,
+    InheritedContext,
+    DependencyGraph,
+)
+from databricks.labs.ucx.source_code.linters.imports import (
+    SysPathChange,
+    DbutilsPyLinter,
+    ImportSource,
+    NotebookRunCall,
+    UnresolvedPath,
+)
+from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
+
+logger = logging.getLogger(__name__)
+
+
+class PythonCodeAnalyzer:
+
+    def __init__(self, context: DependencyGraphContext, python_code: str):
+        self._context = context
+        self._python_code = python_code
+
+    def build_graph(self) -> list[DependencyProblem]:
+        """Check python code for dependency-related problems.
+
+        Returns:
+            A list of dependency problems; position information is relative to the python code itself.
+        """
+        problems: list[DependencyProblem] = []
+        try:
+            _, nodes, parse_problems = self._parse_and_extract_nodes()
+            problems.extend(parse_problems)
+        except AstroidSyntaxError as e:
+            logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True)
+            problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}"))
+            return problems
+        for base_node in nodes:
+            for problem in self._build_graph_from_node(base_node):
+                # Astroid line numbers are 1-based.
+                problem = problem.replace(
+                    start_line=base_node.node.lineno - 1,
+                    start_col=base_node.node.col_offset,
+                    end_line=(base_node.node.end_lineno or 1) - 1,
+                    end_col=base_node.node.end_col_offset or 0,
+                )
+                problems.append(problem)
+        return problems
+
+    def build_inherited_context(self, child_path: Path) -> InheritedContext:
+        try:
+            tree, nodes, _ = self._parse_and_extract_nodes()
+        except AstroidSyntaxError:
+            logger.debug(f"Could not parse Python code: {self._python_code}", exc_info=True)
+            return InheritedContext(None, False)
+        if len(nodes) == 0:
+            return InheritedContext(tree, False)
+        context = InheritedContext(Tree.new_module(), False)
+        last_line = -1
+        for base_node in nodes:
+            # append nodes
+            node_line = base_node.node.lineno
+            nodes = tree.nodes_between(last_line + 1, node_line - 1)
+            context.tree.append_nodes(nodes)
+            globs = tree.globals_between(last_line + 1, node_line - 1)
+            context.tree.append_globals(globs)
+            last_line = node_line
+            # process node
+            child_context = self._build_inherited_context_from_node(base_node, child_path)
+            context = context.append(child_context, True)
+            if context.found:
+                return context
+        line_count = tree.line_count()
+        if last_line < line_count:
+            nodes = tree.nodes_between(last_line + 1, line_count)
+            context.tree.append_nodes(nodes)
+            globs = tree.globals_between(last_line + 1, line_count)
+            context.tree.append_globals(globs)
+        return context
+
+    def _build_full_tree(self, inherited_context: Tree | None) -> Tree:
+        full_tree = Tree.new_module()
+        if inherited_context is not None:
+            full_tree = full_tree.append_tree(inherited_context)
+            full_tree = full_tree.renumber(-1)
+        tree = Tree.normalize_and_parse(self._python_code)
+        return full_tree.append_tree(tree)
+
+    def _parse_and_extract_nodes(self) -> tuple[Tree, list[NodeBase], Iterable[DependencyProblem]]:
+        problems: list[DependencyProblem] = []
+        tree = Tree.normalize_and_parse(self._python_code)
+        syspath_changes = SysPathChange.extract_from_tree(self._context.session_state, tree)
+        run_calls = DbutilsPyLinter.list_dbutils_notebook_run_calls(tree)
+        import_sources: list[ImportSource]
+        import_problems: list[DependencyProblem]
+        import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node)
+        problems.extend(import_problems)
+        magic_lines, command_problems = MagicLine.extract_from_tree(tree, DependencyProblem.from_node)
+        problems.extend(command_problems)
+        # need to evaluate things in intertwined sequence so concat and sort them
+        nodes: list[NodeBase] = cast(list[NodeBase], syspath_changes + run_calls + import_sources + magic_lines)
+        nodes = sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset))
+        return tree, nodes, problems
+
+    def _build_graph_from_node(self, base_node: NodeBase) -> Iterable[DependencyProblem]:
+        if isinstance(base_node, SysPathChange):
+            yield from self._mutate_path_lookup(base_node)
+        elif isinstance(base_node, NotebookRunCall):
+            yield from self._register_notebook(base_node)
+        elif isinstance(base_node, ImportSource):
+            yield from self._register_import(base_node)
+        elif isinstance(base_node, MagicLine):
+            yield from base_node.build_dependency_graph(self._context.parent)
+        else:
+            logger.warning(f"Can't build graph for node {NodeBase.__name__} of type {type(base_node).__name__}")
+
+    def _build_inherited_context_from_node(self, base_node: NodeBase, child_path: Path) -> InheritedContext:
+        if isinstance(base_node, SysPathChange):
+            self._mutate_path_lookup(base_node)
+            return InheritedContext(None, False)
+        if isinstance(base_node, ImportSource):
+            # nothing to do, Astroid takes care of imports
+            return InheritedContext(None, False)
+        if isinstance(base_node, NotebookRunCall):
+            # nothing to do, dbutils.notebook.run uses a dedicated context
+            return InheritedContext(None, False)
+        if isinstance(base_node, MagicLine):
+            return base_node.build_inherited_context(self._context, child_path)
+        logger.warning(f"Can't build inherited context for node {NodeBase.__name__} of type {type(base_node).__name__}")
+        return InheritedContext(None, False)
+
+    def _register_import(self, base_node: ImportSource) -> Iterable[DependencyProblem]:
+        prefix = ""
+        if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None:
+            prefix = "." * base_node.node.level
+        name = base_node.name or ""
+        problems = self._context.parent.register_import(prefix + name)
+        for problem in problems:
+            prob = self._filter_import_problem_in_try_except(problem, base_node)
+            if prob is not None:
+                yield prob
+
+    @classmethod
+    def _filter_import_problem_in_try_except(
+        cls, problem: DependencyProblem, base_node: ImportSource
+    ) -> DependencyProblem | None:
+        if problem.code != 'import-not-found':
+            return problem
+        # is base_node in a try-except clause ?
+        node = base_node.node.parent
+        while node and not isinstance(node, Try):
+            node = node.parent
+        if cls._is_try_except_import_error(node):
+            return None
+        return problem
+
+    @classmethod
+    def _is_try_except_import_error(cls, node: Try | None) -> bool:
+        if not isinstance(node, Try):
+            return False
+        for handler in node.handlers:
+            if isinstance(handler.type, Name):
+                if handler.type.name == "ImportError":
+                    return True
+        return False
+
+    def _register_notebook(self, base_node: NotebookRunCall) -> Iterable[DependencyProblem]:
+        has_unresolved, paths = base_node.get_notebook_paths(self._context.session_state)
+        if has_unresolved:
+            yield DependencyProblem(
+                'dependency-cannot-compute-value',
+                f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed",
+            )
+        for path in paths:
+            # notebooks ran via dbutils.notebook.run do not inherit or propagate context
+            yield from self._context.parent.register_notebook(Path(path), False)
+
+    def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[DependencyProblem]:
+        if isinstance(change, UnresolvedPath):
+            yield DependencyProblem(
+                'sys-path-cannot-compute-value',
+                f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed",
+            )
+            return
+        change.apply_to(self._context.path_lookup)
+
+
+T = TypeVar("T")
+
+
+class MagicLine(NodeBase):
+
+    @classmethod
+    def extract_from_tree(
+        cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T]
+    ) -> tuple[list[MagicLine], list[T]]:
+        problems: list[T] = []
+        commands: list[MagicLine] = []
+        try:
+            nodes = tree.locate(Call, [("magic_command", Name)])
+            for command in cls._make_commands_for_magic_command_call_nodes(nodes):
+                commands.append(command)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True)
+            problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root)
+            problems.append(problem)
+        return commands, problems
+
+    @classmethod
+    def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]):
+        for node in nodes:
+            arg = node.args[0]
+            if isinstance(arg, Const):
+                yield MagicLine(node, arg.value)
+
+    def __init__(self, node: NodeNG, command: bytes):
+        super().__init__(node)
+        self._command = command.decode()
+
+    def as_magic(self) -> MagicCommand | None:
+        for factory in _FACTORIES:
+            command = factory(self._command, self.node)
+            if command is not None:
+                return command
+        return None
+
+    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
+        magic = self.as_magic()
+        if magic is not None:
+            return magic.build_dependency_graph(parent)
+        problem = DependencyProblem.from_node(
+            code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node
+        )
+        return [problem]
+
+    def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext:
+        magic = self.as_magic()
+        if magic is not None:
+            return magic.build_inherited_context(context, child_path)
+        return InheritedContext(None, False)
+
+
+class MagicNode(NodeNG):
+    pass
+
+
+class MagicCommand(ABC):
+
+    def __init__(self, node: NodeNG, code: str):
+        self._node = node
+        self._code = code
+
+    @abstractmethod
+    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ...
+
+    def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext:
+        return InheritedContext(None, False)
+
+
+_FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = []
+
+
+def register_magic_command_factory(factory: Callable[[str, NodeNG], MagicCommand | None]):
+    _FACTORIES.append(factory)
diff --git a/tests/unit/source_code/linters/test_python_imports.py b/tests/unit/source_code/linters/test_python_imports.py
index 27d44a3482..c40eedff84 100644
--- a/tests/unit/source_code/linters/test_python_imports.py
+++ b/tests/unit/source_code/linters/test_python_imports.py
@@ -9,8 +9,8 @@
 from databricks.labs.ucx.source_code.linters.files import FileLoader
 
 from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter, ImportSource, SysPathChange
+from databricks.labs.ucx.source_code.python.python_analyzer import PythonCodeAnalyzer
 from databricks.labs.ucx.source_code.python.python_ast import Tree
-from databricks.labs.ucx.source_code.notebooks.cells import PythonCodeAnalyzer
 
 
 def test_linter_returns_empty_list_of_dbutils_notebook_run_calls():
diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py
index 779099576c..25f794f2eb 100644
--- a/tests/unit/source_code/notebooks/test_cells.py
+++ b/tests/unit/source_code/notebooks/test_cells.py
@@ -8,6 +8,7 @@
 from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, DependencyProblem
 from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver
+from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine
 from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.cells import (
     CellLanguage,
@@ -16,7 +17,6 @@
     PipCommand,
     PythonCodeAnalyzer,
 )
-from databricks.labs.ucx.source_code.notebooks.cells import MagicLine
 from databricks.labs.ucx.source_code.notebooks.loaders import (
     NotebookResolver,
     NotebookLoader,

From 3e256be0a48c78f6aaba80e8df5f84df2f359a6f Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 11:07:42 +0200
Subject: [PATCH 14/80] fix merge issues

---
 src/databricks/labs/ucx/source_code/linters/dfsa.py    | 4 ++--
 src/databricks/labs/ucx/source_code/linters/pyspark.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
index a1c131c7f8..a95ffeae97 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -14,8 +14,8 @@
     PythonLinter,
     SqlLinter,
 )
-from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor
-from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
+from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor
+from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index feb8debe1e..a537757add 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -13,6 +13,7 @@
     CurrentSessionState,
     PythonLinter,
 )
+from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper

From d1d251f7be6410ea8b647fa71bc4af3c2c29949f Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 18:11:47 +0200
Subject: [PATCH 15/80] merge from stale branch

---
 src/databricks/labs/ucx/source_code/base.py   | 27 +++++++++++++++++++
 .../labs/ucx/source_code/linters/dfsa.py      |  9 +------
 tests/unit/source_code/test_dfsa_crawler.py   | 14 ++++++++++
 3 files changed, 42 insertions(+), 8 deletions(-)
 create mode 100644 tests/unit/source_code/test_dfsa_crawler.py

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index e00bc86847..2c93ea8d13 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -334,3 +334,30 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
         logger.warning(f"Could not read file {path}")
         return False
     return file_header == magic_header
+
+
+@dataclass
+class DFSA:
+    """A DFSA is a record describing a Direct File System Access"""
+
+    UNKNOWN = "unknown"
+
+    source_type: str
+    source_id: str
+    path: str
+    is_read: bool
+    is_write: bool
+
+    @property
+    def key(self) -> str:
+        return f"{self.source_type}.{self.source_id}.{self.path}".lower()  # TODO for now
+
+    @property
+    def safe_sql_key(self) -> str:
+        return escape_sql_identifier(self.key)
+
+    def __hash__(self) -> int:
+        return hash(self.key)
+
+    def __eq__(self, other) -> bool:
+        return isinstance(other, DFSA) and self.key == other.key
diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
index a95ffeae97..4fe867b37a 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -12,7 +12,7 @@
     Deprecation,
     CurrentSessionState,
     PythonLinter,
-    SqlLinter,
+    SqlLinter, DFSA,
 )
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
@@ -56,13 +56,6 @@ def _matches_allowed_root(self, value: str):
 ]
 
 
-@dataclass
-class DFSA:
-    """A DFSA is a record describing a Direct File System Access"""
-
-    path: str
-
-
 @dataclass
 class DFSANode:
     dfsa: DFSA
diff --git a/tests/unit/source_code/test_dfsa_crawler.py b/tests/unit/source_code/test_dfsa_crawler.py
new file mode 100644
index 0000000000..e7095eb0e1
--- /dev/null
+++ b/tests/unit/source_code/test_dfsa_crawler.py
@@ -0,0 +1,14 @@
+from databricks.labs.lsql.backends import MockBackend
+
+from databricks.labs.ucx.source_code.base import DFSA
+
+
+def test_crawler_appends_dfsas():
+    backend = MockBackend()
+    crawler = DfsaCrawler(backend, "schema")
+    for path in ("a", "b", "c"):
+        dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False)
+        crawler.append(dfsa)
+    rows = backend.rows_written_for(crawler.full_name, "append")
+    assert len(rows) == 3
+

From 13ea1e6d084bdf8dd4891885236563c46c58d46f Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 18:00:21 +0200
Subject: [PATCH 16/80] more tests

---
 .../functional/file-access/create_cloud_files.sql  | 13 +++++++++++++
 .../functional/file-access/create_location.py      | 13 +++++++++++++
 .../functional/file-access/create_location.sql     |  2 ++
 .../functional/file-access/select_format.sql       |  2 ++
 .../functional/file-access/select_read_files.sql   |  2 ++
 .../file-access/spark_read_format_load.py          |  3 +++
 tests/unit/source_code/test_dfsa.py                | 14 ++++++++++++++
 7 files changed, 49 insertions(+)
 create mode 100644 tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql
 create mode 100644 tests/unit/source_code/samples/functional/file-access/create_location.py
 create mode 100644 tests/unit/source_code/samples/functional/file-access/create_location.sql
 create mode 100644 tests/unit/source_code/samples/functional/file-access/select_format.sql
 create mode 100644 tests/unit/source_code/samples/functional/file-access/select_read_files.sql
 create mode 100644 tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py
 create mode 100644 tests/unit/source_code/test_dfsa.py

diff --git a/tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql b/tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql
new file mode 100644
index 0000000000..0f19edf368
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/create_cloud_files.sql
@@ -0,0 +1,13 @@
+-- Databricks notebook source
+CREATE OR REFRESH STREAMING LIVE TABLE pcmd_stream_bronze
+COMMENT "PCMD Stream - Bronze"
+AS SELECT *
+  FROM cloud_files(
+    "s3a://db-gtm-industry-solutions/data/CME/telco/PCMD",
+    "json",
+    map(
+      "header", "false",
+      "mergeSchema", "true",
+      "cloudFiles.inferColumnTypes", "true"
+    )
+  )
diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.py b/tests/unit/source_code/samples/functional/file-access/create_location.py
new file mode 100644
index 0000000000..940640f7ae
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/create_location.py
@@ -0,0 +1,13 @@
+# Databricks notebook source
+
+a = 12
+
+# COMMAND ----------
+
+# MAGIC %sql
+# MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/'
+
+# COMMAND ----------
+
+# MAGIC %sql
+# MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_550 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_550/'
diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.sql b/tests/unit/source_code/samples/functional/file-access/create_location.sql
new file mode 100644
index 0000000000..4f90fd669d
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/create_location.sql
@@ -0,0 +1,2 @@
+-- Databricks notebook source
+CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/'
diff --git a/tests/unit/source_code/samples/functional/file-access/select_format.sql b/tests/unit/source_code/samples/functional/file-access/select_format.sql
new file mode 100644
index 0000000000..76d91894f2
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/select_format.sql
@@ -0,0 +1,2 @@
+-- Databricks notebook source
+SELECT * FROM parquet.`hdfs://examples/src/main/resources/users.parquet`
diff --git a/tests/unit/source_code/samples/functional/file-access/select_read_files.sql b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql
new file mode 100644
index 0000000000..e326eec5f5
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql
@@ -0,0 +1,2 @@
+-- Databricks notebook source
+SELECT * FROM read_files("s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/file.csv") LIMIT 10
diff --git a/tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py b/tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py
new file mode 100644
index 0000000000..16e0c9b5a7
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/spark_read_format_load.py
@@ -0,0 +1,3 @@
+# Databricks notebook source
+# ucx[direct-filesystem-access:+1:0:+1:61] The use of direct filesystem references is deprecated: s3a://prefix/some_file.csv
+spark.read.format("delta").load("s3a://prefix/some_file.csv")
diff --git a/tests/unit/source_code/test_dfsa.py b/tests/unit/source_code/test_dfsa.py
new file mode 100644
index 0000000000..18caa46ff1
--- /dev/null
+++ b/tests/unit/source_code/test_dfsa.py
@@ -0,0 +1,14 @@
+from databricks.labs.lsql.backends import MockBackend
+
+from databricks.labs.ucx.source_code.linters.dfsa import DFSA
+
+
+def test_crawler_appends_dfsas():
+    backend = MockBackend()
+    crawler = DfsaCrawler(backend, "schema")
+    for path in ("a", "b", "c"):
+        dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False)
+        crawler.append(dfsa)
+    rows = backend.rows_written_for(crawler.full_name, "append")
+    assert len(rows) == 3
+

From 93d194f0aa3a040f3a6cd877a7d38d017e9dff6d Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 2 Sep 2024 18:23:38 +0200
Subject: [PATCH 17/80] merge from stale branch

---
 src/databricks/labs/ucx/source_code/base.py   |  2 ++
 .../labs/ucx/source_code/dfsa_crawler.py      | 19 +++++++++++++++++++
 .../labs/ucx/source_code/linters/dfsa.py      |  3 ++-
 tests/unit/source_code/test_dfsa.py           | 14 --------------
 tests/unit/source_code/test_dfsa_crawler.py   |  2 +-
 5 files changed, 24 insertions(+), 16 deletions(-)
 create mode 100644 src/databricks/labs/ucx/source_code/dfsa_crawler.py
 delete mode 100644 tests/unit/source_code/test_dfsa.py

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index 2c93ea8d13..775a72a1c6 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -15,6 +15,8 @@
 from databricks.sdk.service.workspace import Language
 
 from databricks.labs.blueprint.paths import WorkspacePath
+
+from databricks.labs.ucx.framework.utils import escape_sql_identifier
 from databricks.labs.ucx.source_code.python.python_ast import Tree
 
 # Code mapping between LSP, PyLint, and our own diagnostics:
diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
new file mode 100644
index 0000000000..8a9a1d879c
--- /dev/null
+++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
@@ -0,0 +1,19 @@
+from databricks.labs.ucx.framework.crawlers import CrawlerBase
+from databricks.labs.ucx.source_code.base import DFSA
+from databricks.labs.lsql.backends import SqlBackend
+
+
+class DfsaCrawler(CrawlerBase):
+
+    def __init__(self, backend: SqlBackend, schema: str):
+        """
+        Initializes a DFSACrawler instance.
+
+        Args:
+            backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
+            schema: The schema name for the inventory persistence.
+        """
+        super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DFSA)
+
+    def append(self, dfsa: DFSA):
+        self._append_records([dfsa])
diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
index 4fe867b37a..19e3f5fd70 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -12,7 +12,8 @@
     Deprecation,
     CurrentSessionState,
     PythonLinter,
-    SqlLinter, DFSA,
+    SqlLinter,
+    DFSA,
 )
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
diff --git a/tests/unit/source_code/test_dfsa.py b/tests/unit/source_code/test_dfsa.py
deleted file mode 100644
index 18caa46ff1..0000000000
--- a/tests/unit/source_code/test_dfsa.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from databricks.labs.lsql.backends import MockBackend
-
-from databricks.labs.ucx.source_code.linters.dfsa import DFSA
-
-
-def test_crawler_appends_dfsas():
-    backend = MockBackend()
-    crawler = DfsaCrawler(backend, "schema")
-    for path in ("a", "b", "c"):
-        dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False)
-        crawler.append(dfsa)
-    rows = backend.rows_written_for(crawler.full_name, "append")
-    assert len(rows) == 3
-
diff --git a/tests/unit/source_code/test_dfsa_crawler.py b/tests/unit/source_code/test_dfsa_crawler.py
index e7095eb0e1..93652c4aee 100644
--- a/tests/unit/source_code/test_dfsa_crawler.py
+++ b/tests/unit/source_code/test_dfsa_crawler.py
@@ -1,6 +1,7 @@
 from databricks.labs.lsql.backends import MockBackend
 
 from databricks.labs.ucx.source_code.base import DFSA
+from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler
 
 
 def test_crawler_appends_dfsas():
@@ -11,4 +12,3 @@ def test_crawler_appends_dfsas():
         crawler.append(dfsa)
     rows = backend.rows_written_for(crawler.full_name, "append")
     assert len(rows) == 3
-

From f4bc0b888ec50a6f5bfb80bd087d8740c5fbd1da Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 10:55:55 +0200
Subject: [PATCH 18/80] merge from stale branch

---
 .../labs/ucx/source_code/linters/dfsa.py      |  21 ++-
 .../labs/ucx/source_code/linters/pyspark.py   | 158 ++++++++++++------
 2 files changed, 116 insertions(+), 63 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
index 19e3f5fd70..afc0fbc5bf 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -69,11 +69,11 @@ class _DetectDfsaVisitor(TreeVisitor):
     against a list of known deprecated paths.
     """
 
-    def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None:
+    def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates: bool) -> None:
         self._session_state = session_state
         self._dfsa_nodes: list[DFSANode] = []
         self._reported_locations: set[tuple[int, int]] = set()
-        self._allow_spark_duplicates = allow_spark_duplicates
+        self._prevent_spark_duplicates = prevent_spark_duplicates
 
     def visit_call(self, node: Call):
         for arg in node.args:
@@ -98,11 +98,16 @@ def _check_str_constant(self, source_node, inferred: InferredValue):
         if self._already_reported(source_node, inferred):
             return
         # avoid duplicate advices that are reported by SparkSqlPyLinter
-        if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates:
+        if self._prevent_spark_duplicates and Tree(source_node).is_from_module("spark"):
             return
         value = inferred.as_string()
-        if any(pattern.matches(value) for pattern in DFSA_PATTERNS):
-            self._dfsa_nodes.append(DFSANode(DFSA(value), source_node))
+        for pattern in DFSA_PATTERNS:
+            if not pattern.matches(value):
+                continue
+            # since we're normally filtering out spark calls, we're dealing with dfsas we know little about
+            # notable we don't know is_read or is_write
+            dfsa = DFSA(source_type=DFSA.UNKNOWN, source_id=DFSA.UNKNOWN, path=value, is_read=True, is_write=False)
+            self._dfsa_nodes.append(DFSANode(dfsa, source_node))
             self._reported_locations.add((source_node.lineno, source_node.col_offset))
 
     def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
@@ -116,9 +121,9 @@ def dfsa_nodes(self):
 
 class DfsaPyLinter(PythonLinter):
 
-    def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False):
+    def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates=True):
         self._session_state = session_state
-        self._allow_spark_duplicates = allow_spark_duplicates
+        self._prevent_spark_duplicates = prevent_spark_duplicates
 
     @staticmethod
     def name() -> str:
@@ -131,7 +136,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
         """
         Lints the code looking for file system paths that are deprecated
         """
-        visitor = _DetectDfsaVisitor(self._session_state, self._allow_spark_duplicates)
+        visitor = _DetectDfsaVisitor(self._session_state, self._prevent_spark_duplicates)
         visitor.visit(tree.node)
         for dfsa_node in visitor.dfsa_nodes:
             advisory = Deprecation.from_node(
diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index a537757add..394b72164f 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -2,6 +2,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
+from typing import TypeVar
 
 from astroid import Attribute, Call, Const, InferenceError, NodeNG  # type: ignore
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
@@ -12,9 +13,10 @@
     Fixer,
     CurrentSessionState,
     PythonLinter,
+    DFSA,
 )
-from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
+from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS, DFSANode
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
 
@@ -31,6 +33,8 @@ class Matcher(ABC):
     table_arg_name: str | None = None
     call_context: dict[str, set[str]] | None = None
     session_state: CurrentSessionState | None = None
+    is_read: bool | None = None
+    is_write: bool | None = None
 
     def matches(self, node: NodeNG):
         return (
@@ -123,6 +127,8 @@ def lint(
         self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: Call
     ) -> Iterator[Advice]:
         table_arg = self._get_table_arg(node)
+        if table_arg is None:
+            return
         table_name = table_arg.as_string().strip("'").strip('"')
         for inferred in InferredValue.infer_from_node(table_arg, session_state):
             if not inferred.is_inferred():
@@ -181,6 +187,9 @@ def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Cal
         return
 
 
+T = TypeVar("T")
+
+
 @dataclass
 class DirectFilesystemAccessMatcher(Matcher):
 
@@ -195,18 +204,34 @@ def matches(self, node: NodeNG):
     def lint(
         self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state: CurrentSessionState, node: NodeNG
     ) -> Iterator[Advice]:
-        table_arg = self._get_table_arg(node)
-        for inferred in InferredValue.infer_from_node(table_arg):
+
+        for dfsa_node in self._for_table_arg(node):
+            yield Deprecation.from_node(
+                code='direct-filesystem-access',
+                message=f"The use of direct filesystem references is deprecated: {dfsa_node.dfsa.path}",
+                node=dfsa_node.node,
+            )
+
+    def _for_table_arg(self, node: NodeNG) -> Iterable[DFSANode]:
+        if not isinstance(node, Call):
+            return
+        table_arg_node = self._get_table_arg(node)
+        for inferred in InferredValue.infer_from_node(table_arg_node):
             if not inferred.is_inferred():
-                logger.debug(f"Could not infer value of {table_arg.as_string()}")
                 continue
-            value = inferred.as_string()
-            if any(pattern.matches(value) for pattern in DFSA_PATTERNS):
-                yield Deprecation.from_node(
-                    code='direct-filesystem-access',
-                    message=f"The use of direct filesystem references is deprecated: {value}",
-                    node=node,
+            table_arg = inferred.as_string()
+            if not table_arg:
+                continue
+            if any(pattern.matches(table_arg) for pattern in DFSA_PATTERNS):
+                dfsa = DFSA(
+                    source_type=DFSA.UNKNOWN,
+                    source_id=DFSA.UNKNOWN,
+                    path=table_arg,
+                    is_read=self.is_read or False,
+                    is_write=self.is_write or False,
                 )
+                yield DFSANode(dfsa, node)
+                continue
 
     def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Call) -> None:
         # No transformations to apply
@@ -215,12 +240,64 @@ def apply(self, from_table: FromTableSqlLinter, index: MigrationIndex, node: Cal
 
 class SparkMatchers:
 
-    def __init__(self):
+    def __init__(self, dfsa_matchers_only: bool):
+
+        spark_dfsa_matchers: list[Matcher] = [
+            DirectFilesystemAccessMatcher(
+                "ls", 1, 1, 0, call_context={"ls": {"dbutils.fs.ls"}}, is_read=True, is_write=False
+            ),
+            DirectFilesystemAccessMatcher(
+                "cp", 1, 2, 0, call_context={"cp": {"dbutils.fs.cp"}}, is_read=True, is_write=True
+            ),
+            DirectFilesystemAccessMatcher("rm", 1, 1, 0, call_context={"rm": {"dbutils.fs.rm"}}, is_write=True),
+            DirectFilesystemAccessMatcher(
+                "head", 1, 1, 0, call_context={"head": {"dbutils.fs.head"}}, is_read=True, is_write=False
+            ),
+            DirectFilesystemAccessMatcher(
+                "put", 1, 2, 0, call_context={"put": {"dbutils.fs.put"}}, is_read=False, is_write=True
+            ),
+            DirectFilesystemAccessMatcher(
+                "mkdirs", 1, 1, 0, call_context={"mkdirs": {"dbutils.fs.mkdirs"}}, is_read=False, is_write=True
+            ),
+            DirectFilesystemAccessMatcher(
+                "mv", 1, 2, 0, call_context={"mv": {"dbutils.fs.mv"}}, is_read=False, is_write=True
+            ),
+            DirectFilesystemAccessMatcher("text", 1, 3, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("csv", 1, 1000, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("json", 1, 1000, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("orc", 1, 1000, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("parquet", 1, 1000, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("save", 0, 1000, 0, "path", is_read=False, is_write=True),
+            DirectFilesystemAccessMatcher("load", 0, 1000, 0, "path", is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher(
+                "option", 1, 1000, 1, is_read=True, is_write=False
+            ),  # Only .option("path", "xxx://bucket/path") will hit
+            DirectFilesystemAccessMatcher("addFile", 1, 3, 0, is_read=False, is_write=True),
+            DirectFilesystemAccessMatcher("binaryFiles", 1, 2, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("binaryRecords", 1, 2, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("dump_profiles", 1, 1, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("hadoopFile", 1, 8, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("newAPIHadoopFile", 1, 8, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("pickleFile", 1, 3, 0, is_read=True, is_write=False),
+            DirectFilesystemAccessMatcher("saveAsHadoopFile", 1, 8, 0, is_read=False, is_write=True),
+            DirectFilesystemAccessMatcher("saveAsNewAPIHadoopFile", 1, 7, 0, is_read=False, is_write=True),
+            DirectFilesystemAccessMatcher("saveAsPickleFile", 1, 2, 0, is_read=False, is_write=True),
+            DirectFilesystemAccessMatcher("saveAsSequenceFile", 1, 2, 0, is_read=False, is_write=True),
+            DirectFilesystemAccessMatcher("saveAsTextFile", 1, 2, 0, is_read=False, is_write=True),
+            DirectFilesystemAccessMatcher("load_from_path", 1, 1, 0, is_read=True, is_write=False),
+        ]
+        if dfsa_matchers_only:
+            self._make_matchers(spark_dfsa_matchers)
+            return
+
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.SparkSession.html
-        spark_session_matchers = [QueryMatcher("sql", 1, 1000, 0, "sqlQuery"), TableNameMatcher("table", 1, 1, 0)]
+        spark_session_matchers: list[Matcher] = [
+            QueryMatcher("sql", 1, 1000, 0, "sqlQuery"),
+            TableNameMatcher("table", 1, 1, 0),
+        ]
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Catalog.html
-        spark_catalog_matchers = [
+        spark_catalog_matchers: list[Matcher] = [
             TableNameMatcher("cacheTable", 1, 2, 0, "tableName"),
             TableNameMatcher("createTable", 1, 1000, 0, "tableName"),
             TableNameMatcher("createExternalTable", 1, 1000, 0, "tableName"),
@@ -235,7 +312,7 @@ def __init__(self):
         ]
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
-        spark_dataframe_matchers = [
+        spark_dataframe_matchers: list[Matcher] = [
             TableNameMatcher("writeTo", 1, 1, 0),
         ]
 
@@ -249,12 +326,12 @@ def __init__(self):
         # nothing to migrate in Window, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Window.html
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html
-        spark_dataframereader_matchers = [
+        spark_dataframereader_matchers: list[Matcher] = [
             TableNameMatcher("table", 1, 1, 0),  # TODO good example of collision, see spark_session_calls
         ]
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html
-        spark_dataframewriter_matchers = [
+        spark_dataframewriter_matchers: list[Matcher] = [
             TableNameMatcher("insertInto", 1, 2, 0, "tableName"),
             # TODO jdbc: could the url be a databricks url, raise warning ?
             TableNameMatcher("saveAsTable", 1, 4, 0, "name"),
@@ -263,48 +340,20 @@ def __init__(self):
         # nothing to migrate in DataFrameWriterV2, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriterV2.html
         # nothing to migrate in UDFRegistration, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.UDFRegistration.html
 
-        direct_fs_access_matchers = [
-            DirectFilesystemAccessMatcher("ls", 1, 1, 0, call_context={"ls": {"dbutils.fs.ls"}}),
-            DirectFilesystemAccessMatcher("cp", 1, 2, 0, call_context={"cp": {"dbutils.fs.cp"}}),
-            DirectFilesystemAccessMatcher("rm", 1, 1, 0, call_context={"rm": {"dbutils.fs.rm"}}),
-            DirectFilesystemAccessMatcher("head", 1, 1, 0, call_context={"head": {"dbutils.fs.head"}}),
-            DirectFilesystemAccessMatcher("put", 1, 2, 0, call_context={"put": {"dbutils.fs.put"}}),
-            DirectFilesystemAccessMatcher("mkdirs", 1, 1, 0, call_context={"mkdirs": {"dbutils.fs.mkdirs"}}),
-            DirectFilesystemAccessMatcher("mv", 1, 2, 0, call_context={"mv": {"dbutils.fs.mv"}}),
-            DirectFilesystemAccessMatcher("text", 1, 3, 0),
-            DirectFilesystemAccessMatcher("csv", 1, 1000, 0),
-            DirectFilesystemAccessMatcher("json", 1, 1000, 0),
-            DirectFilesystemAccessMatcher("orc", 1, 1000, 0),
-            DirectFilesystemAccessMatcher("parquet", 1, 1000, 0),
-            DirectFilesystemAccessMatcher("save", 0, 1000, 0, "path"),
-            DirectFilesystemAccessMatcher("load", 0, 1000, 0, "path"),
-            DirectFilesystemAccessMatcher("option", 1, 1000, 1),  # Only .option("path", "xxx://bucket/path") will hit
-            DirectFilesystemAccessMatcher("addFile", 1, 3, 0),
-            DirectFilesystemAccessMatcher("binaryFiles", 1, 2, 0),
-            DirectFilesystemAccessMatcher("binaryRecords", 1, 2, 0),
-            DirectFilesystemAccessMatcher("dump_profiles", 1, 1, 0),
-            DirectFilesystemAccessMatcher("hadoopFile", 1, 8, 0),
-            DirectFilesystemAccessMatcher("newAPIHadoopFile", 1, 8, 0),
-            DirectFilesystemAccessMatcher("pickleFile", 1, 3, 0),
-            DirectFilesystemAccessMatcher("saveAsHadoopFile", 1, 8, 0),
-            DirectFilesystemAccessMatcher("saveAsNewAPIHadoopFile", 1, 7, 0),
-            DirectFilesystemAccessMatcher("saveAsPickleFile", 1, 2, 0),
-            DirectFilesystemAccessMatcher("saveAsSequenceFile", 1, 2, 0),
-            DirectFilesystemAccessMatcher("saveAsTextFile", 1, 2, 0),
-            DirectFilesystemAccessMatcher("load_from_path", 1, 1, 0),
-        ]
-
         # nothing to migrate in UserDefinedFunction, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.UserDefinedFunction.html
         # nothing to migrate in UserDefinedTableFunction, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.UserDefinedTableFunction.html
-        self._matchers = {}
-        for matcher in (
-            spark_session_matchers
+        self._make_matchers(
+            spark_dfsa_matchers
+            + spark_session_matchers
             + spark_catalog_matchers
             + spark_dataframe_matchers
             + spark_dataframereader_matchers
             + spark_dataframewriter_matchers
-            + direct_fs_access_matchers
-        ):
+        )
+
+    def _make_matchers(self, matchers: list[Matcher]):
+        self._matchers = {}
+        for matcher in matchers:
             self._matchers[matcher.method_name] = matcher
 
     @property
@@ -314,12 +363,11 @@ def matchers(self):
 
 class SparkSqlPyLinter(PythonLinter, Fixer):
 
-    _spark_matchers = SparkMatchers()
-
     def __init__(self, from_table: FromTableSqlLinter, index: MigrationIndex, session_state):
         self._from_table = from_table
         self._index = index
         self._session_state = session_state
+        self._spark_matchers = SparkMatchers(False).matchers
 
     def name(self) -> str:
         # this is the same fixer, just in a different language context
@@ -349,7 +397,7 @@ def _find_matcher(self, node: NodeNG):
             return None
         if not isinstance(node.func, Attribute):
             return None
-        matcher = self._spark_matchers.matchers.get(node.func.attrname, None)
+        matcher = self._spark_matchers.get(node.func.attrname, None)
         if matcher is None:
             return None
         return matcher if matcher.matches(node) else None

From 7ae9adc76745b000a2c01bc9efcd9a9c781c6b84 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 11:40:47 +0200
Subject: [PATCH 19/80] fix failing tests

---
 src/databricks/labs/ucx/source_code/linters/dfsa.py | 2 +-
 tests/unit/source_code/linters/test_dfsa.py         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
index afc0fbc5bf..4a54818456 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -52,7 +52,7 @@ def _matches_allowed_root(self, value: str):
     DFSAPattern("abfs:/", []),
     DFSAPattern("abfss:/", []),
     DFSAPattern("hdfs:/", []),
-    DFSAPattern("/mnt/", []),
+    # "/mnt/" is detected by the below pattern,
     RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]),
 ]
 
diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_dfsa.py
index 11cb5a1d08..db8a4ac096 100644
--- a/tests/unit/source_code/linters/test_dfsa.py
+++ b/tests/unit/source_code/linters/test_dfsa.py
@@ -33,7 +33,7 @@ def test_matches_dfsa_pattern(path, matches):
     ],
 )
 def test_detects_dfsa_paths(code, expected):
-    linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DfsaPyLinter(CurrentSessionState(), prevent_spark_duplicates=False)
     advices = list(linter.lint(code))
     for advice in advices:
         assert isinstance(advice, Advice)
@@ -63,7 +63,7 @@ def test_detects_dfsa_paths(code, expected):
     ],
 )
 def test_dfsa_usage_linter(code, expected):
-    linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DfsaPyLinter(CurrentSessionState(), prevent_spark_duplicates=False)
     advices = linter.lint(code)
     count = 0
     for advice in advices:

From fdf7a39991a2f92ae3a67814a8041e5b81fc06fd Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 11:44:10 +0200
Subject: [PATCH 20/80] rename ctor arg

---
 src/databricks/labs/ucx/source_code/jobs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index feab7bc1d0..dc09d2e12c 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -419,13 +419,13 @@ class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
     def __init__(
         self,
         graph: DependencyGraph,
-        linted_paths: set[Path],
+        walked_paths: set[Path],
         path_lookup: PathLookup,
         key: str,
         session_state: CurrentSessionState,
         migration_index: MigrationIndex,
     ):
-        super().__init__(graph, linted_paths, path_lookup)
+        super().__init__(graph, walked_paths, path_lookup)
         self._key = key
         self._session_state = session_state
         self._migration_index = migration_index

From 054f847f5c9e44e3d37106bd51e69f8c6b28e572 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 12:36:32 +0200
Subject: [PATCH 21/80] fix infinite recursion with unknown ASTs

---
 .../labs/ucx/source_code/python/python_ast.py | 70 ++++++++++++++-----
 1 file changed, 52 insertions(+), 18 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py
index 7ce7228dd9..57dc87d21e 100644
--- a/src/databricks/labs/ucx/source_code/python/python_ast.py
+++ b/src/databricks/labs/ucx/source_code/python/python_ast.py
@@ -141,7 +141,7 @@ def append_tree(self, tree: Tree) -> Tree:
         # because each node points to the correct parent (practically, the tree is now only a list of statements)
         return tree
 
-    def append_globals(self, globs: dict[str, list[NodeNG]]) -> None:
+    def append_globals(self, globs: dict[str, list[Expr]]) -> None:
         if not isinstance(self.node, Module):
             raise NotImplementedError(f"Can't append globals to {type(self.node).__name__}")
         self_module: Module = cast(Module, self.node)
@@ -161,28 +161,56 @@ def append_nodes(self, nodes: list[NodeNG]) -> None:
             self_module.body.append(node)
 
     def is_from_module(self, module_name: str) -> bool:
-        # if this is the call's root node, check it against the required module
-        if isinstance(self._node, Name):
-            if self._node.name == module_name:
-                return True
-            root = self.root
-            if not isinstance(root, Module):
-                return False
-            for value in root.globals.get(self._node.name, []):
-                if not isinstance(value, AssignName) or not isinstance(value.parent, Assign):
-                    continue
-                if Tree(value.parent.value).is_from_module(module_name):
-                    return True
+        return self._is_from_module(module_name, set())
+
+    def _is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        if self._node in visited:
+            logger.debug(f"Recursion encountered while traversing node {self._node.as_string()}")
             return False
-        # walk up intermediate calls such as spark.range(...)
+        visited.add(self._node)
+        return self._node_is_from_module(module_name, visited)
+
+    def _node_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        if isinstance(self._node, Name):
+            return self._name_is_from_module(module_name, visited)
         if isinstance(self._node, Call):
-            return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name)
+            return self._call_is_from_module(module_name, visited)
         if isinstance(self._node, Attribute):
-            return Tree(self._node.expr).is_from_module(module_name)
+            return self._attribute_is_from_module(module_name, visited)
         if isinstance(self._node, Const):
-            return Tree(self._node.parent).is_from_module(module_name)
+            return self._const_is_from_module(module_name, visited)
+        return False
+
+    def _name_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Name)
+        # if this is the call's root node, check it against the required module
+        if self._node.name == module_name:
+            return True
+        root = self.root
+        if not isinstance(root, Module):
+            return False
+        for value in root.globals.get(self._node.name, []):
+            if not isinstance(value, AssignName) or not isinstance(value.parent, Assign):
+                continue
+            if _LocalTree(value.parent.value).is_from_module_visited(module_name, visited):
+                return True
         return False
 
+    def _call_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Call)
+        # walk up intermediate calls such as spark.range(...)
+        return isinstance(self._node.func, Attribute) and _LocalTree(self._node.func.expr).is_from_module_visited(
+            module_name, visited
+        )
+
+    def _attribute_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Attribute)
+        return _LocalTree(self._node.expr).is_from_module_visited(module_name, visited)
+
+    def _const_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Const)
+        return _LocalTree(self._node.parent).is_from_module_visited(module_name, visited)
+
     def has_global(self, name: str) -> bool:
         if not isinstance(self.node, Module):
             return False
@@ -230,7 +258,7 @@ def renumber(self, start: int) -> Tree:
         assert start != 0
         if not isinstance(self.node, Module):
             raise NotImplementedError(f"Can't renumber {type(self.node).__name__}")
-        root: Module = self.node
+        root: Module = cast(Module, self.node)
         # for now renumber in place to avoid the complexity of rebuilding the tree with clones
 
         def renumber_node(node: NodeNG, offset: int) -> None:
@@ -249,6 +277,12 @@ def renumber_node(node: NodeNG, offset: int) -> None:
         return self
 
 
+class _LocalTree(Tree):
+
+    def is_from_module_visited(self, name: str, visited_nodes: set[NodeNG]) -> bool:
+        return self._is_from_module(name, visited_nodes)
+
+
 class TreeHelper(ABC):
 
     @classmethod

From 9c4a5bfd8c9156eb7f61c04ba77de6d29d29d3f7 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 12:36:32 +0200
Subject: [PATCH 22/80] fix infinite recursion with unknown ASTs

---
 .../labs/ucx/source_code/python/python_ast.py | 70 ++++++++++++++-----
 1 file changed, 52 insertions(+), 18 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py
index 7ce7228dd9..57dc87d21e 100644
--- a/src/databricks/labs/ucx/source_code/python/python_ast.py
+++ b/src/databricks/labs/ucx/source_code/python/python_ast.py
@@ -141,7 +141,7 @@ def append_tree(self, tree: Tree) -> Tree:
         # because each node points to the correct parent (practically, the tree is now only a list of statements)
         return tree
 
-    def append_globals(self, globs: dict[str, list[NodeNG]]) -> None:
+    def append_globals(self, globs: dict[str, list[Expr]]) -> None:
         if not isinstance(self.node, Module):
             raise NotImplementedError(f"Can't append globals to {type(self.node).__name__}")
         self_module: Module = cast(Module, self.node)
@@ -161,28 +161,56 @@ def append_nodes(self, nodes: list[NodeNG]) -> None:
             self_module.body.append(node)
 
     def is_from_module(self, module_name: str) -> bool:
-        # if this is the call's root node, check it against the required module
-        if isinstance(self._node, Name):
-            if self._node.name == module_name:
-                return True
-            root = self.root
-            if not isinstance(root, Module):
-                return False
-            for value in root.globals.get(self._node.name, []):
-                if not isinstance(value, AssignName) or not isinstance(value.parent, Assign):
-                    continue
-                if Tree(value.parent.value).is_from_module(module_name):
-                    return True
+        return self._is_from_module(module_name, set())
+
+    def _is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        if self._node in visited:
+            logger.debug(f"Recursion encountered while traversing node {self._node.as_string()}")
             return False
-        # walk up intermediate calls such as spark.range(...)
+        visited.add(self._node)
+        return self._node_is_from_module(module_name, visited)
+
+    def _node_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        if isinstance(self._node, Name):
+            return self._name_is_from_module(module_name, visited)
         if isinstance(self._node, Call):
-            return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name)
+            return self._call_is_from_module(module_name, visited)
         if isinstance(self._node, Attribute):
-            return Tree(self._node.expr).is_from_module(module_name)
+            return self._attribute_is_from_module(module_name, visited)
         if isinstance(self._node, Const):
-            return Tree(self._node.parent).is_from_module(module_name)
+            return self._const_is_from_module(module_name, visited)
+        return False
+
+    def _name_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Name)
+        # if this is the call's root node, check it against the required module
+        if self._node.name == module_name:
+            return True
+        root = self.root
+        if not isinstance(root, Module):
+            return False
+        for value in root.globals.get(self._node.name, []):
+            if not isinstance(value, AssignName) or not isinstance(value.parent, Assign):
+                continue
+            if _LocalTree(value.parent.value).is_from_module_visited(module_name, visited):
+                return True
         return False
 
+    def _call_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Call)
+        # walk up intermediate calls such as spark.range(...)
+        return isinstance(self._node.func, Attribute) and _LocalTree(self._node.func.expr).is_from_module_visited(
+            module_name, visited
+        )
+
+    def _attribute_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Attribute)
+        return _LocalTree(self._node.expr).is_from_module_visited(module_name, visited)
+
+    def _const_is_from_module(self, module_name: str, visited: set[NodeNG]) -> bool:
+        assert isinstance(self._node, Const)
+        return _LocalTree(self._node.parent).is_from_module_visited(module_name, visited)
+
     def has_global(self, name: str) -> bool:
         if not isinstance(self.node, Module):
             return False
@@ -230,7 +258,7 @@ def renumber(self, start: int) -> Tree:
         assert start != 0
         if not isinstance(self.node, Module):
             raise NotImplementedError(f"Can't renumber {type(self.node).__name__}")
-        root: Module = self.node
+        root: Module = cast(Module, self.node)
         # for now renumber in place to avoid the complexity of rebuilding the tree with clones
 
         def renumber_node(node: NodeNG, offset: int) -> None:
@@ -249,6 +277,12 @@ def renumber_node(node: NodeNG, offset: int) -> None:
         return self
 
 
+class _LocalTree(Tree):
+
+    def is_from_module_visited(self, name: str, visited_nodes: set[NodeNG]) -> bool:
+        return self._is_from_module(name, visited_nodes)
+
+
 class TreeHelper(ABC):
 
     @classmethod

From 688985270374487279acdfdf61b38188fe02feef Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 12:57:22 +0200
Subject: [PATCH 23/80] make register_magic_command a decorator

---
 .../labs/ucx/source_code/notebooks/cells.py     | 17 +++++++----------
 .../ucx/source_code/python/python_analyzer.py   | 10 ++++++++--
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py
index 58e5bd763e..bd38b35333 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/cells.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py
@@ -24,7 +24,7 @@
     PythonCodeAnalyzer,
     MagicCommand,
     MagicNode,
-    register_magic_command_factory,
+    magic_command_factory,
 )
 
 # use a specific logger for sqlglot warnings so we can disable them selectively
@@ -400,8 +400,9 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str:
 
 class RunCommand(MagicCommand):
 
-    @classmethod
-    def factory(cls, command: str, node: NodeNG) -> MagicCommand | None:
+    @staticmethod
+    @magic_command_factory
+    def factory(command: str, node: NodeNG) -> MagicCommand | None:
         if command.startswith("%run"):
             return RunCommand(node, command)
         return None
@@ -443,13 +444,11 @@ def build_inherited_context(self, context: DependencyGraphContext, child_path: P
         return container.build_inherited_context(context.parent, child_path)
 
 
-register_magic_command_factory(RunCommand.factory)
-
-
 class PipCommand(MagicCommand):
 
-    @classmethod
-    def factory(cls, command: str, node: NodeNG) -> MagicCommand | None:
+    @staticmethod
+    @magic_command_factory
+    def factory(command: str, node: NodeNG) -> MagicCommand | None:
         if command.startswith("%pip") or command.startswith("!pip"):
             return PipCommand(node, command)
         return None
@@ -493,5 +492,3 @@ def _split(cls, code: str) -> list[str]:
         lexer = shlex.split(code, posix=True)
         return list(lexer)
 
-
-register_magic_command_factory(PipCommand.factory)
diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
index 05d27709f9..78e6de34d3 100644
--- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py
+++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
@@ -271,5 +271,11 @@ def build_inherited_context(self, _context: DependencyGraphContext, _child_path:
 _FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = []
 
 
-def register_magic_command_factory(factory: Callable[[str, NodeNG], MagicCommand | None]):
-    _FACTORIES.append(factory)
+def magic_command_factory(func: Callable[[str, NodeNG], MagicCommand | None]):
+    _FACTORIES.append(func)
+
+    def inner(command: str, node: NodeNG) -> MagicCommand | None:
+        return func(command, node)
+
+    return inner
+

From 779219997cf52268ebf2f772f8bfd458bed949e3 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 12:57:55 +0200
Subject: [PATCH 24/80] formatting

---
 src/databricks/labs/ucx/source_code/notebooks/cells.py        | 1 -
 src/databricks/labs/ucx/source_code/python/python_analyzer.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py
index bd38b35333..a2b994972a 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/cells.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py
@@ -491,4 +491,3 @@ def _split(cls, code: str) -> list[str]:
         code = code.replace("\\\n", " ")
         lexer = shlex.split(code, posix=True)
         return list(lexer)
-
diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
index 78e6de34d3..d2a2454b2f 100644
--- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py
+++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
@@ -278,4 +278,3 @@ def inner(command: str, node: NodeNG) -> MagicCommand | None:
         return func(command, node)
 
     return inner
-

From b4ba5ae3e425bf1606853f22a756bf98c441c62a Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 17:04:12 +0200
Subject: [PATCH 25/80] integrate with WorkflowLinter

---
 .../labs/ucx/source_code/dfsa_crawler.py      |  6 +-
 src/databricks/labs/ucx/source_code/jobs.py   | 88 ++++++++++++++++++-
 .../labs/ucx/source_code/linters/dfsa.py      | 82 ++++++++++++++---
 tests/unit/source_code/conftest.py            |  8 ++
 .../functional/file-access/create_location.py |  2 +
 tests/unit/source_code/test_dfsa_crawler.py   |  7 +-
 tests/unit/source_code/test_functional.py     |  2 +-
 tests/unit/source_code/test_jobs.py           |  6 +-
 8 files changed, 177 insertions(+), 24 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
index 8a9a1d879c..fa3ce36f8d 100644
--- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py
+++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
@@ -1,3 +1,5 @@
+from collections.abc import Sequence
+
 from databricks.labs.ucx.framework.crawlers import CrawlerBase
 from databricks.labs.ucx.source_code.base import DFSA
 from databricks.labs.lsql.backends import SqlBackend
@@ -15,5 +17,5 @@ def __init__(self, backend: SqlBackend, schema: str):
         """
         super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DFSA)
 
-    def append(self, dfsa: DFSA):
-        self._append_records([dfsa])
+    def append(self, dfsas: Sequence[DFSA]):
+        self._append_records(dfsas)
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index dc09d2e12c..b8c534adb6 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -16,11 +16,20 @@
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.errors import NotFound
 from databricks.sdk.service import compute, jobs
+from databricks.sdk.service.workspace import Language
 
 from databricks.labs.ucx.assessment.crawlers import runtime_version_tuple
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
 from databricks.labs.ucx.mixins.cached_workspace_path import WorkspaceCache
-from databricks.labs.ucx.source_code.base import CurrentSessionState, LocatedAdvice
+from databricks.labs.ucx.source_code.base import (
+    CurrentSessionState,
+    LocatedAdvice,
+    DFSA,
+    is_a_notebook,
+    file_language,
+    guess_encoding,
+)
+from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler
 from databricks.labs.ucx.source_code.graph import (
     Dependency,
     DependencyGraph,
@@ -31,8 +40,9 @@
     DependencyGraphWalker,
 )
 from databricks.labs.ucx.source_code.linters.context import LinterContext
+from databricks.labs.ucx.source_code.linters.dfsa import DfsaSqlLinter, DfsaPyLinter
 from databricks.labs.ucx.source_code.python.python_ast import Tree
-from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
+from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
 
 logger = logging.getLogger(__name__)
@@ -320,12 +330,14 @@ def __init__(
         resolver: DependencyResolver,
         path_lookup: PathLookup,
         migration_index: MigrationIndex,
+        dfsa_crawler: DfsaCrawler,
         include_job_ids: list[int] | None = None,
     ):
         self._ws = ws
         self._resolver = resolver
         self._path_lookup = path_lookup
         self._migration_index = migration_index
+        self._dfsa_crawler = dfsa_crawler
         self._include_job_ids = include_job_ids
 
     def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
@@ -412,6 +424,9 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) ->
             graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index
         )
         yield from walker
+        collector = DfsaCollector(graph, set(), self._path_lookup, session_state)
+        dfsas = list(dfsa for dfsa in collector)
+        self._dfsa_crawler.append(dfsas)
 
 
 class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
@@ -441,3 +456,72 @@ def _process_dependency(
         linter = FileLinter(ctx, path_lookup, self._session_state, dependency.path, inherited_tree)
         for advice in linter.lint():
             yield LocatedAdvice(advice, dependency.path)
+
+
+class DfsaCollector(DependencyGraphWalker[DFSA]):
+
+    def __init__(
+        self,
+        graph: DependencyGraph,
+        walked_paths: set[Path],
+        path_lookup: PathLookup,
+        session_state: CurrentSessionState,
+    ):
+        super().__init__(graph, walked_paths, path_lookup)
+        self._session_state = session_state
+
+    def _process_dependency(
+        self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None
+    ) -> Iterable[DFSA]:
+        language = file_language(dependency.path)
+        if not language:
+            logger.warning(f"Unknown language for {dependency.path}")
+            return
+        source = dependency.path.read_text(guess_encoding(dependency.path))
+        if is_a_notebook(dependency.path):
+            yield from self._collect_from_notebook(source, language, dependency.path, inherited_tree)
+        elif dependency.path.is_file():
+            yield from self._collect_from_source(source, language, dependency.path, inherited_tree)
+
+    def _collect_from_notebook(
+        self, source: str, language: Language, path: Path, inherited_tree: Tree | None
+    ) -> Iterable[DFSA]:
+        notebook = Notebook.parse(path, source, language)
+        for cell in notebook.cells:
+            for dfsa in self._collect_from_source(cell.original_code, cell.language.language, path, inherited_tree):
+                yield DFSA(
+                    source_type="NOTEBOOK",
+                    source_id=str(path),
+                    path=dfsa.path,
+                    is_read=dfsa.is_read,
+                    is_write=dfsa.is_write,
+                )
+            if cell.language.language is Language.PYTHON:
+                if inherited_tree is None:
+                    inherited_tree = Tree.new_module()
+                tree = Tree.normalize_and_parse(cell.original_code)
+                inherited_tree.append_tree(tree)
+
+    def _collect_from_source(
+        self, source: str, language: Language, path: Path, inherited_tree: Tree | None
+    ) -> Iterable[DFSA]:
+        iterable: Iterable[DFSA] | None = None
+        if language is Language.SQL:
+            iterable = self._collect_from_sql(source)
+        if language is Language.PYTHON:
+            iterable = self._collect_from_python(source, inherited_tree)
+        if iterable is None:
+            logger.warning(f"Language {language.name} not supported yet!")
+            return
+        for dfsa in iterable:
+            yield DFSA(
+                source_type="FILE", source_id=str(path), path=dfsa.path, is_read=dfsa.is_read, is_write=dfsa.is_write
+            )
+
+    def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DFSA]:
+        linter = DfsaPyLinter(self._session_state, prevent_spark_duplicates=False)
+        yield from linter.collect_dfsas(source, inherited_tree)
+
+    def _collect_from_sql(self, source: str) -> Iterable[DFSA]:
+        linter = DfsaSqlLinter()
+        yield from linter.collect_dfsas(source)
diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
index 4a54818456..6fd87e41c1 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -4,8 +4,8 @@
 from collections.abc import Iterable
 
 from astroid import Call, Const, InferenceError, NodeNG  # type: ignore
-from sqlglot import Expression
-from sqlglot.expressions import Table
+from sqlglot import Expression as SqlExpression, parse as parse_sql, ParseError as SqlParseError
+from sqlglot.expressions import Alter, Create, Delete, Drop, Identifier, Insert, Literal, Select
 
 from databricks.labs.ucx.source_code.base import (
     Advice,
@@ -146,6 +146,15 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
             )
             yield advisory
 
+    def collect_dfsas(self, python_code: str, inherited_tree: Tree | None) -> Iterable[DFSA]:
+        tree = Tree.new_module()
+        if inherited_tree:
+            tree.append_tree(inherited_tree)
+        tree.append_tree(Tree.normalize_and_parse(python_code))
+        visitor = _DetectDfsaVisitor(self._session_state, self._prevent_spark_duplicates)
+        visitor.visit(tree.node)
+        yield from visitor.dfsa_nodes
+
 
 class DfsaSqlLinter(SqlLinter):
 
@@ -153,23 +162,68 @@ class DfsaSqlLinter(SqlLinter):
     def name() -> str:
         return 'dfsa-query'
 
-    def lint_expression(self, expression: Expression):
-        for table in expression.find_all(Table):
-            # Check table names for direct file system access
-            yield from self._check_dfsa(table)
-
-    def _check_dfsa(self, table: Table) -> Iterable[Advice]:
-        """
-        Check if the table is a DBFS table or reference in some way
-        and yield a deprecation message if it is
-        """
-        if any(pattern.matches(table.name) for pattern in DFSA_PATTERNS):
+    def lint_expression(self, expression: SqlExpression):
+        for dfsa in self._collect_dfsas(expression):
             yield Deprecation(
                 code='direct-filesystem-access-in-sql-query',
-                message=f"The use of direct filesystem references is deprecated: {table.name}",
+                message=f"The use of direct filesystem references is deprecated: {dfsa.path}",
                 # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
                 start_line=0,
                 start_col=0,
                 end_line=0,
                 end_col=1024,
             )
+
+    def collect_dfsas(self, sql_code: str):
+        try:
+            expressions = parse_sql(sql_code, read='databricks')
+            for expression in expressions:
+                if not expression:
+                    continue
+                yield from self._collect_dfsas(expression)
+        except SqlParseError as e:
+            logger.debug(f"Failed to parse SQL: {sql_code}", exc_info=e)
+
+    @classmethod
+    def _collect_dfsas(cls, expression: SqlExpression) -> Iterable[DFSA]:
+        yield from cls._collect_dfsas_from_literals(expression)
+        yield from cls._collect_dfsas_from_identifiers(expression)
+
+    @classmethod
+    def _collect_dfsas_from_literals(cls, expression: SqlExpression) -> Iterable[DFSA]:
+        for literal in expression.find_all(Literal):
+            if not isinstance(literal.this, str):
+                logger.warning(f"Can't interpret {type(literal.this).__name__}")
+            yield from cls._collect_dfsa_from_node(literal, literal.this)
+
+    @classmethod
+    def _collect_dfsas_from_identifiers(cls, expression: SqlExpression) -> Iterable[DFSA]:
+        for identifier in expression.find_all(Identifier):
+            if not isinstance(identifier.this, str):
+                logger.warning(f"Can't interpret {type(identifier.this).__name__}")
+            yield from cls._collect_dfsa_from_node(identifier, identifier.this)
+
+    @classmethod
+    def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterable[DFSA]:
+        if any(pattern.matches(path) for pattern in DFSA_PATTERNS):
+            is_read = cls._is_read(expression)
+            is_write = cls._is_write(expression)
+            yield DFSA(source_type=DFSA.UNKNOWN, source_id=DFSA.UNKNOWN, path=path, is_read=is_read, is_write=is_write)
+
+    @classmethod
+    def _is_read(cls, expression: SqlExpression | None) -> bool:
+        expression = cls._walk_up(expression)
+        return isinstance(expression, Select)
+
+    @classmethod
+    def _is_write(cls, expression: SqlExpression | None) -> bool:
+        expression = cls._walk_up(expression)
+        return isinstance(expression, (Create, Alter, Drop, Insert, Delete))
+
+    @classmethod
+    def _walk_up(cls, expression: SqlExpression | None) -> SqlExpression | None:
+        if expression is None:
+            return None
+        if isinstance(expression, (Create, Alter, Drop, Insert, Delete, Select)):
+            return expression
+        return cls._walk_up(expression.parent)
diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py
index df70470041..24c6020077 100644
--- a/tests/unit/source_code/conftest.py
+++ b/tests/unit/source_code/conftest.py
@@ -1,9 +1,12 @@
 import pytest
 
+from databricks.labs.lsql.backends import MockBackend
+
 from databricks.labs.ucx.hive_metastore.migration_status import (
     MigrationStatus,
 )
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
+from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler
 from databricks.labs.ucx.source_code.graph import DependencyResolver
 from databricks.labs.ucx.source_code.known import KnownList
 from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader
@@ -55,3 +58,8 @@ def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolv
     notebook_resolver = NotebookResolver(NotebookLoader())
     import_resolver = ImportFileResolver(FileLoader(), allow_list)
     return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup)
+
+
+@pytest.fixture
+def mock_dfsa_crawler() -> DfsaCrawler:
+    return DfsaCrawler(MockBackend(), "schema")
diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.py b/tests/unit/source_code/samples/functional/file-access/create_location.py
index 940640f7ae..178ddf4c62 100644
--- a/tests/unit/source_code/samples/functional/file-access/create_location.py
+++ b/tests/unit/source_code/samples/functional/file-access/create_location.py
@@ -4,10 +4,12 @@
 
 # COMMAND ----------
 
+# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/
 # MAGIC %sql
 # MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/'
 
 # COMMAND ----------
 
+# ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_550/
 # MAGIC %sql
 # MAGIC CREATE TABLE hive_metastore.indices_historical_data.sp_550 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_550/'
diff --git a/tests/unit/source_code/test_dfsa_crawler.py b/tests/unit/source_code/test_dfsa_crawler.py
index 93652c4aee..d442805c40 100644
--- a/tests/unit/source_code/test_dfsa_crawler.py
+++ b/tests/unit/source_code/test_dfsa_crawler.py
@@ -7,8 +7,9 @@
 def test_crawler_appends_dfsas():
     backend = MockBackend()
     crawler = DfsaCrawler(backend, "schema")
-    for path in ("a", "b", "c"):
-        dfsa = DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False)
-        crawler.append(dfsa)
+    dfsas = list(
+        DFSA(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False) for path in ("a", "b", "c")
+    )
+    crawler.append(dfsas)
     rows = backend.rows_written_for(crawler.full_name, "append")
     assert len(rows) == 3
diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py
index ed2f6fe6dc..d4d5250058 100644
--- a/tests/unit/source_code/test_functional.py
+++ b/tests/unit/source_code/test_functional.py
@@ -249,7 +249,7 @@ def test_functional_with_parent(
 
 @pytest.mark.skip(reason="Used for troubleshooting failing tests")
 def test_one_functional(mock_path_lookup, simple_dependency_resolver, extended_test_index):
-    path = mock_path_lookup.resolve(Path("functional/table-migration/table-migration-notebook.sql"))
+    path = mock_path_lookup.resolve(Path("functional/file-access/create_location.py"))
     path_lookup = mock_path_lookup.change_directory(path.parent)
     sample = Functional(path)
     sample.verify(path_lookup, simple_dependency_resolver, extended_test_index)
diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
index 437649b8e6..e21af86cfa 100644
--- a/tests/unit/source_code/test_jobs.py
+++ b/tests/unit/source_code/test_jobs.py
@@ -229,11 +229,13 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task(
     assert registered_notebooks == [expected_path_instance]
 
 
-def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_lookup, empty_index, caplog):
+def test_workflow_linter_lint_job_logs_problems(
+    dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler, caplog
+):
     expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library"
 
     ws = create_autospec(WorkspaceClient)
-    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index)
+    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler)
 
     libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))]
     task = jobs.Task(task_key="test-task", libraries=libraries)

From 2646accefed35acc2c23da2a222bc1ba3a1356dd Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 3 Sep 2024 18:27:30 +0200
Subject: [PATCH 26/80] fix failing tests

---
 tests/integration/source_code/test_jobs.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index 2484d15a5c..e0f49d4a2b 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -122,10 +122,8 @@ def test_job_task_linter_library_installed_cluster(
 
 def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, make_notebook, make_random, caplog):
     expected_messages = {
-        'second_notebook:3 [dbfs-usage] Deprecated file system path: /mnt/something',
-        'second_notebook:3 [implicit-dbfs-usage] The use of default dbfs: references is deprecated: /mnt/something',
-        'some_file.py:0 [dbfs-usage] Deprecated file system path: /mnt/foo/bar',
-        'some_file.py:0 [implicit-dbfs-usage] The use of default dbfs: references is deprecated: /mnt/foo/bar',
+        'some_file.py:0 [direct-filesystem-access] The use of direct filesystem references is deprecated: /mnt/foo/bar',
+        'second_notebook:3 [direct-filesystem-access] The use of direct filesystem references is deprecated: /mnt/something',
     }
 
     entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}-{get_purge_suffix()}").expanduser()
@@ -149,7 +147,8 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"):
         problems = simple_ctx.workflow_linter.lint_job(j.job_id)
 
-    messages = {replace(p, path=Path(p.path).relative_to(entrypoint)).as_message() for p in problems}
+    root = Path(entrypoint.as_posix())
+    messages = {replace(p, path=Path(p.path).relative_to(root)).as_message() for p in problems}
     assert messages == expected_messages
 
     last_messages = caplog.messages[-1].split("\n")

From a5416c01fd541f0accaf7581267828f964ef4cb6 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 4 Sep 2024 11:39:36 +0200
Subject: [PATCH 27/80] finalize integration

---
 .../labs/ucx/contexts/application.py          |  6 +++++
 src/databricks/labs/ucx/source_code/jobs.py   | 24 ++++++++++---------
 .../labs/ucx/source_code/linters/dfsa.py      |  2 +-
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
index 3d9e45aa61..c3b40e039b 100644
--- a/src/databricks/labs/ucx/contexts/application.py
+++ b/src/databricks/labs/ucx/contexts/application.py
@@ -15,6 +15,7 @@
 from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever
 from databricks.labs.ucx.recon.migration_recon import MigrationRecon
 from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator
+from databricks.labs.ucx.source_code.dfsa_crawler import DfsaCrawler
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.sdk import AccountClient, WorkspaceClient, core
 from databricks.sdk.errors import ResourceDoesNotExist
@@ -425,9 +426,14 @@ def workflow_linter(self):
             self.dependency_resolver,
             self.path_lookup,
             MigrationIndex([]),  # TODO: bring back self.tables_migrator.index()
+            self.dfsa_crawler,
             self.config.include_job_ids,
         )
 
+    @cached_property
+    def dfsa_crawler(self):
+        return DfsaCrawler(self.sql_backend, self.inventory_database)
+
     @cached_property
     def redash(self):
         return Redash(
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index b8c534adb6..acff9be8e7 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -16,7 +16,6 @@
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.errors import NotFound
 from databricks.sdk.service import compute, jobs
-from databricks.sdk.service.workspace import Language
 
 from databricks.labs.ucx.assessment.crawlers import runtime_version_tuple
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
@@ -41,6 +40,7 @@
 )
 from databricks.labs.ucx.source_code.linters.context import LinterContext
 from databricks.labs.ucx.source_code.linters.dfsa import DfsaSqlLinter, DfsaPyLinter
+from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
 from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
@@ -477,18 +477,19 @@ def _process_dependency(
         if not language:
             logger.warning(f"Unknown language for {dependency.path}")
             return
+        cell_language = CellLanguage.of_language(language)
         source = dependency.path.read_text(guess_encoding(dependency.path))
         if is_a_notebook(dependency.path):
-            yield from self._collect_from_notebook(source, language, dependency.path, inherited_tree)
+            yield from self._collect_from_notebook(source, cell_language, dependency.path, inherited_tree)
         elif dependency.path.is_file():
-            yield from self._collect_from_source(source, language, dependency.path, inherited_tree)
+            yield from self._collect_from_source(source, cell_language, dependency.path, inherited_tree)
 
     def _collect_from_notebook(
-        self, source: str, language: Language, path: Path, inherited_tree: Tree | None
+        self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None
     ) -> Iterable[DFSA]:
-        notebook = Notebook.parse(path, source, language)
+        notebook = Notebook.parse(path, source, language.language)
         for cell in notebook.cells:
-            for dfsa in self._collect_from_source(cell.original_code, cell.language.language, path, inherited_tree):
+            for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
                 yield DFSA(
                     source_type="NOTEBOOK",
                     source_id=str(path),
@@ -496,19 +497,19 @@ def _collect_from_notebook(
                     is_read=dfsa.is_read,
                     is_write=dfsa.is_write,
                 )
-            if cell.language.language is Language.PYTHON:
+            if cell.language is CellLanguage.PYTHON:
                 if inherited_tree is None:
                     inherited_tree = Tree.new_module()
                 tree = Tree.normalize_and_parse(cell.original_code)
                 inherited_tree.append_tree(tree)
 
     def _collect_from_source(
-        self, source: str, language: Language, path: Path, inherited_tree: Tree | None
+        self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None
     ) -> Iterable[DFSA]:
         iterable: Iterable[DFSA] | None = None
-        if language is Language.SQL:
+        if language is CellLanguage.SQL:
             iterable = self._collect_from_sql(source)
-        if language is Language.PYTHON:
+        if language is CellLanguage.PYTHON:
             iterable = self._collect_from_python(source, inherited_tree)
         if iterable is None:
             logger.warning(f"Language {language.name} not supported yet!")
@@ -520,7 +521,8 @@ def _collect_from_source(
 
     def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DFSA]:
         linter = DfsaPyLinter(self._session_state, prevent_spark_duplicates=False)
-        yield from linter.collect_dfsas(source, inherited_tree)
+        for dfsa_node in linter.collect_dfsas(source, inherited_tree):
+            yield dfsa_node.dfsa
 
     def _collect_from_sql(self, source: str) -> Iterable[DFSA]:
         linter = DfsaSqlLinter()
diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/dfsa.py
index 6fd87e41c1..9bd7caa6ff 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/dfsa.py
@@ -146,7 +146,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
             )
             yield advisory
 
-    def collect_dfsas(self, python_code: str, inherited_tree: Tree | None) -> Iterable[DFSA]:
+    def collect_dfsas(self, python_code: str, inherited_tree: Tree | None) -> Iterable[DFSANode]:
         tree = Tree.new_module()
         if inherited_tree:
             tree.append_tree(inherited_tree)

From bba91c9db3c1a8e31aa2f6ff19545f34cf6b2cb8 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 4 Sep 2024 11:59:28 +0200
Subject: [PATCH 28/80] add logs

---
 src/databricks/labs/ucx/source_code/dfsa_crawler.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
index fa3ce36f8d..f2b4109b3b 100644
--- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py
+++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
@@ -1,9 +1,12 @@
+import logging
 from collections.abc import Sequence
 
 from databricks.labs.ucx.framework.crawlers import CrawlerBase
 from databricks.labs.ucx.source_code.base import DFSA
 from databricks.labs.lsql.backends import SqlBackend
+from databricks.sdk.errors import DatabricksError
 
+logger = logging.getLogger(__name__)
 
 class DfsaCrawler(CrawlerBase):
 
@@ -18,4 +21,8 @@ def __init__(self, backend: SqlBackend, schema: str):
         super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DFSA)
 
     def append(self, dfsas: Sequence[DFSA]):
-        self._append_records(dfsas)
+        try:
+            self._append_records(dfsas)
+        except DatabricksError as e:
+            logger.error("Failed to store DFSAs", exc_info=e)
+

From 2da934e79cc1b08c3fc0aae95fd1385758b9d58a Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 4 Sep 2024 12:18:05 +0200
Subject: [PATCH 29/80] enhance integration test or checking stored DFSAs

---
 src/databricks/labs/ucx/source_code/dfsa_crawler.py | 6 +++++-
 tests/integration/source_code/test_jobs.py          | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
index f2b4109b3b..e4abaa81c7 100644
--- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py
+++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
@@ -1,5 +1,5 @@
 import logging
-from collections.abc import Sequence
+from collections.abc import Sequence, Iterable
 
 from databricks.labs.ucx.framework.crawlers import CrawlerBase
 from databricks.labs.ucx.source_code.base import DFSA
@@ -26,3 +26,7 @@ def append(self, dfsas: Sequence[DFSA]):
         except DatabricksError as e:
             logger.error("Failed to store DFSAs", exc_info=e)
 
+    def snapshot(self) -> Iterable[DFSA]:
+        sql = f"SELECT * FROM {self.full_name}"
+        yield from self._backend.fetch(sql)
+
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index e0f49d4a2b..6a4bd9d6ab 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -154,6 +154,9 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     last_messages = caplog.messages[-1].split("\n")
     assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages)
 
+    dfsas = simple_ctx.dfsa_crawler.snapshot()
+    assert len(list(dfsas)) == 2
+
 
 def test_workflow_linter_lints_job_with_import_pypi_library(
     simple_ctx,

From 5c444ffef0ac7323cb56de0f7e8d9ac86383d366 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 4 Sep 2024 16:24:36 +0200
Subject: [PATCH 30/80] move 'magic'-related stuff to dedicated file

---
 .../labs/ucx/source_code/notebooks/cells.py   |   8 +-
 .../labs/ucx/source_code/notebooks/magic.py   | 103 ++++++++++++++++++
 .../labs/ucx/source_code/notebooks/sources.py |   2 +-
 .../ucx/source_code/python/python_analyzer.py |  93 +---------------
 .../unit/source_code/notebooks/test_cells.py  |   2 +-
 5 files changed, 111 insertions(+), 97 deletions(-)
 create mode 100644 src/databricks/labs/ucx/source_code/notebooks/magic.py

diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py
index a2b994972a..0447d4c513 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/cells.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py
@@ -20,12 +20,8 @@
     DependencyGraphContext,
     InheritedContext,
 )
-from databricks.labs.ucx.source_code.python.python_analyzer import (
-    PythonCodeAnalyzer,
-    MagicCommand,
-    MagicNode,
-    magic_command_factory,
-)
+from databricks.labs.ucx.source_code.python.python_analyzer import PythonCodeAnalyzer
+from databricks.labs.ucx.source_code.notebooks.magic import MagicNode, MagicCommand, magic_command_factory
 
 # use a specific logger for sqlglot warnings so we can disable them selectively
 sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot")
diff --git a/src/databricks/labs/ucx/source_code/notebooks/magic.py b/src/databricks/labs/ucx/source_code/notebooks/magic.py
new file mode 100644
index 0000000000..fd728faa8b
--- /dev/null
+++ b/src/databricks/labs/ucx/source_code/notebooks/magic.py
@@ -0,0 +1,103 @@
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from pathlib import Path
+from typing import TypeVar
+
+from astroid import NodeNG, Call, Name, Const  # type: ignore
+
+from databricks.labs.ucx.source_code.graph import (
+    DependencyGraph,
+    DependencyProblem,
+    DependencyGraphContext,
+    InheritedContext,
+)
+from databricks.labs.ucx.source_code.python.python_ast import NodeBase, Tree
+
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T")
+
+
+class MagicLine(NodeBase):
+
+    @classmethod
+    def extract_from_tree(
+        cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T]
+    ) -> tuple[list[MagicLine], list[T]]:
+        problems: list[T] = []
+        commands: list[MagicLine] = []
+        try:
+            nodes = tree.locate(Call, [("magic_command", Name)])
+            for command in cls._make_commands_for_magic_command_call_nodes(nodes):
+                commands.append(command)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True)
+            problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root)
+            problems.append(problem)
+        return commands, problems
+
+    @classmethod
+    def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]):
+        for node in nodes:
+            arg = node.args[0]
+            if isinstance(arg, Const):
+                yield MagicLine(node, arg.value)
+
+    def __init__(self, node: NodeNG, command: bytes):
+        super().__init__(node)
+        self._command = command.decode()
+
+    def as_magic(self) -> MagicCommand | None:
+        for factory in _FACTORIES:
+            command = factory(self._command, self.node)
+            if command is not None:
+                return command
+        return None
+
+    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
+        magic = self.as_magic()
+        if magic is not None:
+            return magic.build_dependency_graph(parent)
+        problem = DependencyProblem.from_node(
+            code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node
+        )
+        return [problem]
+
+    def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext:
+        magic = self.as_magic()
+        if magic is not None:
+            return magic.build_inherited_context(context, child_path)
+        return InheritedContext(None, False)
+
+
+class MagicNode(NodeNG):
+    pass
+
+
+class MagicCommand(ABC):
+
+    def __init__(self, node: NodeNG, code: str):
+        self._node = node
+        self._code = code
+
+    @abstractmethod
+    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ...
+
+    def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext:
+        return InheritedContext(None, False)
+
+
+_FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = []
+
+
+def magic_command_factory(func: Callable[[str, NodeNG], MagicCommand | None]):
+    _FACTORIES.append(func)
+
+    def inner(command: str, node: NodeNG) -> MagicCommand | None:
+        return func(command, node)
+
+    return inner
diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py
index ab7a51cf3c..c17f937cbe 100644
--- a/src/databricks/labs/ucx/source_code/notebooks/sources.py
+++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py
@@ -35,7 +35,7 @@
     SysPathChange,
     UnresolvedPath,
 )
-from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine
+from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
 from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
 from databricks.labs.ucx.source_code.notebooks.cells import (
     CellLanguage,
diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
index d2a2454b2f..f2c52114ae 100644
--- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py
+++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py
@@ -1,18 +1,16 @@
 from __future__ import annotations
 
 import logging
-from abc import ABC, abstractmethod
-from collections.abc import Iterable, Callable
+from collections.abc import Iterable
 from pathlib import Path
-from typing import cast, TypeVar
+from typing import cast
 
-from astroid import AstroidSyntaxError, Call, Const, ImportFrom, NodeNG, Try, Name  # type: ignore
+from astroid import AstroidSyntaxError, ImportFrom, Try, Name  # type: ignore
 
 from databricks.labs.ucx.source_code.graph import (
     DependencyGraphContext,
     DependencyProblem,
     InheritedContext,
-    DependencyGraph,
 )
 from databricks.labs.ucx.source_code.linters.imports import (
     SysPathChange,
@@ -21,6 +19,7 @@
     NotebookRunCall,
     UnresolvedPath,
 )
+from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
 from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
 
 logger = logging.getLogger(__name__)
@@ -194,87 +193,3 @@ def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[DependencyProbl
             )
             return
         change.apply_to(self._context.path_lookup)
-
-
-T = TypeVar("T")
-
-
-class MagicLine(NodeBase):
-
-    @classmethod
-    def extract_from_tree(
-        cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], T]
-    ) -> tuple[list[MagicLine], list[T]]:
-        problems: list[T] = []
-        commands: list[MagicLine] = []
-        try:
-            nodes = tree.locate(Call, [("magic_command", Name)])
-            for command in cls._make_commands_for_magic_command_call_nodes(nodes):
-                commands.append(command)
-        except Exception as e:  # pylint: disable=broad-except
-            logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True)
-            problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root)
-            problems.append(problem)
-        return commands, problems
-
-    @classmethod
-    def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]):
-        for node in nodes:
-            arg = node.args[0]
-            if isinstance(arg, Const):
-                yield MagicLine(node, arg.value)
-
-    def __init__(self, node: NodeNG, command: bytes):
-        super().__init__(node)
-        self._command = command.decode()
-
-    def as_magic(self) -> MagicCommand | None:
-        for factory in _FACTORIES:
-            command = factory(self._command, self.node)
-            if command is not None:
-                return command
-        return None
-
-    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
-        magic = self.as_magic()
-        if magic is not None:
-            return magic.build_dependency_graph(parent)
-        problem = DependencyProblem.from_node(
-            code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node
-        )
-        return [problem]
-
-    def build_inherited_context(self, context: DependencyGraphContext, child_path: Path) -> InheritedContext:
-        magic = self.as_magic()
-        if magic is not None:
-            return magic.build_inherited_context(context, child_path)
-        return InheritedContext(None, False)
-
-
-class MagicNode(NodeNG):
-    pass
-
-
-class MagicCommand(ABC):
-
-    def __init__(self, node: NodeNG, code: str):
-        self._node = node
-        self._code = code
-
-    @abstractmethod
-    def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: ...
-
-    def build_inherited_context(self, _context: DependencyGraphContext, _child_path: Path) -> InheritedContext:
-        return InheritedContext(None, False)
-
-
-_FACTORIES: list[Callable[[str, NodeNG], MagicCommand | None]] = []
-
-
-def magic_command_factory(func: Callable[[str, NodeNG], MagicCommand | None]):
-    _FACTORIES.append(func)
-
-    def inner(command: str, node: NodeNG) -> MagicCommand | None:
-        return func(command, node)
-
-    return inner
diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py
index 25f794f2eb..9d1988cd12 100644
--- a/tests/unit/source_code/notebooks/test_cells.py
+++ b/tests/unit/source_code/notebooks/test_cells.py
@@ -8,7 +8,7 @@
 from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, DependencyProblem
 from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver
-from databricks.labs.ucx.source_code.python.python_analyzer import MagicLine
+from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
 from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.cells import (
     CellLanguage,

From 7f93d107623d122f61ef892b6eadb94d509ac96b Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 4 Sep 2024 16:44:56 +0200
Subject: [PATCH 31/80] formatting

---
 src/databricks/labs/ucx/mixins/fixtures.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py
index 2e26293eef..ac06c8b6b7 100644
--- a/src/databricks/labs/ucx/mixins/fixtures.py
+++ b/src/databricks/labs/ucx/mixins/fixtures.py
@@ -1248,7 +1248,9 @@ def create() -> Wait[ServingEndpointDetailed]:
             endpoint_name,
             EndpointCoreConfigInput(
                 served_models=[
-                    ServedModelInput(model.name, "1", ServedModelInputWorkloadSize.SMALL, scale_to_zero_enabled=True)
+                    ServedModelInput(
+                        model.name, "1", workload_size=ServedModelInputWorkloadSize.SMALL, scale_to_zero_enabled=True
+                    )
                 ]
             ),
         )

From 7192e851a6f7c8225f931f21dd7fe83ea19e26c7 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 4 Sep 2024 16:57:13 +0200
Subject: [PATCH 32/80] fix failing tests

---
 .../samples/functional/file-access/create_location.sql           | 1 +
 .../source_code/samples/functional/file-access/select_format.sql | 1 +
 .../samples/functional/file-access/select_read_files.sql         | 1 +
 3 files changed, 3 insertions(+)

diff --git a/tests/unit/source_code/samples/functional/file-access/create_location.sql b/tests/unit/source_code/samples/functional/file-access/create_location.sql
index 4f90fd669d..2b6b4b3aeb 100644
--- a/tests/unit/source_code/samples/functional/file-access/create_location.sql
+++ b/tests/unit/source_code/samples/functional/file-access/create_location.sql
@@ -1,2 +1,3 @@
 -- Databricks notebook source
+-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/
 CREATE TABLE hive_metastore.indices_historical_data.sp_500 LOCATION 's3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/'
diff --git a/tests/unit/source_code/samples/functional/file-access/select_format.sql b/tests/unit/source_code/samples/functional/file-access/select_format.sql
index 76d91894f2..d64358a23d 100644
--- a/tests/unit/source_code/samples/functional/file-access/select_format.sql
+++ b/tests/unit/source_code/samples/functional/file-access/select_format.sql
@@ -1,2 +1,3 @@
 -- Databricks notebook source
+-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: hdfs://examples/src/main/resources/users.parquet
 SELECT * FROM parquet.`hdfs://examples/src/main/resources/users.parquet`
diff --git a/tests/unit/source_code/samples/functional/file-access/select_read_files.sql b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql
index e326eec5f5..cd2c86cbe1 100644
--- a/tests/unit/source_code/samples/functional/file-access/select_read_files.sql
+++ b/tests/unit/source_code/samples/functional/file-access/select_read_files.sql
@@ -1,2 +1,3 @@
 -- Databricks notebook source
+-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/file.csv
 SELECT * FROM read_files("s3a://db-gtm-industry-solutions/data/fsi/capm/sp_500/file.csv") LIMIT 10

From d4be072df9cf007cc2b4af37e93992dd497cf708 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 4 Sep 2024 18:18:33 +0200
Subject: [PATCH 33/80] formatting

---
 src/databricks/labs/ucx/source_code/dfsa_crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/source_code/dfsa_crawler.py b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
index e4abaa81c7..c552628dbe 100644
--- a/src/databricks/labs/ucx/source_code/dfsa_crawler.py
+++ b/src/databricks/labs/ucx/source_code/dfsa_crawler.py
@@ -8,6 +8,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class DfsaCrawler(CrawlerBase):
 
     def __init__(self, backend: SqlBackend, schema: str):
@@ -29,4 +30,3 @@ def append(self, dfsas: Sequence[DFSA]):
     def snapshot(self) -> Iterable[DFSA]:
         sql = f"SELECT * FROM {self.full_name}"
         yield from self._backend.fetch(sql)
-

From e552e3efd6fec09607ab2ac0e568c95f61337cb0 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 5 Sep 2024 11:13:57 +0200
Subject: [PATCH 34/80] rename dfsa -> directfs

---
 .../labs/ucx/source_code/linters/context.py   |  6 +-
 .../linters/{dfsa.py => directfs.py}          | 58 +++++++++----------
 .../labs/ucx/source_code/linters/pyspark.py   |  4 +-
 .../{test_dfsa.py => test_directfs.py}        | 18 +++---
 4 files changed, 43 insertions(+), 43 deletions(-)
 rename src/databricks/labs/ucx/source_code/linters/{dfsa.py => directfs.py} (80%)
 rename tests/unit/source_code/linters/{test_dfsa.py => test_directfs.py} (88%)

diff --git a/src/databricks/labs/ucx/source_code/linters/context.py b/src/databricks/labs/ucx/source_code/linters/context.py
index 1106b85612..9cec44b2eb 100644
--- a/src/databricks/labs/ucx/source_code/linters/context.py
+++ b/src/databricks/labs/ucx/source_code/linters/context.py
@@ -12,7 +12,7 @@
     PythonLinter,
     SqlLinter,
 )
-from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter
+from databricks.labs.ucx.source_code.linters.directfs import DirectFsPyLinter, DirectFsSqlLinter
 from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter
 
 from databricks.labs.ucx.source_code.linters.pyspark import SparkSqlPyLinter
@@ -40,12 +40,12 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe
             python_fixers.append(SparkSqlPyLinter(from_table, index, session_state))
 
         python_linters += [
-            DfsaPyLinter(session_state),
+            DirectFsPyLinter(session_state),
             DBRv8d0PyLinter(dbr_version=session_state.dbr_version),
             SparkConnectPyLinter(session_state),
             DbutilsPyLinter(session_state),
         ]
-        sql_linters.append(DfsaSqlLinter())
+        sql_linters.append(DirectFsSqlLinter())
 
         self._linters: dict[Language, list[SqlLinter] | list[PythonLinter]] = {
             Language.PYTHON: python_linters,
diff --git a/src/databricks/labs/ucx/source_code/linters/dfsa.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
similarity index 80%
rename from src/databricks/labs/ucx/source_code/linters/dfsa.py
rename to src/databricks/labs/ucx/source_code/linters/directfs.py
index da6b343252..b44465ddd6 100644
--- a/src/databricks/labs/ucx/source_code/linters/dfsa.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -20,7 +20,7 @@
 logger = logging.getLogger(__name__)
 
 
-class DFSAPattern(ABC):
+class DirectFsPattern(ABC):
 
     def __init__(self, prefix: str, allowed_roots: list[str]):
         self._prefix = prefix
@@ -33,43 +33,43 @@ def _matches_allowed_root(self, value: str):
         return any(value.startswith(f"{self._prefix}/{root}") for root in self._allowed_roots)
 
 
-class RootPattern(DFSAPattern):
+class RootPattern(DirectFsPattern):
 
     def _matches_allowed_root(self, value: str):
         return any(value.startswith(f"/{root}") for root in self._allowed_roots)
 
 
 # the below aims to implement https://docs.databricks.com/en/files/index.html
-DFSA_PATTERNS = [
-    DFSAPattern("dbfs:/", []),
-    DFSAPattern("file:/", ["Workspace/", "tmp/"]),
-    DFSAPattern("s3:/", []),
-    DFSAPattern("s3n:/", []),
-    DFSAPattern("s3a:/", []),
-    DFSAPattern("wasb:/", []),
-    DFSAPattern("wasbs:/", []),
-    DFSAPattern("abfs:/", []),
-    DFSAPattern("abfss:/", []),
-    DFSAPattern("hdfs:/", []),
+DIRECT_FS_PATTERNS = [
+    DirectFsPattern("dbfs:/", []),
+    DirectFsPattern("file:/", ["Workspace/", "tmp/"]),
+    DirectFsPattern("s3:/", []),
+    DirectFsPattern("s3n:/", []),
+    DirectFsPattern("s3a:/", []),
+    DirectFsPattern("wasb:/", []),
+    DirectFsPattern("wasbs:/", []),
+    DirectFsPattern("abfs:/", []),
+    DirectFsPattern("abfss:/", []),
+    DirectFsPattern("hdfs:/", []),
     # "/mnt/" is detected by the below pattern,
     RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]),
 ]
 
 
 @dataclass
-class DFSA:
+class DirectFsAccess:
     """A DFSA is a record describing a Direct File System Access"""
 
     path: str
 
 
 @dataclass
-class DFSANode:
-    dfsa: DFSA
+class DirectFsNode:
+    dfsa: DirectFsAccess
     node: NodeNG
 
 
-class _DetectDfsaVisitor(TreeVisitor):
+class _DetectDirectFsVisitor(TreeVisitor):
     """
     Visitor that detects file system paths in Python code and checks them
     against a list of known deprecated paths.
@@ -77,7 +77,7 @@ class _DetectDfsaVisitor(TreeVisitor):
 
     def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None:
         self._session_state = session_state
-        self._dfsa_nodes: list[DFSANode] = []
+        self._directfs_nodes: list[DirectFsNode] = []
         self._reported_locations: set[tuple[int, int]] = set()
         self._allow_spark_duplicates = allow_spark_duplicates
 
@@ -107,8 +107,8 @@ def _check_str_constant(self, source_node, inferred: InferredValue):
         if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates:
             return
         value = inferred.as_string()
-        if any(pattern.matches(value) for pattern in DFSA_PATTERNS):
-            self._dfsa_nodes.append(DFSANode(DFSA(value), source_node))
+        if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS):
+            self._directfs_nodes.append(DirectFsNode(DirectFsAccess(value), source_node))
             self._reported_locations.add((source_node.lineno, source_node.col_offset))
 
     def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
@@ -116,11 +116,11 @@ def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
         return any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes)
 
     @property
-    def dfsa_nodes(self):
-        return self._dfsa_nodes
+    def directfs_nodes(self):
+        return self._directfs_nodes
 
 
-class DfsaPyLinter(PythonLinter):
+class DirectFsPyLinter(PythonLinter):
 
     def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False):
         self._session_state = session_state
@@ -137,18 +137,18 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
         """
         Lints the code looking for file system paths that are deprecated
         """
-        visitor = _DetectDfsaVisitor(self._session_state, self._allow_spark_duplicates)
+        visitor = _DetectDirectFsVisitor(self._session_state, self._allow_spark_duplicates)
         visitor.visit(tree.node)
-        for dfsa_node in visitor.dfsa_nodes:
+        for directfs_node in visitor.directfs_nodes:
             advisory = Deprecation.from_node(
                 code='direct-filesystem-access',
-                message=f"The use of direct filesystem references is deprecated: {dfsa_node.dfsa.path}",
-                node=dfsa_node.node,
+                message=f"The use of direct filesystem references is deprecated: {directfs_node.dfsa.path}",
+                node=directfs_node.node,
             )
             yield advisory
 
 
-class DfsaSqlLinter(SqlLinter):
+class DirectFsSqlLinter(SqlLinter):
 
     @staticmethod
     def name() -> str:
@@ -164,7 +164,7 @@ def _check_dfsa(self, table: Table) -> Iterable[Advice]:
         Check if the table is a DBFS table or reference in some way
         and yield a deprecation message if it is
         """
-        if any(pattern.matches(table.name) for pattern in DFSA_PATTERNS):
+        if any(pattern.matches(table.name) for pattern in DIRECT_FS_PATTERNS):
             yield Deprecation(
                 code='direct-filesystem-access-in-sql-query',
                 message=f"The use of direct filesystem references is deprecated: {table.name}",
diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index a537757add..4cbe11d506 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -13,7 +13,7 @@
     CurrentSessionState,
     PythonLinter,
 )
-from databricks.labs.ucx.source_code.linters.dfsa import DFSA_PATTERNS
+from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
@@ -201,7 +201,7 @@ def lint(
                 logger.debug(f"Could not infer value of {table_arg.as_string()}")
                 continue
             value = inferred.as_string()
-            if any(pattern.matches(value) for pattern in DFSA_PATTERNS):
+            if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS):
                 yield Deprecation.from_node(
                     code='direct-filesystem-access',
                     message=f"The use of direct filesystem references is deprecated: {value}",
diff --git a/tests/unit/source_code/linters/test_dfsa.py b/tests/unit/source_code/linters/test_directfs.py
similarity index 88%
rename from tests/unit/source_code/linters/test_dfsa.py
rename to tests/unit/source_code/linters/test_directfs.py
index 11cb5a1d08..44e38406e9 100644
--- a/tests/unit/source_code/linters/test_dfsa.py
+++ b/tests/unit/source_code/linters/test_directfs.py
@@ -1,7 +1,7 @@
 import pytest
 
 from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure
-from databricks.labs.ucx.source_code.linters.dfsa import DfsaPyLinter, DfsaSqlLinter, DFSA_PATTERNS
+from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS, DirectFsPyLinter, DirectFsSqlLinter
 
 
 @pytest.mark.parametrize(
@@ -17,7 +17,7 @@
 )
 def test_matches_dfsa_pattern(path, matches):
     """see https://github.com/databrickslabs/ucx/issues/2350"""
-    matched = any(pattern.matches(path) for pattern in DFSA_PATTERNS)
+    matched = any(pattern.matches(path) for pattern in DIRECT_FS_PATTERNS)
     assert matches == matched
 
 
@@ -33,7 +33,7 @@ def test_matches_dfsa_pattern(path, matches):
     ],
 )
 def test_detects_dfsa_paths(code, expected):
-    linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
     advices = list(linter.lint(code))
     for advice in advices:
         assert isinstance(advice, Advice)
@@ -63,7 +63,7 @@ def test_detects_dfsa_paths(code, expected):
     ],
 )
 def test_dfsa_usage_linter(code, expected):
-    linter = DfsaPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
     advices = linter.lint(code)
     count = 0
     for advice in advices:
@@ -73,7 +73,7 @@ def test_dfsa_usage_linter(code, expected):
 
 
 def test_dfsa_name():
-    linter = DfsaPyLinter(CurrentSessionState())
+    linter = DirectFsPyLinter(CurrentSessionState())
     assert linter.name() == "dfsa-usage"
 
 
@@ -87,7 +87,7 @@ def test_dfsa_name():
     ],
 )
 def test_non_dfsa_triggers_nothing(query):
-    ftf = DfsaSqlLinter()
+    ftf = DirectFsSqlLinter()
     assert not list(ftf.lint(query))
 
 
@@ -109,7 +109,7 @@ def test_non_dfsa_triggers_nothing(query):
     ],
 )
 def test_dfsa_tables_trigger_messages_param(query: str, table: str):
-    ftf = DfsaSqlLinter()
+    ftf = DirectFsSqlLinter()
     actual = list(ftf.lint(query))
     assert actual == [
         Deprecation(
@@ -130,7 +130,7 @@ def test_dfsa_tables_trigger_messages_param(query: str, table: str):
     ],
 )
 def test_dfsa_queries_failure(query: str):
-    ftf = DfsaSqlLinter()
+    ftf = DirectFsSqlLinter()
     actual = list(ftf.lint(query))
     assert actual == [
         Failure(
@@ -145,5 +145,5 @@ def test_dfsa_queries_failure(query: str):
 
 
 def test_dfsa_queries_name():
-    ftf = DfsaSqlLinter()
+    ftf = DirectFsSqlLinter()
     assert ftf.name() == 'dfsa-query'

From e30ccfa4a45fcafae2fce76d137eef7209472442 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 5 Sep 2024 11:28:51 +0200
Subject: [PATCH 35/80] improve naming and drop /tmp/ exclusion

---
 .../labs/ucx/source_code/linters/context.py   |  6 +-
 .../labs/ucx/source_code/linters/directfs.py  | 57 ++++++++-----------
 .../labs/ucx/source_code/linters/pyspark.py   |  4 +-
 .../unit/source_code/linters/test_directfs.py | 35 +++++-------
 .../unit/source_code/linters/test_pyspark.py  | 15 ++++-
 5 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/context.py b/src/databricks/labs/ucx/source_code/linters/context.py
index 9cec44b2eb..7f9f589680 100644
--- a/src/databricks/labs/ucx/source_code/linters/context.py
+++ b/src/databricks/labs/ucx/source_code/linters/context.py
@@ -12,7 +12,7 @@
     PythonLinter,
     SqlLinter,
 )
-from databricks.labs.ucx.source_code.linters.directfs import DirectFsPyLinter, DirectFsSqlLinter
+from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter
 from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter
 
 from databricks.labs.ucx.source_code.linters.pyspark import SparkSqlPyLinter
@@ -40,12 +40,12 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe
             python_fixers.append(SparkSqlPyLinter(from_table, index, session_state))
 
         python_linters += [
-            DirectFsPyLinter(session_state),
+            DirectFsAccessPyLinter(session_state),
             DBRv8d0PyLinter(dbr_version=session_state.dbr_version),
             SparkConnectPyLinter(session_state),
             DbutilsPyLinter(session_state),
         ]
-        sql_linters.append(DirectFsSqlLinter())
+        sql_linters.append(DirectFsAccessSqlLinter())
 
         self._linters: dict[Language, list[SqlLinter] | list[PythonLinter]] = {
             Language.PYTHON: python_linters,
diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
index b44465ddd6..1ba4440b73 100644
--- a/src/databricks/labs/ucx/source_code/linters/directfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -20,7 +20,7 @@
 logger = logging.getLogger(__name__)
 
 
-class DirectFsPattern(ABC):
+class DirectFsAccessPattern(ABC):
 
     def __init__(self, prefix: str, allowed_roots: list[str]):
         self._prefix = prefix
@@ -33,26 +33,26 @@ def _matches_allowed_root(self, value: str):
         return any(value.startswith(f"{self._prefix}/{root}") for root in self._allowed_roots)
 
 
-class RootPattern(DirectFsPattern):
+class RootPattern(DirectFsAccessPattern):
 
     def _matches_allowed_root(self, value: str):
         return any(value.startswith(f"/{root}") for root in self._allowed_roots)
 
 
 # the below aims to implement https://docs.databricks.com/en/files/index.html
-DIRECT_FS_PATTERNS = [
-    DirectFsPattern("dbfs:/", []),
-    DirectFsPattern("file:/", ["Workspace/", "tmp/"]),
-    DirectFsPattern("s3:/", []),
-    DirectFsPattern("s3n:/", []),
-    DirectFsPattern("s3a:/", []),
-    DirectFsPattern("wasb:/", []),
-    DirectFsPattern("wasbs:/", []),
-    DirectFsPattern("abfs:/", []),
-    DirectFsPattern("abfss:/", []),
-    DirectFsPattern("hdfs:/", []),
+DIRECT_FS_ACCESS_PATTERNS = [
+    DirectFsAccessPattern("dbfs:/", []),
+    DirectFsAccessPattern("file:/", ["Workspace/"]),
+    DirectFsAccessPattern("s3:/", []),
+    DirectFsAccessPattern("s3n:/", []),
+    DirectFsAccessPattern("s3a:/", []),
+    DirectFsAccessPattern("wasb:/", []),
+    DirectFsAccessPattern("wasbs:/", []),
+    DirectFsAccessPattern("abfs:/", []),
+    DirectFsAccessPattern("abfss:/", []),
+    DirectFsAccessPattern("hdfs:/", []),
     # "/mnt/" is detected by the below pattern,
-    RootPattern("/", ["Volumes/", "Workspace/", "tmp/"]),
+    RootPattern("/", ["Volumes/", "Workspace/"]),
 ]
 
 
@@ -64,12 +64,12 @@ class DirectFsAccess:
 
 
 @dataclass
-class DirectFsNode:
+class DirectFsAccessNode:
     dfsa: DirectFsAccess
     node: NodeNG
 
 
-class _DetectDirectFsVisitor(TreeVisitor):
+class _DetectDirectFsAccessVisitor(TreeVisitor):
     """
     Visitor that detects file system paths in Python code and checks them
     against a list of known deprecated paths.
@@ -77,7 +77,7 @@ class _DetectDirectFsVisitor(TreeVisitor):
 
     def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None:
         self._session_state = session_state
-        self._directfs_nodes: list[DirectFsNode] = []
+        self._directfs_nodes: list[DirectFsAccessNode] = []
         self._reported_locations: set[tuple[int, int]] = set()
         self._allow_spark_duplicates = allow_spark_duplicates
 
@@ -107,8 +107,8 @@ def _check_str_constant(self, source_node, inferred: InferredValue):
         if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates:
             return
         value = inferred.as_string()
-        if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS):
-            self._directfs_nodes.append(DirectFsNode(DirectFsAccess(value), source_node))
+        if any(pattern.matches(value) for pattern in DIRECT_FS_ACCESS_PATTERNS):
+            self._directfs_nodes.append(DirectFsAccessNode(DirectFsAccess(value), source_node))
             self._reported_locations.add((source_node.lineno, source_node.col_offset))
 
     def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
@@ -120,24 +120,17 @@ def directfs_nodes(self):
         return self._directfs_nodes
 
 
-class DirectFsPyLinter(PythonLinter):
+class DirectFsAccessPyLinter(PythonLinter):
 
     def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False):
         self._session_state = session_state
         self._allow_spark_duplicates = allow_spark_duplicates
 
-    @staticmethod
-    def name() -> str:
-        """
-        Returns the name of the linter, for reporting etc
-        """
-        return 'dfsa-usage'
-
     def lint_tree(self, tree: Tree) -> Iterable[Advice]:
         """
         Lints the code looking for file system paths that are deprecated
         """
-        visitor = _DetectDirectFsVisitor(self._session_state, self._allow_spark_duplicates)
+        visitor = _DetectDirectFsAccessVisitor(self._session_state, self._allow_spark_duplicates)
         visitor.visit(tree.node)
         for directfs_node in visitor.directfs_nodes:
             advisory = Deprecation.from_node(
@@ -148,11 +141,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
             yield advisory
 
 
-class DirectFsSqlLinter(SqlLinter):
-
-    @staticmethod
-    def name() -> str:
-        return 'dfsa-query'
+class DirectFsAccessSqlLinter(SqlLinter):
 
     def lint_expression(self, expression: Expression):
         for table in expression.find_all(Table):
@@ -164,7 +153,7 @@ def _check_dfsa(self, table: Table) -> Iterable[Advice]:
         Check if the table is a DBFS table or reference in some way
         and yield a deprecation message if it is
         """
-        if any(pattern.matches(table.name) for pattern in DIRECT_FS_PATTERNS):
+        if any(pattern.matches(table.name) for pattern in DIRECT_FS_ACCESS_PATTERNS):
             yield Deprecation(
                 code='direct-filesystem-access-in-sql-query',
                 message=f"The use of direct filesystem references is deprecated: {table.name}",
diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index 4cbe11d506..7de8cbfbb6 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -13,7 +13,7 @@
     CurrentSessionState,
     PythonLinter,
 )
-from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS
+from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
@@ -201,7 +201,7 @@ def lint(
                 logger.debug(f"Could not infer value of {table_arg.as_string()}")
                 continue
             value = inferred.as_string()
-            if any(pattern.matches(value) for pattern in DIRECT_FS_PATTERNS):
+            if any(pattern.matches(value) for pattern in DIRECT_FS_ACCESS_PATTERNS):
                 yield Deprecation.from_node(
                     code='direct-filesystem-access',
                     message=f"The use of direct filesystem references is deprecated: {value}",
diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py
index 44e38406e9..e14b87b242 100644
--- a/tests/unit/source_code/linters/test_directfs.py
+++ b/tests/unit/source_code/linters/test_directfs.py
@@ -1,7 +1,11 @@
 import pytest
 
 from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure
-from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_PATTERNS, DirectFsPyLinter, DirectFsSqlLinter
+from databricks.labs.ucx.source_code.linters.directfs import (
+    DIRECT_FS_ACCESS_PATTERNS,
+    DirectFsAccessPyLinter,
+    DirectFsAccessSqlLinter,
+)
 
 
 @pytest.mark.parametrize(
@@ -11,13 +15,13 @@
         ("dbfs:/mnt/foo/bar", True),
         ("s3a://bucket1/folder1", True),
         ("/dbfs/mnt/foo/bar", True),
-        ("/tmp/foo", False),
+        ("/tmp/foo", True),
         ("table.we.know.nothing.about", False),
     ],
 )
 def test_matches_dfsa_pattern(path, matches):
     """see https://github.com/databrickslabs/ucx/issues/2350"""
-    matched = any(pattern.matches(path) for pattern in DIRECT_FS_PATTERNS)
+    matched = any(pattern.matches(path) for pattern in DIRECT_FS_ACCESS_PATTERNS)
     assert matches == matched
 
 
@@ -33,7 +37,7 @@ def test_matches_dfsa_pattern(path, matches):
     ],
 )
 def test_detects_dfsa_paths(code, expected):
-    linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
     advices = list(linter.lint(code))
     for advice in advices:
         assert isinstance(advice, Advice)
@@ -63,7 +67,7 @@ def test_detects_dfsa_paths(code, expected):
     ],
 )
 def test_dfsa_usage_linter(code, expected):
-    linter = DirectFsPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
     advices = linter.lint(code)
     count = 0
     for advice in advices:
@@ -72,11 +76,6 @@ def test_dfsa_usage_linter(code, expected):
     assert count == expected
 
 
-def test_dfsa_name():
-    linter = DirectFsPyLinter(CurrentSessionState())
-    assert linter.name() == "dfsa-usage"
-
-
 @pytest.mark.parametrize(
     "query",
     [
@@ -87,7 +86,7 @@ def test_dfsa_name():
     ],
 )
 def test_non_dfsa_triggers_nothing(query):
-    ftf = DirectFsSqlLinter()
+    ftf = DirectFsAccessSqlLinter()
     assert not list(ftf.lint(query))
 
 
@@ -98,10 +97,7 @@ def test_non_dfsa_triggers_nothing(query):
         ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."),
         ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"),
         ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."),
-        (
-            "MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE",
-            "/dbfs/...",
-        ),
+        ("MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE", "/dbfs/..."),
         ("SELECT * FROM json.`s3a://abc/d/e/f`", "s3a://abc/d/e/f"),
         ("SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6", "s3a://abc/d/e/f"),
         ("SELECT * FROM delta.`s3a://foo/bar`", "s3a://foo/bar"),
@@ -109,7 +105,7 @@ def test_non_dfsa_triggers_nothing(query):
     ],
 )
 def test_dfsa_tables_trigger_messages_param(query: str, table: str):
-    ftf = DirectFsSqlLinter()
+    ftf = DirectFsAccessSqlLinter()
     actual = list(ftf.lint(query))
     assert actual == [
         Deprecation(
@@ -130,7 +126,7 @@ def test_dfsa_tables_trigger_messages_param(query: str, table: str):
     ],
 )
 def test_dfsa_queries_failure(query: str):
-    ftf = DirectFsSqlLinter()
+    ftf = DirectFsAccessSqlLinter()
     actual = list(ftf.lint(query))
     assert actual == [
         Failure(
@@ -142,8 +138,3 @@ def test_dfsa_queries_failure(query: str):
             end_col=1024,
         ),
     ]
-
-
-def test_dfsa_queries_name():
-    ftf = DirectFsSqlLinter()
-    assert ftf.name() == 'dfsa-query'
diff --git a/tests/unit/source_code/linters/test_pyspark.py b/tests/unit/source_code/linters/test_pyspark.py
index 639763d549..bd50b9dc0f 100644
--- a/tests/unit/source_code/linters/test_pyspark.py
+++ b/tests/unit/source_code/linters/test_pyspark.py
@@ -559,12 +559,23 @@ def test_spark_cloud_direct_access(empty_index, code, expected):
 
 
 @pytest.mark.parametrize("fs_function", FS_FUNCTIONS)
-def test_direct_cloud_access_to_tmp_reports_nothing(empty_index, fs_function):
+def test_direct_cloud_access_to_workspace_reports_nothing(empty_index, fs_function):
     session_state = CurrentSessionState()
     ftf = FromTableSqlLinter(empty_index, session_state)
     sqf = SparkSqlPyLinter(ftf, empty_index, session_state)
     # ls function calls have to be from dbutils.fs, or we ignore them
-    code = f"""spark.{fs_function}("/tmp/bucket/path")"""
+    code = f"""spark.{fs_function}("/Workspace/bucket/path")"""
+    advisories = list(sqf.lint(code))
+    assert not advisories
+
+
+@pytest.mark.parametrize("fs_function", FS_FUNCTIONS)
+def test_direct_cloud_access_to_volumes_reports_nothing(empty_index, fs_function):
+    session_state = CurrentSessionState()
+    ftf = FromTableSqlLinter(empty_index, session_state)
+    sqf = SparkSqlPyLinter(ftf, empty_index, session_state)
+    # ls function calls have to be from dbutils.fs, or we ignore them
+    code = f"""spark.{fs_function}("/Volumes/bucket/path")"""
     advisories = list(sqf.lint(code))
     assert not advisories
 

From 4c48951052069ca9b4718d4f7ec367ec44a97d0a Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 5 Sep 2024 11:43:06 +0200
Subject: [PATCH 36/80] Update docs

---
 README.md | 52 +++++++++++++++++-----------------------------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index bb1292dc49..7045365640 100644
--- a/README.md
+++ b/README.md
@@ -64,12 +64,10 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
       * [`cannot-autofix-table-reference`](#cannot-autofix-table-reference)
       * [`catalog-api-in-shared-clusters`](#catalog-api-in-shared-clusters)
       * [`changed-result-format-in-uc`](#changed-result-format-in-uc)
-      * [`dbfs-read-from-sql-query`](#dbfs-read-from-sql-query)
-      * [`dbfs-usage`](#dbfs-usage)
+      * [`direct-filesystem-access`](#direct-filesystem-access)
+      * [`direct-filesystem-access-in-sql-query`](#direct-filesystem-access-in-sql-query)
       * [`default-format-changed-in-dbr8`](#default-format-changed-in-dbr8)
       * [`dependency-not-found`](#dependency-not-found)
-      * [`direct-filesystem-access`](#direct-filesystem-access)
-      * [`implicit-dbfs-usage`](#implicit-dbfs-usage)
       * [`jvm-access-in-shared-clusters`](#jvm-access-in-shared-clusters)
       * [`legacy-context-in-shared-clusters`](#legacy-context-in-shared-clusters)
       * [`not-supported`](#not-supported)
@@ -766,24 +764,32 @@ you need to make sure that `do_stuff_with_table` can handle the new format.
 
 [[back to top](#databricks-labs-ucx)]
 
-#### `dbfs-read-from-sql-query`
+#### `direct-filesystem-access-in-sql-query`
 
-DBFS access is not allowed in Unity Catalog, so if you have code like this:
+Direct filesystem access is deprecated in Unity Catalog.
+DBFS is no longer supported, so if you have code like this:
 
 ```python
-df = spark.sql("SELECT * FROM parquet.`/mnt/foo/path/to/file`")
+df = spark.sql("SELECT * FROM parquet.`/mnt/foo/path/to/parquet.file`")
 ```
 
 you need to change it to use UC tables.
 
 [[back to top](#databricks-labs-ucx)]
 
-#### `dbfs-usage`
+#### `direct-filesystem-access`
 
-DBFS does not work in Unity Catalog, so if you have code like this:
+Direct filesystem access is deprecated in Unity Catalog.
+DBFS is no longer supported, so if you have code like this:
 
 ```python
-display(spark.read.csv('/mnt/things/e/f/g'))
+display(spark.read.csv('/mnt/things/data.csv'))
+```
+
+or this:
+
+```python
+display(spark.read.csv('s3:/bucket/folder/data.csv'))
 ```
 
 You need to change it to use UC tables or UC volumes.
@@ -798,31 +804,7 @@ means an error in the user code.
 
 [[back to top](#databricks-labs-ucx)]
 
-#### `direct-filesystem-access`
-
-It's not allowed to access the filesystem directly in Unity Catalog, so if you have code like this:
-
-```python
-spark.read.csv("s3://bucket/path")
-```
-
-you need to change it to use UC tables or UC volumes.
-
-[[back to top](#databricks-labs-ucx)]
-
-#### `implicit-dbfs-usage`
-
-The use of DBFS is not allowed in Unity Catalog, so if you have code like this:
-
-```python
-display(spark.read.csv('/mnt/things/e/f/g'))
-```
-
-you need to change it to use UC tables or UC volumes.
-
-[[back to top](#databricks-labs-ucx)]
-
-#### `jvm-access-in-shared-clusters`
+### `jvm-access-in-shared-clusters`
 
 You cannot access Spark Driver JVM on Unity Catalog clusters in Shared Access mode. If you have code like this:
 

From 6e07c9bb2290a324a0a4bec2b4013d549512cc80 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 5 Sep 2024 11:51:34 +0200
Subject: [PATCH 37/80] move to functional test

---
 tests/unit/source_code/linters/test_directfs.py     | 13 ++-----------
 .../samples/functional/file-access/direct-fs3.py    |  8 ++++++++
 2 files changed, 10 insertions(+), 11 deletions(-)
 create mode 100644 tests/unit/source_code/samples/functional/file-access/direct-fs3.py

diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py
index e14b87b242..931973a60b 100644
--- a/tests/unit/source_code/linters/test_directfs.py
+++ b/tests/unit/source_code/linters/test_directfs.py
@@ -55,18 +55,9 @@ def test_detects_dfsa_paths(code, expected):
         ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1),
         ('spark.read.parquet("dbfs://mnt/foo/bar")', 1),
         ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1),
-        (
-            """
-DBFS1="dbfs:/mnt/foo/bar1"
-systems=[DBFS1, "dbfs:/mnt/foo/bar2"]
-for system in systems:
-    spark.read.parquet(system)
-""",
-            2,
-        ),
     ],
 )
-def test_dfsa_usage_linter(code, expected):
+def test_directfs_linter(code, expected):
     linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
     advices = linter.lint(code)
     count = 0
@@ -97,7 +88,7 @@ def test_non_dfsa_triggers_nothing(query):
         ("SELECT * FROM delta.`/mnt/...` WHERE foo > 6", "/mnt/..."),
         ("SELECT * FROM json.`/a/b/c` WHERE foo > 6", "/a/b/c"),
         ("DELETE FROM json.`/...` WHERE foo = 'bar'", "/..."),
-        ("MERGE INTO delta.`/dbfs/...` t USING source ON t.key = source.key WHEN MATCHED THEN DELETE", "/dbfs/..."),
+        ("MERGE INTO delta.`/dbfs/...` t USING src ON t.key = src.key WHEN MATCHED THEN DELETE", "/dbfs/..."),
         ("SELECT * FROM json.`s3a://abc/d/e/f`", "s3a://abc/d/e/f"),
         ("SELECT * FROM delta.`s3a://abc/d/e/f` WHERE foo > 6", "s3a://abc/d/e/f"),
         ("SELECT * FROM delta.`s3a://foo/bar`", "s3a://foo/bar"),
diff --git a/tests/unit/source_code/samples/functional/file-access/direct-fs3.py b/tests/unit/source_code/samples/functional/file-access/direct-fs3.py
new file mode 100644
index 0000000000..0db9d9a2f1
--- /dev/null
+++ b/tests/unit/source_code/samples/functional/file-access/direct-fs3.py
@@ -0,0 +1,8 @@
+# ucx[direct-filesystem-access:+1:6:+1:26] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar1
+DBFS1="dbfs:/mnt/foo/bar1"
+# ucx[direct-filesystem-access:+1:16:+1:36] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar2
+systems=[DBFS1, "dbfs:/mnt/foo/bar2"]
+for system in systems:
+    # ucx[direct-filesystem-access:+2:4:+2:30] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar1
+    # ucx[direct-filesystem-access:+1:4:+1:30] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar2
+    spark.read.parquet(system)

From cd3b115534e533405c265cb04b3a3b64f7450497 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 5 Sep 2024 11:56:12 +0200
Subject: [PATCH 38/80] update docs

---
 CONTRIBUTING.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c165715eba..48b00fc086 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -286,12 +286,10 @@ $ python tests/integration/source_code/message_codes.py
 cannot-autofix-table-reference
 catalog-api-in-shared-clusters
 changed-result-format-in-uc
-dbfs-read-from-sql-query
-dbfs-usage
+direct-filesystem-access
+direct-filesystem-access-in-sql-query
 default-format-changed-in-dbr8
 dependency-not-found
-direct-filesystem-access
-implicit-dbfs-usage
 jvm-access-in-shared-clusters
 legacy-context-in-shared-clusters
 not-supported

From 8fea3eb0bf565ba75e87c9fba2493f93d98368d6 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 5 Sep 2024 14:11:27 +0200
Subject: [PATCH 39/80] improve naming and comments

---
 .../labs/ucx/source_code/linters/directfs.py  | 21 ++++++++++---------
 .../unit/source_code/linters/test_directfs.py |  4 ++--
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
index 1ba4440b73..2af6af8840 100644
--- a/src/databricks/labs/ucx/source_code/linters/directfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -58,8 +58,6 @@ def _matches_allowed_root(self, value: str):
 
 @dataclass
 class DirectFsAccess:
-    """A DFSA is a record describing a Direct File System Access"""
-
     path: str
 
 
@@ -75,11 +73,11 @@ class _DetectDirectFsAccessVisitor(TreeVisitor):
     against a list of known deprecated paths.
     """
 
-    def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates: bool) -> None:
+    def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates: bool) -> None:
         self._session_state = session_state
         self._directfs_nodes: list[DirectFsAccessNode] = []
         self._reported_locations: set[tuple[int, int]] = set()
-        self._allow_spark_duplicates = allow_spark_duplicates
+        self._prevent_spark_duplicates = prevent_spark_duplicates
 
     def visit_call(self, node: Call):
         for arg in node.args:
@@ -104,11 +102,14 @@ def _check_str_constant(self, source_node, inferred: InferredValue):
         if self._already_reported(source_node, inferred):
             return
         # avoid duplicate advices that are reported by SparkSqlPyLinter
-        if Tree(source_node).is_from_module("spark") and not self._allow_spark_duplicates:
+        if self._prevent_spark_duplicates and Tree(source_node).is_from_module("spark"):
             return
         value = inferred.as_string()
-        if any(pattern.matches(value) for pattern in DIRECT_FS_ACCESS_PATTERNS):
-            self._directfs_nodes.append(DirectFsAccessNode(DirectFsAccess(value), source_node))
+        for pattern in DIRECT_FS_ACCESS_PATTERNS:
+            if not pattern.matches(value):
+                continue
+            dfsa = DirectFsAccess(path=value)
+            self.directfs_nodes.append(DirectFsAccessNode(dfsa, source_node))
             self._reported_locations.add((source_node.lineno, source_node.col_offset))
 
     def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
@@ -122,15 +123,15 @@ def directfs_nodes(self):
 
 class DirectFsAccessPyLinter(PythonLinter):
 
-    def __init__(self, session_state: CurrentSessionState, allow_spark_duplicates=False):
+    def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates=True):
         self._session_state = session_state
-        self._allow_spark_duplicates = allow_spark_duplicates
+        self._prevent_spark_duplicates = prevent_spark_duplicates
 
     def lint_tree(self, tree: Tree) -> Iterable[Advice]:
         """
         Lints the code looking for file system paths that are deprecated
         """
-        visitor = _DetectDirectFsAccessVisitor(self._session_state, self._allow_spark_duplicates)
+        visitor = _DetectDirectFsAccessVisitor(self._session_state, self._prevent_spark_duplicates)
         visitor.visit(tree.node)
         for directfs_node in visitor.directfs_nodes:
             advisory = Deprecation.from_node(
diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py
index 931973a60b..70b933ea00 100644
--- a/tests/unit/source_code/linters/test_directfs.py
+++ b/tests/unit/source_code/linters/test_directfs.py
@@ -37,7 +37,7 @@ def test_matches_dfsa_pattern(path, matches):
     ],
 )
 def test_detects_dfsa_paths(code, expected):
-    linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DirectFsAccessPyLinter(CurrentSessionState(), prevent_spark_duplicates=False)
     advices = list(linter.lint(code))
     for advice in advices:
         assert isinstance(advice, Advice)
@@ -58,7 +58,7 @@ def test_detects_dfsa_paths(code, expected):
     ],
 )
 def test_directfs_linter(code, expected):
-    linter = DirectFsAccessPyLinter(CurrentSessionState(), allow_spark_duplicates=True)
+    linter = DirectFsAccessPyLinter(CurrentSessionState(), prevent_spark_duplicates=False)
     advices = linter.lint(code)
     count = 0
     for advice in advices:

From 64de1e0eaa535ce7872c76285218595bf624abcc Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 5 Sep 2024 14:56:07 +0200
Subject: [PATCH 40/80] fix failing test

---
 tests/integration/source_code/test_jobs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index e82384ea09..2c989753ef 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -154,7 +154,7 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     last_messages = caplog.messages[-1].split("\n")
     assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages)
 
-    dfsas = simple_ctx.dfsa_crawler.snapshot()
+    dfsas = simple_ctx.directfs_access_crawler.snapshot()
     assert len(list(dfsas)) == 2
 
 

From 6ce8f862381ae320bf4fcdc8d0f3f5029ad77252 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 10:29:25 +0200
Subject: [PATCH 41/80] Merge branch 'main' into store-dfsa-records

# Conflicts:
#	README.md
#	src/databricks/labs/ucx/source_code/base.py
#	src/databricks/labs/ucx/source_code/jobs.py
#	src/databricks/labs/ucx/source_code/linters/directfs.py
#	src/databricks/labs/ucx/source_code/linters/pyspark.py
#	tests/unit/source_code/linters/test_directfs.py
---
 src/databricks/labs/ucx/source_code/linters/directfs.py | 5 -----
 src/databricks/labs/ucx/source_code/linters/pyspark.py  | 3 +--
 tests/unit/source_code/linters/test_directfs.py         | 1 -
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
index a553d69e5b..1043b8b6fc 100644
--- a/src/databricks/labs/ucx/source_code/linters/directfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -57,11 +57,6 @@ def _matches_allowed_root(self, value: str):
 ]
 
 
-@dataclass
-class DirectFsAccess:
-    path: str
-
-
 @dataclass
 class DirectFsAccessNode:
     dfsa: DirectFsAccess
diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index 8c30341afb..fd94ed513a 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -13,9 +13,8 @@
     Fixer,
     CurrentSessionState,
     PythonLinter,
-    DirectFsAccess,
 )
-from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessNode, DIRECT_FS_ACCESS_PATTERNS
+from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 from databricks.labs.ucx.source_code.queries import FromTableSqlLinter
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
diff --git a/tests/unit/source_code/linters/test_directfs.py b/tests/unit/source_code/linters/test_directfs.py
index 083592701e..70b933ea00 100644
--- a/tests/unit/source_code/linters/test_directfs.py
+++ b/tests/unit/source_code/linters/test_directfs.py
@@ -3,7 +3,6 @@
 from databricks.labs.ucx.source_code.base import Deprecation, Advice, CurrentSessionState, Failure
 from databricks.labs.ucx.source_code.linters.directfs import (
     DIRECT_FS_ACCESS_PATTERNS,
-    DirectFsAccessSqlLinter,
     DirectFsAccessPyLinter,
     DirectFsAccessSqlLinter,
 )

From 5b980df967483f95fe0c1443473c27378d65a79e Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 12:53:33 +0200
Subject: [PATCH 42/80] store DFSAs for paths and queries in dedicated tables

---
 .../labs/ucx/contexts/application.py          |  8 ++++----
 .../source_code/directfs_access_crawler.py    | 19 ++++++++++++++++---
 src/databricks/labs/ucx/source_code/jobs.py   |  8 ++++----
 tests/unit/source_code/conftest.py            |  6 +++---
 .../test_directfs_access_crawler.py           |  4 ++--
 tests/unit/source_code/test_jobs.py           |  4 ++--
 6 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
index aaf3beada0..373bbbc82d 100644
--- a/src/databricks/labs/ucx/contexts/application.py
+++ b/src/databricks/labs/ucx/contexts/application.py
@@ -15,7 +15,7 @@
 from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever
 from databricks.labs.ucx.recon.migration_recon import MigrationRecon
 from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler
+from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.sdk import AccountClient, WorkspaceClient, core
 from databricks.sdk.errors import ResourceDoesNotExist
@@ -426,13 +426,13 @@ def workflow_linter(self):
             self.dependency_resolver,
             self.path_lookup,
             MigrationIndex([]),  # TODO: bring back self.tables_migrator.index()
-            self.directfs_access_crawler,
+            self.directfs_access_crawlers,
             self.config.include_job_ids,
         )
 
     @cached_property
-    def directfs_access_crawler(self):
-        return DirectFsAccessCrawler(self.sql_backend, self.inventory_database)
+    def directfs_access_crawlers(self):
+        return DirectFsAccessCrawlers(self.sql_backend, self.inventory_database)
 
     @cached_property
     def redash(self):
diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
index 63b99a2f75..8512e8da08 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
@@ -10,9 +10,9 @@
 logger = logging.getLogger(__name__)
 
 
-class DirectFsAccessCrawler(CrawlerBase):
+class _DirectFsAccessCrawler(CrawlerBase):
 
-    def __init__(self, backend: SqlBackend, schema: str):
+    def __init__(self, backend: SqlBackend, schema: str, table: str):
         """
         Initializes a DFSACrawler instance.
 
@@ -20,7 +20,7 @@ def __init__(self, backend: SqlBackend, schema: str):
             backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
             schema: The schema name for the inventory persistence.
         """
-        super().__init__(backend, "hive_metastore", schema, "direct_file_system_access", DirectFsAccess)
+        super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess)
 
     def append(self, dfsas: Sequence[DirectFsAccess]):
         try:
@@ -31,3 +31,16 @@ def append(self, dfsas: Sequence[DirectFsAccess]):
     def snapshot(self) -> Iterable[DirectFsAccess]:
         sql = f"SELECT * FROM {self.full_name}"
         yield from self._backend.fetch(sql)
+
+
+class DirectFsAccessCrawlers:
+
+    def __init__(self, backend: SqlBackend, schema: str):
+        self._backend = backend
+        self._schema = schema
+
+    def for_paths(self) -> _DirectFsAccessCrawler:
+        return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_paths")
+
+    def for_queries(self) -> _DirectFsAccessCrawler:
+        return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_queries")
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 47633b7145..b4f080e662 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -28,7 +28,7 @@
     guess_encoding,
     DirectFsAccess,
 )
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler
+from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.graph import (
     Dependency,
     DependencyGraph,
@@ -330,14 +330,14 @@ def __init__(
         resolver: DependencyResolver,
         path_lookup: PathLookup,
         migration_index: MigrationIndex,
-        directfs_crawler: DirectFsAccessCrawler,
+        directfs_crawlers: DirectFsAccessCrawlers,
         include_job_ids: list[int] | None = None,
     ):
         self._ws = ws
         self._resolver = resolver
         self._path_lookup = path_lookup
         self._migration_index = migration_index
-        self._directfs_crawler = directfs_crawler
+        self._directfs_crawlers = directfs_crawlers
         self._include_job_ids = include_job_ids
 
     def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
@@ -426,7 +426,7 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) ->
         yield from walker
         collector = DfsaCollector(graph, set(), self._path_lookup, session_state)
         dfsas = list(dfsa for dfsa in collector)
-        self._directfs_crawler.append(dfsas)
+        self._directfs_crawlers.for_paths().append(dfsas)
 
 
 class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py
index 46884c251c..a4942c24ce 100644
--- a/tests/unit/source_code/conftest.py
+++ b/tests/unit/source_code/conftest.py
@@ -6,7 +6,7 @@
     MigrationStatus,
 )
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler
+from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.graph import DependencyResolver
 from databricks.labs.ucx.source_code.known import KnownList
 from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader
@@ -61,5 +61,5 @@ def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolv
 
 
 @pytest.fixture
-def mock_dfsa_crawler() -> DirectFsAccessCrawler:
-    return DirectFsAccessCrawler(MockBackend(), "schema")
+def mock_dfsa_crawlers() -> DirectFsAccessCrawlers:
+    return DirectFsAccessCrawlers(MockBackend(), "schema")
diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py
index 1cc6da6be6..1964c13238 100644
--- a/tests/unit/source_code/test_directfs_access_crawler.py
+++ b/tests/unit/source_code/test_directfs_access_crawler.py
@@ -1,12 +1,12 @@
 from databricks.labs.lsql.backends import MockBackend
 
 from databricks.labs.ucx.source_code.base import DirectFsAccess
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawler
+from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
 
 
 def test_crawler_appends_dfsas():
     backend = MockBackend()
-    crawler = DirectFsAccessCrawler(backend, "schema")
+    crawler = DirectFsAccessCrawlers(backend, "schema").for_paths()
     dfsas = list(
         DirectFsAccess(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False)
         for path in ("a", "b", "c")
diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
index e21af86cfa..7dff7a206b 100644
--- a/tests/unit/source_code/test_jobs.py
+++ b/tests/unit/source_code/test_jobs.py
@@ -230,12 +230,12 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task(
 
 
 def test_workflow_linter_lint_job_logs_problems(
-    dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler, caplog
+    dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers, caplog
 ):
     expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library"
 
     ws = create_autospec(WorkspaceClient)
-    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawler)
+    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers)
 
     libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))]
     task = jobs.Task(task_key="test-task", libraries=libraries)

From cdcc3e1e4faeac428dfa9531a92e411fa7dcb383 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 14:40:03 +0200
Subject: [PATCH 43/80] support lineage when walking dependency graph

---
 src/databricks/labs/ucx/source_code/graph.py |  3 +++
 tests/unit/source_code/test_graph.py         | 28 ++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index 4841fe904c..f2f5f9b9bc 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -593,6 +593,7 @@ def __init__(self, graph: DependencyGraph, walked_paths: set[Path], path_lookup:
         self._graph = graph
         self._walked_paths = walked_paths
         self._path_lookup = path_lookup
+        self._lineage: list[Dependency] = []
 
     def __iter__(self) -> Iterator[T]:
         for dependency in self._graph.root_dependencies:
@@ -604,6 +605,7 @@ def __iter__(self) -> Iterator[T]:
     def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: Path) -> Iterable[T]:
         if dependency.path in self._walked_paths:
             return
+        self._lineage.append(dependency)
         self._walked_paths.add(dependency.path)
         self._log_walk_one(dependency)
         if dependency.path.is_file() or is_a_notebook(dependency.path):
@@ -616,6 +618,7 @@ def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: P
                 child_graph = maybe_graph.graph
                 for child_dependency in child_graph.local_dependencies:
                     yield from self._iter_one(child_dependency, child_graph, root_path)
+        self._lineage.pop()
 
     def _log_walk_one(self, dependency: Dependency):
         logger.debug(f'Analyzing dependency: {dependency}')
diff --git a/tests/unit/source_code/test_graph.py b/tests/unit/source_code/test_graph.py
index 06bc24cdbb..a9f620b7bc 100644
--- a/tests/unit/source_code/test_graph.py
+++ b/tests/unit/source_code/test_graph.py
@@ -1,11 +1,15 @@
 from pathlib import Path
+from typing import Iterable
 
 import pytest
 
 from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver, FolderLoader
-from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, InheritedContext
+from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, InheritedContext, \
+    DependencyGraphWalker, T
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
+from databricks.labs.ucx.source_code.path_lookup import PathLookup
+from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.labs.ucx.source_code.known import KnownList
 
@@ -61,7 +65,6 @@ def build_inherited_context(self, root: Path, leaf: Path) -> InheritedContext:
 
 @pytest.fixture()
 def dependency_graph_factory(mock_path_lookup, simple_dependency_resolver):
-
     def new_test_dependency_graph(dependency: Dependency) -> _TestDependencyGraph:
         return _TestDependencyGraph(
             dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()
@@ -181,3 +184,24 @@ def test_graph_builds_inherited_context(mock_path_lookup, simple_dependency_reso
     assert inference_context.tree is not None
     assert inference_context.tree.has_global("some_table_name")
     assert not inference_context.tree.has_global("other_table_name")
+
+
+def test_graph_walker_captures_lineage(mock_path_lookup, simple_dependency_resolver):
+    grand_parent = mock_path_lookup.cwd / "functional/grand_parent_that_magic_runs_parent_that_magic_runs_child.py"
+    child = mock_path_lookup.cwd / "functional/_child_that_uses_value_from_parent.py"
+    root_dependency = Dependency(NotebookLoader(), grand_parent)
+    root_graph = DependencyGraph(root_dependency, None, simple_dependency_resolver, mock_path_lookup,
+                                 CurrentSessionState())
+    container = root_dependency.load(mock_path_lookup)
+    container.build_dependency_graph(root_graph)
+
+    class _TestWalker(DependencyGraphWalker):
+        def _process_dependency(self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None) -> Iterable[None]:
+            if dependency.path.as_posix().endswith(grand_parent.as_posix()):
+                assert len(self._lineage) == 1
+            if dependency.path.as_posix().endswith(child.as_posix()):
+                assert len(self._lineage) == 3 # there's a parent between grand_parent and child
+            return []
+
+    walker = _TestWalker(root_graph, set(), mock_path_lookup)
+    _ = list(_ for _ in walker)

From 1515342918fd2095a29250814e48044c73b89773 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 16:14:40 +0200
Subject: [PATCH 44/80] sore dfsa lineage

---
 src/databricks/labs/ucx/source_code/base.py   | 34 +++++++++++--------
 src/databricks/labs/ucx/source_code/graph.py  | 14 ++++++--
 src/databricks/labs/ucx/source_code/jobs.py   | 20 ++++-------
 .../labs/ucx/source_code/linters/directfs.py  |  2 ++
 tests/integration/source_code/test_jobs.py    | 10 ++++--
 .../test_directfs_access_crawler.py           |  4 ++-
 tests/unit/source_code/test_graph.py          | 22 ++++++++----
 7 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index 19c1035d35..8ce91c24fc 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -16,7 +16,6 @@
 
 from databricks.labs.blueprint.paths import WorkspacePath
 
-from databricks.labs.ucx.framework.utils import escape_sql_identifier
 from databricks.labs.ucx.source_code.python.python_ast import Tree
 
 # Code mapping between LSP, PyLint, and our own diagnostics:
@@ -340,26 +339,31 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
 
 @dataclass
 class DirectFsAccess:
-    """A DFSA is a record describing a Direct File System Access"""
+    """A record describing a Direct File System Access"""
 
     UNKNOWN = "unknown"
 
     source_type: str
     source_id: str
+    source_lineage: str
     path: str
     is_read: bool
     is_write: bool
 
-    @property
-    def key(self) -> str:
-        return f"{self.source_type}.{self.source_id}.{self.path}".lower()  # TODO for now
-
-    @property
-    def safe_sql_key(self) -> str:
-        return escape_sql_identifier(self.key)
-
-    def __hash__(self) -> int:
-        return hash(self.key)
-
-    def __eq__(self, other) -> bool:
-        return isinstance(other, DirectFsAccess) and self.key == other.key
+    def replace(
+        self,
+        source_type: str | None = None,
+        source_id: str | None = None,
+        source_lineage: str | None = None,
+        path: str | None = None,
+        is_read: bool | None = None,
+        is_write: bool | None = None,
+    ):
+        return DirectFsAccess(
+            source_type=source_type or self.source_type,
+            source_id=source_id or self.source_id,
+            source_lineage=source_lineage or self.source_lineage,
+            path=path or self.path,
+            is_read=is_read or self.is_read,
+            is_write=is_write or self.is_write,
+        )
diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index f2f5f9b9bc..0f4804db67 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -289,12 +289,13 @@ class DependencyGraphContext:
     session_state: CurrentSessionState
 
 
-class Dependency(abc.ABC):
+class Dependency:
 
-    def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True):
+    def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True, lineage_str: str | None = None):
         self._loader = loader
         self._path = path
         self._inherits_context = inherits_context
+        self._lineage_str = lineage_str or '"' + self._path.as_posix() + '"'
 
     @property
     def path(self) -> Path:
@@ -316,6 +317,10 @@ def load(self, path_lookup: PathLookup) -> SourceContainer | None:
     def __repr__(self):
         return f"Dependency<{self.path}>"
 
+    @property
+    def lineage_str(self):
+        return self._lineage_str
+
 
 class SourceContainer(abc.ABC):
 
@@ -627,3 +632,8 @@ def _log_walk_one(self, dependency: Dependency):
     def _process_dependency(
         self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None
     ) -> Iterable[T]: ...
+
+    @property
+    def lineage_str(self):
+        parts = [dependency.lineage_str for dependency in self._lineage]
+        return "->".join(parts)
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index b4f080e662..00e1e1cdf5 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -68,8 +68,10 @@ def as_message(self) -> str:
 
 class WorkflowTask(Dependency):
     def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job):
+        # concat job and task for lineage, see DependencyGraphWalker.lineage_str
+        lineage_str = f'"job:{job.job_id}"->"task:{task.task_key}"'
         loader = WrappingLoader(WorkflowTaskContainer(ws, task, job))
-        super().__init__(loader, Path(f'/jobs/{task.task_key}'), False)
+        super().__init__(loader, Path(f'/jobs/{task.task_key}'), inherits_context=False, lineage_str=lineage_str)
         self._task = task
         self._job = job
 
@@ -424,7 +426,7 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) ->
             graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index
         )
         yield from walker
-        collector = DfsaCollector(graph, set(), self._path_lookup, session_state)
+        collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state)
         dfsas = list(dfsa for dfsa in collector)
         self._directfs_crawlers.for_paths().append(dfsas)
 
@@ -458,7 +460,7 @@ def _process_dependency(
             yield LocatedAdvice(advice, dependency.path)
 
 
-class DfsaCollector(DependencyGraphWalker[DirectFsAccess]):
+class DfsaCollectorWalker(DependencyGraphWalker[DirectFsAccess]):
 
     def __init__(
         self,
@@ -490,13 +492,7 @@ def _collect_from_notebook(
         notebook = Notebook.parse(path, source, language.language)
         for cell in notebook.cells:
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
-                yield DirectFsAccess(
-                    source_type="NOTEBOOK",
-                    source_id=str(path),
-                    path=dfsa.path,
-                    is_read=dfsa.is_read,
-                    is_write=dfsa.is_write,
-                )
+                yield dfsa.replace(source_type="NOTEBOOK", source_id=str(path), source_lineage=self.lineage_str)
             if cell.language is CellLanguage.PYTHON:
                 if inherited_tree is None:
                     inherited_tree = Tree.new_module()
@@ -515,9 +511,7 @@ def _collect_from_source(
             logger.warning(f"Language {language.name} not supported yet!")
             return
         for dfsa in iterable:
-            yield DirectFsAccess(
-                source_type="FILE", source_id=str(path), path=dfsa.path, is_read=dfsa.is_read, is_write=dfsa.is_write
-            )
+            yield dfsa.replace(source_type="FILE", source_id=str(path), source_lineage=self.lineage_str)
 
     def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]:
         linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False)
diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
index 1043b8b6fc..51f409624c 100644
--- a/src/databricks/labs/ucx/source_code/linters/directfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -109,6 +109,7 @@ def _check_str_constant(self, source_node, inferred: InferredValue):
             dfsa = DirectFsAccess(
                 source_type=DirectFsAccess.UNKNOWN,
                 source_id=DirectFsAccess.UNKNOWN,
+                source_lineage=DirectFsAccess.UNKNOWN,
                 path=value,
                 is_read=True,
                 is_write=False,
@@ -206,6 +207,7 @@ def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterab
             yield DirectFsAccess(
                 source_type=DirectFsAccess.UNKNOWN,
                 source_id=DirectFsAccess.UNKNOWN,
+                source_lineage=DirectFsAccess.UNKNOWN,
                 path=path,
                 is_read=is_read,
                 is_write=is_write,
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index 2c989753ef..b9470c4a59 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -20,7 +20,7 @@
 
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
 from databricks.labs.ucx.mixins.fixtures import get_purge_suffix, factory
-from databricks.labs.ucx.source_code.base import CurrentSessionState
+from databricks.labs.ucx.source_code.base import CurrentSessionState, DirectFsAccess
 from databricks.labs.ucx.source_code.graph import Dependency
 from databricks.labs.ucx.source_code.known import UNKNOWN, KnownList
 from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter, FileLoader, FolderLoader
@@ -154,8 +154,12 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     last_messages = caplog.messages[-1].split("\n")
     assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages)
 
-    dfsas = simple_ctx.directfs_access_crawler.snapshot()
-    assert len(list(dfsas)) == 2
+    dfsas = list(simple_ctx.directfs_access_crawler.snapshot())
+    assert len(dfsas) == 2
+    for dfsa in dfsas:
+        assert dfsa.source_type != DirectFsAccess.UNKNOWN
+        assert dfsa.source_id != DirectFsAccess.UNKNOWN
+        assert dfsa.source_lineage != DirectFsAccess.UNKNOWN
 
 
 def test_workflow_linter_lints_job_with_import_pypi_library(
diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py
index 1964c13238..2a9459c511 100644
--- a/tests/unit/source_code/test_directfs_access_crawler.py
+++ b/tests/unit/source_code/test_directfs_access_crawler.py
@@ -8,7 +8,9 @@ def test_crawler_appends_dfsas():
     backend = MockBackend()
     crawler = DirectFsAccessCrawlers(backend, "schema").for_paths()
     dfsas = list(
-        DirectFsAccess(source_type="SOURCE", source_id="ID", path=path, is_read=False, is_write=False)
+        DirectFsAccess(
+            source_type="SOURCE", source_id="ID", source_lineage="LINEAGE", path=path, is_read=False, is_write=False
+        )
         for path in ("a", "b", "c")
     )
     crawler.append(dfsas)
diff --git a/tests/unit/source_code/test_graph.py b/tests/unit/source_code/test_graph.py
index a9f620b7bc..0dd78b4d8e 100644
--- a/tests/unit/source_code/test_graph.py
+++ b/tests/unit/source_code/test_graph.py
@@ -1,12 +1,17 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable
 
 import pytest
 
 from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver, FolderLoader
-from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, InheritedContext, \
-    DependencyGraphWalker, T
+from databricks.labs.ucx.source_code.graph import (
+    Dependency,
+    DependencyGraph,
+    DependencyResolver,
+    InheritedContext,
+    DependencyGraphWalker,
+)
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
 from databricks.labs.ucx.source_code.python.python_ast import Tree
@@ -190,17 +195,20 @@ def test_graph_walker_captures_lineage(mock_path_lookup, simple_dependency_resol
     grand_parent = mock_path_lookup.cwd / "functional/grand_parent_that_magic_runs_parent_that_magic_runs_child.py"
     child = mock_path_lookup.cwd / "functional/_child_that_uses_value_from_parent.py"
     root_dependency = Dependency(NotebookLoader(), grand_parent)
-    root_graph = DependencyGraph(root_dependency, None, simple_dependency_resolver, mock_path_lookup,
-                                 CurrentSessionState())
+    root_graph = DependencyGraph(
+        root_dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()
+    )
     container = root_dependency.load(mock_path_lookup)
     container.build_dependency_graph(root_graph)
 
     class _TestWalker(DependencyGraphWalker):
-        def _process_dependency(self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None) -> Iterable[None]:
+        def _process_dependency(
+            self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None
+        ) -> Iterable[None]:
             if dependency.path.as_posix().endswith(grand_parent.as_posix()):
                 assert len(self._lineage) == 1
             if dependency.path.as_posix().endswith(child.as_posix()):
-                assert len(self._lineage) == 3 # there's a parent between grand_parent and child
+                assert len(self._lineage) == 3  # there's a parent between grand_parent and child
             return []
 
     walker = _TestWalker(root_graph, set(), mock_path_lookup)

From 1c88c97304bcff06852a319468f9c0bae2af8a99 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 16:28:12 +0200
Subject: [PATCH 45/80] fix merge issues

---
 .../labs/ucx/source_code/directfs_access_crawler.py        | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
index 8512e8da08..09f442cd04 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
@@ -1,7 +1,7 @@
 import logging
 from collections.abc import Sequence, Iterable
 
-from databricks.labs.ucx.framework.crawlers import CrawlerBase
+from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result
 from databricks.labs.lsql.backends import SqlBackend
 from databricks.sdk.errors import DatabricksError
 
@@ -28,10 +28,13 @@ def append(self, dfsas: Sequence[DirectFsAccess]):
         except DatabricksError as e:
             logger.error("Failed to store DFSAs", exc_info=e)
 
-    def snapshot(self) -> Iterable[DirectFsAccess]:
+    def _try_fetch(self) -> Iterable[DirectFsAccess]:
         sql = f"SELECT * FROM {self.full_name}"
         yield from self._backend.fetch(sql)
 
+    def _crawl(self) -> Iterable[Result]:
+        return []
+
 
 class DirectFsAccessCrawlers:
 

From ef261a6cd7101447d773e6eacf8b766493d198aa Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 16:49:03 +0200
Subject: [PATCH 46/80] capture and store source_timestamp

---
 src/databricks/labs/ucx/source_code/base.py        |  3 +++
 src/databricks/labs/ucx/source_code/jobs.py        | 14 ++++++++++++--
 .../labs/ucx/source_code/linters/directfs.py       |  2 ++
 tests/integration/source_code/test_jobs.py         |  3 ++-
 .../source_code/test_directfs_access_crawler.py    |  8 +++++++-
 5 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index 8ce91c24fc..b2b369d6fa 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -346,6 +346,7 @@ class DirectFsAccess:
     source_type: str
     source_id: str
     source_lineage: str
+    source_timestamp: int
     path: str
     is_read: bool
     is_write: bool
@@ -355,6 +356,7 @@ def replace(
         source_type: str | None = None,
         source_id: str | None = None,
         source_lineage: str | None = None,
+        source_timestamp: int | None = None,
         path: str | None = None,
         is_read: bool | None = None,
         is_write: bool | None = None,
@@ -363,6 +365,7 @@ def replace(
             source_type=source_type or self.source_type,
             source_id=source_id or self.source_id,
             source_lineage=source_lineage or self.source_lineage,
+            source_timestamp=source_timestamp or self.source_timestamp,
             path=path or self.path,
             is_read=is_read or self.is_read,
             is_write=is_write or self.is_write,
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 00e1e1cdf5..6d74f9380b 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -491,8 +491,13 @@ def _collect_from_notebook(
     ) -> Iterable[DirectFsAccess]:
         notebook = Notebook.parse(path, source, language.language)
         for cell in notebook.cells:
+            src_timestamp = int(path.stat().st_mtime)
+            src_id = str(path)
+            src_lineage = self.lineage_str
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
-                yield dfsa.replace(source_type="NOTEBOOK", source_id=str(path), source_lineage=self.lineage_str)
+                yield dfsa.replace(
+                    source_type="NOTEBOOK", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp
+                )
             if cell.language is CellLanguage.PYTHON:
                 if inherited_tree is None:
                     inherited_tree = Tree.new_module()
@@ -510,8 +515,13 @@ def _collect_from_source(
         if iterable is None:
             logger.warning(f"Language {language.name} not supported yet!")
             return
+        src_id = str(path)
+        src_lineage = self.lineage_str
+        src_timestamp = int(path.stat().st_mtime)
         for dfsa in iterable:
-            yield dfsa.replace(source_type="FILE", source_id=str(path), source_lineage=self.lineage_str)
+            yield dfsa.replace(
+                source_type="FILE", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp
+            )
 
     def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]:
         linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False)
diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
index 51f409624c..cf8a347a19 100644
--- a/src/databricks/labs/ucx/source_code/linters/directfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -110,6 +110,7 @@ def _check_str_constant(self, source_node, inferred: InferredValue):
                 source_type=DirectFsAccess.UNKNOWN,
                 source_id=DirectFsAccess.UNKNOWN,
                 source_lineage=DirectFsAccess.UNKNOWN,
+                source_timestamp=-1,
                 path=value,
                 is_read=True,
                 is_write=False,
@@ -208,6 +209,7 @@ def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterab
                 source_type=DirectFsAccess.UNKNOWN,
                 source_id=DirectFsAccess.UNKNOWN,
                 source_lineage=DirectFsAccess.UNKNOWN,
+                source_timestamp=-1,
                 path=path,
                 is_read=is_read,
                 is_write=is_write,
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index b9470c4a59..248e452acc 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -154,12 +154,13 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     last_messages = caplog.messages[-1].split("\n")
     assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages)
 
-    dfsas = list(simple_ctx.directfs_access_crawler.snapshot())
+    dfsas = list(simple_ctx.directfs_access_crawlers.for_paths().snapshot())
     assert len(dfsas) == 2
     for dfsa in dfsas:
         assert dfsa.source_type != DirectFsAccess.UNKNOWN
         assert dfsa.source_id != DirectFsAccess.UNKNOWN
         assert dfsa.source_lineage != DirectFsAccess.UNKNOWN
+        assert dfsa.source_timestamp != -1
 
 
 def test_workflow_linter_lints_job_with_import_pypi_library(
diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py
index 2a9459c511..e34efad717 100644
--- a/tests/unit/source_code/test_directfs_access_crawler.py
+++ b/tests/unit/source_code/test_directfs_access_crawler.py
@@ -9,7 +9,13 @@ def test_crawler_appends_dfsas():
     crawler = DirectFsAccessCrawlers(backend, "schema").for_paths()
     dfsas = list(
         DirectFsAccess(
-            source_type="SOURCE", source_id="ID", source_lineage="LINEAGE", path=path, is_read=False, is_write=False
+            source_type="SOURCE",
+            source_id="ID",
+            source_lineage="LINEAGE",
+            source_timestamp=7452,
+            path=path,
+            is_read=False,
+            is_write=False,
         )
         for path in ("a", "b", "c")
     )

From d951286621f92c0de9a447da46539534ba38e8bc Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 17:03:15 +0200
Subject: [PATCH 47/80] simplify

---
 src/databricks/labs/ucx/source_code/base.py   | 20 +++++++++----------
 .../labs/ucx/source_code/linters/directfs.py  |  8 --------
 tests/integration/source_code/test_jobs.py    |  3 +++
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index b2b369d6fa..ed5a0d11c1 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -343,30 +343,30 @@ class DirectFsAccess:
 
     UNKNOWN = "unknown"
 
-    source_type: str
-    source_id: str
-    source_lineage: str
-    source_timestamp: int
     path: str
     is_read: bool
     is_write: bool
+    source_type: str = UNKNOWN
+    source_id: str = UNKNOWN
+    source_lineage: str = UNKNOWN
+    source_timestamp: int = -1
 
     def replace(
         self,
+        path: str | None = None,
+        is_read: bool | None = None,
+        is_write: bool | None = None,
         source_type: str | None = None,
         source_id: str | None = None,
         source_lineage: str | None = None,
         source_timestamp: int | None = None,
-        path: str | None = None,
-        is_read: bool | None = None,
-        is_write: bool | None = None,
     ):
         return DirectFsAccess(
+            path=path or self.path,
+            is_read=is_read or self.is_read,
+            is_write=is_write or self.is_write,
             source_type=source_type or self.source_type,
             source_id=source_id or self.source_id,
             source_lineage=source_lineage or self.source_lineage,
             source_timestamp=source_timestamp or self.source_timestamp,
-            path=path or self.path,
-            is_read=is_read or self.is_read,
-            is_write=is_write or self.is_write,
         )
diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
index cf8a347a19..4a8c01fa67 100644
--- a/src/databricks/labs/ucx/source_code/linters/directfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -107,10 +107,6 @@ def _check_str_constant(self, source_node, inferred: InferredValue):
             # since we're normally filtering out spark calls, we're dealing with dfsas we know little about
             # notable we don't know is_read or is_write
             dfsa = DirectFsAccess(
-                source_type=DirectFsAccess.UNKNOWN,
-                source_id=DirectFsAccess.UNKNOWN,
-                source_lineage=DirectFsAccess.UNKNOWN,
-                source_timestamp=-1,
                 path=value,
                 is_read=True,
                 is_write=False,
@@ -206,10 +202,6 @@ def _collect_dfsa_from_node(cls, expression: SqlExpression, path: str) -> Iterab
             is_read = cls._is_read(expression)
             is_write = cls._is_write(expression)
             yield DirectFsAccess(
-                source_type=DirectFsAccess.UNKNOWN,
-                source_id=DirectFsAccess.UNKNOWN,
-                source_lineage=DirectFsAccess.UNKNOWN,
-                source_timestamp=-1,
                 path=path,
                 is_read=is_read,
                 is_write=is_write,
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index 248e452acc..21b9e6ead9 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -156,7 +156,10 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
 
     dfsas = list(simple_ctx.directfs_access_crawlers.for_paths().snapshot())
     assert len(dfsas) == 2
+    task_keys = set(task.task_key for task in j.settings.tasks)
     for dfsa in dfsas:
+        assert dfsa.job_id == j.job_id
+        assert dfsa.task_key in task_keys
         assert dfsa.source_type != DirectFsAccess.UNKNOWN
         assert dfsa.source_id != DirectFsAccess.UNKNOWN
         assert dfsa.source_lineage != DirectFsAccess.UNKNOWN

From ea74ba68b7d50dcaba8123ac384679602f2126a7 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 17:18:20 +0200
Subject: [PATCH 48/80] capture and store job/task infos

---
 src/databricks/labs/ucx/source_code/base.py | 10 ++++++++++
 src/databricks/labs/ucx/source_code/jobs.py |  6 +++++-
 tests/integration/source_code/test_jobs.py  |  5 +++--
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index ed5a0d11c1..ac29971ce2 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -350,7 +350,11 @@ class DirectFsAccess:
     source_id: str = UNKNOWN
     source_lineage: str = UNKNOWN
     source_timestamp: int = -1
+    job_id: int = -1
+    job_name: str = UNKNOWN
+    task_key: str = UNKNOWN
 
+    # pylint: disable=too-many-arguments
     def replace(
         self,
         path: str | None = None,
@@ -360,6 +364,9 @@ def replace(
         source_id: str | None = None,
         source_lineage: str | None = None,
         source_timestamp: int | None = None,
+        job_id: int | None = None,
+        job_name: str | None = None,
+        task_key: str | None = None,
     ):
         return DirectFsAccess(
             path=path or self.path,
@@ -369,4 +376,7 @@ def replace(
             source_id=source_id or self.source_id,
             source_lineage=source_lineage or self.source_lineage,
             source_timestamp=source_timestamp or self.source_timestamp,
+            job_id=job_id or self.job_id,
+            job_name=job_name or self.job_name,
+            task_key=task_key or self.task_key,
         )
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 6d74f9380b..36f5954dcc 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -427,7 +427,11 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) ->
         )
         yield from walker
         collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state)
-        dfsas = list(dfsa for dfsa in collector)
+        dfsas: list[DirectFsAccess] = []
+        job_name = "<anonymous>" if job.settings is None else job.settings.name
+        for dfsa in collector:
+            dfsa = dfsa.replace(job_id=job.job_id, job_name=job_name, task_key=task.task_key)
+            dfsas.append(dfsa)
         self._directfs_crawlers.for_paths().append(dfsas)
 
 
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index 21b9e6ead9..c99328a7e2 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -158,12 +158,13 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     assert len(dfsas) == 2
     task_keys = set(task.task_key for task in j.settings.tasks)
     for dfsa in dfsas:
-        assert dfsa.job_id == j.job_id
-        assert dfsa.task_key in task_keys
         assert dfsa.source_type != DirectFsAccess.UNKNOWN
         assert dfsa.source_id != DirectFsAccess.UNKNOWN
         assert dfsa.source_lineage != DirectFsAccess.UNKNOWN
         assert dfsa.source_timestamp != -1
+        assert dfsa.job_id == j.job_id
+        assert dfsa.job_name == j.job_name
+        assert dfsa.task_key in task_keys
 
 
 def test_workflow_linter_lints_job_with_import_pypi_library(

From e75e07d9d85c58227daa616fe7b7f0fa959b7635 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 17:23:37 +0200
Subject: [PATCH 49/80] simplify

---
 src/databricks/labs/ucx/source_code/base.py | 34 ++++++++++++++-------
 src/databricks/labs/ucx/source_code/jobs.py |  6 ++--
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index ac29971ce2..56ec62ce66 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -354,28 +354,40 @@ class DirectFsAccess:
     job_name: str = UNKNOWN
     task_key: str = UNKNOWN
 
-    # pylint: disable=too-many-arguments
-    def replace(
+    def replace_source(
         self,
-        path: str | None = None,
-        is_read: bool | None = None,
-        is_write: bool | None = None,
         source_type: str | None = None,
         source_id: str | None = None,
         source_lineage: str | None = None,
         source_timestamp: int | None = None,
-        job_id: int | None = None,
-        job_name: str | None = None,
-        task_key: str | None = None,
     ):
         return DirectFsAccess(
-            path=path or self.path,
-            is_read=is_read or self.is_read,
-            is_write=is_write or self.is_write,
+            path=self.path,
+            is_read=self.is_read,
+            is_write=self.is_write,
             source_type=source_type or self.source_type,
             source_id=source_id or self.source_id,
             source_lineage=source_lineage or self.source_lineage,
             source_timestamp=source_timestamp or self.source_timestamp,
+            job_id=self.job_id,
+            job_name=self.job_name,
+            task_key=self.task_key,
+        )
+
+    def replace_job_infos(
+        self,
+        job_id: int | None = None,
+        job_name: str | None = None,
+        task_key: str | None = None,
+    ):
+        return DirectFsAccess(
+            path=self.path,
+            is_read=self.is_read,
+            is_write=self.is_write,
+            source_type=self.source_type,
+            source_id=self.source_id,
+            source_lineage=self.source_lineage,
+            source_timestamp=self.source_timestamp,
             job_id=job_id or self.job_id,
             job_name=job_name or self.job_name,
             task_key=task_key or self.task_key,
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 36f5954dcc..f598ad81ac 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -430,7 +430,7 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) ->
         dfsas: list[DirectFsAccess] = []
         job_name = "<anonymous>" if job.settings is None else job.settings.name
         for dfsa in collector:
-            dfsa = dfsa.replace(job_id=job.job_id, job_name=job_name, task_key=task.task_key)
+            dfsa = dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key)
             dfsas.append(dfsa)
         self._directfs_crawlers.for_paths().append(dfsas)
 
@@ -499,7 +499,7 @@ def _collect_from_notebook(
             src_id = str(path)
             src_lineage = self.lineage_str
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
-                yield dfsa.replace(
+                yield dfsa.replace_source(
                     source_type="NOTEBOOK", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp
                 )
             if cell.language is CellLanguage.PYTHON:
@@ -523,7 +523,7 @@ def _collect_from_source(
         src_lineage = self.lineage_str
         src_timestamp = int(path.stat().st_mtime)
         for dfsa in iterable:
-            yield dfsa.replace(
+            yield dfsa.replace_source(
                 source_type="FILE", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp
             )
 

From e62fd185dbc509f0b039602934aa3f4b11345437 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 17:50:21 +0200
Subject: [PATCH 50/80] capture and store assessment start/stop, also drop
 source_type

---
 src/databricks/labs/ucx/source_code/base.py   | 31 ++++++++++++++-----
 src/databricks/labs/ucx/source_code/jobs.py   | 15 ++++-----
 tests/integration/source_code/test_jobs.py    |  2 ++
 .../test_directfs_access_crawler.py           | 12 ++++---
 4 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index 56ec62ce66..c37937e59f 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -346,17 +346,17 @@ class DirectFsAccess:
     path: str
     is_read: bool
     is_write: bool
-    source_type: str = UNKNOWN
     source_id: str = UNKNOWN
-    source_lineage: str = UNKNOWN
     source_timestamp: int = -1
+    source_lineage: str = UNKNOWN
     job_id: int = -1
     job_name: str = UNKNOWN
     task_key: str = UNKNOWN
+    assessment_start_timestamp: int = -1
+    assessment_end_timestamp: int = -1
 
     def replace_source(
         self,
-        source_type: str | None = None,
         source_id: str | None = None,
         source_lineage: str | None = None,
         source_timestamp: int | None = None,
@@ -365,13 +365,14 @@ def replace_source(
             path=self.path,
             is_read=self.is_read,
             is_write=self.is_write,
-            source_type=source_type or self.source_type,
             source_id=source_id or self.source_id,
-            source_lineage=source_lineage or self.source_lineage,
             source_timestamp=source_timestamp or self.source_timestamp,
+            source_lineage=source_lineage or self.source_lineage,
             job_id=self.job_id,
             job_name=self.job_name,
             task_key=self.task_key,
+            assessment_start_timestamp=self.assessment_start_timestamp,
+            assessment_end_timestamp=self.assessment_start_timestamp,
         )
 
     def replace_job_infos(
@@ -384,11 +385,27 @@ def replace_job_infos(
             path=self.path,
             is_read=self.is_read,
             is_write=self.is_write,
-            source_type=self.source_type,
             source_id=self.source_id,
-            source_lineage=self.source_lineage,
             source_timestamp=self.source_timestamp,
+            source_lineage=self.source_lineage,
             job_id=job_id or self.job_id,
             job_name=job_name or self.job_name,
             task_key=task_key or self.task_key,
+            assessment_start_timestamp=self.assessment_start_timestamp,
+            assessment_end_timestamp=self.assessment_start_timestamp,
+        )
+
+    def replace_assessment_infos(self, assessment_start: int | None = None, assessment_end: int | None = None):
+        return DirectFsAccess(
+            path=self.path,
+            is_read=self.is_read,
+            is_write=self.is_write,
+            source_id=self.source_id,
+            source_timestamp=self.source_timestamp,
+            source_lineage=self.source_lineage,
+            job_id=self.job_id,
+            job_name=self.job_name,
+            task_key=self.task_key,
+            assessment_start_timestamp=assessment_start or self.assessment_start_timestamp,
+            assessment_end_timestamp=assessment_end or self.assessment_start_timestamp,
         )
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index f598ad81ac..1840d478a2 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -3,6 +3,7 @@
 import logging
 import shutil
 import tempfile
+import time
 from collections.abc import Generator, Iterable
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -426,11 +427,15 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) ->
             graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index
         )
         yield from walker
+        assessment_start = int(time.mktime(time.gmtime()))
         collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state)
+        assessment_end = int(time.mktime(time.gmtime()))
         dfsas: list[DirectFsAccess] = []
-        job_name = "<anonymous>" if job.settings is None else job.settings.name
+        assert job.settings is not None  # as already done in _lint_job
+        job_name = job.settings.name
         for dfsa in collector:
             dfsa = dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key)
+            dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end)
             dfsas.append(dfsa)
         self._directfs_crawlers.for_paths().append(dfsas)
 
@@ -499,9 +504,7 @@ def _collect_from_notebook(
             src_id = str(path)
             src_lineage = self.lineage_str
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
-                yield dfsa.replace_source(
-                    source_type="NOTEBOOK", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp
-                )
+                yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp)
             if cell.language is CellLanguage.PYTHON:
                 if inherited_tree is None:
                     inherited_tree = Tree.new_module()
@@ -523,9 +526,7 @@ def _collect_from_source(
         src_lineage = self.lineage_str
         src_timestamp = int(path.stat().st_mtime)
         for dfsa in iterable:
-            yield dfsa.replace_source(
-                source_type="FILE", source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp
-            )
+            yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp)
 
     def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]:
         linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False)
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index c99328a7e2..e1073110b7 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -165,6 +165,8 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
         assert dfsa.job_id == j.job_id
         assert dfsa.job_name == j.job_name
         assert dfsa.task_key in task_keys
+        assert dfsa.assessment_start_timestamp != -1
+        assert dfsa.assessment_end_timestamp != -1
 
 
 def test_workflow_linter_lints_job_with_import_pypi_library(
diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access_crawler.py
index e34efad717..cd38435e66 100644
--- a/tests/unit/source_code/test_directfs_access_crawler.py
+++ b/tests/unit/source_code/test_directfs_access_crawler.py
@@ -9,13 +9,17 @@ def test_crawler_appends_dfsas():
     crawler = DirectFsAccessCrawlers(backend, "schema").for_paths()
     dfsas = list(
         DirectFsAccess(
-            source_type="SOURCE",
-            source_id="ID",
-            source_lineage="LINEAGE",
-            source_timestamp=7452,
             path=path,
             is_read=False,
             is_write=False,
+            source_id="ID",
+            source_timestamp=7452,
+            source_lineage="LINEAGE",
+            job_id=222,
+            job_name="JOB",
+            task_key="TASK",
+            assessment_start_timestamp=123,
+            assessment_end_timestamp=234,
         )
         for path in ("a", "b", "c")
     )

From b58f47dd2c9c84cf25de580fa06e65386d058689 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 18:03:30 +0200
Subject: [PATCH 51/80] drop mock_dfsa_crawlers

---
 tests/unit/source_code/conftest.py  | 5 -----
 tests/unit/source_code/test_jobs.py | 6 ++++--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py
index a4942c24ce..a5be3047ee 100644
--- a/tests/unit/source_code/conftest.py
+++ b/tests/unit/source_code/conftest.py
@@ -58,8 +58,3 @@ def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolv
     notebook_resolver = NotebookResolver(NotebookLoader())
     import_resolver = ImportFileResolver(FileLoader(), allow_list)
     return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup)
-
-
-@pytest.fixture
-def mock_dfsa_crawlers() -> DirectFsAccessCrawlers:
-    return DirectFsAccessCrawlers(MockBackend(), "schema")
diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
index 7dff7a206b..8b9714a326 100644
--- a/tests/unit/source_code/test_jobs.py
+++ b/tests/unit/source_code/test_jobs.py
@@ -10,6 +10,7 @@
 
 from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath
 from databricks.labs.ucx.source_code.base import CurrentSessionState
+from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.labs.ucx.source_code.known import KnownList
 from databricks.sdk import WorkspaceClient
@@ -230,12 +231,13 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task(
 
 
 def test_workflow_linter_lint_job_logs_problems(
-    dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers, caplog
+    dependency_resolver, mock_path_lookup, empty_index, caplog
 ):
     expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library"
 
     ws = create_autospec(WorkspaceClient)
-    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, mock_dfsa_crawlers)
+    crawlers = create_autospec(DirectFsAccessCrawlers)
+    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, crawlers)
 
     libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))]
     task = jobs.Task(task_key="test-task", libraries=libraries)

From 5a5d4ffc8c7334618c2b07e02359c2a14bcb4367 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 18:04:15 +0200
Subject: [PATCH 52/80] rename _backend -> _sql_backend

---
 .../labs/ucx/source_code/directfs_access_crawler.py       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
index 09f442cd04..59423d453f 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
@@ -38,12 +38,12 @@ def _crawl(self) -> Iterable[Result]:
 
 class DirectFsAccessCrawlers:
 
-    def __init__(self, backend: SqlBackend, schema: str):
-        self._backend = backend
+    def __init__(self, sql_backend: SqlBackend, schema: str):
+        self._sql_backend = sql_backend
         self._schema = schema
 
     def for_paths(self) -> _DirectFsAccessCrawler:
-        return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_paths")
+        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths")
 
     def for_queries(self) -> _DirectFsAccessCrawler:
-        return _DirectFsAccessCrawler(self._backend, self._schema, "direct_file_system_access_in_queries")
+        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries")

From 404f6cdd6a55ecb1cc0112039004af295c716334 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 18:05:20 +0200
Subject: [PATCH 53/80] rename _backend -> _sql_backend

---
 src/databricks/labs/ucx/source_code/directfs_access_crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
index 59423d453f..a05c2079db 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
@@ -17,7 +17,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str):
         Initializes a DFSACrawler instance.
 
         Args:
-            backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
+            sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
             schema: The schema name for the inventory persistence.
         """
         super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess)

From cb6e45d409a29efaf249caedf2638a59991b1bf5 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Fri, 6 Sep 2024 18:14:38 +0200
Subject: [PATCH 54/80] hdfs is irrelevant, replace with dbfs

---
 .../samples/functional/file-access/select_format.sql          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/source_code/samples/functional/file-access/select_format.sql b/tests/unit/source_code/samples/functional/file-access/select_format.sql
index d64358a23d..d2a16bbe4c 100644
--- a/tests/unit/source_code/samples/functional/file-access/select_format.sql
+++ b/tests/unit/source_code/samples/functional/file-access/select_format.sql
@@ -1,3 +1,3 @@
 -- Databricks notebook source
--- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: hdfs://examples/src/main/resources/users.parquet
-SELECT * FROM parquet.`hdfs://examples/src/main/resources/users.parquet`
+-- ucx[direct-filesystem-access-in-sql-query:+0:0:+0:1024] The use of direct filesystem references is deprecated: dbfs://examples/src/main/resources/users.parquet
+SELECT * FROM parquet.`dbfs://examples/src/main/resources/users.parquet`

From 733deccbdab5e35ed165ed1f5345d41dacd73c30 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 10:45:32 +0200
Subject: [PATCH 55/80] drop mock of DirectFsAccessCrawlers

---
 tests/unit/source_code/conftest.py  |  2 --
 tests/unit/source_code/test_jobs.py | 10 ++++------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py
index a5be3047ee..4b86de4f13 100644
--- a/tests/unit/source_code/conftest.py
+++ b/tests/unit/source_code/conftest.py
@@ -1,12 +1,10 @@
 import pytest
 
-from databricks.labs.lsql.backends import MockBackend
 
 from databricks.labs.ucx.hive_metastore.migration_status import (
     MigrationStatus,
 )
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.graph import DependencyResolver
 from databricks.labs.ucx.source_code.known import KnownList
 from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader
diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
index 8b9714a326..ba8b9287a8 100644
--- a/tests/unit/source_code/test_jobs.py
+++ b/tests/unit/source_code/test_jobs.py
@@ -230,14 +230,13 @@ def test_workflow_task_container_builds_dependency_graph_spark_python_task(
     assert registered_notebooks == [expected_path_instance]
 
 
-def test_workflow_linter_lint_job_logs_problems(
-    dependency_resolver, mock_path_lookup, empty_index, caplog
-):
+def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_lookup, empty_index, caplog):
     expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library"
 
     ws = create_autospec(WorkspaceClient)
-    crawlers = create_autospec(DirectFsAccessCrawlers)
-    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, crawlers)
+    # pylint: disable=mock-no-usage
+    dfsas = create_autospec(DirectFsAccessCrawlers)
+    linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, dfsas)
 
     libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))]
     task = jobs.Task(task_key="test-task", libraries=libraries)
@@ -245,7 +244,6 @@ def test_workflow_linter_lint_job_logs_problems(
     job = jobs.Job(job_id=1234, settings=settings)
 
     ws.jobs.get.return_value = job
-
     with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"):
         linter.lint_job(1234)
 

From 6338af6680f64e0b6bd2f85e93e5f8890b003ba3 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 10:46:05 +0200
Subject: [PATCH 56/80] gather and store dfsas from refresh_report

---
 src/databricks/labs/ucx/source_code/jobs.py | 72 ++++++++++++++-------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 1840d478a2..d59dc394f9 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -1,5 +1,4 @@
 import functools
-import itertools
 import logging
 import shutil
 import tempfile
@@ -353,42 +352,51 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
                 continue
             tasks.append(functools.partial(self.lint_job, job.job_id))
         logger.info(f"Running {tasks} linting tasks in parallel...")
-        job_problems, errors = Threads.gather('linting workflows', tasks)
-        job_problems_flattened = list(itertools.chain(*job_problems))
-        logger.info(f"Saving {len(job_problems_flattened)} linting problems...")
+        job_results, errors = Threads.gather('linting workflows', tasks)
+        job_problems: list[JobProblem] = []
+        job_dfsas: list[DirectFsAccess] = []
+        for problems, dfsas in job_results:
+            job_problems.extend(problems)
+            job_dfsas.extend(dfsas)
+        logger.info(f"Saving {len(job_problems)} linting problems...")
         sql_backend.save_table(
             f'{inventory_database}.workflow_problems',
-            job_problems_flattened,
+            job_problems,
             JobProblem,
             mode='overwrite',
         )
+        self._directfs_crawlers.for_paths().append(job_dfsas)
         if len(errors) > 0:
             raise ManyError(errors)
 
-    def lint_job(self, job_id: int) -> list[JobProblem]:
+    def lint_job(self, job_id: int) -> tuple[list[JobProblem], list[DirectFsAccess]]:
         try:
             job = self._ws.jobs.get(job_id)
         except NotFound:
             logger.warning(f'Could not find job: {job_id}')
-            return []
+            return ([], [])
 
-        problems = self._lint_job(job)
+        problems, dfsas = self._lint_job(job)
         if len(problems) > 0:
             problem_messages = "\n".join([problem.as_message() for problem in problems])
             logger.warning(f"Found job problems:\n{problem_messages}")
-        return problems
+        return problems, dfsas
 
     _UNKNOWN = Path('<UNKNOWN>')
 
-    def _lint_job(self, job: jobs.Job) -> list[JobProblem]:
+    def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAccess]]:
         problems: list[JobProblem] = []
+        dfsas: list[DirectFsAccess] = []
         assert job.job_id is not None
         assert job.settings is not None
         assert job.settings.name is not None
         assert job.settings.tasks is not None
         linted_paths: set[Path] = set()
         for task in job.settings.tasks:
-            for advice in self._lint_task(task, job, linted_paths):
+            graph, advices, session_state = self._build_task_dependency_graph(task, job)
+            if not advices:
+                advices = self._lint_task(task, graph, session_state, linted_paths)
+            for advice in advices:
                 absolute_path = advice.path.absolute().as_posix() if advice.path != self._UNKNOWN else 'UNKNOWN'
                 job_problem = JobProblem(
                     job_id=job.job_id,
@@ -403,9 +411,17 @@ def _lint_job(self, job: jobs.Job) -> list[JobProblem]:
                     end_col=advice.advice.end_col,
                 )
                 problems.append(job_problem)
-        return problems
-
-    def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) -> Iterable[LocatedAdvice]:
+            assessment_start = int(time.mktime(time.gmtime()))
+            task_dfsas = self._collect_task_dfsas(task, job, graph, session_state)
+            assessment_end = int(time.mktime(time.gmtime()))
+            for dfsa in task_dfsas:
+                dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end)
+                dfsas.append(dfsa)
+        return problems, dfsas
+
+    def _build_task_dependency_graph(
+        self, task: jobs.Task, job: jobs.Job
+    ) -> tuple[DependencyGraph, Iterable[LocatedAdvice], CurrentSessionState]:
         root_dependency: Dependency = WorkflowTask(self._ws, task, job)
         # we can load it without further preparation since the WorkflowTask is merely a wrapper
         container = root_dependency.load(self._path_lookup)
@@ -418,25 +434,33 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job, linted_paths: set[Path]) ->
         )
         graph = DependencyGraph(root_dependency, None, self._resolver, self._path_lookup, session_state)
         problems = container.build_dependency_graph(graph)
-        if problems:
-            for problem in problems:
-                source_path = self._UNKNOWN if problem.is_path_missing() else problem.source_path
-                yield LocatedAdvice(problem.as_advisory(), source_path)
-            return
+        located_advices: list[LocatedAdvice] = []
+        for problem in problems:
+            source_path = self._UNKNOWN if problem.is_path_missing() else problem.source_path
+            located_advices.append(LocatedAdvice(problem.as_advisory(), source_path))
+        return graph, located_advices, session_state
+
+    def _lint_task(
+        self,
+        task: jobs.Task,
+        graph: DependencyGraph,
+        session_state: CurrentSessionState,
+        linted_paths: set[Path],
+    ) -> Iterable[LocatedAdvice]:
         walker = LintingWalker(
             graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index
         )
         yield from walker
-        assessment_start = int(time.mktime(time.gmtime()))
+
+    def _collect_task_dfsas(
+        self, task: jobs.Task, job: jobs.Job, graph: DependencyGraph, session_state: CurrentSessionState
+    ) -> Iterable[DirectFsAccess]:
         collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state)
-        assessment_end = int(time.mktime(time.gmtime()))
         dfsas: list[DirectFsAccess] = []
         assert job.settings is not None  # as already done in _lint_job
         job_name = job.settings.name
         for dfsa in collector:
-            dfsa = dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key)
-            dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end)
-            dfsas.append(dfsa)
+            yield dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key)
         self._directfs_crawlers.for_paths().append(dfsas)
 
 

From 166e34ca62877faeeeab1a22d7ea1fd524331cbf Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 12:20:34 +0200
Subject: [PATCH 57/80] prevent pylint warning

---
 tests/unit/source_code/test_jobs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
index ba8b9287a8..e2293bd375 100644
--- a/tests/unit/source_code/test_jobs.py
+++ b/tests/unit/source_code/test_jobs.py
@@ -234,7 +234,6 @@ def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_l
     expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] 'pip --disable-pip-version-check install unknown-library"
 
     ws = create_autospec(WorkspaceClient)
-    # pylint: disable=mock-no-usage
     dfsas = create_autospec(DirectFsAccessCrawlers)
     linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index, dfsas)
 
@@ -247,6 +246,7 @@ def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_l
     with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"):
         linter.lint_job(1234)
 
+    dfsas.assert_not_called()
     assert any(message.startswith(expected_message) for message in caplog.messages)
 
 

From 024931063c4c7f1f5092122dad6efb4d56ae81d2 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 17:03:21 +0200
Subject: [PATCH 58/80] fix failing tests

---
 tests/integration/source_code/test_jobs.py | 43 +++++++++++-----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index e1073110b7..d63941a93e 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -69,7 +69,7 @@ def test_linter_from_context(simple_ctx, make_job, make_notebook):
 def test_job_linter_no_problems(simple_ctx, make_job):
     j = make_job()
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
 
     assert len(problems) == 0
 
@@ -92,7 +92,7 @@ def test_job_task_linter_library_not_installed_cluster(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 1
 
 
@@ -116,7 +116,7 @@ def test_job_task_linter_library_installed_cluster(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0
 
 
@@ -145,7 +145,7 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     some_file.write_text('display(spark.read.parquet("/mnt/foo/bar"))')
 
     with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"):
-        problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+        problems, dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
 
     root = Path(entrypoint.as_posix())
     messages = {replace(p, path=Path(p.path).relative_to(root)).as_message() for p in problems}
@@ -154,7 +154,6 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     last_messages = caplog.messages[-1].split("\n")
     assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages)
 
-    dfsas = list(simple_ctx.directfs_access_crawlers.for_paths().snapshot())
     assert len(dfsas) == 2
     task_keys = set(task.task_key for task in j.settings.tasks)
     for dfsa in dfsas:
@@ -187,14 +186,14 @@ def test_workflow_linter_lints_job_with_import_pypi_library(
     make_notebook(path=notebook, content=b"import greenlet")
 
     job_without_pytest_library = make_job(notebook_path=notebook)
-    problems = simple_ctx.workflow_linter.lint_job(job_without_pytest_library.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_without_pytest_library.job_id)
 
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) > 0
 
     library = compute.Library(pypi=compute.PythonPyPiLibrary(package="greenlet"))
     job_with_pytest_library = make_job(notebook_path=notebook, libraries=[library])
 
-    problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id)
 
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0
 
@@ -305,7 +304,7 @@ def test_workflow_linter_lints_job_with_workspace_requirements_dependency(
     notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=python_code.encode("utf-8"))
     job_with_pytest_library = make_job(notebook_path=notebook, libraries=[library])
 
-    problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id)
     messages = tuple(problem.message for problem in problems)
     expected_messages = (
         "ERROR: Could not find a version that satisfies the requirement a_package_that_does_not_exist",
@@ -339,7 +338,7 @@ def test_workflow_linter_lints_job_with_dbfs_requirements_dependency(
     notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=python_code.encode("utf-8"))
     job_with_pytest_library = make_job(notebook_path=notebook, libraries=[library])
 
-    problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id)
     messages = tuple(problem.message for problem in problems)
     expected_messages = (
         "ERROR: Could not find a version that satisfies the requirement a_package_that_does_not_exist",
@@ -371,7 +370,7 @@ def test_workflow_linter_lints_job_with_workspace_egg_dependency(
     notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import thingy\n")
     job_with_egg_dependency = make_job(notebook_path=notebook, libraries=[library])
 
-    problems = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)
 
     assert not [problem for problem in problems if problem.message == expected_problem_message]
 
@@ -396,7 +395,7 @@ def test_workflow_linter_lints_job_with_dbfs_egg_dependency(
     notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import thingy\n")
     job_with_egg_dependency = make_job(notebook_path=notebook, libraries=[library])
 
-    problems = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)
+    problems,_dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)
 
     assert not [problem for problem in problems if problem.message == expected_problem_message]
 
@@ -414,7 +413,7 @@ def test_workflow_linter_lints_job_with_missing_library(simple_ctx, make_job, ma
     notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=b"import databricks.labs.ucx")
     job_without_ucx_library = make_job(notebook_path=notebook)
 
-    problems = simple_ctx.workflow_linter.lint_job(job_without_ucx_library.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_without_ucx_library.job_id)
 
     assert len([problem for problem in problems if problem.message == expected_problem_message]) > 0
     allow_list.module_compatibility.assert_called_once_with("databricks.labs.ucx")
@@ -435,7 +434,7 @@ def test_workflow_linter_lints_job_with_wheel_dependency(simple_ctx, make_job, m
     notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=b"import databricks.labs.ucx")
     job_with_ucx_library = make_job(notebook_path=notebook, libraries=[library])
 
-    problems = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id)
 
     assert len([problem for problem in problems if problem.message == expected_problem_message]) == 0
 
@@ -463,7 +462,7 @@ def test_job_spark_python_task_linter_happy_path(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0
 
 
@@ -484,7 +483,7 @@ def test_job_spark_python_task_linter_unhappy_path(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 1
 
 
@@ -516,7 +515,7 @@ def test_workflow_linter_lints_python_wheel_task(simple_ctx, ws, make_job, make_
     )
     job_with_ucx_library = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id)
 
     assert len([problem for problem in problems if problem.code == "library-dist-info-not-found"]) == 0
     assert len([problem for problem in problems if problem.code == "library-entrypoint-not-found"]) == 0
@@ -542,7 +541,7 @@ def test_job_spark_python_task_workspace_linter_happy_path(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert not [problem for problem in problems if problem.message == "Could not locate import: greenlet"]
 
 
@@ -565,7 +564,7 @@ def test_job_spark_python_task_dbfs_linter_happy_path(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert not [problem for problem in problems if problem.message == "Could not locate import: greenlet"]
 
 
@@ -593,7 +592,7 @@ def test_job_spark_python_task_linter_notebook_handling(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     # The notebook being linted has 'import greenlet' in a cell that should be ignored, but will trigger this problem if processed.
     assert not [problem for problem in problems if problem.message == "Could not locate import: greenlet"]
 
@@ -618,7 +617,7 @@ def test_job_dlt_task_linter_unhappy_path(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 1
 
 
@@ -643,7 +642,7 @@ def test_job_dlt_task_linter_happy_path(
     )
     j = make_job(tasks=[task])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0
 
 
@@ -657,7 +656,7 @@ def test_job_dependency_problem_egg_dbr14plus(make_job, make_directory, make_not
 
     j = make_job(libraries=[library])
 
-    problems = simple_ctx.workflow_linter.lint_job(j.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(j.job_id)
     assert (
         len(
             [

From b605b777beb73b2fe3172c7236a043e230e81a14 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 17:15:38 +0200
Subject: [PATCH 59/80] formatting

---
 tests/integration/source_code/test_jobs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index d63941a93e..db53bbe9c7 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -395,7 +395,7 @@ def test_workflow_linter_lints_job_with_dbfs_egg_dependency(
     notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import thingy\n")
     job_with_egg_dependency = make_job(notebook_path=notebook, libraries=[library])
 
-    problems,_dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)
+    problems, _dfsas = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)
 
     assert not [problem for problem in problems if problem.message == expected_problem_message]
 

From 7f9fa061ea714d2915e38500d3ae5a27fe12a5d1 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 17:27:09 +0200
Subject: [PATCH 60/80] fix failing tests

---
 src/databricks/labs/ucx/source_code/jobs.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index d59dc394f9..a42b499a11 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -11,7 +11,7 @@
 from urllib import parse
 
 from databricks.labs.blueprint.parallel import ManyError, Threads
-from databricks.labs.blueprint.paths import DBFSPath
+from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath
 from databricks.labs.lsql.backends import SqlBackend
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.errors import NotFound
@@ -524,7 +524,12 @@ def _collect_from_notebook(
     ) -> Iterable[DirectFsAccess]:
         notebook = Notebook.parse(path, source, language.language)
         for cell in notebook.cells:
-            src_timestamp = int(path.stat().st_mtime)
+            if isinstance(path, WorkspacePath):
+                # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268
+                # pylint: disable=protected-access
+                src_timestamp = path._object_info.modified_at
+            else:
+                src_timestamp = int(path.stat().st_mtime)
             src_id = str(path)
             src_lineage = self.lineage_str
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
@@ -548,7 +553,12 @@ def _collect_from_source(
             return
         src_id = str(path)
         src_lineage = self.lineage_str
-        src_timestamp = int(path.stat().st_mtime)
+        if isinstance(path, WorkspacePath):
+            # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268
+            # pylint: disable=protected-access
+            src_timestamp = path._object_info.modified_at
+        else:
+            src_timestamp = int(path.stat().st_mtime)
         for dfsa in iterable:
             yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp)
 

From f7108cb35fa7ed04ef38f474ad81da066b5df659 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 17:46:45 +0200
Subject: [PATCH 61/80] fix failing tests

---
 src/databricks/labs/ucx/mixins/fixtures.py  | 2 +-
 src/databricks/labs/ucx/source_code/jobs.py | 2 +-
 tests/integration/source_code/test_jobs.py  | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py
index abc6556fec..f4de9e0351 100644
--- a/src/databricks/labs/ucx/mixins/fixtures.py
+++ b/src/databricks/labs/ucx/mixins/fixtures.py
@@ -831,7 +831,7 @@ def create(notebook_path: str | Path | None = None, **kwargs):
 
         job = ws.jobs.create(**kwargs)
         logger.info(f"Job: {ws.config.host}#job/{job.job_id}")
-        return job
+        return ws.jobs.get(job.job_id)
 
     yield from factory("job", create, lambda item: ws.jobs.delete(item.job_id))
 
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index a42b499a11..b09ff7088a 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -554,7 +554,7 @@ def _collect_from_source(
         src_id = str(path)
         src_lineage = self.lineage_str
         if isinstance(path, WorkspacePath):
-            # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268
+            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
             # pylint: disable=protected-access
             src_timestamp = path._object_info.modified_at
         else:
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index db53bbe9c7..11fde9dfe8 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -157,12 +157,11 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
     assert len(dfsas) == 2
     task_keys = set(task.task_key for task in j.settings.tasks)
     for dfsa in dfsas:
-        assert dfsa.source_type != DirectFsAccess.UNKNOWN
         assert dfsa.source_id != DirectFsAccess.UNKNOWN
         assert dfsa.source_lineage != DirectFsAccess.UNKNOWN
         assert dfsa.source_timestamp != -1
         assert dfsa.job_id == j.job_id
-        assert dfsa.job_name == j.job_name
+        assert dfsa.job_name == j.settings.name
         assert dfsa.task_key in task_keys
         assert dfsa.assessment_start_timestamp != -1
         assert dfsa.assessment_end_timestamp != -1

From abcab8773be2b0ca466fa15503bcbc4539f385af Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 9 Sep 2024 17:54:43 +0200
Subject: [PATCH 62/80] fix failing tests

---
 src/databricks/labs/ucx/source_code/jobs.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index b09ff7088a..6cb3daf4b1 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -527,7 +527,11 @@ def _collect_from_notebook(
             if isinstance(path, WorkspacePath):
                 # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268
                 # pylint: disable=protected-access
-                src_timestamp = path._object_info.modified_at
+                src_timestamp = path._object_info.modified_at or -1
+            elif isinstance(path, DBFSPath):
+                # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
+                # pylint: disable=protected-access
+                src_timestamp = path._file_info.modification_time or -1
             else:
                 src_timestamp = int(path.stat().st_mtime)
             src_id = str(path)
@@ -556,7 +560,11 @@ def _collect_from_source(
         if isinstance(path, WorkspacePath):
             # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
             # pylint: disable=protected-access
-            src_timestamp = path._object_info.modified_at
+            src_timestamp = path._object_info.modified_at or -1
+        elif isinstance(path, DBFSPath):
+            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
+            # pylint: disable=protected-access
+            src_timestamp = path._file_info.modification_time or -1
         else:
             src_timestamp = int(path.stat().st_mtime)
         for dfsa in iterable:

From eb603e4e65e3c5eb837477dfae3af9fbe7b775be Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 10 Sep 2024 09:37:25 +0200
Subject: [PATCH 63/80] catch infinite recursion

---
 src/databricks/labs/ucx/source_code/graph.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index 0f4804db67..1ed9e2a360 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -203,6 +203,12 @@ def root_relative_names(self) -> set[str]:
 
     # when visit_node returns True it interrupts the visit
     def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool:
+        try:
+            return self._visit(visit_node, visited)
+        except RecursionError as e:
+            return False
+
+    def _visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool:
         """provide visited set if you want to ensure nodes are only visited once"""
         if visited is not None:
             path = self.dependency.path

From 837fe6e9473d9f2a4b58d11e4577abffbe9db44f Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 10 Sep 2024 09:37:44 +0200
Subject: [PATCH 64/80] drop legacy code

---
 src/databricks/labs/ucx/source_code/jobs.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 6cb3daf4b1..a43de3af16 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -456,12 +456,10 @@ def _collect_task_dfsas(
         self, task: jobs.Task, job: jobs.Job, graph: DependencyGraph, session_state: CurrentSessionState
     ) -> Iterable[DirectFsAccess]:
         collector = DfsaCollectorWalker(graph, set(), self._path_lookup, session_state)
-        dfsas: list[DirectFsAccess] = []
         assert job.settings is not None  # as already done in _lint_job
         job_name = job.settings.name
         for dfsa in collector:
             yield dfsa.replace_job_infos(job_id=job.job_id, job_name=job_name, task_key=task.task_key)
-        self._directfs_crawlers.for_paths().append(dfsas)
 
 
 class LintingWalker(DependencyGraphWalker[LocatedAdvice]):

From e80b6f034bbf30dbb4b75b655623433d008086c7 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Tue, 10 Sep 2024 11:54:44 +0200
Subject: [PATCH 65/80] Revert "catch infinite recursion"

This reverts commit eb603e4e65e3c5eb837477dfae3af9fbe7b775be.
---
 src/databricks/labs/ucx/source_code/graph.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index 1ed9e2a360..0f4804db67 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -203,12 +203,6 @@ def root_relative_names(self) -> set[str]:
 
     # when visit_node returns True it interrupts the visit
     def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool:
-        try:
-            return self._visit(visit_node, visited)
-        except RecursionError as e:
-            return False
-
-    def _visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path] | None) -> bool:
         """provide visited set if you want to ensure nodes are only visited once"""
         if visited is not None:
             path = self.dependency.path

From f8cdc22030750424fa0b0988eadb8d7d6d3370c7 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Tue, 10 Sep 2024 16:50:44 +0200
Subject: [PATCH 66/80] Use structured lineage for DependencyGraph (#2556)

## Changes
Use structured lineage for DependencyGraph

### Linked issues
Resolves #2550

### Functionality
None

### Tests
- [x] added unit tests

---------

Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
---
 src/databricks/labs/ucx/source_code/graph.py | 32 ++++++++++++----
 src/databricks/labs/ucx/source_code/jobs.py  | 40 +++++++++++---------
 tests/unit/source_code/test_jobs.py          | 32 ++++++++++++++--
 3 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index 0f4804db67..6ad90965d6 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import abc
+import itertools
+import json
 import logging
 from dataclasses import dataclass
 from pathlib import Path
@@ -291,11 +293,10 @@ class DependencyGraphContext:
 
 class Dependency:
 
-    def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True, lineage_str: str | None = None):
+    def __init__(self, loader: DependencyLoader, path: Path, inherits_context=True):
         self._loader = loader
         self._path = path
         self._inherits_context = inherits_context
-        self._lineage_str = lineage_str or '"' + self._path.as_posix() + '"'
 
     @property
     def path(self) -> Path:
@@ -318,8 +319,8 @@ def __repr__(self):
         return f"Dependency<{self.path}>"
 
     @property
-    def lineage_str(self):
-        return self._lineage_str
+    def lineage(self) -> list[LineageAtom]:
+        return [LineageAtom("path", str(self.path))]
 
 
 class SourceContainer(abc.ABC):
@@ -589,6 +590,23 @@ def finalize(self) -> InheritedContext:
         return InheritedContext(tree, self.found)
 
 
+@dataclass
+class LineageAtom:
+
+    @staticmethod
+    def atoms_to_json_string(atoms: list[LineageAtom]):
+        json_lists = list(lineage.as_objects() for lineage in atoms)
+        json_obj = list(itertools.chain(*json_lists))
+        return json.dumps(json_obj)
+
+    object_type: str
+    object_id: str
+    other: dict[str, str] | None = None
+
+    def as_objects(self) -> list[dict[str, str]]:
+        return [{"object_type": self.object_type, "object_id": self.object_id, **(self.other or {})}]
+
+
 T = TypeVar("T")
 
 
@@ -634,6 +652,6 @@ def _process_dependency(
     ) -> Iterable[T]: ...
 
     @property
-    def lineage_str(self):
-        parts = [dependency.lineage_str for dependency in self._lineage]
-        return "->".join(parts)
+    def lineage(self) -> list[LineageAtom]:
+        lineages = [dependency.lineage for dependency in self._lineage]
+        return list(itertools.chain(*lineages))
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index a43de3af16..8cf02441a5 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -37,6 +37,7 @@
     SourceContainer,
     WrappingLoader,
     DependencyGraphWalker,
+    LineageAtom,
 )
 from databricks.labs.ucx.source_code.linters.context import LinterContext
 from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter
@@ -68,10 +69,8 @@ def as_message(self) -> str:
 
 class WorkflowTask(Dependency):
     def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job):
-        # concat job and task for lineage, see DependencyGraphWalker.lineage_str
-        lineage_str = f'"job:{job.job_id}"->"task:{task.task_key}"'
         loader = WrappingLoader(WorkflowTaskContainer(ws, task, job))
-        super().__init__(loader, Path(f'/jobs/{task.task_key}'), inherits_context=False, lineage_str=lineage_str)
+        super().__init__(loader, Path(f'/jobs/{task.task_key}'), inherits_context=False)
         self._task = task
         self._job = job
 
@@ -81,6 +80,13 @@ def load(self, path_lookup: PathLookup) -> SourceContainer | None:
     def __repr__(self):
         return f'WorkflowTask<{self._task.task_key} of {self._job.settings.name}>'
 
+    @property
+    def lineage(self) -> list[LineageAtom]:
+        job_name = ("" if self._job.settings is None else self._job.settings.name) or "unknown job"
+        job_lineage = LineageAtom("job", str(self._job.job_id), {"name": job_name})
+        task_lineage = LineageAtom("task", self._task.task_key)
+        return [job_lineage, task_lineage]
+
 
 class WorkflowTaskContainer(SourceContainer):
     def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job):
@@ -521,19 +527,19 @@ def _collect_from_notebook(
         self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None
     ) -> Iterable[DirectFsAccess]:
         notebook = Notebook.parse(path, source, language.language)
+        if isinstance(path, WorkspacePath):
+            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
+            # pylint: disable=protected-access
+            src_timestamp = path._object_info.modified_at or -1
+        elif isinstance(path, DBFSPath):
+            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
+            # pylint: disable=protected-access
+            src_timestamp = path._file_info.modification_time or -1
+        else:
+            src_timestamp = int(path.stat().st_mtime)
+        src_id = str(path)
+        src_lineage = LineageAtom.atoms_to_json_string(self.lineage)
         for cell in notebook.cells:
-            if isinstance(path, WorkspacePath):
-                # TODO add modified_at property in lsql, see https://github.com/databrickslabs/lsql/issues/268
-                # pylint: disable=protected-access
-                src_timestamp = path._object_info.modified_at or -1
-            elif isinstance(path, DBFSPath):
-                # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
-                # pylint: disable=protected-access
-                src_timestamp = path._file_info.modification_time or -1
-            else:
-                src_timestamp = int(path.stat().st_mtime)
-            src_id = str(path)
-            src_lineage = self.lineage_str
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
                 yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp)
             if cell.language is CellLanguage.PYTHON:
@@ -553,8 +559,6 @@ def _collect_from_source(
         if iterable is None:
             logger.warning(f"Language {language.name} not supported yet!")
             return
-        src_id = str(path)
-        src_lineage = self.lineage_str
         if isinstance(path, WorkspacePath):
             # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
             # pylint: disable=protected-access
@@ -565,6 +569,8 @@ def _collect_from_source(
             src_timestamp = path._file_info.modification_time or -1
         else:
             src_timestamp = int(path.stat().st_mtime)
+        src_id = str(path)
+        src_lineage = LineageAtom.atoms_to_json_string(self.lineage)
         for dfsa in iterable:
             yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp)
 
diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
index e2293bd375..7b441764b2 100644
--- a/tests/unit/source_code/test_jobs.py
+++ b/tests/unit/source_code/test_jobs.py
@@ -1,11 +1,12 @@
 import io
+import itertools
 import logging
 import textwrap
 from pathlib import Path
 from unittest.mock import create_autospec
 
 import pytest
-from databricks.sdk.service.jobs import Job, SparkPythonTask
+from databricks.sdk.service.jobs import Job, SparkPythonTask, JobSettings, Task
 from databricks.sdk.service.pipelines import NotebookLibrary, GetPipelineResponse, PipelineLibrary, FileLibrary
 
 from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath
@@ -18,8 +19,13 @@
 from databricks.sdk.service.workspace import ExportFormat
 
 from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver
-from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver
-from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer
+from databricks.labs.ucx.source_code.graph import (
+    Dependency,
+    DependencyGraph,
+    DependencyResolver,
+    LineageAtom,
+)
+from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer, WorkflowTask
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
 
 
@@ -512,3 +518,23 @@ def test_xxx(graph):
     assert workflow_task_container.spark_conf == {"spark.databricks.cluster.profile": "singleNode"}
 
     ws.assert_not_called()
+
+
+def test_full_lineage_is_converted_to_json():
+    ws = create_autospec(WorkspaceClient)
+    ws.assert_not_called()
+    task = Task(task_key="task-key")
+    settings = JobSettings(name="job-name")
+    job = create_autospec(jobs.Job)
+    job.job_id = "job-id"
+    job.settings = settings
+    wtask = WorkflowTask(ws, task, job)
+    full_lineage = list(itertools.chain(wtask.lineage, [LineageAtom("path", "abc"), LineageAtom("path", "xyz")]))
+    json_str = LineageAtom.atoms_to_json_string(full_lineage)
+    job.assert_not_called()
+    assert json_str == (
+        '[{"object_type": "job", "object_id": "job-id", "name": "job-name"}, '
+        '{"object_type": "task", "object_id": "task-key"}, '
+        '{"object_type": "path", "object_id": "abc"}, '
+        '{"object_type": "path", "object_id": "xyz"}]'
+    )

From 8cb4ac04ae9c05df94d5d1ef71a1af8d77e5a1df Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Wed, 11 Sep 2024 16:54:30 +0200
Subject: [PATCH 67/80] fix merge issues

---
 src/databricks/labs/ucx/source_code/linters/pyspark.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py
index c442af9fd6..86cc274b87 100644
--- a/src/databricks/labs/ucx/source_code/linters/pyspark.py
+++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py
@@ -238,10 +238,10 @@ def __init__(self, dfsa_matchers_only: bool):
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.SparkSession.html
         # spark.sql is handled by a dedicated linter
-        spark_session_matchers = [SparkCallMatcher("table", 1, 1, 0)]
+        spark_session_matchers: list[_TableNameMatcher] = [SparkCallMatcher("table", 1, 1, 0)]
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Catalog.html
-        spark_catalog_matchers = [
+        spark_catalog_matchers: list[_TableNameMatcher] = [
             SparkCallMatcher("cacheTable", 1, 2, 0, "tableName"),
             SparkCallMatcher("createTable", 1, 1000, 0, "tableName"),
             SparkCallMatcher("createExternalTable", 1, 1000, 0, "tableName"),
@@ -256,7 +256,7 @@ def __init__(self, dfsa_matchers_only: bool):
         ]
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
-        spark_dataframe_matchers = [
+        spark_dataframe_matchers: list[_TableNameMatcher] = [
             SparkCallMatcher("writeTo", 1, 1, 0),
         ]
 
@@ -270,12 +270,12 @@ def __init__(self, dfsa_matchers_only: bool):
         # nothing to migrate in Window, see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Window.html
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html
-        spark_dataframereader_matchers = [
+        spark_dataframereader_matchers: list[_TableNameMatcher] = [
             SparkCallMatcher("table", 1, 1, 0),  # TODO good example of collision, see spark_session_calls
         ]
 
         # see https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html
-        spark_dataframewriter_matchers = [
+        spark_dataframewriter_matchers: list[_TableNameMatcher] = [
             SparkCallMatcher("insertInto", 1, 2, 0, "tableName"),
             # TODO jdbc: could the url be a databricks url, raise warning ?
             SparkCallMatcher("saveAsTable", 1, 4, 0, "name"),

From 185d06094977330abdb7118d5634a54017361b6b Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Thu, 12 Sep 2024 09:33:49 +0200
Subject: [PATCH 68/80] Update src/databricks/labs/ucx/source_code/base.py

Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com>
---
 src/databricks/labs/ucx/source_code/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index a99b3a5a86..1ee2917d21 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -349,13 +349,13 @@ class DirectFsAccess:
     is_read: bool
     is_write: bool
     source_id: str = UNKNOWN
-    source_timestamp: int = -1
-    source_lineage: str = UNKNOWN
+    source_timestamp: datetime.datetime
+    source_lineage: list[LineageAtom]
     job_id: int = -1
     job_name: str = UNKNOWN
     task_key: str = UNKNOWN
-    assessment_start_timestamp: int = -1
-    assessment_end_timestamp: int = -1
+    assessment_start_timestamp: datetime.datetime
+    assessment_end_timestamp: datetime.datetime
 
     def replace_source(
         self,

From 1742415787038a27098ca14912fc183a0009eebd Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 12 Sep 2024 10:36:47 +0200
Subject: [PATCH 69/80] refactor DirectFsAccess

---
 .../labs/ucx/contexts/application.py          |   2 +-
 src/databricks/labs/ucx/source_code/base.py   |  74 ----------
 .../labs/ucx/source_code/directfs_access.py   | 136 ++++++++++++++++++
 .../source_code/directfs_access_crawler.py    |  49 -------
 src/databricks/labs/ucx/source_code/graph.py  |  23 +--
 src/databricks/labs/ucx/source_code/jobs.py   |  28 ++--
 .../labs/ucx/source_code/linters/directfs.py  |   2 +-
 tests/integration/source_code/test_jobs.py    |   3 +-
 ...ess_crawler.py => test_directfs_access.py} |  13 +-
 tests/unit/source_code/test_jobs.py           |  28 +---
 10 files changed, 165 insertions(+), 193 deletions(-)
 create mode 100644 src/databricks/labs/ucx/source_code/directfs_access.py
 delete mode 100644 src/databricks/labs/ucx/source_code/directfs_access_crawler.py
 rename tests/unit/source_code/{test_directfs_access_crawler.py => test_directfs_access.py} (60%)

diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
index 6ba45ef469..6dc9a649d1 100644
--- a/src/databricks/labs/ucx/contexts/application.py
+++ b/src/databricks/labs/ucx/contexts/application.py
@@ -15,7 +15,7 @@
 from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever
 from databricks.labs.ucx.recon.migration_recon import MigrationRecon
 from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.sdk import AccountClient, WorkspaceClient, core
 from databricks.sdk.errors import ResourceDoesNotExist
diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
index 1ee2917d21..1a21264d5d 100644
--- a/src/databricks/labs/ucx/source_code/base.py
+++ b/src/databricks/labs/ucx/source_code/base.py
@@ -337,77 +337,3 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
         logger.warning(f"Could not read file {path}")
         return False
     return file_header == magic_header
-
-
-@dataclass
-class DirectFsAccess:
-    """A record describing a Direct File System Access"""
-
-    UNKNOWN = "unknown"
-
-    path: str
-    is_read: bool
-    is_write: bool
-    source_id: str = UNKNOWN
-    source_timestamp: datetime.datetime
-    source_lineage: list[LineageAtom]
-    job_id: int = -1
-    job_name: str = UNKNOWN
-    task_key: str = UNKNOWN
-    assessment_start_timestamp: datetime.datetime
-    assessment_end_timestamp: datetime.datetime
-
-    def replace_source(
-        self,
-        source_id: str | None = None,
-        source_lineage: str | None = None,
-        source_timestamp: int | None = None,
-    ):
-        return DirectFsAccess(
-            path=self.path,
-            is_read=self.is_read,
-            is_write=self.is_write,
-            source_id=source_id or self.source_id,
-            source_timestamp=source_timestamp or self.source_timestamp,
-            source_lineage=source_lineage or self.source_lineage,
-            job_id=self.job_id,
-            job_name=self.job_name,
-            task_key=self.task_key,
-            assessment_start_timestamp=self.assessment_start_timestamp,
-            assessment_end_timestamp=self.assessment_start_timestamp,
-        )
-
-    def replace_job_infos(
-        self,
-        job_id: int | None = None,
-        job_name: str | None = None,
-        task_key: str | None = None,
-    ):
-        return DirectFsAccess(
-            path=self.path,
-            is_read=self.is_read,
-            is_write=self.is_write,
-            source_id=self.source_id,
-            source_timestamp=self.source_timestamp,
-            source_lineage=self.source_lineage,
-            job_id=job_id or self.job_id,
-            job_name=job_name or self.job_name,
-            task_key=task_key or self.task_key,
-            assessment_start_timestamp=self.assessment_start_timestamp,
-            assessment_end_timestamp=self.assessment_start_timestamp,
-        )
-
-    def replace_assessment_infos(self, assessment_start: int | None = None, assessment_end: int | None = None):
-        return DirectFsAccess(
-            path=self.path,
-            is_read=self.is_read,
-            is_write=self.is_write,
-            source_id=self.source_id,
-            source_timestamp=self.source_timestamp,
-            source_lineage=self.source_lineage,
-            job_id=self.job_id,
-            job_name=self.job_name,
-            task_key=self.task_key,
-            assessment_start_timestamp=assessment_start or self.assessment_start_timestamp,
-            assessment_end_timestamp=assessment_end or self.assessment_start_timestamp,
-        )
diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
new file mode 100644
index 0000000000..954d5094a2
--- /dev/null
+++ b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -0,0 +1,136 @@
+from __future__ import annotations
+
+
+import logging
+from collections.abc import Sequence, Iterable
+from dataclasses import dataclass, field
+from datetime import datetime
+
+from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result
+from databricks.labs.lsql.backends import SqlBackend
+from databricks.sdk.errors import DatabricksError
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class LineageAtom:
+
+    object_type: str
+    object_id: str
+    other: dict[str, str] | None = None
+
+
+@dataclass
+class DirectFsAccess:
+    """A record describing a Direct File System Access"""
+
+    UNKNOWN = "unknown"
+
+    path: str
+    is_read: bool
+    is_write: bool
+    source_id: str = UNKNOWN
+    source_timestamp: datetime = datetime.fromtimestamp(-1)
+    source_lineage: list[LineageAtom] = field(default_factory=list)
+    job_id: int = -1
+    job_name: str = UNKNOWN
+    task_key: str = UNKNOWN
+    assessment_start_timestamp: datetime = datetime.fromtimestamp(-1)
+    assessment_end_timestamp: datetime = datetime.fromtimestamp(-1)
+
+    def replace_source(
+        self,
+        source_id: str | None = None,
+        source_lineage: list[LineageAtom] | None = None,
+        source_timestamp: datetime | None = None,
+    ):
+        return DirectFsAccess(
+            path=self.path,
+            is_read=self.is_read,
+            is_write=self.is_write,
+            source_id=source_id or self.source_id,
+            source_timestamp=source_timestamp or self.source_timestamp,
+            source_lineage=source_lineage or self.source_lineage,
+            job_id=self.job_id,
+            job_name=self.job_name,
+            task_key=self.task_key,
+            assessment_start_timestamp=self.assessment_start_timestamp,
+            assessment_end_timestamp=self.assessment_start_timestamp,
+        )
+
+    def replace_job_infos(
+        self,
+        job_id: int | None = None,
+        job_name: str | None = None,
+        task_key: str | None = None,
+    ):
+        return DirectFsAccess(
+            path=self.path,
+            is_read=self.is_read,
+            is_write=self.is_write,
+            source_id=self.source_id,
+            source_timestamp=self.source_timestamp,
+            source_lineage=self.source_lineage,
+            job_id=job_id or self.job_id,
+            job_name=job_name or self.job_name,
+            task_key=task_key or self.task_key,
+            assessment_start_timestamp=self.assessment_start_timestamp,
+            assessment_end_timestamp=self.assessment_start_timestamp,
+        )
+
+    def replace_assessment_infos(
+        self, assessment_start: datetime | None = None, assessment_end: datetime | None = None
+    ):
+        return DirectFsAccess(
+            path=self.path,
+            is_read=self.is_read,
+            is_write=self.is_write,
+            source_id=self.source_id,
+            source_timestamp=self.source_timestamp,
+            source_lineage=self.source_lineage,
+            job_id=self.job_id,
+            job_name=self.job_name,
+            task_key=self.task_key,
+            assessment_start_timestamp=assessment_start or self.assessment_start_timestamp,
+            assessment_end_timestamp=assessment_end or self.assessment_start_timestamp,
+        )
+
+
+class _DirectFsAccessCrawler(CrawlerBase):
+
+    def __init__(self, backend: SqlBackend, schema: str, table: str):
+        """
+        Initializes a DFSACrawler instance.
+
+        Args:
+            sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
+            schema: The schema name for the inventory persistence.
+        """
+        super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess)
+
+    def append(self, dfsas: Sequence[DirectFsAccess]):
+        try:
+            self._append_records(dfsas)
+        except DatabricksError as e:
+            logger.error("Failed to store DFSAs", exc_info=e)
+
+    def _try_fetch(self) -> Iterable[DirectFsAccess]:
+        sql = f"SELECT * FROM {self.full_name}"
+        yield from self._backend.fetch(sql)
+
+    def _crawl(self) -> Iterable[Result]:
+        return []
+
+
+class DirectFsAccessCrawlers:
+
+    def __init__(self, sql_backend: SqlBackend, schema: str):
+        self._sql_backend = sql_backend
+        self._schema = schema
+
+    def for_paths(self) -> _DirectFsAccessCrawler:
+        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths")
+
+    def for_queries(self) -> _DirectFsAccessCrawler:
+        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries")
diff --git a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py b/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
deleted file mode 100644
index a05c2079db..0000000000
--- a/src/databricks/labs/ucx/source_code/directfs_access_crawler.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import logging
-from collections.abc import Sequence, Iterable
-
-from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result
-from databricks.labs.lsql.backends import SqlBackend
-from databricks.sdk.errors import DatabricksError
-
-from databricks.labs.ucx.source_code.base import DirectFsAccess
-
-logger = logging.getLogger(__name__)
-
-
-class _DirectFsAccessCrawler(CrawlerBase):
-
-    def __init__(self, backend: SqlBackend, schema: str, table: str):
-        """
-        Initializes a DFSACrawler instance.
-
-        Args:
-            sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark)
-            schema: The schema name for the inventory persistence.
-        """
-        super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess)
-
-    def append(self, dfsas: Sequence[DirectFsAccess]):
-        try:
-            self._append_records(dfsas)
-        except DatabricksError as e:
-            logger.error("Failed to store DFSAs", exc_info=e)
-
-    def _try_fetch(self) -> Iterable[DirectFsAccess]:
-        sql = f"SELECT * FROM {self.full_name}"
-        yield from self._backend.fetch(sql)
-
-    def _crawl(self) -> Iterable[Result]:
-        return []
-
-
-class DirectFsAccessCrawlers:
-
-    def __init__(self, sql_backend: SqlBackend, schema: str):
-        self._sql_backend = sql_backend
-        self._schema = schema
-
-    def for_paths(self) -> _DirectFsAccessCrawler:
-        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths")
-
-    def for_queries(self) -> _DirectFsAccessCrawler:
-        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries")
diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py
index e316125c93..e044b7e26a 100644
--- a/src/databricks/labs/ucx/source_code/graph.py
+++ b/src/databricks/labs/ucx/source_code/graph.py
@@ -2,7 +2,6 @@
 
 import abc
 import itertools
-import json
 import logging
 from dataclasses import dataclass
 from pathlib import Path
@@ -13,6 +12,7 @@
     NodeNG,
 )
 from databricks.labs.ucx.source_code.base import Advisory, CurrentSessionState, is_a_notebook
+from databricks.labs.ucx.source_code.directfs_access import LineageAtom
 from databricks.labs.ucx.source_code.python.python_ast import Tree
 from databricks.labs.ucx.source_code.path_lookup import PathLookup
 
@@ -605,23 +605,6 @@ def finalize(self) -> InheritedContext:
         return InheritedContext(tree, self.found)
 
 
-@dataclass
-class LineageAtom:
-
-    @staticmethod
-    def atoms_to_json_string(atoms: list[LineageAtom]):
-        json_lists = list(lineage.as_objects() for lineage in atoms)
-        json_obj = list(itertools.chain(*json_lists))
-        return json.dumps(json_obj)
-
-    object_type: str
-    object_id: str
-    other: dict[str, str] | None = None
-
-    def as_objects(self) -> list[dict[str, str]]:
-        return [{"object_type": self.object_type, "object_id": self.object_id, **(self.other or {})}]
-
-
 T = TypeVar("T")
 
 
@@ -668,5 +651,5 @@ def _process_dependency(
 
     @property
     def lineage(self) -> list[LineageAtom]:
-        lineages = [dependency.lineage for dependency in self._lineage]
-        return list(itertools.chain(*lineages))
+        lists: list[list[LineageAtom]] = [dependency.lineage for dependency in self._lineage]
+        return list(itertools.chain(*lists))
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 8cf02441a5..144cf0304a 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -2,10 +2,10 @@
 import logging
 import shutil
 import tempfile
-import time
 from collections.abc import Generator, Iterable
 from contextlib import contextmanager
 from dataclasses import dataclass
+from datetime import datetime
 from importlib import metadata
 from pathlib import Path
 from urllib import parse
@@ -26,9 +26,8 @@
     is_a_notebook,
     file_language,
     guess_encoding,
-    DirectFsAccess,
 )
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, LineageAtom, DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.graph import (
     Dependency,
     DependencyGraph,
@@ -37,7 +36,6 @@
     SourceContainer,
     WrappingLoader,
     DependencyGraphWalker,
-    LineageAtom,
 )
 from databricks.labs.ucx.source_code.linters.context import LinterContext
 from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter
@@ -417,9 +415,9 @@ def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAcces
                     end_col=advice.advice.end_col,
                 )
                 problems.append(job_problem)
-            assessment_start = int(time.mktime(time.gmtime()))
+            assessment_start = datetime.now()
             task_dfsas = self._collect_task_dfsas(task, job, graph, session_state)
-            assessment_end = int(time.mktime(time.gmtime()))
+            assessment_end = datetime.now()
             for dfsa in task_dfsas:
                 dfsa = dfsa.replace_assessment_infos(assessment_start=assessment_start, assessment_end=assessment_end)
                 dfsas.append(dfsa)
@@ -530,18 +528,17 @@ def _collect_from_notebook(
         if isinstance(path, WorkspacePath):
             # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
             # pylint: disable=protected-access
-            src_timestamp = path._object_info.modified_at or -1
+            src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1)
         elif isinstance(path, DBFSPath):
             # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
             # pylint: disable=protected-access
-            src_timestamp = path._file_info.modification_time or -1
+            src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1)
         else:
-            src_timestamp = int(path.stat().st_mtime)
+            src_timestamp = datetime.fromtimestamp(path.stat().st_mtime)
         src_id = str(path)
-        src_lineage = LineageAtom.atoms_to_json_string(self.lineage)
         for cell in notebook.cells:
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
-                yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp)
+                yield dfsa.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp)
             if cell.language is CellLanguage.PYTHON:
                 if inherited_tree is None:
                     inherited_tree = Tree.new_module()
@@ -562,17 +559,16 @@ def _collect_from_source(
         if isinstance(path, WorkspacePath):
             # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
             # pylint: disable=protected-access
-            src_timestamp = path._object_info.modified_at or -1
+            src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1)
         elif isinstance(path, DBFSPath):
             # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
             # pylint: disable=protected-access
-            src_timestamp = path._file_info.modification_time or -1
+            src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1)
         else:
-            src_timestamp = int(path.stat().st_mtime)
+            src_timestamp = datetime.fromtimestamp(path.stat().st_mtime)
         src_id = str(path)
-        src_lineage = LineageAtom.atoms_to_json_string(self.lineage)
         for dfsa in iterable:
-            yield dfsa.replace_source(source_id=src_id, source_lineage=src_lineage, source_timestamp=src_timestamp)
+            yield dfsa.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp)
 
     def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]:
         linter = DirectFsAccessPyLinter(self._session_state, prevent_spark_duplicates=False)
diff --git a/src/databricks/labs/ucx/source_code/linters/directfs.py b/src/databricks/labs/ucx/source_code/linters/directfs.py
index 4a8c01fa67..2eb14b03db 100644
--- a/src/databricks/labs/ucx/source_code/linters/directfs.py
+++ b/src/databricks/labs/ucx/source_code/linters/directfs.py
@@ -13,8 +13,8 @@
     CurrentSessionState,
     PythonLinter,
     SqlLinter,
-    DirectFsAccess,
 )
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess
 from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor
 from databricks.labs.ucx.source_code.python.python_infer import InferredValue
 
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
index 11fde9dfe8..c741322748 100644
--- a/tests/integration/source_code/test_jobs.py
+++ b/tests/integration/source_code/test_jobs.py
@@ -20,7 +20,8 @@
 
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
 from databricks.labs.ucx.mixins.fixtures import get_purge_suffix, factory
-from databricks.labs.ucx.source_code.base import CurrentSessionState, DirectFsAccess
+from databricks.labs.ucx.source_code.base import CurrentSessionState
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess
 from databricks.labs.ucx.source_code.graph import Dependency
 from databricks.labs.ucx.source_code.known import UNKNOWN, KnownList
 from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter, FileLoader, FolderLoader
diff --git a/tests/unit/source_code/test_directfs_access_crawler.py b/tests/unit/source_code/test_directfs_access.py
similarity index 60%
rename from tests/unit/source_code/test_directfs_access_crawler.py
rename to tests/unit/source_code/test_directfs_access.py
index cd38435e66..6adfff7cc8 100644
--- a/tests/unit/source_code/test_directfs_access_crawler.py
+++ b/tests/unit/source_code/test_directfs_access.py
@@ -1,7 +1,8 @@
+from datetime import datetime
+
 from databricks.labs.lsql.backends import MockBackend
 
-from databricks.labs.ucx.source_code.base import DirectFsAccess
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, DirectFsAccessCrawlers, LineageAtom
 
 
 def test_crawler_appends_dfsas():
@@ -13,13 +14,13 @@ def test_crawler_appends_dfsas():
             is_read=False,
             is_write=False,
             source_id="ID",
-            source_timestamp=7452,
-            source_lineage="LINEAGE",
+            source_timestamp=datetime.now(),
+            source_lineage=[LineageAtom(object_type="LINEAGE", object_id="ID")],
             job_id=222,
             job_name="JOB",
             task_key="TASK",
-            assessment_start_timestamp=123,
-            assessment_end_timestamp=234,
+            assessment_start_timestamp=datetime.now(),
+            assessment_end_timestamp=datetime.now(),
         )
         for path in ("a", "b", "c")
     )
diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
index 7b441764b2..9dc67f07a3 100644
--- a/tests/unit/source_code/test_jobs.py
+++ b/tests/unit/source_code/test_jobs.py
@@ -1,17 +1,16 @@
 import io
-import itertools
 import logging
 import textwrap
 from pathlib import Path
 from unittest.mock import create_autospec
 
 import pytest
-from databricks.sdk.service.jobs import Job, SparkPythonTask, JobSettings, Task
+from databricks.sdk.service.jobs import Job, SparkPythonTask
 from databricks.sdk.service.pipelines import NotebookLibrary, GetPipelineResponse, PipelineLibrary, FileLibrary
 
 from databricks.labs.blueprint.paths import DBFSPath, WorkspacePath
 from databricks.labs.ucx.source_code.base import CurrentSessionState
-from databricks.labs.ucx.source_code.directfs_access_crawler import DirectFsAccessCrawlers
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawlers
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.labs.ucx.source_code.known import KnownList
 from databricks.sdk import WorkspaceClient
@@ -23,9 +22,8 @@
     Dependency,
     DependencyGraph,
     DependencyResolver,
-    LineageAtom,
 )
-from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer, WorkflowTask
+from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
 
 
@@ -518,23 +516,3 @@ def test_xxx(graph):
     assert workflow_task_container.spark_conf == {"spark.databricks.cluster.profile": "singleNode"}
 
     ws.assert_not_called()
-
-
-def test_full_lineage_is_converted_to_json():
-    ws = create_autospec(WorkspaceClient)
-    ws.assert_not_called()
-    task = Task(task_key="task-key")
-    settings = JobSettings(name="job-name")
-    job = create_autospec(jobs.Job)
-    job.job_id = "job-id"
-    job.settings = settings
-    wtask = WorkflowTask(ws, task, job)
-    full_lineage = list(itertools.chain(wtask.lineage, [LineageAtom("path", "abc"), LineageAtom("path", "xyz")]))
-    json_str = LineageAtom.atoms_to_json_string(full_lineage)
-    job.assert_not_called()
-    assert json_str == (
-        '[{"object_type": "job", "object_id": "job-id", "name": "job-name"}, '
-        '{"object_type": "task", "object_id": "task-key"}, '
-        '{"object_type": "path", "object_id": "abc"}, '
-        '{"object_type": "path", "object_id": "xyz"}]'
-    )

From c5987bb982fb67e92e908c9976a26f1eae701e67 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 12 Sep 2024 10:41:39 +0200
Subject: [PATCH 70/80] add view

---
 src/databricks/labs/ucx/queries/views/direct_fs_access.sql | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 src/databricks/labs/ucx/queries/views/direct_fs_access.sql

diff --git a/src/databricks/labs/ucx/queries/views/direct_fs_access.sql b/src/databricks/labs/ucx/queries/views/direct_fs_access.sql
new file mode 100644
index 0000000000..86ce1d26c8
--- /dev/null
+++ b/src/databricks/labs/ucx/queries/views/direct_fs_access.sql
@@ -0,0 +1,7 @@
+SELECT
+    *
+FROM direct_file_system_access_in_paths
+UNION
+SELECT
+    *
+FROM direct_file_system_access_in_queries

From b6fbeb47777d57259ae19d3f3bb31a53b353f07b Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 12 Sep 2024 12:30:15 +0200
Subject: [PATCH 71/80] fix failing tests

---
 .../labs/ucx/source_code/directfs_access.py   |  6 +--
 src/databricks/labs/ucx/source_code/jobs.py   | 39 +++++++++----------
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
index 954d5094a2..76041a2688 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -31,13 +31,13 @@ class DirectFsAccess:
     is_read: bool
     is_write: bool
     source_id: str = UNKNOWN
-    source_timestamp: datetime = datetime.fromtimestamp(-1)
+    source_timestamp: datetime = datetime.fromtimestamp(0)
     source_lineage: list[LineageAtom] = field(default_factory=list)
     job_id: int = -1
     job_name: str = UNKNOWN
     task_key: str = UNKNOWN
-    assessment_start_timestamp: datetime = datetime.fromtimestamp(-1)
-    assessment_end_timestamp: datetime = datetime.fromtimestamp(-1)
+    assessment_start_timestamp: datetime = datetime.fromtimestamp(0)
+    assessment_end_timestamp: datetime = datetime.fromtimestamp(0)
 
     def replace_source(
         self,
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 144cf0304a..88029616fa 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -5,7 +5,7 @@
 from collections.abc import Generator, Iterable
 from contextlib import contextmanager
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 from importlib import metadata
 from pathlib import Path
 from urllib import parse
@@ -495,6 +495,21 @@ def _process_dependency(
             yield LocatedAdvice(advice, dependency.path)
 
 
+def _get_path_modified_datetime(path: Path) -> datetime:
+    unix_time = 0.0
+    if isinstance(path, WorkspacePath):
+        # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
+        # pylint: disable=protected-access
+        unix_time += float(path._object_info.modified_at) / 1000.0 or 0.0
+    elif isinstance(path, DBFSPath):
+        # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
+        # pylint: disable=protected-access
+        unix_time += float(path._file_info.modification_time) / 1000.0 or 0.0
+    else:
+        unix_time = path.stat().st_mtime
+    return datetime.fromtimestamp(unix_time, timezone.utc)
+
+
 class DfsaCollectorWalker(DependencyGraphWalker[DirectFsAccess]):
 
     def __init__(
@@ -525,16 +540,7 @@ def _collect_from_notebook(
         self, source: str, language: CellLanguage, path: Path, inherited_tree: Tree | None
     ) -> Iterable[DirectFsAccess]:
         notebook = Notebook.parse(path, source, language.language)
-        if isinstance(path, WorkspacePath):
-            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
-            # pylint: disable=protected-access
-            src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1)
-        elif isinstance(path, DBFSPath):
-            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
-            # pylint: disable=protected-access
-            src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1)
-        else:
-            src_timestamp = datetime.fromtimestamp(path.stat().st_mtime)
+        src_timestamp = _get_path_modified_datetime(path)
         src_id = str(path)
         for cell in notebook.cells:
             for dfsa in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
@@ -556,16 +562,7 @@ def _collect_from_source(
         if iterable is None:
             logger.warning(f"Language {language.name} not supported yet!")
             return
-        if isinstance(path, WorkspacePath):
-            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
-            # pylint: disable=protected-access
-            src_timestamp = datetime.fromtimestamp(path._object_info.modified_at or -1)
-        elif isinstance(path, DBFSPath):
-            # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
-            # pylint: disable=protected-access
-            src_timestamp = datetime.fromtimestamp(path._file_info.modification_time or -1)
-        else:
-            src_timestamp = datetime.fromtimestamp(path.stat().st_mtime)
+        src_timestamp = _get_path_modified_datetime(path)
         src_id = str(path)
         for dfsa in iterable:
             yield dfsa.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp)

From cc194b605230713fb2ca0ab4e4626630fbebcd88 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 12 Sep 2024 12:36:13 +0200
Subject: [PATCH 72/80] formatting

---
 src/databricks/labs/ucx/source_code/jobs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index 88029616fa..50256f02b4 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -500,11 +500,11 @@ def _get_path_modified_datetime(path: Path) -> datetime:
     if isinstance(path, WorkspacePath):
         # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
         # pylint: disable=protected-access
-        unix_time += float(path._object_info.modified_at) / 1000.0 or 0.0
+        unix_time += float(path._object_info.modified_at) / 1000.0 if path._object_info.modified_at else 0.0
     elif isinstance(path, DBFSPath):
         # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
         # pylint: disable=protected-access
-        unix_time += float(path._file_info.modification_time) / 1000.0 or 0.0
+        unix_time += float(path._file_info.modification_time) / 1000.0 if path._file_info.modification_time else 0.0
     else:
         unix_time = path.stat().st_mtime
     return datetime.fromtimestamp(unix_time, timezone.utc)

From a5ada24144a21e6be5f51577db3846c715dc719b Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 12 Sep 2024 15:09:57 +0200
Subject: [PATCH 73/80] install added table

---
 src/databricks/labs/ucx/install.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
index 42981c768f..b1631deb6b 100644
--- a/src/databricks/labs/ucx/install.py
+++ b/src/databricks/labs/ucx/install.py
@@ -75,6 +75,7 @@
 from databricks.labs.ucx.installer.workflows import WorkflowsDeployment
 from databricks.labs.ucx.recon.migration_recon import ReconResult
 from databricks.labs.ucx.runtime import Workflows
+from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess
 from databricks.labs.ucx.source_code.jobs import JobProblem
 from databricks.labs.ucx.workspace_access.base import Permissions
 from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo
@@ -120,6 +121,7 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
             functools.partial(table, "udfs", Udf),
             functools.partial(table, "logs", LogRecord),
             functools.partial(table, "recon_results", ReconResult),
+            functools.partial(table, "direct_file_system_access_in_paths", DirectFsAccess), # direct_file_system_access_in_queries will be added in upcoming PR
         ],
     )
     deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")
@@ -128,6 +130,7 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
     deployer.deploy_view("misc_patterns", "queries/views/misc_patterns.sql")
     deployer.deploy_view("code_patterns", "queries/views/code_patterns.sql")
     deployer.deploy_view("reconciliation_results", "queries/views/reconciliation_results.sql")
+    # direct_file_system_access view will be added in upcoming PR
 
 
 def extract_major_minor(version_string):

From 2ce8e42163e1e9b2e086de8c4e5258b70c62eecb Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Thu, 12 Sep 2024 17:34:32 +0200
Subject: [PATCH 74/80] address verbal comments from @asnare

---
 src/databricks/labs/ucx/source_code/directfs_access.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
index 81c59bb485..104cb634d1 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -111,6 +111,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str):
 
     def append(self, dfsas: Sequence[DirectFsAccess]):
         try:
+            # TODO until we historize data, we append all DFSAs
             self._update_snapshot(dfsas, mode="append")
         except DatabricksError as e:
             logger.error("Failed to store DFSAs", exc_info=e)
@@ -120,7 +121,7 @@ def _try_fetch(self) -> Iterable[DirectFsAccess]:
         yield from self._backend.fetch(sql)
 
     def _crawl(self) -> Iterable[DirectFsAccess]:
-        return []
+        raise NotImplementedError()
 
 
 class DirectFsAccessCrawlers:

From 145fbae075524af485b45db6dbbe8ba9a4265c96 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 16 Sep 2024 10:33:37 +0200
Subject: [PATCH 75/80] rename table and drop unused view

---
 src/databricks/labs/ucx/install.py                         | 4 ++--
 src/databricks/labs/ucx/queries/views/direct_fs_access.sql | 7 -------
 src/databricks/labs/ucx/source_code/directfs_access.py     | 4 ++--
 3 files changed, 4 insertions(+), 11 deletions(-)
 delete mode 100644 src/databricks/labs/ucx/queries/views/direct_fs_access.sql

diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
index 0888aee1b2..0529c1b94d 100644
--- a/src/databricks/labs/ucx/install.py
+++ b/src/databricks/labs/ucx/install.py
@@ -122,8 +122,8 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str):
             functools.partial(table, "logs", LogRecord),
             functools.partial(table, "recon_results", ReconResult),
             functools.partial(
-                table, "direct_file_system_access_in_paths", DirectFsAccess
-            ),  # direct_file_system_access_in_queries will be added in upcoming PR
+                table, "directfs_in_paths", DirectFsAccess
+            ),  # directfs_in_queries will be added in upcoming PR
         ],
     )
     deployer.deploy_view("grant_detail", "queries/views/grant_detail.sql")
diff --git a/src/databricks/labs/ucx/queries/views/direct_fs_access.sql b/src/databricks/labs/ucx/queries/views/direct_fs_access.sql
deleted file mode 100644
index 86ce1d26c8..0000000000
--- a/src/databricks/labs/ucx/queries/views/direct_fs_access.sql
+++ /dev/null
@@ -1,7 +0,0 @@
-SELECT
-    *
-FROM direct_file_system_access_in_paths
-UNION
-SELECT
-    *
-FROM direct_file_system_access_in_queries
diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
index 104cb634d1..8d8abf8f90 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -131,7 +131,7 @@ def __init__(self, sql_backend: SqlBackend, schema: str):
         self._schema = schema
 
     def for_paths(self) -> _DirectFsAccessCrawler:
-        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_paths")
+        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "directfs_in_paths")
 
     def for_queries(self) -> _DirectFsAccessCrawler:
-        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "direct_file_system_access_in_queries")
+        return _DirectFsAccessCrawler(self._sql_backend, self._schema, "directfs_in_queries")

From 810e3566f04aca022afabe6c6de8e718b40622c9 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 16 Sep 2024 10:36:39 +0200
Subject: [PATCH 76/80] rename method that is not yet in line with new crawler
 design

---
 src/databricks/labs/ucx/source_code/directfs_access.py | 2 +-
 src/databricks/labs/ucx/source_code/jobs.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
index 8d8abf8f90..841bb837a8 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -109,7 +109,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str):
         """
         super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess)
 
-    def append(self, dfsas: Sequence[DirectFsAccess]):
+    def dump_all(self, dfsas: Sequence[DirectFsAccess]):
         try:
             # TODO until we historize data, we append all DFSAs
             self._update_snapshot(dfsas, mode="append")
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index a58048aabe..de931cad2b 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -369,7 +369,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
             JobProblem,
             mode='overwrite',
         )
-        self._directfs_crawlers.for_paths().append(job_dfsas)
+        self._directfs_crawlers.for_paths().dump_all(job_dfsas)
         if len(errors) > 0:
             raise ManyError(errors)
 

From 9f24f9795acfadb53b5d44b15c6a7af8825a5067 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 16 Sep 2024 10:42:07 +0200
Subject: [PATCH 77/80] rename method that is not yet in line with new crawler
 design

---
 tests/unit/source_code/test_directfs_access.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py
index 6adfff7cc8..7807e05f24 100644
--- a/tests/unit/source_code/test_directfs_access.py
+++ b/tests/unit/source_code/test_directfs_access.py
@@ -24,6 +24,6 @@ def test_crawler_appends_dfsas():
         )
         for path in ("a", "b", "c")
     )
-    crawler.append(dfsas)
+    crawler.dump_all(dfsas)
     rows = backend.rows_written_for(crawler.full_name, "append")
     assert len(rows) == 3

From b93b565907d40fc57c2d546741e438905031cbb0 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@wanadoo.fr>
Date: Mon, 16 Sep 2024 11:18:43 +0200
Subject: [PATCH 78/80] Update src/databricks/labs/ucx/source_code/jobs.py

Co-authored-by: Andrew Snare <asnare@users.noreply.github.com>
---
 src/databricks/labs/ucx/source_code/jobs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index de931cad2b..82610f8dbb 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -80,7 +80,7 @@ def __repr__(self):
 
     @property
     def lineage(self) -> list[LineageAtom]:
-        job_name = ("" if self._job.settings is None else self._job.settings.name) or "unknown job"
+        job_name = (None if self._job.settings is None else self._job.settings.name) or "unknown job"
         job_lineage = LineageAtom("job", str(self._job.job_id), {"name": job_name})
         task_lineage = LineageAtom("task", self._task.task_key)
         return [job_lineage, task_lineage]

From 3f846a85a7ba362ccf2eafc253de1f1fa6248a4c Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 16 Sep 2024 11:24:23 +0200
Subject: [PATCH 79/80] document design decision

---
 src/databricks/labs/ucx/source_code/directfs_access.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
index 841bb837a8..342cbd7104 100644
--- a/src/databricks/labs/ucx/source_code/directfs_access.py
+++ b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -110,6 +110,10 @@ def __init__(self, backend: SqlBackend, schema: str, table: str):
         super().__init__(backend, "hive_metastore", schema, table, DirectFsAccess)
 
     def dump_all(self, dfsas: Sequence[DirectFsAccess]):
+        """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one
+        It's not **bad** because all records are pushed at once.
+        Providing a multi-entity crawler is out-of-scope of this PR
+        """
         try:
             # TODO until we historize data, we append all DFSAs
             self._update_snapshot(dfsas, mode="append")

From 67f9d6846cac05b52657c68f0230d4fb00a80f12 Mon Sep 17 00:00:00 2001
From: Eric Vergnaud <eric.vergnaud@databricks.com>
Date: Mon, 16 Sep 2024 11:34:19 +0200
Subject: [PATCH 80/80] simplify

---
 src/databricks/labs/ucx/source_code/jobs.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
index de931cad2b..4f98e24bec 100644
--- a/src/databricks/labs/ucx/source_code/jobs.py
+++ b/src/databricks/labs/ucx/source_code/jobs.py
@@ -496,15 +496,14 @@ def _process_dependency(
 
 
 def _get_path_modified_datetime(path: Path) -> datetime:
-    unix_time = 0.0
     if isinstance(path, WorkspacePath):
         # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/142
         # pylint: disable=protected-access
-        unix_time += float(path._object_info.modified_at) / 1000.0 if path._object_info.modified_at else 0.0
+        unix_time = float(path._object_info.modified_at) / 1000.0 if path._object_info.modified_at else 0.0
     elif isinstance(path, DBFSPath):
         # TODO add stats method in blueprint, see https://github.com/databrickslabs/blueprint/issues/143
         # pylint: disable=protected-access
-        unix_time += float(path._file_info.modification_time) / 1000.0 if path._file_info.modification_time else 0.0
+        unix_time = float(path._file_info.modification_time) / 1000.0 if path._file_info.modification_time else 0.0
     else:
         unix_time = path.stat().st_mtime
     return datetime.fromtimestamp(unix_time, timezone.utc)