diff --git a/.github/scripts/generate_pre_commit.py b/.github/scripts/generate_pre_commit.py new file mode 100755 index 0000000000000..2db73fd357ff5 --- /dev/null +++ b/.github/scripts/generate_pre_commit.py @@ -0,0 +1,279 @@ +"""Generate pre-commit hooks for Java and Python projects. + +This script scans a repository for Java and Python projects and generates appropriate +pre-commit hooks for linting and formatting. It also merges in additional hooks from +an override file. +""" + +import os +from dataclasses import dataclass +from enum import Enum, auto +from pathlib import Path +import datetime + +import yaml + + +class ProjectType(Enum): + """Types of projects supported for hook generation.""" + + JAVA = auto() + PYTHON = auto() + + +@dataclass +class Project: + """Represents a project found in the repository.""" + + path: str + type: ProjectType + + @property + def gradle_path(self) -> str: + """Convert path to Gradle task format.""" + return ":" + self.path.replace("/", ":") + + @property + def project_id(self) -> str: + """Generate a unique identifier for the project.""" + return self.path.replace("/", "-").replace(".", "-") + + +class ProjectFinder: + """Find Java and Python projects in a repository.""" + + JAVA_PATTERNS = [ + "plugins.hasPlugin('java')", + "apply plugin: 'java'", + "id 'java'", + "id 'java-library'", + "plugins.hasPlugin('java-library')", + "apply plugin: 'java-library'", + "plugins.hasPlugin('pegasus')", + "org.springframework.boot", + ] + + EXCLUDED_DIRS = {".git", "build", "node_modules", ".tox", "venv"} + SOURCE_EXTENSIONS = {".java", ".kt", ".groovy"} + + def __init__(self, root_dir: str): + self.root_path = Path(root_dir) + + def find_all_projects(self) -> list[Project]: + """Find all Java and Python projects in the repository.""" + java_projects = self._find_java_projects() + python_projects = self._find_python_projects() + + all_projects = [] + all_projects.extend( + Project(path=p, type=ProjectType.JAVA) for p in java_projects + ) + all_projects.extend( + Project(path=p, type=ProjectType.PYTHON) for p in python_projects + ) + + return sorted(all_projects, key=lambda p: p.path) + + def _find_java_projects(self) -> set[str]: + """Find all Java projects by checking build.gradle files.""" + java_projects = set() + + # Search both build.gradle and build.gradle.kts + for pattern in ["build.gradle", "build.gradle.kts"]: + for gradle_file in self.root_path.rglob(pattern): + if self._should_skip_directory(gradle_file.parent): + continue + + if self._is_java_project(gradle_file): + java_projects.add(self._get_relative_path(gradle_file.parent)) + + return { + p + for p in java_projects + if "buildSrc" not in p and "spark-smoke-test" not in p and p != "." + } + + def _find_python_projects(self) -> set[str]: + """Find all Python projects by checking for setup.py or pyproject.toml.""" + python_projects = set() + + for file_name in ["setup.py", "pyproject.toml"]: + for path in self.root_path.rglob(file_name): + if self._should_skip_directory(path.parent): + continue + + rel_path = self._get_relative_path(path.parent) + if "examples" not in rel_path: + python_projects.add(rel_path) + + return python_projects + + def _should_skip_directory(self, path: Path) -> bool: + """Check if directory should be skipped.""" + return any( + part in self.EXCLUDED_DIRS or part.startswith(".") for part in path.parts + ) + + def _is_java_project(self, gradle_file: Path) -> bool: + """Check if a Gradle file represents a Java project.""" + try: + content = gradle_file.read_text() + has_java_plugin = any(pattern in content for pattern in self.JAVA_PATTERNS) + + if has_java_plugin: + # Verify presence of source files + return any( + list(gradle_file.parent.rglob(f"*{ext}")) + for ext in self.SOURCE_EXTENSIONS + ) + return False + + except Exception as e: + print(f"Warning: Error reading {gradle_file}: {e}") + return False + + def _get_relative_path(self, path: Path) -> str: + """Get relative path from root, normalized with forward slashes.""" + return str(path.relative_to(self.root_path)).replace("\\", "/") + + +class HookGenerator: + """Generate pre-commit hooks for projects.""" + + def __init__(self, projects: list[Project], override_file: str = None): + self.projects = projects + self.override_file = override_file + + def generate_config(self) -> dict: + """Generate the complete pre-commit config.""" + hooks = [] + + for project in self.projects: + if project.type == ProjectType.PYTHON: + hooks.append(self._generate_lint_fix_hook(project)) + else: # ProjectType.JAVA + hooks.append(self._generate_spotless_hook(project)) + + config = {"repos": [{"repo": "local", "hooks": hooks}]} + + # Merge override hooks if they exist + if self.override_file and os.path.exists(self.override_file): + try: + with open(self.override_file, 'r') as f: + override_config = yaml.safe_load(f) + + if override_config and 'repos' in override_config: + for override_repo in override_config['repos']: + matching_repo = next( + (repo for repo in config['repos'] + if repo['repo'] == override_repo['repo']), + None + ) + + if matching_repo: + matching_repo['hooks'].extend(override_repo.get('hooks', [])) + else: + config['repos'].append(override_repo) + + print(f"Merged additional hooks from {self.override_file}") + except Exception as e: + print(f"Warning: Error reading override file {self.override_file}: {e}") + + return config + + def _generate_lint_fix_hook(self, project: Project) -> dict: + """Generate a lint-fix hook for Python projects.""" + return { + "id": f"{project.project_id}-lint-fix", + "name": f"{project.path} Lint Fix", + "entry": f"./gradlew {project.gradle_path}:lintFix", + "language": "system", + "files": f"^{project.path}/.*\\.py$", + "pass_filenames": False, + } + + def _generate_spotless_hook(self, project: Project) -> dict: + """Generate a spotless hook for Java projects.""" + return { + "id": f"{project.project_id}-spotless", + "name": f"{project.path} Spotless Apply", + "entry": f"./gradlew {project.gradle_path}:spotlessApply", + "language": "system", + "files": f"^{project.path}/.*\\.java$", + "pass_filenames": False, + } + + +class PrecommitDumper(yaml.Dumper): + """Custom YAML dumper that maintains proper indentation.""" + + def increase_indent(self, flow=False, *args, **kwargs): + return super().increase_indent(flow=flow, indentless=False) + + +def write_yaml_with_spaces(file_path: str, data: dict): + """Write YAML file with extra spacing between hooks and a timestamp header.""" + with open(file_path, "w") as f: + # Add timestamp header + current_time = datetime.datetime.now(datetime.timezone.utc) + formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S %Z") + header = f"# Auto-generated by .github/scripts/generate_pre_commit.py at {formatted_time}\n" + f.write(header) + header = f"# Do not edit this file directly. Run the script to regenerate.\n" + f.write(header) + header = f"# Add additional hooks in .github/scripts/pre-commit-override.yaml\n" + f.write(header) + + # Write the YAML content + yaml_str = yaml.dump( + data, Dumper=PrecommitDumper, sort_keys=False, default_flow_style=False + ) + + # Add extra newline between hooks + lines = yaml_str.split("\n") + result = [] + in_hook = False + + for line in lines: + if line.strip().startswith("- id:"): + if in_hook: # If we were already in a hook, add extra newline + result.append("") + in_hook = True + elif not line.strip() and in_hook: + in_hook = False + + result.append(line) + + f.write("\n".join(result)) + + +def main(): + root_dir = os.path.abspath(os.curdir) + override_file = ".github/scripts/pre-commit-override.yaml" + + # Find projects + finder = ProjectFinder(root_dir) + projects = finder.find_all_projects() + + # Print summary + print("Found projects:") + print("\nJava projects:") + for project in projects: + if project.type == ProjectType.JAVA: + print(f" - {project.path}") + + print("\nPython projects:") + for project in projects: + if project.type == ProjectType.PYTHON: + print(f" - {project.path}") + + # Generate and write config + generator = HookGenerator(projects, override_file) + config = generator.generate_config() + write_yaml_with_spaces(".pre-commit-config.yaml", config) + + print("\nGenerated .pre-commit-config.yaml") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/.github/scripts/pre-commit-override.yaml b/.github/scripts/pre-commit-override.yaml new file mode 100644 index 0000000000000..961134bebe2c9 --- /dev/null +++ b/.github/scripts/pre-commit-override.yaml @@ -0,0 +1,9 @@ +repos: + - repo: local + hooks: + - id: smoke-test-cypress-lint-fix + name: smoke-test cypress Lint Fix + entry: ./gradlew :smoke-test:cypressLintFix + language: system + files: ^smoke-test/tests/cypress/.*$ + pass_filenames: false \ No newline at end of file diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 26fcceb8aeab7..89e0c9e2513d8 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -84,9 +84,14 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} directory: ./build/coverage-reports/ fail_ci_if_error: false - flags: airflow,airflow-${{ matrix.extra_pip_extras }} - name: pytest-airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_requirements }} + flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }} + name: pytest-airflow verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 784dce0f11b2b..058ac4a5c9b1e 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -113,7 +113,7 @@ jobs: if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | ./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (build) @@ -134,6 +134,11 @@ jobs: flags: ${{ matrix.timezone }} name: ${{ matrix.command }} verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} quickstart-compose-validation: runs-on: ubuntu-latest @@ -152,7 +157,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml index 98e3041f28804..005f41b767ea6 100644 --- a/.github/workflows/close-stale-issues.yml +++ b/.github/workflows/close-stale-issues.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@v6 + - uses: actions/stale@v9 with: ascending: true operations-per-run: 100 diff --git a/.github/workflows/contributor-open-pr-comment.yml b/.github/workflows/contributor-open-pr-comment.yml index decc7ab27a411..fe60601b0159b 100644 --- a/.github/workflows/contributor-open-pr-comment.yml +++ b/.github/workflows/contributor-open-pr-comment.yml @@ -17,12 +17,12 @@ jobs: - name: Get and Format Username (PR only) if: github.event_name == 'pull_request' run: | - formatted_username=$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g') - echo "FORMATTED_USERNAME=$formatted_username" >> $GITHUB_ENV + formatted_username="$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')" + echo "FORMATTED_USERNAME=${formatted_username}" >> "$GITHUB_ENV" - name: Create Comment (PR only) if: github.event_name == 'pull_request' - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: | if (context.payload.pull_request) { diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index ae9a0b1605cdf..c29e72367c53c 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -74,6 +74,11 @@ jobs: flags: dagster-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} name: pytest-dagster verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index a5200c7e917d8..e44e6b11c6d05 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -1253,19 +1253,19 @@ jobs: TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}" source .github/scripts/docker_logs.sh - name: Upload logs - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }} path: "docker_logs/*.log" retention-days: 5 - name: Upload screenshots - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }} path: smoke-test/tests/cypress/cypress/screenshots/ - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }} diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml index 2fd814a076485..825f8beda2f56 100644 --- a/.github/workflows/gx-plugin.yml +++ b/.github/workflows/gx-plugin.yml @@ -78,6 +78,11 @@ jobs: flags: gx-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} name: pytest-gx verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 106cba1473982..aa404c4c35c50 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -41,9 +41,6 @@ jobs: "testIntegrationBatch1", "testIntegrationBatch2", ] - include: - - python-version: "3.8" - - python-version: "3.11" fail-fast: false steps: - name: Free up disk space @@ -92,15 +89,20 @@ jobs: **/junit.*.xml !**/binary/** - name: Upload coverage to Codecov - if: ${{ always() && matrix.python-version == '3.10' }} + if: ${{ always() }} uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} directory: ./build/coverage-reports/ fail_ci_if_error: false - flags: pytest-${{ matrix.command }} - name: pytest-${{ matrix.python-version }}-${{ matrix.command }} + flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }} + name: pytest-ingestion verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index 2225baecde64c..bcadc641ee2f7 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -70,7 +70,7 @@ jobs: - name: Gradle build (and test) run: | ./gradlew :metadata-io:test - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (metadata-io) @@ -90,12 +90,17 @@ jobs: fail_ci_if_error: false name: metadata-io-test verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index d77142a1f00de..0bce4d5ef19f3 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -67,9 +67,14 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} directory: ./build/coverage-reports/ fail_ci_if_error: false - flags: prefect,prefect-${{ matrix.python-version }} - name: pytest-prefect-${{ matrix.python-version }} + flags: prefect-${{ matrix.python-version }} + name: pytest-prefect verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 23413336404f2..e6a6705a72879 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -72,14 +72,14 @@ jobs: docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true - name: Upload logs - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: docker logs path: | "**/build/container-logs/*.log" "*.log" - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (smoke tests) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 898e3d262b394..3697efa37770e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,26 +1,445 @@ -exclude: ^$ -files: ^(docs/|docs-website/|metadata-ingestion/) +# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-01-09 10:08:09 UTC +# Do not edit this file directly. Run the script to regenerate. +# Add additional hooks in .github/scripts/pre-commit-override.yaml repos: - - repo: https://github.com/pre-commit/mirrors-isort - rev: v5.10.1 + - repo: local hooks: - - id: isort - - repo: https://github.com/ambv/black - rev: 23.1.0 - hooks: - - id: black - - repo: https://github.com/myint/autoflake - rev: v1.4 - hooks: - - id: autoflake - args: - - --in-place - - --remove-unused-variables - - --remove-all-unused-imports - - --expand-star-imports - - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.0.0-alpha.6" # Use the sha or tag you want to point at - hooks: - - id: prettier - args: - - --write \ No newline at end of file + - id: datahub-graphql-core-spotless + name: datahub-graphql-core Spotless Apply + entry: ./gradlew :datahub-graphql-core:spotlessApply + language: system + files: ^datahub-graphql-core/.*\.java$ + pass_filenames: false + + - id: datahub-upgrade-spotless + name: datahub-upgrade Spotless Apply + entry: ./gradlew :datahub-upgrade:spotlessApply + language: system + files: ^datahub-upgrade/.*\.java$ + pass_filenames: false + + - id: entity-registry-spotless + name: entity-registry Spotless Apply + entry: ./gradlew :entity-registry:spotlessApply + language: system + files: ^entity-registry/.*\.java$ + pass_filenames: false + + - id: ingestion-scheduler-spotless + name: ingestion-scheduler Spotless Apply + entry: ./gradlew :ingestion-scheduler:spotlessApply + language: system + files: ^ingestion-scheduler/.*\.java$ + pass_filenames: false + + - id: li-utils-spotless + name: li-utils Spotless Apply + entry: ./gradlew :li-utils:spotlessApply + language: system + files: ^li-utils/.*\.java$ + pass_filenames: false + + - id: metadata-auth-auth-api-spotless + name: metadata-auth/auth-api Spotless Apply + entry: ./gradlew :metadata-auth:auth-api:spotlessApply + language: system + files: ^metadata-auth/auth-api/.*\.java$ + pass_filenames: false + + - id: metadata-dao-impl-kafka-producer-spotless + name: metadata-dao-impl/kafka-producer Spotless Apply + entry: ./gradlew :metadata-dao-impl:kafka-producer:spotlessApply + language: system + files: ^metadata-dao-impl/kafka-producer/.*\.java$ + pass_filenames: false + + - id: metadata-events-mxe-avro-spotless + name: metadata-events/mxe-avro Spotless Apply + entry: ./gradlew :metadata-events:mxe-avro:spotlessApply + language: system + files: ^metadata-events/mxe-avro/.*\.java$ + pass_filenames: false + + - id: metadata-events-mxe-registration-spotless + name: metadata-events/mxe-registration Spotless Apply + entry: ./gradlew :metadata-events:mxe-registration:spotlessApply + language: system + files: ^metadata-events/mxe-registration/.*\.java$ + pass_filenames: false + + - id: metadata-events-mxe-schemas-spotless + name: metadata-events/mxe-schemas Spotless Apply + entry: ./gradlew :metadata-events:mxe-schemas:spotlessApply + language: system + files: ^metadata-events/mxe-schemas/.*\.java$ + pass_filenames: false + + - id: metadata-events-mxe-utils-avro-spotless + name: metadata-events/mxe-utils-avro Spotless Apply + entry: ./gradlew :metadata-events:mxe-utils-avro:spotlessApply + language: system + files: ^metadata-events/mxe-utils-avro/.*\.java$ + pass_filenames: false + + - id: metadata-ingestion-lint-fix + name: metadata-ingestion Lint Fix + entry: ./gradlew :metadata-ingestion:lintFix + language: system + files: ^metadata-ingestion/.*\.py$ + pass_filenames: false + + - id: metadata-ingestion-modules-airflow-plugin-lint-fix + name: metadata-ingestion-modules/airflow-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:airflow-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/airflow-plugin/.*\.py$ + pass_filenames: false + + - id: metadata-ingestion-modules-dagster-plugin-lint-fix + name: metadata-ingestion-modules/dagster-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:dagster-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/dagster-plugin/.*\.py$ + pass_filenames: false + + - id: metadata-ingestion-modules-gx-plugin-lint-fix + name: metadata-ingestion-modules/gx-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:gx-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/gx-plugin/.*\.py$ + pass_filenames: false + + - id: metadata-ingestion-modules-prefect-plugin-lint-fix + name: metadata-ingestion-modules/prefect-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:prefect-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/prefect-plugin/.*\.py$ + pass_filenames: false + + - id: metadata-integration-java-acryl-spark-lineage-spotless + name: metadata-integration/java/acryl-spark-lineage Spotless Apply + entry: ./gradlew :metadata-integration:java:acryl-spark-lineage:spotlessApply + language: system + files: ^metadata-integration/java/acryl-spark-lineage/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-datahub-client-spotless + name: metadata-integration/java/datahub-client Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-client:spotlessApply + language: system + files: ^metadata-integration/java/datahub-client/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-datahub-event-spotless + name: metadata-integration/java/datahub-event Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-event:spotlessApply + language: system + files: ^metadata-integration/java/datahub-event/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-datahub-protobuf-spotless + name: metadata-integration/java/datahub-protobuf Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-protobuf:spotlessApply + language: system + files: ^metadata-integration/java/datahub-protobuf/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-datahub-schematron-cli-spotless + name: metadata-integration/java/datahub-schematron/cli Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-schematron:cli:spotlessApply + language: system + files: ^metadata-integration/java/datahub-schematron/cli/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-datahub-schematron-lib-spotless + name: metadata-integration/java/datahub-schematron/lib Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-schematron:lib:spotlessApply + language: system + files: ^metadata-integration/java/datahub-schematron/lib/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-examples-spotless + name: metadata-integration/java/examples Spotless Apply + entry: ./gradlew :metadata-integration:java:examples:spotlessApply + language: system + files: ^metadata-integration/java/examples/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-openlineage-converter-spotless + name: metadata-integration/java/openlineage-converter Spotless Apply + entry: ./gradlew :metadata-integration:java:openlineage-converter:spotlessApply + language: system + files: ^metadata-integration/java/openlineage-converter/.*\.java$ + pass_filenames: false + + - id: metadata-integration-java-spark-lineage-legacy-spotless + name: metadata-integration/java/spark-lineage-legacy Spotless Apply + entry: ./gradlew :metadata-integration:java:spark-lineage-legacy:spotlessApply + language: system + files: ^metadata-integration/java/spark-lineage-legacy/.*\.java$ + pass_filenames: false + + - id: metadata-io-spotless + name: metadata-io Spotless Apply + entry: ./gradlew :metadata-io:spotlessApply + language: system + files: ^metadata-io/.*\.java$ + pass_filenames: false + + - id: metadata-io-metadata-io-api-spotless + name: metadata-io/metadata-io-api Spotless Apply + entry: ./gradlew :metadata-io:metadata-io-api:spotlessApply + language: system + files: ^metadata-io/metadata-io-api/.*\.java$ + pass_filenames: false + + - id: metadata-jobs-common-spotless + name: metadata-jobs/common Spotless Apply + entry: ./gradlew :metadata-jobs:common:spotlessApply + language: system + files: ^metadata-jobs/common/.*\.java$ + pass_filenames: false + + - id: metadata-jobs-mae-consumer-spotless + name: metadata-jobs/mae-consumer Spotless Apply + entry: ./gradlew :metadata-jobs:mae-consumer:spotlessApply + language: system + files: ^metadata-jobs/mae-consumer/.*\.java$ + pass_filenames: false + + - id: metadata-jobs-mae-consumer-job-spotless + name: metadata-jobs/mae-consumer-job Spotless Apply + entry: ./gradlew :metadata-jobs:mae-consumer-job:spotlessApply + language: system + files: ^metadata-jobs/mae-consumer-job/.*\.java$ + pass_filenames: false + + - id: metadata-jobs-mce-consumer-spotless + name: metadata-jobs/mce-consumer Spotless Apply + entry: ./gradlew :metadata-jobs:mce-consumer:spotlessApply + language: system + files: ^metadata-jobs/mce-consumer/.*\.java$ + pass_filenames: false + + - id: metadata-jobs-mce-consumer-job-spotless + name: metadata-jobs/mce-consumer-job Spotless Apply + entry: ./gradlew :metadata-jobs:mce-consumer-job:spotlessApply + language: system + files: ^metadata-jobs/mce-consumer-job/.*\.java$ + pass_filenames: false + + - id: metadata-jobs-pe-consumer-spotless + name: metadata-jobs/pe-consumer Spotless Apply + entry: ./gradlew :metadata-jobs:pe-consumer:spotlessApply + language: system + files: ^metadata-jobs/pe-consumer/.*\.java$ + pass_filenames: false + + - id: metadata-models-spotless + name: metadata-models Spotless Apply + entry: ./gradlew :metadata-models:spotlessApply + language: system + files: ^metadata-models/.*\.java$ + pass_filenames: false + + - id: metadata-models-custom-spotless + name: metadata-models-custom Spotless Apply + entry: ./gradlew :metadata-models-custom:spotlessApply + language: system + files: ^metadata-models-custom/.*\.java$ + pass_filenames: false + + - id: metadata-models-validator-spotless + name: metadata-models-validator Spotless Apply + entry: ./gradlew :metadata-models-validator:spotlessApply + language: system + files: ^metadata-models-validator/.*\.java$ + pass_filenames: false + + - id: metadata-operation-context-spotless + name: metadata-operation-context Spotless Apply + entry: ./gradlew :metadata-operation-context:spotlessApply + language: system + files: ^metadata-operation-context/.*\.java$ + pass_filenames: false + + - id: metadata-service-auth-config-spotless + name: metadata-service/auth-config Spotless Apply + entry: ./gradlew :metadata-service:auth-config:spotlessApply + language: system + files: ^metadata-service/auth-config/.*\.java$ + pass_filenames: false + + - id: metadata-service-auth-filter-spotless + name: metadata-service/auth-filter Spotless Apply + entry: ./gradlew :metadata-service:auth-filter:spotlessApply + language: system + files: ^metadata-service/auth-filter/.*\.java$ + pass_filenames: false + + - id: metadata-service-auth-impl-spotless + name: metadata-service/auth-impl Spotless Apply + entry: ./gradlew :metadata-service:auth-impl:spotlessApply + language: system + files: ^metadata-service/auth-impl/.*\.java$ + pass_filenames: false + + - id: metadata-service-auth-servlet-impl-spotless + name: metadata-service/auth-servlet-impl Spotless Apply + entry: ./gradlew :metadata-service:auth-servlet-impl:spotlessApply + language: system + files: ^metadata-service/auth-servlet-impl/.*\.java$ + pass_filenames: false + + - id: metadata-service-configuration-spotless + name: metadata-service/configuration Spotless Apply + entry: ./gradlew :metadata-service:configuration:spotlessApply + language: system + files: ^metadata-service/configuration/.*\.java$ + pass_filenames: false + + - id: metadata-service-factories-spotless + name: metadata-service/factories Spotless Apply + entry: ./gradlew :metadata-service:factories:spotlessApply + language: system + files: ^metadata-service/factories/.*\.java$ + pass_filenames: false + + - id: metadata-service-graphql-servlet-impl-spotless + name: metadata-service/graphql-servlet-impl Spotless Apply + entry: ./gradlew :metadata-service:graphql-servlet-impl:spotlessApply + language: system + files: ^metadata-service/graphql-servlet-impl/.*\.java$ + pass_filenames: false + + - id: metadata-service-openapi-analytics-servlet-spotless + name: metadata-service/openapi-analytics-servlet Spotless Apply + entry: ./gradlew :metadata-service:openapi-analytics-servlet:spotlessApply + language: system + files: ^metadata-service/openapi-analytics-servlet/.*\.java$ + pass_filenames: false + + - id: metadata-service-openapi-entity-servlet-spotless + name: metadata-service/openapi-entity-servlet Spotless Apply + entry: ./gradlew :metadata-service:openapi-entity-servlet:spotlessApply + language: system + files: ^metadata-service/openapi-entity-servlet/.*\.java$ + pass_filenames: false + + - id: metadata-service-openapi-entity-servlet-generators-spotless + name: metadata-service/openapi-entity-servlet/generators Spotless Apply + entry: ./gradlew :metadata-service:openapi-entity-servlet:generators:spotlessApply + language: system + files: ^metadata-service/openapi-entity-servlet/generators/.*\.java$ + pass_filenames: false + + - id: metadata-service-openapi-servlet-spotless + name: metadata-service/openapi-servlet Spotless Apply + entry: ./gradlew :metadata-service:openapi-servlet:spotlessApply + language: system + files: ^metadata-service/openapi-servlet/.*\.java$ + pass_filenames: false + + - id: metadata-service-openapi-servlet-models-spotless + name: metadata-service/openapi-servlet/models Spotless Apply + entry: ./gradlew :metadata-service:openapi-servlet:models:spotlessApply + language: system + files: ^metadata-service/openapi-servlet/models/.*\.java$ + pass_filenames: false + + - id: metadata-service-plugin-spotless + name: metadata-service/plugin Spotless Apply + entry: ./gradlew :metadata-service:plugin:spotlessApply + language: system + files: ^metadata-service/plugin/.*\.java$ + pass_filenames: false + + - id: metadata-service-plugin-src-test-sample-test-plugins-spotless + name: metadata-service/plugin/src/test/sample-test-plugins Spotless Apply + entry: ./gradlew :metadata-service:plugin:src:test:sample-test-plugins:spotlessApply + language: system + files: ^metadata-service/plugin/src/test/sample-test-plugins/.*\.java$ + pass_filenames: false + + - id: metadata-service-restli-client-spotless + name: metadata-service/restli-client Spotless Apply + entry: ./gradlew :metadata-service:restli-client:spotlessApply + language: system + files: ^metadata-service/restli-client/.*\.java$ + pass_filenames: false + + - id: metadata-service-restli-client-api-spotless + name: metadata-service/restli-client-api Spotless Apply + entry: ./gradlew :metadata-service:restli-client-api:spotlessApply + language: system + files: ^metadata-service/restli-client-api/.*\.java$ + pass_filenames: false + + - id: metadata-service-restli-servlet-impl-spotless + name: metadata-service/restli-servlet-impl Spotless Apply + entry: ./gradlew :metadata-service:restli-servlet-impl:spotlessApply + language: system + files: ^metadata-service/restli-servlet-impl/.*\.java$ + pass_filenames: false + + - id: metadata-service-schema-registry-api-spotless + name: metadata-service/schema-registry-api Spotless Apply + entry: ./gradlew :metadata-service:schema-registry-api:spotlessApply + language: system + files: ^metadata-service/schema-registry-api/.*\.java$ + pass_filenames: false + + - id: metadata-service-schema-registry-servlet-spotless + name: metadata-service/schema-registry-servlet Spotless Apply + entry: ./gradlew :metadata-service:schema-registry-servlet:spotlessApply + language: system + files: ^metadata-service/schema-registry-servlet/.*\.java$ + pass_filenames: false + + - id: metadata-service-services-spotless + name: metadata-service/services Spotless Apply + entry: ./gradlew :metadata-service:services:spotlessApply + language: system + files: ^metadata-service/services/.*\.java$ + pass_filenames: false + + - id: metadata-service-servlet-spotless + name: metadata-service/servlet Spotless Apply + entry: ./gradlew :metadata-service:servlet:spotlessApply + language: system + files: ^metadata-service/servlet/.*\.java$ + pass_filenames: false + + - id: metadata-utils-spotless + name: metadata-utils Spotless Apply + entry: ./gradlew :metadata-utils:spotlessApply + language: system + files: ^metadata-utils/.*\.java$ + pass_filenames: false + + - id: mock-entity-registry-spotless + name: mock-entity-registry Spotless Apply + entry: ./gradlew :mock-entity-registry:spotlessApply + language: system + files: ^mock-entity-registry/.*\.java$ + pass_filenames: false + + - id: smoke-test-lint-fix + name: smoke-test Lint Fix + entry: ./gradlew :smoke-test:lintFix + language: system + files: ^smoke-test/.*\.py$ + pass_filenames: false + + - id: test-models-spotless + name: test-models Spotless Apply + entry: ./gradlew :test-models:spotlessApply + language: system + files: ^test-models/.*\.java$ + pass_filenames: false + + - id: smoke-test-cypress-lint-fix + name: smoke-test cypress Lint Fix + entry: ./gradlew :smoke-test:cypressLintFix + language: system + files: ^smoke-test/tests/cypress/.*$ diff --git a/build.gradle b/build.gradle index 8929b4e644972..eff36ee3a7977 100644 --- a/build.gradle +++ b/build.gradle @@ -211,7 +211,7 @@ project.ext.externalDependency = [ 'mockitoInline': 'org.mockito:mockito-inline:4.11.0', 'mockServer': 'org.mock-server:mockserver-netty:5.11.2', 'mockServerClient': 'org.mock-server:mockserver-client-java:5.11.2', - 'mysqlConnector': 'mysql:mysql-connector-java:8.0.28', + 'mysqlConnector': 'com.mysql:mysql-connector-j:8.4.0', 'neo4jHarness': 'org.neo4j.test:neo4j-harness:' + neo4jTestVersion, 'neo4jJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jVersion, 'neo4jTestJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jTestVersion, @@ -235,7 +235,7 @@ project.ext.externalDependency = [ 'playFilters': "com.typesafe.play:filters-helpers_$playScalaVersion:$playVersion", 'pac4j': 'org.pac4j:pac4j-oidc:6.0.6', 'playPac4j': "org.pac4j:play-pac4j_$playScalaVersion:12.0.0-PLAY2.8", - 'postgresql': 'org.postgresql:postgresql:42.3.9', + 'postgresql': 'org.postgresql:postgresql:42.7.4', 'protobuf': 'com.google.protobuf:protobuf-java:3.25.5', 'grpcProtobuf': 'io.grpc:grpc-protobuf:1.53.0', 'rangerCommons': 'org.apache.ranger:ranger-plugins-common:2.3.0', @@ -474,10 +474,6 @@ subprojects { if (compileJavaTask != null) { spotlessJavaTask.dependsOn compileJavaTask } - // TODO - Do not run this in CI. How? - // tasks.withType(JavaCompile) { - // finalizedBy(tasks.findByName('spotlessApply')) - // } } } diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index b95515684f01f..3de0170fc7038 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -181,7 +181,12 @@ protected OperationContext provideOperationContext( final Authentication systemAuthentication, final ConfigurationProvider configurationProvider) { ActorContext systemActorContext = - ActorContext.builder().systemAuth(true).authentication(systemAuthentication).build(); + ActorContext.builder() + .systemAuth(true) + .authentication(systemAuthentication) + .enforceExistenceEnabled( + configurationProvider.getAuthentication().isEnforceExistenceEnabled()) + .build(); OperationContextConfig systemConfig = OperationContextConfig.builder() .viewAuthorizationConfiguration(configurationProvider.getAuthorization().getView()) @@ -197,7 +202,9 @@ protected OperationContext provideOperationContext( .entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY)) .validationContext(ValidationContext.builder().alternateValidation(false).build()) .retrieverContext(RetrieverContext.EMPTY) - .build(systemAuthentication); + .build( + systemAuthentication, + configurationProvider.getAuthentication().isEnforceExistenceEnabled()); } @Provides diff --git a/datahub-frontend/app/config/ConfigurationProvider.java b/datahub-frontend/app/config/ConfigurationProvider.java index 97e916769a6c4..9bc28be1bfc89 100644 --- a/datahub-frontend/app/config/ConfigurationProvider.java +++ b/datahub-frontend/app/config/ConfigurationProvider.java @@ -1,5 +1,6 @@ package config; +import com.datahub.authentication.AuthenticationConfiguration; import com.datahub.authorization.AuthorizationConfiguration; import com.linkedin.metadata.config.VisualConfiguration; import com.linkedin.metadata.config.cache.CacheConfiguration; @@ -30,4 +31,7 @@ public class ConfigurationProvider { /** Configuration for authorization */ private AuthorizationConfiguration authorization; + + /** Configuration for authentication */ + private AuthenticationConfiguration authentication; } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index fdd84da6044f7..d0493019a40af 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -194,7 +194,8 @@ protected OperationContext javaSystemOperationContext( ValidationContext.builder() .alternateValidation( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) - .build()); + .build(), + true); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); diff --git a/docs/api/tutorials/structured-properties.md b/docs/api/tutorials/structured-properties.md index 2caa015e20659..ed270811b82e9 100644 --- a/docs/api/tutorials/structured-properties.md +++ b/docs/api/tutorials/structured-properties.md @@ -6,7 +6,7 @@ import TabItem from '@theme/TabItem'; ## Why Would You Use Structured Properties? Structured properties are a structured, named set of properties that can be attached to logical entities like Datasets, DataJobs, etc. -Structured properties have values that are types. Conceptually, they are like “field definitions”. +Structured properties have values that are typed and support constraints. Learn more about structured properties in the [Structured Properties Feature Guide](../../../docs/features/feature-guides/properties/overview.md). @@ -15,6 +15,7 @@ Learn more about structured properties in the [Structured Properties Feature Gui This guide will show you how to execute the following actions with structured properties. - Create structured properties +- List structured properties - Read structured properties - Delete structured properties - Add structured properties to a dataset @@ -32,7 +33,8 @@ Additionally, you need to have the following tools installed according to the me -Install the relevant CLI version. Forms are available as of CLI version `0.13.1`. The corresponding DataHub Cloud release version is `v0.2.16.5` +Install the relevant CLI version. +Structured Properties were introduced in version `0.13.1`, but we continuously improve and add new functionality, so you should always [upgrade](https://datahubproject.io/docs/cli/#installation) to the latest cli for best results. Connect to your instance via [init](https://datahubproject.io/docs/cli/#init): - Run `datahub init` to update the instance you want to load into. @@ -56,33 +58,8 @@ Requirements for OpenAPI are: The following code will create a structured property `io.acryl.privacy.retentionTime`. - -```graphql -mutation createStructuredProperty { - createStructuredProperty( - input: { - id: "retentionTime", - qualifiedName:"retentionTime", - displayName: "Retention Time", - description: "Retention Time is used to figure out how long to retain records in a dataset", - valueType: "urn:li:dataType:datahub.number", - allowedValues: [ - {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"}, - {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"}, - {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} - ], - cardinality: SINGLE, - entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], - } - ) { - urn - } -} -``` - - - + Create a yaml file representing the properties you’d like to load. For example, below file represents a property `io.acryl.privacy.retentionTime`. You can see the full example [here](https://github.com/datahub-project/datahub/blob/example-yaml-sp/metadata-ingestion/examples/structured_properties/struct_props.yaml). @@ -108,13 +85,41 @@ For example, below file represents a property `io.acryl.privacy.retentionTime`. ``` Use the CLI to create your properties: -```commandline +```shell datahub properties upsert -f {properties_yaml} ``` If successful, you should see `Created structured property urn:li:structuredProperty:...` + + + +```graphql +mutation createStructuredProperty { + createStructuredProperty( + input: { + id: "retentionTime", + qualifiedName:"retentionTime", + displayName: "Retention Time", + description: "Retention Time is used to figure out how long to retain records in a dataset", + valueType: "urn:li:dataType:datahub.number", + allowedValues: [ + {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"}, + {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"}, + {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} + ], + cardinality: SINGLE, + entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], + } + ) { + urn + } +} +``` + + + ```shell @@ -236,9 +241,182 @@ Example Response: -## Read Structured Properties +## List Structured Properties + +You can list all structured properties in your DataHub instance using the following methods: + + + + +```shell +datahub properties list +``` + +This will show all properties with their full details. + +Example Response: +```json +{ + "urn": "urn:li:structuredProperty:clusterName", + "qualified_name": "clusterName", + "type": "urn:li:dataType:datahub.string", + "description": "Test Cluster Name Property", + "display_name": "Cluster's name", + "entity_types": [ + "urn:li:entityType:datahub.dataset" + ], + "cardinality": "SINGLE" +} +{ + "urn": "urn:li:structuredProperty:projectNames", + "qualified_name": "projectNames", + "type": "urn:li:dataType:datahub.string", + "description": "Test property for project name", + "display_name": "Project Name", + "entity_types": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "cardinality": "MULTIPLE", + "allowed_values": [ + { + "value": "Tracking", + "description": "test value 1 for project" + }, + { + "value": "DataHub", + "description": "test value 2 for project" + } + ] +} +``` + + +If you only want to see the URNs, you can use: + +```shell +datahub properties list --no-details +``` + +Example Response: +``` +[2025-01-08 22:23:00,625] INFO {datahub.cli.specific.structuredproperties_cli:134} - Listing structured property urns only, use --details for more information +urn:li:structuredProperty:clusterName +urn:li:structuredProperty:clusterType +urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate +urn:li:structuredProperty:projectNames +``` + +To download all the structured property definitions into a single file that you can use with the `upsert` command as described in the [create section](#create-structured-properties), you can run the list command with the `--to-file` option. + +```shell +datahub properties list --to-file structured_properties.yaml +``` + +Example Response: +```yaml + - urn: urn:li:structuredProperty:clusterName + qualified_name: clusterName + type: urn:li:dataType:datahub.string + description: Test Cluster Name Property + display_name: Cluster's name + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: SINGLE + - urn: urn:li:structuredProperty:clusterType + qualified_name: clusterType + type: urn:li:dataType:datahub.string + description: Test Cluster Type Property + display_name: Cluster's type + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: SINGLE + - urn: urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate + qualified_name: io.acryl.dataManagement.deprecationDate + type: urn:li:dataType:datahub.date + display_name: Deprecation Date + entity_types: + - urn:li:entityType:datahub.dataset + - urn:li:entityType:datahub.dataFlow + - urn:li:entityType:datahub.dataJob + - urn:li:entityType:datahub.schemaField + cardinality: SINGLE + - urn: urn:li:structuredProperty:io.acryl.privacy.enumProperty5712 + qualified_name: io.acryl.privacy.enumProperty5712 + type: urn:li:dataType:datahub.string + description: The retention policy for the dataset + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: MULTIPLE + allowed_values: + - value: foo + - value: bar +... etc. +``` + + + + + +Example Request: +```bash +curl -X 'GET' \ + 'http://localhost:9002/openapi/v3/entity/structuredproperty?systemMetadata=false&includeSoftDelete=false&skipCache=false&aspects=structuredPropertySettings&aspects=propertyDefinition&aspects=institutionalMemory&aspects=structuredPropertyKey&aspects=status&count=10&sortCriteria=urn&sortOrder=ASCENDING&query=*' \ + -H 'accept: application/json' +``` + +Example Response: +```json +{ + "scrollId": "...", + "entities": [ + { + "urn": "urn:li:structuredProperty:clusterName", + "propertyDefinition": { + "value": { + "immutable": false, + "qualifiedName": "clusterName", + "displayName": "Cluster's name", + "valueType": "urn:li:dataType:datahub.string", + "description": "Test Cluster Name Property", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "cardinality": "SINGLE" + } + }, + "structuredPropertyKey": { + "value": { + "id": "clusterName" + } + } + } + ] +} +``` + +Key Query Parameters: +- `count`: Number of results to return per page (default: 10) +- `sortCriteria`: Field to sort by (default: urn) +- `sortOrder`: Sort order (ASCENDING or DESCENDING) +- `query`: Search query to filter properties (* for all) + + + + +The list endpoint returns all structured properties in your DataHub instance. Each property includes: +- URN: Unique identifier for the property +- Qualified Name: The property's qualified name +- Type: The data type of the property (string, number, date, etc.) +- Description: A description of the property's purpose +- Display Name: Human-readable name for the property +- Entity Types: The types of entities this property can be applied to +- Cardinality: Whether the property accepts single (SINGLE) or multiple (MULTIPLE) values +- Allowed Values: If specified, the list of allowed values for this property -You can see the properties you created by running the following command: +## Read a single Structured Property + +You can read an individual property you created by running the following command: @@ -279,6 +457,91 @@ If successful, you should see metadata about your properties returned. } ``` + + + +Example Request: +```graphql +query { + structuredProperty(urn: "urn:li:structuredProperty:projectNames") { + urn + type + definition { + qualifiedName + displayName + description + cardinality + allowedValues { + value { + ... on StringValue { + stringValue + } + ... on NumberValue { + numberValue + } + } + description + } + entityTypes { + urn + info { + type + qualifiedName + } + } + } + } +} +``` + +Example Response: +```json +{ + "data": { + "structuredProperty": { + "urn": "urn:li:structuredProperty:projectNames", + "type": "STRUCTURED_PROPERTY", + "definition": { + "qualifiedName": "projectNames", + "displayName": "Project Name", + "description": "Test property for project name", + "cardinality": "MULTIPLE", + "allowedValues": [ + { + "value": { + "stringValue": "Tracking" + }, + "description": "test value 1 for project" + }, + { + "value": { + "stringValue": "DataHub" + }, + "description": "test value 2 for project" + } + ], + "entityTypes": [ + { + "urn": "urn:li:entityType:datahub.dataset", + "info": { + "type": "DATASET", + "qualifiedName": "datahub.dataset" + } + }, + { + "urn": "urn:li:entityType:datahub.dataFlow", + "info": { + "type": "DATA_FLOW", + "qualifiedName": "datahub.dataFlow" + } + } + ] + } + } + }, + "extensions": {} +} +``` @@ -389,7 +652,7 @@ Example Response: This action will set/replace all structured properties on the entity. See PATCH operations to add/remove a single property. - + ```graphql mutation upsertStructuredProperties { @@ -537,7 +800,7 @@ datahub dataset get --urn {urn} For reading all structured properties from a dataset: - + ```graphql query getDataset { diff --git a/docs/authentication/guides/add-users.md b/docs/authentication/guides/add-users.md index 30da5c9f229f9..dbd44b6308678 100644 --- a/docs/authentication/guides/add-users.md +++ b/docs/authentication/guides/add-users.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Onboarding Users to DataHub New user accounts can be provisioned on DataHub in 3 ways: @@ -94,6 +97,11 @@ using this mechanism. It is highly recommended that admins change or remove the ## Adding new users using a user.props file +:::NOTE +Adding users via the `user.props` will require disabling existence checks on GMS using the `METADATA_SERVICE_AUTH_ENFORCE_EXISTENCE_ENABLED=false` environment variable or using the API to enable the user prior to login. +The directions below demonstrate using the API to enable the user. +::: + To define a set of username / password combinations that should be allowed to log in to DataHub (in addition to the root 'datahub' user), create a new file called `user.props` at the file path `${HOME}/.datahub/plugins/frontend/auth/user.props` within the `datahub-frontend-react` container or pod. @@ -107,6 +115,28 @@ janesmith:janespassword johndoe:johnspassword ``` +In order to enable the user access with the credential defined in `user.props`, set the `status` aspect on the user with an Admin user. This can be done using an API call or via the [OpenAPI UI interface](/docs/api/openapi/openapi-usage-guide.md). + + + + +Example enabling login for the `janesmith` user from the example above. Make sure to update the example with your access token. + +```shell +curl -X 'POST' \ + 'http://localhost:9002/openapi/v3/entity/corpuser/urn%3Ali%3Acorpuser%3Ajanesmith/status?async=false&systemMetadata=false&createIfEntityNotExists=false&createIfNotExists=true' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer ' \ + -d '{ + "value": { + "removed": false + } +}' +``` + + + Once you've saved the file, simply start the DataHub containers & navigate to `http://localhost:9002/login` to verify that your new credentials work. diff --git a/docs/businessattributes.md b/docs/businessattributes.md index 3e912e7e60980..2359c2ac85b58 100644 --- a/docs/businessattributes.md +++ b/docs/businessattributes.md @@ -1,5 +1,10 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + # Business Attributes + + +>**Note:** This is BETA feature ## What are Business Attributes A Business Attribute, as its name implies, is an attribute with a business focus. It embodies the traits or properties of an entity within a business framework. This attribute is a crucial piece of data for a business, utilised to define or control the entity throughout the organisation. If a business process or concept is depicted as a comprehensive logical model, then each Business Attribute can be considered as an individual component within that model. While business names and descriptions are generally managed through glossary terms, Business Attributes encompass additional characteristics such as data quality rules/assertions, data privacy markers, data usage protocols, standard tags, and supplementary documentation, alongside Names and Descriptions. @@ -70,9 +75,11 @@ Description inherited from business attribute is greyed out to differentiate bet

### Enable Business Attributes Feature -By default, business attribute is disabled. To enable Business Attributes feature, set the following configuration in [application.yaml](../metadata-service/configuration/src/main/resources/application.yaml) - -businessAttributeEntityEnabled : true +By default, business attribute is disabled. To enable Business Attributes feature, export environmental variable +(may be done via `extraEnvs` for GMS deployment): +```shell +BUSINESS_ATTRIBUTE_ENTITY_ENABLED=true +``` ### What updates are planned for the Business Attributes feature? diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md index e36940bf39835..1b1a9952f7898 100644 --- a/docs/how/delete-metadata.md +++ b/docs/how/delete-metadata.md @@ -97,6 +97,21 @@ The start and end time fields filter on the `timestampMillis` field of the times - `ddddddddd` (e.g. `1684384045`): a unix timestamp - `min`, `max`, `now`: special keywords +#### Undo-ing soft deletion of entities + +You can restore soft-deleted entities using the `undo-by-filter` command. This reverts the effect of a soft delete. + +```shell +# Restore (un-soft-delete) a single soft-deleted entity +datahub delete undo-by-filter --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)" + +# Restore all soft-deleted entities from a specific platform +datahub delete undo-by-filter --platform snowflake + +# You can adjust the batch size (default 3000, max 10000) for better performance +datahub delete undo-by-filter --platform snowflake --batch-size 5000 +``` + ## Delete CLI Examples :::note diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index c404c1863dc7c..68b41c907c6ad 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -18,6 +18,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ## Next - #12191 - Configs `include_view_lineage` and `include_view_column_lineage` are removed from snowflake ingestion source. View and External Table DDL lineage will always be ingested when definitions are available. +- #12181 - Configs `include_view_lineage`, `include_view_column_lineage` and `lineage_parse_view_ddl` are removed from bigquery ingestion source. View and Snapshot lineage will always be ingested when definitions are available. - #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.
PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.
Entity urn with `include_workspace_name_in_dataset_urn: false` @@ -66,6 +67,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe changed to NOT fill out `created` and `lastModified` auditstamps by default for input and output dataset edges. This should not have any user-observable impact (time-based lineage viz will still continue working based on observed time), but could break assumptions previously being made by clients. +- #12158 - Users provisioned with `user.props` will need to be enabled before login in order to be granted access to DataHub. ### Potential Downtime diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 2693aab0700da..d07063dbffc5c 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -119,6 +119,7 @@ def get_long_description(): "pendulum<3.0", "Flask-Session<0.6.0", "connexion<3.0", + "marshmallow<3.24.0", }, } diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index fc1409fbed74e..ac8658bd86927 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -127,6 +127,9 @@ task lintFix(type: Exec, dependsOn: installDev) { "mypy --show-traceback --show-error-codes src/ tests/ examples/" } +def pytest_default_env = "PYTHONDEVMODE=1" +def pytest_default_args = "--durations=30 -vv --continue-on-collection-errors" + task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJsonSchema']) { // We can't enforce the coverage requirements if we run a subset of the tests. inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) @@ -135,7 +138,7 @@ task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJso def cvg_arg = get_coverage_args("quick") commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "pytest ${cvg_arg} tests/unit --random-order --durations=20 -m 'not integration' -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" + "${pytest_default_env} pytest ${cvg_arg} tests/unit ${pytest_default_args} --random-order -m 'not integration' --junit-xml=junit.quick.xml" } task installDevTest(type: Exec, dependsOn: [install]) { @@ -155,7 +158,7 @@ task testSingle(dependsOn: [installDevTest]) { if (testFile != 'unknown') { exec { commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && pytest ${testFile}" + "source ${venv_name}/bin/activate && ${pytest_default_env} pytest ${testFile} ${pytest_default_args}" } } else { throw new GradleException("No file provided. Use -PtestFile=") @@ -167,25 +170,25 @@ task testIntegrationBatch0(type: Exec, dependsOn: [installDevTest]) { def cvg_arg = get_coverage_args("intBatch0") commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "pytest ${cvg_arg} --durations=50 -m 'integration_batch_0' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch0.xml" + "${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_0' --junit-xml=junit.integrationbatch0.xml" } task testIntegrationBatch1(type: Exec, dependsOn: [installDevTest]) { def cvg_arg = get_coverage_args("intBatch1") commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "pytest ${cvg_arg} --durations=50 -m 'integration_batch_1' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch1.xml" + "${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_1' --junit-xml=junit.integrationbatch1.xml" } task testIntegrationBatch2(type: Exec, dependsOn: [installDevTest]) { def cvg_arg = get_coverage_args("intBatch2") commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "pytest ${cvg_arg} --durations=20 -m 'integration_batch_2' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch2.xml" + "${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_2' --junit-xml=junit.integrationbatch2.xml" } task testFull(type: Exec, dependsOn: [installDevTest]) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "pytest --durations=50 -vv --continue-on-collection-errors --junit-xml=junit.full.xml" + "${pytest_default_env} pytest ${pytest_default_args} --junit-xml=junit.full.xml" } task specGen(type: Exec, dependsOn: [codegen, installDevTest]) { diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md index 39eac229a6601..457725b6783e5 100644 --- a/metadata-ingestion/docs/dev_guides/classification.md +++ b/metadata-ingestion/docs/dev_guides/classification.md @@ -7,10 +7,10 @@ The classification feature enables sources to be configured to automatically pre Note that a `.` is used to denote nested fields in the YAML recipe. | Field | Required | Type | Description | Default | -| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | +| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |------------------------------------------------------------| | enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False | | sample_size | | int | Number of sample values used for classification. | 100 | -| max_workers | | int | Number of worker processes to use for classification. Set to 1 to disable. | Number of cpu cores or 4 | +| max_workers | | int | Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable. | 1 | | info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. | | classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] | | table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} | diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml index 7e8dbcff88e1c..3226f23c963dd 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml @@ -4,6 +4,9 @@ source: # This option is recommended to be used to ingest all lineage ignore_start_time_lineage: true + # This flag tells the snowflake ingestion to use the more advanced query parsing. This will become the default eventually. + use_queries_v2: true + # Coordinates account_id: "abc48144" warehouse: "COMPUTE_WH" diff --git a/metadata-ingestion/docs/sources/tableau/tableau_pre.md b/metadata-ingestion/docs/sources/tableau/tableau_pre.md index aeb67f85b241b..65ff08367fdc8 100644 --- a/metadata-ingestion/docs/sources/tableau/tableau_pre.md +++ b/metadata-ingestion/docs/sources/tableau/tableau_pre.md @@ -3,9 +3,24 @@ In order to ingest metadata from Tableau, you will need: - Tableau Server Version 2021.1.10 and above. It may also work for older versions. -- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled. -- Tableau Credentials (Username/Password or [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens)) -- The user or token must have **Site Administrator Explorer** permissions. +- [Enable the Tableau Metadata API](https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server) for Tableau Server, if its not already enabled. This is always enabled for Tableau Cloud. + +### Authentication + +DataHub supports two authentication methods: + +1. Username/Password +2. [Personal Access Token](https://help.tableau.com/current/pro/desktop/en-us/useracct.htm#create-and-revoke-personal-access-tokens) + +Either way, the user/token must have the **Site Administrator Explorer** site role. + +:::info + +We need the `Site Administrator Explorer` site role in order to get complete metadata from Tableau. + +With any lower role, the Tableau Metadata API returns missing/partial metadata. This particularly affects data source fields and definitions, which impacts our ability to extract columns and generate column lineage. As such, other site roles like `Viewer` are insufficient with the current Tableau Metadata API. + +::: ### Ingestion through UI @@ -46,8 +61,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce | Source Concept | DataHub Concept | Notes | | --------------------------- | ------------------------------------------------------------- | --------------------------------- | -| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) | -| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` | +| `"Tableau"` | [Data Platform](../../metamodel/entities/dataPlatform.md) | +| Project | [Container](../../metamodel/entities/container.md) | SubType `"Project"` | | Embedded DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Embedded Data Source"` | | Published DataSource | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Published Data Source"` | | Custom SQL Table | [Dataset](../../metamodel/entities/dataset.md) | SubTypes `"View"`, `"Custom SQL"` | @@ -75,14 +90,15 @@ Lineage is emitted as received from Tableau's metadata API for ### Troubleshooting -### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project? +#### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project? This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in response to metadata query and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, consider - reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10). - increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value. -### `PERMISSIONS_MODE_SWITCHED` error in ingestion report +#### `PERMISSIONS_MODE_SWITCHED` error in ingestion report + This error occurs if the Tableau site is using external assets. For more detail, refer to the Tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm). Follow the below steps to enable the derived permissions: diff --git a/metadata-ingestion/examples/mce_files/bootstrap_mce.json b/metadata-ingestion/examples/mce_files/bootstrap_mce.json index bc218e5e8c2d5..d4e3d3aa5d8c4 100644 --- a/metadata-ingestion/examples/mce_files/bootstrap_mce.json +++ b/metadata-ingestion/examples/mce_files/bootstrap_mce.json @@ -3394,7 +3394,7 @@ "changeType":"UPSERT", "aspectName":"datasetProfile", "aspect":{ - "value":"{\"timestampMillis\": 1723488954865, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}", + "value":"{\"timestampMillis\": 1735823280000, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}", "contentType":"application/json" }, "systemMetadata":null @@ -3418,7 +3418,7 @@ "changeType":"UPSERT", "aspectName":"operation", "aspect":{ - "value":"{\"timestampMillis\": 1679515693000, \"operationType\": \"INSERT\", \"lastUpdatedTimestamp\": 1629097200001 }", + "value":"{\"timestampMillis\": 1711138093000, \"operationType\": \"INSERT\", \"lastUpdatedTimestamp\": 1629097200001 }", "contentType":"application/json" }, "systemMetadata":null @@ -3584,7 +3584,7 @@ "changeType": "UPSERT", "aspectName": "assertionRunEvent", "aspect": { - "value": "{\"timestampMillis\": 1675155843000, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"{\\\"category\\\": \\\"catA\\\"}\"}, \"runId\": \"2021-12-28T12:00:00Z\", \"assertionUrn\": \"urn:li:assertion:358c683782c93c2fc2bd4bdd4fdb0153\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)\", \"batchSpec\": {\"customProperties\": {\"data_asset_name\": \"data__foo1__asset\", \"datasource_name\": \"my_hive_datasource\"}, \"nativeBatchId\": \"c8f12129f2e57412eee5fb8656154d05\", \"limit\": 10}, \"status\": \"COMPLETE\", \"result\": {\"type\": \"SUCCESS\", \"nativeResults\": {}}}", + "value": "{\"timestampMillis\": 1730554659000, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"{\\\"category\\\": \\\"catA\\\"}\"}, \"runId\": \"2021-12-28T12:00:00Z\", \"assertionUrn\": \"urn:li:assertion:358c683782c93c2fc2bd4bdd4fdb0153\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)\", \"batchSpec\": {\"customProperties\": {\"data_asset_name\": \"data__foo1__asset\", \"datasource_name\": \"my_hive_datasource\"}, \"nativeBatchId\": \"c8f12129f2e57412eee5fb8656154d05\", \"limit\": 10}, \"status\": \"COMPLETE\", \"result\": {\"type\": \"SUCCESS\", \"nativeResults\": {}}}", "contentType": "application/json" }, "systemMetadata": null diff --git a/metadata-ingestion/examples/structured_properties/list_structured_properties.py b/metadata-ingestion/examples/structured_properties/list_structured_properties.py new file mode 100644 index 0000000000000..66ac90c1228a3 --- /dev/null +++ b/metadata-ingestion/examples/structured_properties/list_structured_properties.py @@ -0,0 +1,12 @@ +# Usage: python3 list_structured_properties.py +# Expected Output: List of structured properties +# This script lists all structured properties in DataHub +from datahub.api.entities.structuredproperties.structuredproperties import ( + StructuredProperties, +) +from datahub.ingestion.graph.client import get_default_graph + +with get_default_graph() as graph: + structuredproperties = StructuredProperties.list(graph) + for structuredproperty in structuredproperties: + print(structuredproperty.dict()) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 8357262537bcf..d5dbb98d3cb17 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -207,7 +207,7 @@ # Clickhouse 0.8.3 adds support for SQLAlchemy 1.4.x "sqlalchemy-redshift>=0.8.3", "GeoAlchemy2", - "redshift-connector>=2.1.0", + "redshift-connector>=2.1.5", *path_spec_common, } @@ -461,7 +461,7 @@ "mssql-odbc": sql_common | mssql_common | {"pyodbc"}, "mysql": mysql, # mariadb should have same dependency as mysql - "mariadb": sql_common | {"pymysql>=1.0.2"}, + "mariadb": sql_common | mysql, "okta": {"okta~=1.7.0", "nest-asyncio"}, "oracle": sql_common | {"oracledb"}, "postgres": sql_common | postgres_common, diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index 619f69b016262..179dbdb231c91 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -1,7 +1,7 @@ import logging from enum import Enum from pathlib import Path -from typing import List, Optional +from typing import Iterable, List, Optional import yaml from pydantic import validator @@ -226,3 +226,14 @@ def to_yaml( yaml.indent(mapping=2, sequence=4, offset=2) yaml.default_flow_style = False yaml.dump(self.dict(), fp) + + @staticmethod + def list_urns(graph: DataHubGraph) -> Iterable[str]: + return graph.get_urns_by_filter( + entity_types=["structuredProperty"], + ) + + @staticmethod + def list(graph: DataHubGraph) -> Iterable["StructuredProperties"]: + for urn in StructuredProperties.list_urns(graph): + yield StructuredProperties.from_datahub(graph, urn) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index f80181192ba58..f6b5ba6176c59 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -3,7 +3,7 @@ import time import typing from datetime import datetime -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union import click import requests @@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]: return next((el for el in ls if el is not None and el.strip() != ""), None) +_T = TypeVar("_T") + + +def get_or_else(value: Optional[_T], default: _T) -> _T: + # Normally we'd use `value or default`. However, that runs into issues + # when value is falsey but not None. + return value if value is not None else default + + def parse_run_restli_response(response: requests.Response) -> dict: response_json = response.json() if response.status_code != 200: @@ -321,6 +330,8 @@ def get_frontend_session_login_as( def _ensure_valid_gms_url_acryl_cloud(url: str) -> str: if "acryl.io" not in url: return url + if url.endswith(":8080"): + url = url.replace(":8080", "") if url.startswith("http://"): url = url.replace("http://", "https://") if url.endswith("acryl.io"): diff --git a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py index 42285cf13a5dd..5cd28516a076d 100644 --- a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py @@ -1,9 +1,11 @@ import json import logging from pathlib import Path +from typing import Iterable import click from click_default_group import DefaultGroup +from ruamel.yaml import YAML from datahub.api.entities.structuredproperties.structuredproperties import ( StructuredProperties, @@ -61,3 +63,85 @@ def get(urn: str, to_file: str) -> None: ) else: click.secho(f"Structured property {urn} does not exist") + + +@properties.command( + name="list", +) +@click.option("--details/--no-details", is_flag=True, default=True) +@click.option("--to-file", required=False, type=str) +@telemetry.with_telemetry() +def list(details: bool, to_file: str) -> None: + """List structured properties in DataHub""" + + def to_yaml_list( + objects: Iterable[StructuredProperties], # iterable of objects to dump + file: Path, + ) -> None: + # if file exists, first we read it + yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) + yaml.indent(mapping=2, sequence=4, offset=2) + yaml.default_flow_style = False + serialized_objects = [] + if file.exists(): + with open(file, "r") as fp: + existing_objects = yaml.load(fp) # this is a list of dicts + existing_objects = [ + StructuredProperties.parse_obj(obj) for obj in existing_objects + ] + objects = [obj for obj in objects] + # do a positional update of the existing objects + existing_urns = {obj.urn for obj in existing_objects} + # existing_urns = {obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" for obj in existing_objects} + for i, obj in enumerate(existing_objects): + # existing_urn = obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" + existing_urn = obj.urn + # breakpoint() + if existing_urn in {obj.urn for obj in objects}: + existing_objects[i] = next( + obj.dict(exclude_unset=True, exclude_none=True) + for obj in objects + if obj.urn == existing_urn + ) + new_objects = [ + obj.dict(exclude_unset=True, exclude_none=True) + for obj in objects + if obj.urn not in existing_urns + ] + serialized_objects = existing_objects + new_objects + else: + serialized_objects = [ + obj.dict(exclude_unset=True, exclude_none=True) for obj in objects + ] + + with open(file, "w") as fp: + yaml.dump(serialized_objects, fp) + + with get_default_graph() as graph: + if details: + logger.info( + "Listing structured properties with details. Use --no-details for urns only" + ) + structuredproperties = StructuredProperties.list(graph) + if to_file: + to_yaml_list(structuredproperties, Path(to_file)) + else: + for structuredproperty in structuredproperties: + click.secho( + f"{json.dumps(structuredproperty.dict(exclude_unset=True, exclude_none=True), indent=2)}" + ) + else: + logger.info( + "Listing structured property urns only, use --details for more information" + ) + structured_property_urns = StructuredProperties.list_urns(graph) + if to_file: + with open(to_file, "w") as f: + for urn in structured_property_urns: + f.write(f"{urn}\n") + click.secho( + f"Structured property urns written to {to_file}", fg="green" + ) + else: + for urn in structured_property_urns: + click.secho(f"{urn}") diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 110624aa61cb8..f095fffbaea6b 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -24,6 +24,7 @@ import typing_inspect from avrogen.dict_wrapper import DictWrapper +from typing_extensions import assert_never from datahub.emitter.enum_helpers import get_enum_options from datahub.metadata.schema_classes import ( @@ -269,9 +270,8 @@ def make_owner_urn(owner: str, owner_type: OwnerType) -> str: return make_user_urn(owner) elif owner_type == OwnerType.GROUP: return make_group_urn(owner) - # This should pretty much never happen. - # TODO: With Python 3.11, we can use typing.assert_never() here. - return f"urn:li:{owner_type.value}:{owner}" + else: + assert_never(owner_type) def make_ownership_type_urn(type: str) -> str: diff --git a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py index 1ed8ce1d5a615..17026a4114c12 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py @@ -2,7 +2,19 @@ import time from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Sequence, Union +from typing import ( + Any, + Dict, + List, + Literal, + Optional, + Protocol, + Tuple, + Union, + runtime_checkable, +) + +from typing_extensions import LiteralString from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE from datahub.emitter.serialization_helper import pre_json_transform @@ -19,25 +31,36 @@ from datahub.utilities.urns.urn import guess_entity_type +@runtime_checkable +class SupportsToObj(Protocol): + def to_obj(self) -> Any: + ... + + def _recursive_to_obj(obj: Any) -> Any: if isinstance(obj, list): return [_recursive_to_obj(v) for v in obj] - elif hasattr(obj, "to_obj"): + elif isinstance(obj, SupportsToObj): return obj.to_obj() else: return obj +PatchPath = Tuple[Union[LiteralString, Urn], ...] +PatchOp = Literal["add", "remove", "replace"] + + @dataclass -class _Patch: - op: str # one of ['add', 'remove', 'replace']; we don't support move, copy or test - path: str +class _Patch(SupportsToObj): + op: PatchOp + path: PatchPath value: Any def to_obj(self) -> Dict: + quoted_path = "/" + "/".join(MetadataPatchProposal.quote(p) for p in self.path) return { "op": self.op, - "path": self.path, + "path": quoted_path, "value": _recursive_to_obj(self.value), } @@ -63,15 +86,16 @@ def __init__( # Json Patch quoting based on https://jsonpatch.com/#json-pointer @classmethod - def quote(cls, value: str) -> str: - return value.replace("~", "~0").replace("/", "~1") + def quote(cls, value: Union[str, Urn]) -> str: + return str(value).replace("~", "~0").replace("/", "~1") def _add_patch( - self, aspect_name: str, op: str, path: Union[str, Sequence[str]], value: Any + self, + aspect_name: str, + op: PatchOp, + path: PatchPath, + value: Any, ) -> None: - if not isinstance(path, str): - path = "/" + "/".join(self.quote(p) for p in path) - # TODO: Validate that aspectName is a valid aspect for this entityType self.patches[aspect_name].append(_Patch(op, path, value)) diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 04242c8bf45d2..74b8ade7da445 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -1,9 +1,21 @@ +from __future__ import annotations + import functools import json import logging import os from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) import requests from deprecated import deprecated @@ -12,8 +24,13 @@ from datahub import nice_version_name from datahub.cli import config_utils -from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url -from datahub.configuration.common import ConfigurationError, OperationalError +from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else +from datahub.cli.env_utils import get_boolean_env_variable +from datahub.configuration.common import ( + ConfigModel, + ConfigurationError, + OperationalError, +) from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.request_helper import make_curl_command @@ -30,10 +47,8 @@ logger = logging.getLogger(__name__) -_DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect -_DEFAULT_READ_TIMEOUT_SEC = ( - 30 # Any ingest call taking longer than 30 seconds should be abandoned -) +_DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect +_TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on 429, 500, @@ -46,6 +61,8 @@ os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4") ) +_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False) + # The limit is 16mb. We will use a max of 15mb to have some space # for overhead like request headers. # This applies to pretty much all calls to GMS. @@ -60,15 +77,76 @@ ) +class RequestsSessionConfig(ConfigModel): + timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC + + retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES + retry_methods: List[str] = _DEFAULT_RETRY_METHODS + retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES + + extra_headers: Dict[str, str] = {} + + ca_certificate_path: Optional[str] = None + client_certificate_path: Optional[str] = None + disable_ssl_verification: bool = False + + def build_session(self) -> requests.Session: + session = requests.Session() + + if self.extra_headers: + session.headers.update(self.extra_headers) + + if self.client_certificate_path: + session.cert = self.client_certificate_path + + if self.ca_certificate_path: + session.verify = self.ca_certificate_path + + if self.disable_ssl_verification: + session.verify = False + + try: + # Set raise_on_status to False to propagate errors: + # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception + # Must call `raise_for_status` after making a request, which we do + retry_strategy = Retry( + total=self.retry_max_times, + status_forcelist=self.retry_status_codes, + backoff_factor=2, + allowed_methods=self.retry_methods, + raise_on_status=False, + ) + except TypeError: + # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`. + retry_strategy = Retry( + total=self.retry_max_times, + status_forcelist=self.retry_status_codes, + backoff_factor=2, + method_whitelist=self.retry_methods, + raise_on_status=False, + ) + + adapter = HTTPAdapter( + pool_connections=100, pool_maxsize=100, max_retries=retry_strategy + ) + session.mount("http://", adapter) + session.mount("https://", adapter) + + if self.timeout is not None: + # Shim session.request to apply default timeout values. + # Via https://stackoverflow.com/a/59317604. + session.request = functools.partial( # type: ignore + session.request, + timeout=self.timeout, + ) + + return session + + class DataHubRestEmitter(Closeable, Emitter): _gms_server: str _token: Optional[str] _session: requests.Session - _connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC - _read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC - _retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES - _retry_methods: List[str] = _DEFAULT_RETRY_METHODS - _retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES def __init__( self, @@ -99,15 +177,13 @@ def __init__( self._session = requests.Session() - self._session.headers.update( - { - "X-RestLi-Protocol-Version": "2.0.0", - "X-DataHub-Py-Cli-Version": nice_version_name(), - "Content-Type": "application/json", - } - ) + headers = { + "X-RestLi-Protocol-Version": "2.0.0", + "X-DataHub-Py-Cli-Version": nice_version_name(), + "Content-Type": "application/json", + } if token: - self._session.headers.update({"Authorization": f"Bearer {token}"}) + headers["Authorization"] = f"Bearer {token}" else: # HACK: When no token is provided but system auth env variables are set, we use them. # Ideally this should simply get passed in as config, instead of being sneakily injected @@ -116,75 +192,43 @@ def __init__( # rest emitter, and the rest sink uses the rest emitter under the hood. system_auth = config_utils.get_system_auth() if system_auth is not None: - self._session.headers.update({"Authorization": system_auth}) - - if extra_headers: - self._session.headers.update(extra_headers) - - if client_certificate_path: - self._session.cert = client_certificate_path + headers["Authorization"] = system_auth - if ca_certificate_path: - self._session.verify = ca_certificate_path - - if disable_ssl_verification: - self._session.verify = False - - self._connect_timeout_sec = ( - connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC - ) - self._read_timeout_sec = ( - read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC - ) - - if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1: - logger.warning( - f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s" - ) - - if retry_status_codes is not None: # Only if missing. Empty list is allowed - self._retry_status_codes = retry_status_codes - - if retry_methods is not None: - self._retry_methods = retry_methods - - if retry_max_times: - self._retry_max_times = retry_max_times - - try: - # Set raise_on_status to False to propagate errors: - # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception - # Must call `raise_for_status` after making a request, which we do - retry_strategy = Retry( - total=self._retry_max_times, - status_forcelist=self._retry_status_codes, - backoff_factor=2, - allowed_methods=self._retry_methods, - raise_on_status=False, - ) - except TypeError: - # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`. - retry_strategy = Retry( - total=self._retry_max_times, - status_forcelist=self._retry_status_codes, - backoff_factor=2, - method_whitelist=self._retry_methods, - raise_on_status=False, + timeout: float | tuple[float, float] + if connect_timeout_sec is not None or read_timeout_sec is not None: + timeout = ( + connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC, + read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC, ) + if ( + timeout[0] < _TIMEOUT_LOWER_BOUND_SEC + or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC + ): + logger.warning( + f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds" + ) + else: + timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC) + if timeout < _TIMEOUT_LOWER_BOUND_SEC: + logger.warning( + f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds" + ) - adapter = HTTPAdapter( - pool_connections=100, pool_maxsize=100, max_retries=retry_strategy - ) - self._session.mount("http://", adapter) - self._session.mount("https://", adapter) - - # Shim session.request to apply default timeout values. - # Via https://stackoverflow.com/a/59317604. - self._session.request = functools.partial( # type: ignore - self._session.request, - timeout=(self._connect_timeout_sec, self._read_timeout_sec), + self._session_config = RequestsSessionConfig( + timeout=timeout, + retry_status_codes=get_or_else( + retry_status_codes, _DEFAULT_RETRY_STATUS_CODES + ), + retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS), + retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES), + extra_headers={**headers, **(extra_headers or {})}, + ca_certificate_path=ca_certificate_path, + client_certificate_path=client_certificate_path, + disable_ssl_verification=disable_ssl_verification, ) + self._session = self._session_config.build_session() + def test_connection(self) -> None: url = f"{self._gms_server}/config" response = self._session.get(url) @@ -291,7 +335,8 @@ def emit_mcps( mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], async_flag: Optional[bool] = None, ) -> int: - logger.debug("Attempting to emit batch mcps") + if _DATAHUB_EMITTER_TRACE: + logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}") url = f"{self._gms_server}/aspects?action=ingestProposalBatch" for mcp in mcps: ensure_has_system_metadata(mcp) @@ -304,22 +349,25 @@ def emit_mcps( current_chunk_size = INGEST_MAX_PAYLOAD_BYTES for mcp_obj in mcp_objs: mcp_obj_size = len(json.dumps(mcp_obj)) - logger.debug( - f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}" - ) + if _DATAHUB_EMITTER_TRACE: + logger.debug( + f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}" + ) if ( mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH ): - logger.debug("Decided to create new chunk") + if _DATAHUB_EMITTER_TRACE: + logger.debug("Decided to create new chunk") mcp_obj_chunks.append([]) current_chunk_size = 0 mcp_obj_chunks[-1].append(mcp_obj) current_chunk_size += mcp_obj_size - logger.debug( - f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks" - ) + if len(mcp_obj_chunks) > 0: + logger.debug( + f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks" + ) for mcp_obj_chunk in mcp_obj_chunks: # TODO: We're calling json.dumps on each MCP object twice, once to estimate diff --git a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py index 559f0b77f59df..b63c96b617ff0 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +++ b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py @@ -1,10 +1,9 @@ import json import logging -from typing import Iterable, List +from typing import TYPE_CHECKING, Iterable, List from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES from datahub.emitter.serialization_helper import pre_json_transform -from datahub.ingestion.api.source import SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( DatasetProfileClass, @@ -12,12 +11,15 @@ SchemaMetadataClass, ) +if TYPE_CHECKING: + from datahub.ingestion.api.source import SourceReport + logger = logging.getLogger(__name__) class EnsureAspectSizeProcessor: def __init__( - self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES + self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES ): self.report = report self.payload_constraint = payload_constraint diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index c3638635b19aa..53cb1b0ecad4e 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -31,6 +31,9 @@ from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import ( auto_patch_last_modified, ) +from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( + EnsureAspectSizeProcessor, +) from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit from datahub.ingestion.api.report import Report @@ -331,6 +334,8 @@ def as_obj(self) -> dict: } def compute_stats(self) -> None: + super().compute_stats() + duration = datetime.datetime.now() - self.start_time workunits_produced = self.events_produced if duration.total_seconds() > 0: @@ -450,6 +455,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: browse_path_processor, partial(auto_workunit_reporter, self.get_report()), auto_patch_last_modified, + EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size, ] @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py index ddcb74e354613..bdcdcb8990eba 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py @@ -1,4 +1,3 @@ -import os from abc import ABCMeta, abstractmethod from dataclasses import dataclass from typing import Any, Dict, List, Optional @@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel): ) max_workers: int = Field( - default=(os.cpu_count() or 4), - description="Number of worker processes to use for classification. Set to 1 to disable.", + default=1, + description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.", ) table_pattern: AllowDenyPattern = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index ca9a41172e5b6..7de6e8130a7ab 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -179,21 +179,24 @@ def frontend_base_url(self) -> str: @classmethod def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph": + session_config = emitter._session_config + if isinstance(session_config.timeout, tuple): + # TODO: This is slightly lossy. Eventually, we want to modify the emitter + # to accept a tuple for timeout_sec, and then we'll be able to remove this. + timeout_sec: Optional[float] = session_config.timeout[0] + else: + timeout_sec = session_config.timeout return cls( DatahubClientConfig( server=emitter._gms_server, token=emitter._token, - timeout_sec=emitter._read_timeout_sec, - retry_status_codes=emitter._retry_status_codes, - retry_max_times=emitter._retry_max_times, - extra_headers=emitter._session.headers, - disable_ssl_verification=emitter._session.verify is False, - ca_certificate_path=( - emitter._session.verify - if isinstance(emitter._session.verify, str) - else None - ), - client_certificate_path=emitter._session.cert, + timeout_sec=timeout_sec, + retry_status_codes=session_config.retry_status_codes, + retry_max_times=session_config.retry_max_times, + extra_headers=session_config.extra_headers, + disable_ssl_verification=session_config.disable_ssl_verification, + ca_certificate_path=session_config.ca_certificate_path, + client_certificate_path=session_config.client_certificate_path, ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/config.py b/metadata-ingestion/src/datahub/ingestion/graph/config.py index 5f269e14e1a4a..8f0a5844c97c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/config.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/config.py @@ -10,7 +10,7 @@ class DatahubClientConfig(ConfigModel): # by callers / the CLI, but the actual client should not have any magic. server: str token: Optional[str] = None - timeout_sec: Optional[int] = None + timeout_sec: Optional[float] = None retry_status_codes: Optional[List[int]] = None retry_max_times: Optional[int] = None extra_headers: Optional[Dict[str, str]] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 7a5ed154d40bc..a0bed4ae9a758 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -1054,49 +1054,66 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self.gen_database_containers(database) for table in tables: - database_name = table["DatabaseName"] table_name = table["Name"] - full_table_name = f"{database_name}.{table_name}" - self.report.report_table_scanned() - if not self.source_config.database_pattern.allowed( - database_name - ) or not self.source_config.table_pattern.allowed(full_table_name): - self.report.report_table_dropped(full_table_name) - continue + try: + yield from self._gen_table_wu(table=table) + except KeyError as e: + self.report.report_failure( + message="Failed to extract workunit for table", + context=f"Table: {table_name}", + exc=e, + ) + if self.extract_transforms: + yield from self._transform_extraction() - dataset_urn = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=full_table_name, - env=self.env, - platform_instance=self.source_config.platform_instance, - ) + def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]: + database_name = table["DatabaseName"] + table_name = table["Name"] + full_table_name = f"{database_name}.{table_name}" + self.report.report_table_scanned() + if not self.source_config.database_pattern.allowed( + database_name + ) or not self.source_config.table_pattern.allowed(full_table_name): + self.report.report_table_dropped(full_table_name) + return + + dataset_urn = make_dataset_urn_with_platform_instance( + platform=self.platform, + name=full_table_name, + env=self.env, + platform_instance=self.source_config.platform_instance, + ) - mce = self._extract_record(dataset_urn, table, full_table_name) - yield MetadataWorkUnit(full_table_name, mce=mce) + mce = self._extract_record(dataset_urn, table, full_table_name) + yield MetadataWorkUnit(full_table_name, mce=mce) - # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not - # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp. - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]), - ).as_workunit() + # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not + # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp. + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]), + ).as_workunit() - yield from self._get_domain_wu( - dataset_name=full_table_name, - entity_urn=dataset_urn, - ) - yield from self.add_table_to_database_container( - dataset_urn=dataset_urn, db_name=database_name - ) + yield from self._get_domain_wu( + dataset_name=full_table_name, + entity_urn=dataset_urn, + ) + yield from self.add_table_to_database_container( + dataset_urn=dataset_urn, db_name=database_name + ) - wu = self.get_lineage_if_enabled(mce) - if wu: - yield wu + wu = self.get_lineage_if_enabled(mce) + if wu: + yield wu + try: yield from self.get_profile_if_enabled(mce, database_name, table_name) - - if self.extract_transforms: - yield from self._transform_extraction() + except KeyError as e: + self.report.report_failure( + message="Failed to extract profile for table", + context=f"Table: {dataset_urn}", + exc=e, + ) def _transform_extraction(self) -> Iterable[MetadataWorkUnit]: dags: Dict[str, Optional[Dict[str, Any]]] = {} diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 16a5268a2dea7..508b4bbaa277d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -206,9 +206,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport: def _init_schema_resolver(self) -> SchemaResolver: schema_resolution_required = ( - self.config.use_queries_v2 - or self.config.lineage_parse_view_ddl - or self.config.lineage_use_sql_parser + self.config.use_queries_v2 or self.config.lineage_use_sql_parser ) schema_ingestion_enabled = ( self.config.include_schema_metadata @@ -255,10 +253,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: for project in projects: yield from self.bq_schema_extractor.get_project_workunits(project) - if self.config.use_queries_v2: - # Always ingest View and Snapshot lineage with schema ingestion - self.report.set_ingestion_stage("*", "View and Snapshot Lineage") - + with self.report.new_stage("*: View and Snapshot Lineage"): yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots( [p.id for p in projects], self.bq_schema_extractor.view_refs_by_project, @@ -267,6 +262,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.bq_schema_extractor.snapshots_by_ref, ) + if self.config.use_queries_v2: # if both usage and lineage are disabled then skip queries extractor piece if ( not self.config.include_usage_statistics @@ -274,29 +270,29 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ): return - self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) - - with BigQueryQueriesExtractor( - connection=self.config.get_bigquery_client(), - schema_api=self.bq_schema_extractor.schema_api, - config=BigQueryQueriesExtractorConfig( - window=self.config, - user_email_pattern=self.config.usage.user_email_pattern, - include_lineage=self.config.include_table_lineage, - include_usage_statistics=self.config.include_usage_statistics, - include_operations=self.config.usage.include_operational_stats, - top_n_queries=self.config.usage.top_n_queries, - region_qualifiers=self.config.region_qualifiers, - ), - structured_report=self.report, - filters=self.filters, - identifiers=self.identifiers, - schema_resolver=self.sql_parser_schema_resolver, - discovered_tables=self.bq_schema_extractor.table_refs, - ) as queries_extractor: - self.report.queries_extractor = queries_extractor.report - yield from queries_extractor.get_workunits_internal() - + with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"): + with BigQueryQueriesExtractor( + connection=self.config.get_bigquery_client(), + schema_api=self.bq_schema_extractor.schema_api, + config=BigQueryQueriesExtractorConfig( + window=self.config, + user_email_pattern=self.config.usage.user_email_pattern, + include_lineage=self.config.include_table_lineage, + include_usage_statistics=self.config.include_usage_statistics, + include_operations=self.config.usage.include_operational_stats, + include_queries=self.config.include_queries, + include_query_usage_statistics=self.config.include_query_usage_statistics, + top_n_queries=self.config.usage.top_n_queries, + region_qualifiers=self.config.region_qualifiers, + ), + structured_report=self.report, + filters=self.filters, + identifiers=self.identifiers, + schema_resolver=self.sql_parser_schema_resolver, + discovered_tables=self.bq_schema_extractor.table_refs, + ) as queries_extractor: + self.report.queries_extractor = queries_extractor.report + yield from queries_extractor.get_workunits_internal() else: if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( @@ -306,10 +302,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.config.include_table_lineage: yield from self.lineage_extractor.get_lineage_workunits( [p.id for p in projects], - self.bq_schema_extractor.view_refs_by_project, - self.bq_schema_extractor.view_definitions, - self.bq_schema_extractor.snapshot_refs_by_project, - self.bq_schema_extractor.snapshots_by_ref, self.bq_schema_extractor.table_refs, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 4af41921c9fa3..afbe919df4dca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -447,6 +447,14 @@ class BigQueryV2Config( default=False, description="If enabled, uses the new queries extractor to extract queries from bigquery.", ) + include_queries: bool = Field( + default=True, + description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.", + ) + include_query_usage_statistics: bool = Field( + default=True, + description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.", + ) @property def have_table_data_read_permission(self) -> bool: @@ -463,10 +471,6 @@ def have_table_data_read_permission(self) -> bool: default=True, description="Use sql parser to resolve view/table lineage.", ) - lineage_parse_view_ddl: bool = Field( - default=True, - description="Sql parse view ddl to get lineage.", - ) lineage_sql_parser_use_raw_names: bool = Field( default=False, @@ -572,11 +576,9 @@ def have_table_data_read_permission(self) -> bool: "See [this](https://cloud.google.com/bigquery/docs/information-schema-jobs#scope_and_syntax) for details.", ) - # include_view_lineage and include_view_column_lineage are inherited from SQLCommonConfig - # but not used in bigquery so we hide them from docs. - include_view_lineage: bool = Field(default=True, hidden_from_docs=True) - - include_view_column_lineage: bool = Field(default=True, hidden_from_docs=True) + _include_view_lineage = pydantic_removed_field("include_view_lineage") + _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage") + _lineage_parse_view_ddl = pydantic_removed_field("lineage_parse_view_ddl") @root_validator(pre=True) def set_include_schema_metadata(cls, values: Dict) -> Dict: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 06842da67f76c..8e55d81aac5fe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -190,6 +190,3 @@ class BigQueryV2Report( num_skipped_external_table_lineage: int = 0 queries_extractor: Optional[BigQueryQueriesExtractorReport] = None - - def set_ingestion_stage(self, project_id: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{project_id}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 4a3b47f6b543a..56e930dfb811f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -248,9 +248,9 @@ def modified_base32decode(self, text_to_decode: str) -> str: def get_project_workunits( self, project: BigqueryProject ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION) - logger.info(f"Processing project: {project.id}") - yield from self._process_project(project) + with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"): + logger.info(f"Processing project: {project.id}") + yield from self._process_project(project) def get_dataplatform_instance_aspect( self, dataset_urn: str, project_id: str @@ -405,11 +405,11 @@ def _process_project( if self.config.is_profiling_enabled(): logger.info(f"Starting profiling project {project_id}") - self.report.set_ingestion_stage(project_id, PROFILING) - yield from self.profiler.get_workunits( - project_id=project_id, - tables=db_tables, - ) + with self.report.new_stage(f"{project_id}: {PROFILING}"): + yield from self.profiler.get_workunits( + project_id=project_id, + tables=db_tables, + ) def _process_project_datasets( self, @@ -653,14 +653,11 @@ def _process_view( self.report.report_dropped(table_identifier.raw_table_name()) return - if self.store_table_refs: - table_ref = str( - BigQueryTableRef(table_identifier).get_sanitized_table_ref() - ) - self.table_refs.add(table_ref) - if self.config.lineage_parse_view_ddl and view.view_definition: - self.view_refs_by_project[project_id].add(table_ref) - self.view_definitions[table_ref] = view.view_definition + table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref()) + self.table_refs.add(table_ref) + if view.view_definition: + self.view_refs_by_project[project_id].add(table_ref) + self.view_definitions[table_ref] = view.view_definition view.column_count = len(columns) if not view.column_count: @@ -701,14 +698,11 @@ def _process_snapshot( f"Snapshot doesn't have any column or unable to get columns for snapshot: {table_identifier}" ) - if self.store_table_refs: - table_ref = str( - BigQueryTableRef(table_identifier).get_sanitized_table_ref() - ) - self.table_refs.add(table_ref) - if snapshot.base_table_identifier: - self.snapshot_refs_by_project[project_id].add(table_ref) - self.snapshots_by_ref[table_ref] = snapshot + table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref()) + self.table_refs.add(table_ref) + if snapshot.base_table_identifier: + self.snapshot_refs_by_project[project_id].add(table_ref) + self.snapshots_by_ref[table_ref] = snapshot yield from self.gen_snapshot_dataset_workunits( table=snapshot, @@ -1148,7 +1142,7 @@ def gen_schema_metadata( foreignKeys=foreign_keys if foreign_keys else None, ) - if self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser: + if self.config.lineage_use_sql_parser: self.sql_parser_schema_resolver.add_schema_metadata( dataset_urn, schema_metadata ) @@ -1209,9 +1203,9 @@ def get_tables_for_dataset( report=self.report, ) - self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round( - timer.elapsed_seconds(), 2 - ) + self.report.metadata_extraction_sec[ + f"{project_id}.{dataset.name}" + ] = timer.elapsed_seconds(digits=2) def get_core_table_details( self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 321b1b6207fab..433282a21fdb6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -291,16 +291,15 @@ def get_lineage_workunits_for_views_and_snapshots( snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot], ) -> Iterable[MetadataWorkUnit]: for project in projects: - if self.config.lineage_parse_view_ddl: - for view in view_refs_by_project[project]: - self.datasets_skip_audit_log_lineage.add(view) - self.aggregator.add_view_definition( - view_urn=self.identifiers.gen_dataset_urn_from_raw_ref( - BigQueryTableRef.from_string_name(view) - ), - view_definition=view_definitions[view], - default_db=project, - ) + for view in view_refs_by_project[project]: + self.datasets_skip_audit_log_lineage.add(view) + self.aggregator.add_view_definition( + view_urn=self.identifiers.gen_dataset_urn_from_raw_ref( + BigQueryTableRef.from_string_name(view) + ), + view_definition=view_definitions[view], + default_db=project, + ) for snapshot_ref in snapshot_refs_by_project[project]: snapshot = snapshots_by_ref[snapshot_ref] @@ -322,32 +321,20 @@ def get_lineage_workunits_for_views_and_snapshots( def get_lineage_workunits( self, projects: List[str], - view_refs_by_project: Dict[str, Set[str]], - view_definitions: FileBackedDict[str], - snapshot_refs_by_project: Dict[str, Set[str]], - snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot], table_refs: Set[str], ) -> Iterable[MetadataWorkUnit]: if not self._should_ingest_lineage(): return - yield from self.get_lineage_workunits_for_views_and_snapshots( - projects, - view_refs_by_project, - view_definitions, - snapshot_refs_by_project, - snapshots_by_ref, - ) - if self.config.use_exported_bigquery_audit_metadata: projects = ["*"] # project_id not used when using exported metadata for project in projects: - self.report.set_ingestion_stage(project, LINEAGE_EXTRACTION) - yield from self.generate_lineage( - project, - table_refs, - ) + with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"): + yield from self.generate_lineage( + project, + table_refs, + ) if self.redundant_run_skip_handler: # Update the checkpoint state for this run. @@ -381,8 +368,8 @@ def generate_lineage( self.report.lineage_metadata_entries[project_id] = len(lineage) logger.info(f"Built lineage map containing {len(lineage)} entries.") logger.debug(f"lineage metadata is {lineage}") - self.report.lineage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds( + digits=2 ) self.report.lineage_mem_size[project_id] = humanfriendly.format_size( memory_footprint.total_size(lineage) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 876ffab85ba31..f2f6cc731858d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -495,62 +495,62 @@ def _ingest_events( def _generate_operational_workunits( self, usage_state: BigQueryUsageState, table_refs: Collection[str] ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS) - for audit_event in usage_state.standalone_events(): - try: - operational_wu = self._create_operation_workunit( - audit_event, table_refs - ) - if operational_wu: - yield operational_wu - self.report.num_operational_stats_workunits_emitted += 1 - except Exception as e: - self.report.warning( - message="Unable to generate operation workunit", - context=f"{audit_event}", - exc=e, - ) + with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"): + for audit_event in usage_state.standalone_events(): + try: + operational_wu = self._create_operation_workunit( + audit_event, table_refs + ) + if operational_wu: + yield operational_wu + self.report.num_operational_stats_workunits_emitted += 1 + except Exception as e: + self.report.warning( + message="Unable to generate operation workunit", + context=f"{audit_event}", + exc=e, + ) def _generate_usage_workunits( self, usage_state: BigQueryUsageState ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) - top_n = ( - self.config.usage.top_n_queries - if self.config.usage.include_top_n_queries - else 0 - ) - for entry in usage_state.usage_statistics(top_n=top_n): - try: - query_freq = [ - ( - self.uuid_to_query.get( - query_hash, usage_state.queries[query_hash] - ), - count, + with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"): + top_n = ( + self.config.usage.top_n_queries + if self.config.usage.include_top_n_queries + else 0 + ) + for entry in usage_state.usage_statistics(top_n=top_n): + try: + query_freq = [ + ( + self.uuid_to_query.get( + query_hash, usage_state.queries[query_hash] + ), + count, + ) + for query_hash, count in entry.query_freq + ] + yield make_usage_workunit( + bucket_start_time=datetime.fromisoformat(entry.timestamp), + resource=BigQueryTableRef.from_string_name(entry.resource), + query_count=entry.query_count, + query_freq=query_freq, + user_freq=entry.user_freq, + column_freq=entry.column_freq, + bucket_duration=self.config.bucket_duration, + resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref, + top_n_queries=self.config.usage.top_n_queries, + format_sql_queries=self.config.usage.format_sql_queries, + queries_character_limit=self.config.usage.queries_character_limit, + ) + self.report.num_usage_workunits_emitted += 1 + except Exception as e: + self.report.warning( + message="Unable to generate usage statistics workunit", + context=f"{entry.timestamp}, {entry.resource}", + exc=e, ) - for query_hash, count in entry.query_freq - ] - yield make_usage_workunit( - bucket_start_time=datetime.fromisoformat(entry.timestamp), - resource=BigQueryTableRef.from_string_name(entry.resource), - query_count=entry.query_count, - query_freq=query_freq, - user_freq=entry.user_freq, - column_freq=entry.column_freq, - bucket_duration=self.config.bucket_duration, - resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref, - top_n_queries=self.config.usage.top_n_queries, - format_sql_queries=self.config.usage.format_sql_queries, - queries_character_limit=self.config.usage.queries_character_limit, - ) - self.report.num_usage_workunits_emitted += 1 - except Exception as e: - self.report.warning( - message="Unable to generate usage statistics workunit", - context=f"{entry.timestamp}, {entry.resource}", - exc=e, - ) def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: if self.config.use_exported_bigquery_audit_metadata: @@ -559,10 +559,10 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: for project_id in projects: with PerfTimer() as timer: try: - self.report.set_ingestion_stage( - project_id, USAGE_EXTRACTION_INGESTION - ) - yield from self._get_parsed_bigquery_log_events(project_id) + with self.report.new_stage( + f"{project_id}: {USAGE_EXTRACTION_INGESTION}" + ): + yield from self._get_parsed_bigquery_log_events(project_id) except Exception as e: self.report.usage_failed_extraction.append(project_id) self.report.warning( @@ -572,8 +572,8 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: ) self.report_status(f"usage-extraction-{project_id}", False) - self.report.usage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 + self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds( + digits=2 ) def _store_usage_event( diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py index d8ab62f1d6d91..7bf1d66f618a4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_profiling.py @@ -70,30 +70,30 @@ def get_workunits( ) -> Iterable[MetadataWorkUnit]: for keyspace_name in cassandra_data.keyspaces: tables = cassandra_data.tables.get(keyspace_name, []) - self.report.set_ingestion_stage(keyspace_name, PROFILING) - with ThreadPoolExecutor( - max_workers=self.config.profiling.max_workers - ) as executor: - future_to_dataset = { - executor.submit( - self.generate_profile, - keyspace_name, - table_name, - cassandra_data.columns.get(table_name, []), - ): table_name - for table_name in tables - } - for future in as_completed(future_to_dataset): - table_name = future_to_dataset[future] - try: - yield from future.result() - except Exception as exc: - self.report.profiling_skipped_other[table_name] += 1 - self.report.failure( - message="Failed to profile for table", - context=f"{keyspace_name}.{table_name}", - exc=exc, - ) + with self.report.new_stage(f"{keyspace_name}: {PROFILING}"): + with ThreadPoolExecutor( + max_workers=self.config.profiling.max_workers + ) as executor: + future_to_dataset = { + executor.submit( + self.generate_profile, + keyspace_name, + table_name, + cassandra_data.columns.get(table_name, []), + ): table_name + for table_name in tables + } + for future in as_completed(future_to_dataset): + table_name = future_to_dataset[future] + try: + yield from future.result() + except Exception as exc: + self.report.profiling_skipped_other[table_name] += 1 + self.report.failure( + message="Failed to profile for table", + context=f"{keyspace_name}.{table_name}", + exc=exc, + ) def generate_profile( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py index 41d4ac7ced603..75a0ba0c61773 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_utils.py @@ -54,9 +54,6 @@ def report_entity_scanned(self, name: str, ent_type: str = "View") -> None: else: raise KeyError(f"Unknown entity {ent_type}.") - def set_ingestion_stage(self, keyspace: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{keyspace}: {stage}") - # TODO Need to create seperate common config for profiling report profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict) profiling_skipped_table_profile_pattern: TopKDict[str, int] = field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index cd3c2146e6d84..09f38913f11b1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -1,6 +1,7 @@ import os from typing import Optional, Set +import pydantic from pydantic import Field, root_validator from datahub.configuration.common import AllowDenyPattern @@ -119,3 +120,12 @@ def check_ingesting_data(cls, values): " Please specify at least one of `database_connection` or `kafka_connection`, ideally both." ) return values + + @pydantic.validator("database_connection") + def validate_mysql_scheme( + cls, v: SQLAlchemyConnectionConfig + ) -> SQLAlchemyConnectionConfig: + if "mysql" in v.scheme: + if v.scheme != "mysql+pymysql": + raise ValueError("For MySQL, the scheme must be mysql+pymysql.") + return v diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py index 80906ca63115f..ee105f4862cab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -151,8 +151,10 @@ def execute_server_cursor( self, query: str, params: Dict[str, Any] ) -> Iterable[Dict[str, Any]]: with self.engine.connect() as conn: - if self.engine.dialect.name == "postgresql": + if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]: with conn.begin(): # Transaction required for PostgreSQL server-side cursor + # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects. + # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results conn = conn.execution_options( stream_results=True, yield_per=self.config.database_query_batch_size, @@ -160,22 +162,6 @@ def execute_server_cursor( result = conn.execute(query, params) for row in result: yield dict(row) - elif self.engine.dialect.name == "mysql": # MySQL - import MySQLdb - - with contextlib.closing( - conn.connection.cursor(MySQLdb.cursors.SSCursor) - ) as cursor: - logger.debug(f"Using Cursor type: {cursor.__class__.__name__}") - cursor.execute(query, params) - - columns = [desc[0] for desc in cursor.description] - while True: - rows = cursor.fetchmany(self.config.database_query_batch_size) - if not rows: - break # Use break instead of return in generator - for row in rows: - yield dict(zip(columns, row)) else: raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index cb72441344088..12daba298a201 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -130,7 +130,7 @@ def _get_database_workunits( self._commit_progress(i) def _get_kafka_workunits( - self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = [] + self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] ) -> Iterable[MetadataWorkUnit]: if self.config.kafka_connection is None: return diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py index c8eb035461ca1..9712d4ddc6799 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py @@ -45,6 +45,3 @@ def report_entity_scanned(self, name: str, ent_type: str = "View") -> None: self.views_scanned += 1 else: raise KeyError(f"Unknown entity {ent_type}.") - - def set_ingestion_stage(self, dataset: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{dataset}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py index 319290d25169a..6d34e86be6282 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py @@ -472,8 +472,8 @@ def generate_profiles( env=self.config.env, platform_instance=self.config.platform_instance, ) - self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING) - yield from self.profiler.get_workunits(dataset_info, dataset_urn) + with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"): + yield from self.profiler.get_workunits(dataset_info, dataset_urn) def generate_view_lineage( self, dataset_urn: str, parents: List[str] diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 168b787b85e8b..b4cc5423277c5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -141,40 +141,36 @@ def get_workunits_internal( ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: try: - self.report.report_ingestion_stage_start("Expired Token Cleanup") - self.revoke_expired_tokens() + with self.report.new_stage("Expired Token Cleanup"): + self.revoke_expired_tokens() except Exception as e: self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: try: - self.report.report_ingestion_stage_start("Truncate Indices") - self.truncate_indices() + with self.report.new_stage("Truncate Indices"): + self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) if self.config.soft_deleted_entities_cleanup.enabled: try: - self.report.report_ingestion_stage_start( - "Soft Deleted Entities Cleanup" - ) - self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() + with self.report.new_stage("Soft Deleted Entities Cleanup"): + self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) if self.config.dataprocess_cleanup.enabled: try: - self.report.report_ingestion_stage_start("Data Process Cleanup") - yield from self.dataprocess_cleanup.get_workunits_internal() + with self.report.new_stage("Data Process Cleanup"): + yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: self.report.failure("While trying to cleanup data process ", exc=e) if self.config.execution_request_cleanup.enabled: try: - self.report.report_ingestion_stage_start("Execution request Cleanup") - self.execution_request_cleanup.run() + with self.report.new_stage("Execution request Cleanup"): + self.execution_request_cleanup.run() except Exception as e: self.report.failure("While trying to cleanup execution request ", exc=e) - # Otherwise last stage's duration does not get calculated. - self.report.report_ingestion_stage_start("End") yield from [] def truncate_indices(self) -> None: @@ -188,6 +184,9 @@ def truncate_indices(self) -> None: self._truncate_timeseries_helper( aspect_name="dashboardUsageStatistics", entity_type="dashboard" ) + self._truncate_timeseries_helper( + aspect_name="queryusagestatistics", entity_type="query" + ) def _truncate_timeseries_helper(self, aspect_name: str, entity_type: str) -> None: self._truncate_timeseries_with_watch_optional( diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 170a6ada3e336..f9a00d7f00905 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -141,7 +141,9 @@ def _scroll_execution_requests( break if self.report.ergc_read_errors >= self.config.max_read_errors: self.report.failure( - f"ergc({self.instance_id}): too many read errors, aborting." + title="Too many read errors, aborting", + message="Too many read errors, aborting", + context=str(self.instance_id), ) break try: @@ -158,8 +160,11 @@ def _scroll_execution_requests( break params["scrollId"] = document["scrollId"] except Exception as e: - logger.error( - f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}" + self.report.failure( + title="Failed to fetch next batch of execution requests", + message="Failed to fetch next batch of execution requests", + context=str(self.instance_id), + exc=e, ) self.report.ergc_read_errors += 1 @@ -231,8 +236,11 @@ def _delete_entry(self, entry: CleanupRecord) -> None: self.graph.delete_entity(entry.urn, True) except Exception as e: self.report.ergc_delete_errors += 1 - logger.error( - f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}" + self.report.failure( + title="Failed to delete ExecutionRequest", + message="Failed to delete ExecutionRequest", + context=str(self.instance_id), + exc=e, ) def _reached_runtime_limit(self) -> bool: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index 4c0355834f9b4..0a52b7e17bf71 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -19,8 +19,8 @@ logger = logging.getLogger(__name__) -QUERY_QUERY_ENTITY = """ -query listQueries($input: ScrollAcrossEntitiesInput!) { +QUERY_ENTITIES = """ +query listEntities($input: ScrollAcrossEntitiesInput!) { scrollAcrossEntities(input: $input) { nextScrollId count @@ -29,6 +29,9 @@ ... on QueryEntity { urn } + ... on DataProcessInstance { + urn + } } } } @@ -105,6 +108,8 @@ class SoftDeletedEntitiesReport(SourceReport): sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field( default_factory=TopKDict ) + runtime_limit_reached: bool = False + deletion_limit_reached: bool = False class SoftDeletedEntitiesCleanup: @@ -163,6 +168,8 @@ def delete_entity(self, urn: str) -> None: f"Dry run is on otherwise it would have deleted {urn} with hard deletion" ) return + if self._deletion_limit_reached() or self._times_up(): + return self._increment_removal_started_count() self.ctx.graph.delete_entity(urn=urn, hard=True) self.ctx.graph.delete_references_to_urn( @@ -203,11 +210,10 @@ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]: for future in done: self._print_report() if future.exception(): - logger.error( - f"Failed to delete entity {futures[future]}: {future.exception()}" - ) self.report.failure( - f"Failed to delete entity {futures[future]}", + title="Failed to delete entity", + message="Failed to delete entity", + context=futures[future], exc=future.exception(), ) self.report.num_soft_deleted_entity_processed += 1 @@ -222,16 +228,16 @@ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]: time.sleep(self.config.delay) return futures - def _get_soft_deleted_queries(self) -> Iterable[str]: + def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]: assert self.ctx.graph scroll_id: Optional[str] = None while True: try: result = self.ctx.graph.execute_graphql( - QUERY_QUERY_ENTITY, + graphql_query, { "input": { - "types": ["QUERY"], + "types": [entity_type], "query": "*", "scrollId": scroll_id if scroll_id else None, "count": self.config.batch_size, @@ -251,11 +257,11 @@ def _get_soft_deleted_queries(self) -> Iterable[str]: ) except Exception as e: self.report.failure( - f"While trying to get queries with {scroll_id}", exc=e + f"While trying to get {entity_type} with {scroll_id}", exc=e ) break scroll_across_entities = result.get("scrollAcrossEntities") - if not scroll_across_entities: + if not scroll_across_entities or not scroll_across_entities.get("count"): break scroll_id = scroll_across_entities.get("nextScrollId") self.report.num_queries_found += scroll_across_entities.get("count") @@ -272,7 +278,28 @@ def _get_urns(self) -> Iterable[str]: status=RemovedStatusFilter.ONLY_SOFT_DELETED, batch_size=self.config.batch_size, ) - yield from self._get_soft_deleted_queries() + yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY") + yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE") + + def _times_up(self) -> bool: + if ( + self.config.runtime_limit_seconds + and time.time() - self.start_time > self.config.runtime_limit_seconds + ): + with self._report_lock: + self.report.runtime_limit_reached = True + return True + return False + + def _deletion_limit_reached(self) -> bool: + if ( + self.config.limit_entities_delete + and self.report.num_hard_deleted > self.config.limit_entities_delete + ): + with self._report_lock: + self.report.deletion_limit_reached = True + return True + return False def cleanup_soft_deleted_entities(self) -> None: if not self.config.enabled: @@ -285,24 +312,8 @@ def cleanup_soft_deleted_entities(self) -> None: self._print_report() while len(futures) >= self.config.futures_max_at_time: futures = self._process_futures(futures) - if ( - self.config.limit_entities_delete - and self.report.num_hard_deleted > self.config.limit_entities_delete - ): - logger.info( - f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more." - ) - break - if ( - self.config.runtime_limit_seconds - and time.time() - self.start_time - > self.config.runtime_limit_seconds - ): - logger.info( - f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures." - ) + if self._deletion_limit_reached() or self._times_up(): break - future = executor.submit(self.delete_soft_deleted_entity, urn) futures[future] = urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index bfae3060013d5..4e9d0f68928a4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -300,11 +300,16 @@ class LookerDashboardSourceConfig( folder_path_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), - description="Allow or deny dashboards from specific folders. " + description="Allow or deny dashboards from specific folders using their fully qualified paths. " "For example: \n" "deny: \n" - " - sales/deprecated \n" - "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n" + " - Shared/deprecated \n" + "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n" + "allow: \n" + " - Shared/sales \n" + "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n" + "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. " + "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. " "Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 49f7941563c1a..5371017a2a321 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -423,10 +423,10 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit database = self.config.database logger.info(f"Processing db {database}") - self.report.report_ingestion_stage_start(METADATA_EXTRACTION) - self.db_tables[database] = defaultdict() - self.db_views[database] = defaultdict() - self.db_schemas.setdefault(database, {}) + with self.report.new_stage(METADATA_EXTRACTION): + self.db_tables[database] = defaultdict() + self.db_views[database] = defaultdict() + self.db_schemas.setdefault(database, {}) # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping # this fallback. For now, this gets us broad coverage quickly. @@ -462,12 +462,12 @@ def _extract_metadata( self.process_schemas(connection, database) ) - self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) - yield from self.extract_lineage_v2( - connection=connection, - database=database, - lineage_extractor=lineage_extractor, - ) + with self.report.new_stage(LINEAGE_EXTRACTION): + yield from self.extract_lineage_v2( + connection=connection, + database=database, + lineage_extractor=lineage_extractor, + ) all_tables = self.get_all_tables() else: @@ -480,25 +480,25 @@ def _extract_metadata( or self.config.include_view_lineage or self.config.include_copy_lineage ): - self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) - yield from self.extract_lineage( - connection=connection, all_tables=all_tables, database=database - ) + with self.report.new_stage(LINEAGE_EXTRACTION): + yield from self.extract_lineage( + connection=connection, all_tables=all_tables, database=database + ) - self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION) if self.config.include_usage_statistics: - yield from self.extract_usage( - connection=connection, all_tables=all_tables, database=database - ) + with self.report.new_stage(USAGE_EXTRACTION_INGESTION): + yield from self.extract_usage( + connection=connection, all_tables=all_tables, database=database + ) if self.config.is_profiling_enabled(): - self.report.report_ingestion_stage_start(PROFILING) - profiler = RedshiftProfiler( - config=self.config, - report=self.report, - state_handler=self.profiling_state_handler, - ) - yield from profiler.get_workunits(self.db_tables) + with self.report.new_stage(PROFILING): + profiler = RedshiftProfiler( + config=self.config, + report=self.report, + state_handler=self.profiling_state_handler, + ) + yield from profiler.get_workunits(self.db_tables) def process_schemas(self, connection, database): for schema in self.data_dictionary.get_schemas( @@ -633,8 +633,8 @@ def process_schema( else: logger.info("View processing disabled, skipping") - self.report.metadata_extraction_sec[report_key] = round( - timer.elapsed_seconds(), 2 + self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds( + digits=2 ) def _process_table( @@ -986,9 +986,7 @@ def extract_usage( yield from usage_extractor.get_usage_workunits(all_tables=all_tables) - self.report.usage_extraction_sec[database] = round( - timer.elapsed_seconds(), 2 - ) + self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2) def extract_lineage( self, @@ -1011,8 +1009,8 @@ def extract_lineage( database=database, connection=connection, all_tables=all_tables ) - self.report.lineage_extraction_sec[f"{database}"] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds( + digits=2 ) yield from self.generate_lineage( database, lineage_extractor=lineage_extractor @@ -1042,8 +1040,8 @@ def extract_lineage_v2( yield from lineage_extractor.generate() - self.report.lineage_extraction_sec[f"{database}"] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds( + digits=2 ) if self.redundant_lineage_run_skip_handler: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py index e0bf8b23dd0f7..d66a1ee18be40 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py @@ -182,38 +182,38 @@ def _get_workunits_internal( self.report.num_operational_stats_filtered = 0 if self.config.include_operational_stats: - self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS) - with PerfTimer() as timer: - # Generate operation aspect workunits - yield from self._gen_operation_aspect_workunits( - self.connection, all_tables - ) - self.report.operational_metadata_extraction_sec[ - self.config.database - ] = round(timer.elapsed_seconds(), 2) + with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS): + with PerfTimer() as timer: + # Generate operation aspect workunits + yield from self._gen_operation_aspect_workunits( + self.connection, all_tables + ) + self.report.operational_metadata_extraction_sec[ + self.config.database + ] = timer.elapsed_seconds(digits=2) # Generate aggregate events - self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION) - query: str = self.queries.usage_query( - start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT), - end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), - database=self.config.database, - ) - access_events_iterable: Iterable[ - RedshiftAccessEvent - ] = self._gen_access_events_from_history_query( - query, connection=self.connection, all_tables=all_tables - ) + with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION): + query: str = self.queries.usage_query( + start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT), + end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), + database=self.config.database, + ) + access_events_iterable: Iterable[ + RedshiftAccessEvent + ] = self._gen_access_events_from_history_query( + query, connection=self.connection, all_tables=all_tables + ) - aggregated_events: AggregatedAccessEvents = self._aggregate_access_events( - access_events_iterable - ) - # Generate usage workunits from aggregated events. - for time_bucket in aggregated_events.values(): - for aggregate in time_bucket.values(): - wu: MetadataWorkUnit = self._make_usage_stat(aggregate) - self.report.num_usage_workunits_emitted += 1 - yield wu + aggregated_events: AggregatedAccessEvents = self._aggregate_access_events( + access_events_iterable + ) + # Generate usage workunits from aggregated events. + for time_bucket in aggregated_events.values(): + for aggregate in time_bucket.values(): + wu: MetadataWorkUnit = self._make_usage_stat(aggregate) + self.report.num_usage_workunits_emitted += 1 + yield wu def _gen_operation_aspect_workunits( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 12e5fb72b00de..2d61ce5985777 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -221,6 +221,14 @@ class SnowflakeV2Config( default=False, description="If enabled, uses the new queries extractor to extract queries from snowflake.", ) + include_queries: bool = Field( + default=True, + description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.", + ) + include_query_usage_statistics: bool = Field( + default=True, + description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.", + ) lazy_schema_resolver: bool = Field( default=True, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 6b200590d7ab6..e93ecf30171f6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -40,6 +40,7 @@ ColumnRef, DownstreamColumnRef, ) +from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.time import ts_millis_to_datetime @@ -239,6 +240,9 @@ def get_known_query_lineage( downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name) known_lineage = KnownQueryLineageInfo( + query_id=get_query_fingerprint( + query.query_text, self.identifiers.platform, fast=True + ), query_text=query.query_text, downstream=downstream_table_urn, upstreams=self.map_query_result_upstreams( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 36825dc33fe7d..b82734cbbe84e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -61,6 +61,7 @@ ColumnRef, DownstreamColumnRef, ) +from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList from datahub.utilities.perf_timer import PerfTimer @@ -475,10 +476,11 @@ def _parse_audit_log_row( entry = PreparsedQuery( # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better - # job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint - # here so that the aggregator auto-generates one. - # query_id=res["query_fingerprint"], - query_id=None, + # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint + # here + query_id=get_query_fingerprint( + res["query_text"], self.identifiers.platform, fast=True + ), query_text=res["query_text"], upstreams=upstreams, downstream=downstream, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py index 030b2d43be81f..b24471f8666af 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py @@ -166,6 +166,3 @@ def _is_tag_scanned(self, tag_name: str) -> bool: def report_tag_processed(self, tag_name: str) -> None: self._processed_tags.add(tag_name) - - def set_ingestion_stage(self, database: str, stage: str) -> None: - self.report_ingestion_stage_start(f"{database}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 8a1bf15b7a7bc..6f09c26b08da2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -216,21 +216,23 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: try: for snowflake_db in self.databases: - self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION) - yield from self._process_database(snowflake_db) + with self.report.new_stage( + f"{snowflake_db.name}: {METADATA_EXTRACTION}" + ): + yield from self._process_database(snowflake_db) - self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE) - discovered_tables: List[str] = [ - self.identifiers.get_dataset_identifier( - table_name, schema.name, db.name - ) - for db in self.databases - for schema in db.schemas - for table_name in schema.tables - ] - if self.aggregator: - for entry in self._external_tables_ddl_lineage(discovered_tables): - self.aggregator.add(entry) + with self.report.new_stage(f"*: {EXTERNAL_TABLE_DDL_LINEAGE}"): + discovered_tables: List[str] = [ + self.identifiers.get_dataset_identifier( + table_name, schema.name, db.name + ) + for db in self.databases + for schema in db.schemas + for table_name in schema.tables + ] + if self.aggregator: + for entry in self._external_tables_ddl_lineage(discovered_tables): + self.aggregator.add(entry) except SnowflakePermissionError as e: self.structured_reporter.failure( @@ -332,8 +334,8 @@ def _process_database( yield from self._process_db_schemas(snowflake_db, db_tables) if self.profiler and db_tables: - self.report.set_ingestion_stage(snowflake_db.name, PROFILING) - yield from self.profiler.get_workunits(snowflake_db, db_tables) + with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"): + yield from self.profiler.get_workunits(snowflake_db, db_tables) def _process_db_schemas( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 4bdf559f293b5..85e4071aec07d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -146,59 +146,58 @@ def get_usage_workunits( if not self._should_ingest_usage(): return - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) - if self.report.edition == SnowflakeEdition.STANDARD.value: - logger.info( - "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported." - ) - return + with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"): + if self.report.edition == SnowflakeEdition.STANDARD.value: + logger.info( + "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported." + ) + return - logger.info("Checking usage date ranges") + logger.info("Checking usage date ranges") - self._check_usage_date_ranges() + self._check_usage_date_ranges() - # If permission error, execution returns from here - if ( - self.report.min_access_history_time is None - or self.report.max_access_history_time is None - ): - return + # If permission error, execution returns from here + if ( + self.report.min_access_history_time is None + or self.report.max_access_history_time is None + ): + return - # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation - # Now, we report the usage as well as operation metadata even if user email is absent + # NOTE: In earlier `snowflake-usage` connector, users with no email were not considered in usage counts as well as in operation + # Now, we report the usage as well as operation metadata even if user email is absent - if self.config.include_usage_stats: - yield from auto_empty_dataset_usage_statistics( - self._get_workunits_internal(discovered_datasets), - config=BaseTimeWindowConfig( - start_time=self.start_time, - end_time=self.end_time, - bucket_duration=self.config.bucket_duration, - ), - dataset_urns={ - self.identifiers.gen_dataset_urn(dataset_identifier) - for dataset_identifier in discovered_datasets - }, - ) + if self.config.include_usage_stats: + yield from auto_empty_dataset_usage_statistics( + self._get_workunits_internal(discovered_datasets), + config=BaseTimeWindowConfig( + start_time=self.start_time, + end_time=self.end_time, + bucket_duration=self.config.bucket_duration, + ), + dataset_urns={ + self.identifiers.gen_dataset_urn(dataset_identifier) + for dataset_identifier in discovered_datasets + }, + ) - self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS) + with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"): + if self.config.include_operational_stats: + # Generate the operation workunits. + access_events = self._get_snowflake_history() + for event in access_events: + yield from self._get_operation_aspect_work_unit( + event, discovered_datasets + ) - if self.config.include_operational_stats: - # Generate the operation workunits. - access_events = self._get_snowflake_history() - for event in access_events: - yield from self._get_operation_aspect_work_unit( - event, discovered_datasets + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time, + self.config.end_time, + self.config.bucket_duration, ) - if self.redundant_run_skip_handler: - # Update the checkpoint state for this run. - self.redundant_run_skip_handler.update_state( - self.config.start_time, - self.config.end_time, - self.config.bucket_duration, - ) - def _get_workunits_internal( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: @@ -386,7 +385,7 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: ) self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False) return - self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2) + self.report.access_history_query_secs = timer.elapsed_seconds(digits=2) for row in results: yield from self._process_snowflake_history_row(row) @@ -434,8 +433,8 @@ def _check_usage_date_ranges(self) -> None: self.report.max_access_history_time = db_row["MAX_TIME"].astimezone( tz=timezone.utc ) - self.report.access_history_range_query_secs = round( - timer.elapsed_seconds(), 2 + self.report.access_history_range_query_secs = timer.elapsed_seconds( + digits=2 ) def _get_operation_aspect_work_unit( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 954e8a29c1a1b..c0385a8d5af30 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -480,8 +480,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, ) - self.report.set_ingestion_stage("*", METADATA_EXTRACTION) - yield from schema_extractor.get_workunits_internal() + with self.report.new_stage(f"*: {METADATA_EXTRACTION}"): + yield from schema_extractor.get_workunits_internal() databases = schema_extractor.databases @@ -513,45 +513,46 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: discovered_datasets = discovered_tables + discovered_views if self.config.use_queries_v2: - self.report.set_ingestion_stage("*", VIEW_PARSING) - yield from auto_workunit(self.aggregator.gen_metadata()) - - self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) - - schema_resolver = self.aggregator._schema_resolver - - queries_extractor = SnowflakeQueriesExtractor( - connection=self.connection, - config=SnowflakeQueriesExtractorConfig( - window=self.config, - temporary_tables_pattern=self.config.temporary_tables_pattern, - include_lineage=self.config.include_table_lineage, - include_usage_statistics=self.config.include_usage_stats, - include_operations=self.config.include_operational_stats, - user_email_pattern=self.config.user_email_pattern, - ), - structured_report=self.report, - filters=self.filters, - identifiers=self.identifiers, - schema_resolver=schema_resolver, - discovered_tables=discovered_datasets, - graph=self.ctx.graph, - ) + with self.report.new_stage(f"*: {VIEW_PARSING}"): + yield from auto_workunit(self.aggregator.gen_metadata()) - # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs - # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors, - # it should be pretty straightforward to refactor this and only initialize the aggregator once. - self.report.queries_extractor = queries_extractor.report - yield from queries_extractor.get_workunits_internal() - queries_extractor.close() + with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"): + schema_resolver = self.aggregator._schema_resolver + + queries_extractor = SnowflakeQueriesExtractor( + connection=self.connection, + config=SnowflakeQueriesExtractorConfig( + window=self.config, + temporary_tables_pattern=self.config.temporary_tables_pattern, + include_lineage=self.config.include_table_lineage, + include_usage_statistics=self.config.include_usage_stats, + include_operations=self.config.include_operational_stats, + include_queries=self.config.include_queries, + include_query_usage_statistics=self.config.include_query_usage_statistics, + user_email_pattern=self.config.user_email_pattern, + ), + structured_report=self.report, + filters=self.filters, + identifiers=self.identifiers, + schema_resolver=schema_resolver, + discovered_tables=discovered_datasets, + graph=self.ctx.graph, + ) + + # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs + # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors, + # it should be pretty straightforward to refactor this and only initialize the aggregator once. + self.report.queries_extractor = queries_extractor.report + yield from queries_extractor.get_workunits_internal() + queries_extractor.close() else: if self.lineage_extractor: - self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) - self.lineage_extractor.add_time_based_lineage_to_aggregator( - discovered_tables=discovered_tables, - discovered_views=discovered_views, - ) + with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"): + self.lineage_extractor.add_time_based_lineage_to_aggregator( + discovered_tables=discovered_tables, + discovered_views=discovered_views, + ) # This would emit view and external table ddl lineage # as well as query lineage via lineage_extractor diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index e42564975c3d1..5b76fe41d92e9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -878,7 +878,7 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit urns = self.schema_resolver.get_urns() if self.config.include_table_lineage or self.config.include_usage_statistics: - self.report.report_ingestion_stage_start("audit log extraction") - yield from self.get_audit_log_mcps(urns=urns) + with self.report.new_stage("Audit log extraction"): + yield from self.get_audit_log_mcps(urns=urns) yield from self.builder.gen_workunits() diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index d47e10c9eb5c6..2543cbe653ba7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -2,9 +2,9 @@ import logging import re import time -from collections import OrderedDict -from dataclasses import dataclass -from datetime import datetime +from collections import OrderedDict, defaultdict +from dataclasses import dataclass, field as dataclass_field +from datetime import datetime, timedelta, timezone from functools import lru_cache from typing import ( Any, @@ -109,6 +109,7 @@ make_filter, make_fine_grained_lineage_class, make_upstream_class, + optimize_query_filter, published_datasource_graphql_query, query_metadata_cursor_based_pagination, sheet_graphql_query, @@ -117,6 +118,7 @@ ) from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo from datahub.ingestion.source.tableau.tableau_validation import check_user_role +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.metadata.com.linkedin.pegasus2avro.common import ( AuditStamp, ChangeAuditStamps, @@ -169,6 +171,8 @@ create_lineage_sql_parsed_result, ) from datahub.utilities import config_clean +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.stats_collections import TopKDict from datahub.utilities.urns.dataset_urn import DatasetUrn try: @@ -195,6 +199,11 @@ 504, # Gateway Timeout ] +# From experience, this expiry time typically ranges from 50 minutes +# to 2 hours but might as well be configurable. We will allow upto +# 10 minutes of such expiry time +REGULAR_AUTH_EXPIRY_PERIOD = timedelta(minutes=10) + logger: logging.Logger = logging.getLogger(__name__) # Replace / with | @@ -636,12 +645,42 @@ class SiteIdContentUrl: site_content_url: str -class TableauSourceReport(StaleEntityRemovalSourceReport): +@dataclass +class TableauSourceReport( + StaleEntityRemovalSourceReport, + IngestionStageReport, +): get_all_datasources_query_failed: bool = False num_get_datasource_query_failures: int = 0 num_datasource_field_skipped_no_name: int = 0 num_csql_field_skipped_no_name: int = 0 num_table_field_skipped_no_name: int = 0 + # timers + extract_usage_stats_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + populate_projects_registry_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_embedded_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_published_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_upstream_tables_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) # lineage num_tables_with_upstream_lineage: int = 0 num_upstream_table_lineage: int = 0 @@ -652,7 +691,15 @@ class TableauSourceReport(StaleEntityRemovalSourceReport): num_upstream_table_lineage_failed_parse_sql: int = 0 num_upstream_fine_grained_lineage_failed_parse_sql: int = 0 num_hidden_assets_skipped: int = 0 - logged_in_user: List[UserInfo] = [] + logged_in_user: List[UserInfo] = dataclass_field(default_factory=list) + + last_authenticated_at: Optional[datetime] = None + + num_expected_tableau_metadata_queries: int = 0 + num_actual_tableau_metadata_queries: int = 0 + tableau_server_error_stats: Dict[str, int] = dataclass_field( + default_factory=(lambda: defaultdict(int)) + ) def report_user_role(report: TableauSourceReport, server: Server) -> None: @@ -723,6 +770,7 @@ def _authenticate(self, site_content_url: str) -> None: try: logger.info(f"Authenticated to Tableau site: '{site_content_url}'") self.server = self.config.make_tableau_client(site_content_url) + self.report.last_authenticated_at = datetime.now(timezone.utc) report_user_role(report=self.report, server=self.server) # Note that we're not catching ConfigurationError, since we want that to throw. except ValueError as e: @@ -806,16 +854,20 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: site_source = TableauSiteSource( config=self.config, ctx=self.ctx, - site=site - if site - else SiteIdContentUrl( - site_id=self.server.site_id, site_content_url=self.config.site + site=( + site + if site + else SiteIdContentUrl( + site_id=self.server.site_id, + site_content_url=self.config.site, + ) ), report=self.report, server=self.server, platform=self.platform, ) yield from site_source.ingest_tableau_site() + except MetadataQueryException as md_exception: self.report.failure( title="Failed to Retrieve Tableau Metadata", @@ -924,6 +976,7 @@ def _re_authenticate(self) -> None: # Sign-in again may not be enough because Tableau sometimes caches invalid sessions # so we need to recreate the Tableau Server object self.server = self.config.make_tableau_client(self.site_content_url) + self.report.last_authenticated_at = datetime.now(timezone.utc) def _populate_usage_stat_registry(self) -> None: if self.server is None: @@ -1189,6 +1242,7 @@ def get_connection_object_page( ) try: assert self.server is not None + self.report.num_actual_tableau_metadata_queries += 1 query_data = query_metadata_cursor_based_pagination( server=self.server, main_query=query, @@ -1198,25 +1252,36 @@ def get_connection_object_page( qry_filter=query_filter, ) - except REAUTHENTICATE_ERRORS: - if not retry_on_auth_error: + except REAUTHENTICATE_ERRORS as e: + self.report.tableau_server_error_stats[e.__class__.__name__] += 1 + if not retry_on_auth_error or retries_remaining <= 0: raise - # If ingestion has been running for over 2 hours, the Tableau - # temporary credentials will expire. If this happens, this exception - # will be thrown, and we need to re-authenticate and retry. - self._re_authenticate() + # We have been getting some irregular authorization errors like below well before the expected expiry time + # - within few seconds of initial authentication . We'll retry without re-auth for such cases. + # : + # b'{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}' + if self.report.last_authenticated_at and ( + datetime.now(timezone.utc) - self.report.last_authenticated_at + > REGULAR_AUTH_EXPIRY_PERIOD + ): + # If ingestion has been running for over 2 hours, the Tableau + # temporary credentials will expire. If this happens, this exception + # will be thrown, and we need to re-authenticate and retry. + self._re_authenticate() + return self.get_connection_object_page( query=query, connection_type=connection_type, query_filter=query_filter, fetch_size=fetch_size, current_cursor=current_cursor, - retry_on_auth_error=False, + retry_on_auth_error=True, retries_remaining=retries_remaining - 1, ) except InternalServerError as ise: + self.report.tableau_server_error_stats[InternalServerError.__name__] += 1 # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry. # Extended with other retryable errors. if ise.code in RETRIABLE_ERROR_CODES: @@ -1229,13 +1294,14 @@ def get_connection_object_page( query_filter=query_filter, fetch_size=fetch_size, current_cursor=current_cursor, - retry_on_auth_error=False, + retry_on_auth_error=True, retries_remaining=retries_remaining - 1, ) else: raise ise except OSError: + self.report.tableau_server_error_stats[OSError.__name__] += 1 # In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04), # the request logic was changed to use threads. # https://github.com/tableau/server-client-python/commit/307d8a20a30f32c1ce615cca7c6a78b9b9bff081 @@ -1250,7 +1316,7 @@ def get_connection_object_page( query_filter=query_filter, fetch_size=fetch_size, current_cursor=current_cursor, - retry_on_auth_error=False, + retry_on_auth_error=True, retries_remaining=retries_remaining - 1, ) @@ -1338,7 +1404,7 @@ def get_connection_object_page( query_filter=query_filter, fetch_size=fetch_size, current_cursor=current_cursor, - retry_on_auth_error=False, + retry_on_auth_error=True, retries_remaining=retries_remaining, ) raise RuntimeError(f"Query {connection_type} error: {errors}") @@ -1363,6 +1429,8 @@ def get_connection_objects( query_filter: dict = {}, page_size_override: Optional[int] = None, ) -> Iterable[dict]: + query_filter = optimize_query_filter(query_filter) + # Calls the get_connection_object_page function to get the objects, # and automatically handles pagination. page_size = page_size_override or self.config.page_size @@ -1374,6 +1442,7 @@ def get_connection_objects( while has_next_page: filter_: str = make_filter(filter_page) + self.report.num_expected_tableau_metadata_queries += 1 ( connection_objects, current_cursor, @@ -3454,33 +3523,87 @@ def _create_workbook_properties( return {"permissions": json.dumps(groups)} if len(groups) > 0 else None def ingest_tableau_site(self): - # Initialise the dictionary to later look-up for chart and dashboard stat - if self.config.extract_usage_stats: - self._populate_usage_stat_registry() - - if self.config.permission_ingestion: - self._fetch_groups() - - # Populate the map of database names and database hostnames to be used later to map - # databases to platform instances. - if self.config.database_hostname_to_platform_instance_map: - self._populate_database_server_hostname_map() - - self._populate_projects_registry() - - if self.config.add_site_container: - yield from self.emit_site_container() - yield from self.emit_project_containers() - yield from self.emit_workbooks() - if self.sheet_ids: - yield from self.emit_sheets() - if self.dashboard_ids: - yield from self.emit_dashboards() - if self.embedded_datasource_ids_being_used: - yield from self.emit_embedded_datasources() - if self.datasource_ids_being_used: - yield from self.emit_published_datasources() - if self.custom_sql_ids_being_used: - yield from self.emit_custom_sql_datasources() - if self.database_tables: - yield from self.emit_upstream_tables() + with self.report.new_stage( + f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}" + ): + # Initialise the dictionary to later look-up for chart and dashboard stat + if self.config.extract_usage_stats: + with PerfTimer() as timer: + self._populate_usage_stat_registry() + self.report.extract_usage_stats_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.config.permission_ingestion: + with PerfTimer() as timer: + self._fetch_groups() + self.report.fetch_groups_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + # Populate the map of database names and database hostnames to be used later to map + # databases to platform instances. + if self.config.database_hostname_to_platform_instance_map: + with PerfTimer() as timer: + self._populate_database_server_hostname_map() + self.report.populate_database_server_hostname_map_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + with PerfTimer() as timer: + self._populate_projects_registry() + self.report.populate_projects_registry_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.config.add_site_container: + yield from self.emit_site_container() + yield from self.emit_project_containers() + + with PerfTimer() as timer: + yield from self.emit_workbooks() + self.report.emit_workbooks_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.sheet_ids: + with PerfTimer() as timer: + yield from self.emit_sheets() + self.report.emit_sheets_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.dashboard_ids: + with PerfTimer() as timer: + yield from self.emit_dashboards() + self.report.emit_dashboards_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.embedded_datasource_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_embedded_datasources() + self.report.emit_embedded_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.datasource_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_published_datasources() + self.report.emit_published_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.custom_sql_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_custom_sql_datasources() + self.report.emit_custom_sql_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.database_tables: + with PerfTimer() as timer: + yield from self.emit_upstream_tables() + self.report.emit_upstream_tables_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py index 61b56c4bee5bd..8f9d81eb9a18c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py @@ -1,3 +1,4 @@ +import copy import html import json import logging @@ -35,6 +36,7 @@ UpstreamClass, ) from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult +from datahub.utilities.ordered_set import OrderedSet logger = logging.getLogger(__name__) @@ -1000,3 +1002,19 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]: ] return filter_pages + + +def optimize_query_filter(query_filter: dict) -> dict: + """ + Duplicates in the filter cause duplicates in the result, + leading to entities/aspects being emitted multiple times unnecessarily + """ + optimized_query = copy.deepcopy(query_filter) + + if query_filter.get(c.ID_WITH_IN): + optimized_query[c.ID_WITH_IN] = list(OrderedSet(query_filter[c.ID_WITH_IN])) + if query_filter.get(c.PROJECT_NAME_WITH_IN): + optimized_query[c.PROJECT_NAME_WITH_IN] = list( + OrderedSet(query_filter[c.PROJECT_NAME_WITH_IN]) + ) + return optimized_query diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 7bfa7fdb28aaf..43bd788f809c3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -26,9 +26,6 @@ gen_containers, ) from datahub.emitter.sql_parsing_builder import SqlParsingBuilder -from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( - EnsureAspectSizeProcessor, -) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -263,90 +260,89 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, - EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - self.report.report_ingestion_stage_start("Ingestion Setup") - wait_on_warehouse = None - if self.config.include_hive_metastore: - self.report.report_ingestion_stage_start("Start warehouse") - # Can take several minutes, so start now and wait later - wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() - if wait_on_warehouse is None: - self.report.report_failure( - "initialization", - f"SQL warehouse {self.config.profiling.warehouse_id} not found", - ) - return - else: - # wait until warehouse is started - wait_on_warehouse.result() + with self.report.new_stage("Ingestion Setup"): + wait_on_warehouse = None + if self.config.include_hive_metastore: + with self.report.new_stage("Start warehouse"): + # Can take several minutes, so start now and wait later + wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() + if wait_on_warehouse is None: + self.report.report_failure( + "initialization", + f"SQL warehouse {self.config.profiling.warehouse_id} not found", + ) + return + else: + # wait until warehouse is started + wait_on_warehouse.result() if self.config.include_ownership: - self.report.report_ingestion_stage_start("Ingest service principals") - self.build_service_principal_map() - self.build_groups_map() + with self.report.new_stage("Ingest service principals"): + self.build_service_principal_map() + self.build_groups_map() if self.config.include_notebooks: - self.report.report_ingestion_stage_start("Ingest notebooks") - yield from self.process_notebooks() + with self.report.new_stage("Ingest notebooks"): + yield from self.process_notebooks() yield from self.process_metastores() yield from self.get_view_lineage() if self.config.include_notebooks: - self.report.report_ingestion_stage_start("Notebook lineage") - for notebook in self.notebooks.values(): - wu = self._gen_notebook_lineage(notebook) - if wu: - yield wu + with self.report.new_stage("Notebook lineage"): + for notebook in self.notebooks.values(): + wu = self._gen_notebook_lineage(notebook) + if wu: + yield wu if self.config.include_usage_statistics: - self.report.report_ingestion_stage_start("Ingest usage") - usage_extractor = UnityCatalogUsageExtractor( - config=self.config, - report=self.report, - proxy=self.unity_catalog_api_proxy, - table_urn_builder=self.gen_dataset_urn, - user_urn_builder=self.gen_user_urn, - ) - yield from usage_extractor.get_usage_workunits( - self.table_refs | self.view_refs - ) - - if self.config.is_profiling_enabled(): - self.report.report_ingestion_stage_start("Start warehouse") - # Need to start the warehouse again for profiling, - # as it may have been stopped after ingestion might take - # longer time to complete - wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() - if wait_on_warehouse is None: - self.report.report_failure( - "initialization", - f"SQL warehouse {self.config.profiling.warehouse_id} not found", + with self.report.new_stage("Ingest usage"): + usage_extractor = UnityCatalogUsageExtractor( + config=self.config, + report=self.report, + proxy=self.unity_catalog_api_proxy, + table_urn_builder=self.gen_dataset_urn, + user_urn_builder=self.gen_user_urn, + ) + yield from usage_extractor.get_usage_workunits( + self.table_refs | self.view_refs ) - return - else: - # wait until warehouse is started - wait_on_warehouse.result() - self.report.report_ingestion_stage_start("Profiling") - if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig): - yield from UnityCatalogAnalyzeProfiler( - self.config.profiling, - self.report, - self.unity_catalog_api_proxy, - self.gen_dataset_urn, - ).get_workunits(self.table_refs) - elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig): - yield from UnityCatalogGEProfiler( - sql_common_config=self.config, - profiling_config=self.config.profiling, - report=self.report, - ).get_workunits(list(self.tables.values())) - else: - raise ValueError("Unknown profiling config method") + if self.config.is_profiling_enabled(): + with self.report.new_stage("Start warehouse"): + # Need to start the warehouse again for profiling, + # as it may have been stopped after ingestion might take + # longer time to complete + wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() + if wait_on_warehouse is None: + self.report.report_failure( + "initialization", + f"SQL warehouse {self.config.profiling.warehouse_id} not found", + ) + return + else: + # wait until warehouse is started + wait_on_warehouse.result() + + with self.report.new_stage("Profiling"): + if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig): + yield from UnityCatalogAnalyzeProfiler( + self.config.profiling, + self.report, + self.unity_catalog_api_proxy, + self.gen_dataset_urn, + ).get_workunits(self.table_refs) + elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig): + yield from UnityCatalogGEProfiler( + sql_common_config=self.config, + profiling_config=self.config.profiling, + report=self.report, + ).get_workunits(list(self.tables.values())) + else: + raise ValueError("Unknown profiling config method") def build_service_principal_map(self) -> None: try: @@ -466,11 +462,11 @@ def process_schemas(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]: self.report.schemas.dropped(schema.id) continue - self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}") - yield from self.gen_schema_containers(schema) - yield from self.process_tables(schema) + with self.report.new_stage(f"Ingest schema {schema.id}"): + yield from self.gen_schema_containers(schema) + yield from self.process_tables(schema) - self.report.schemas.processed(schema.id) + self.report.schemas.processed(schema.id) def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]: for table in self.unity_catalog_api_proxy.tables(schema=schema): diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py index 2b7aae8330905..95c2345232a1e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py @@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str: return builder.make_user_urn(email.split("@")[0]) +def extract_user_email(user: str) -> Optional[str]: + """Extracts user email from user input + + >>> extract_user_email('urn:li:corpuser:abc@xyz.com') + 'abc@xyz.com' + >>> extract_user_email('urn:li:corpuser:abc') + >>> extract_user_email('abc@xyz.com') + 'abc@xyz.com' + """ + if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")): + user = user.split(":")[-1] + return user if "@" in user else None + + def make_usage_workunit( bucket_start_time: datetime, resource: ResourceType, @@ -104,7 +118,7 @@ def make_usage_workunit( DatasetUserUsageCountsClass( user=user_urn_builder(user), count=count, - userEmail=user if "@" in user else None, + userEmail=extract_user_email(user), ) for user, count in user_freq ], diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index ce683e64b3f46..130a36e254fef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -1,7 +1,7 @@ import logging +from contextlib import AbstractContextManager from dataclasses import dataclass, field from datetime import datetime, timezone -from typing import Optional from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.stats_collections import TopKDict @@ -22,25 +22,29 @@ @dataclass class IngestionStageReport: - ingestion_stage: Optional[str] = None ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) - _timer: Optional[PerfTimer] = field( - default=None, init=False, repr=False, compare=False - ) - - def report_ingestion_stage_start(self, stage: str) -> None: - if self._timer: - elapsed = round(self._timer.elapsed_seconds(), 2) - logger.info( - f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds", - stacklevel=2, - ) - if self.ingestion_stage: - self.ingestion_stage_durations[self.ingestion_stage] = elapsed - else: - self._timer = PerfTimer() - - self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" - logger.info(f"Stage started: {self.ingestion_stage}") + def new_stage(self, stage: str) -> "IngestionStageContext": + return IngestionStageContext(stage, self) + + +@dataclass +class IngestionStageContext(AbstractContextManager): + def __init__(self, stage: str, report: IngestionStageReport): + self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + self._timer: PerfTimer = PerfTimer() + self._report = report + + def __enter__(self) -> "IngestionStageContext": + logger.info(f"Stage started: {self._ingestion_stage}") self._timer.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + elapsed = self._timer.elapsed_seconds(digits=2) + logger.info( + f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds", + stacklevel=2, + ) + self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed + return None diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/__init__.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py new file mode 100644 index 0000000000000..1fd1585a91358 --- /dev/null +++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/custom_properties.py @@ -0,0 +1,79 @@ +from abc import abstractmethod +from typing import Dict, Optional, Tuple + +from typing_extensions import Self + +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath + + +class HasCustomPropertiesPatch(MetadataPatchProposal): + @classmethod + @abstractmethod + def _custom_properties_location(self) -> Tuple[str, PatchPath]: + ... + + def add_custom_property(self, key: str, value: str) -> Self: + """Add a custom property to the entity. + + Args: + key: The key of the custom property. + value: The value of the custom property. + + Returns: + The patch builder instance. + """ + aspect_name, path = self._custom_properties_location() + self._add_patch( + aspect_name, + "add", + path=(*path, key), + value=value, + ) + return self + + def add_custom_properties( + self, custom_properties: Optional[Dict[str, str]] = None + ) -> Self: + if custom_properties is not None: + for key, value in custom_properties.items(): + self.add_custom_property(key, value) + return self + + def remove_custom_property(self, key: str) -> Self: + """Remove a custom property from the entity. + + Args: + key: The key of the custom property to remove. + + Returns: + The patch builder instance. + """ + aspect_name, path = self._custom_properties_location() + self._add_patch( + aspect_name, + "remove", + path=(*path, key), + value={}, + ) + return self + + def set_custom_properties(self, custom_properties: Dict[str, str]) -> Self: + """Sets the custom properties of the entity. + + This method replaces all existing custom properties with the given dictionary. + + Args: + custom_properties: A dictionary containing the custom properties to be set. + + Returns: + The patch builder instance. + """ + + aspect_name, path = self._custom_properties_location() + self._add_patch( + aspect_name, + "add", + path=path, + value=custom_properties, + ) + return self diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/ownership.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/ownership.py new file mode 100644 index 0000000000000..1e2c789c7def3 --- /dev/null +++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/ownership.py @@ -0,0 +1,67 @@ +from typing import List, Optional + +from typing_extensions import Self + +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.metadata.schema_classes import ( + OwnerClass, + OwnershipClass, + OwnershipTypeClass, +) + + +class HasOwnershipPatch(MetadataPatchProposal): + def add_owner(self, owner: OwnerClass) -> Self: + """Add an owner to the entity. + + Args: + owner: The Owner object to add. + + Returns: + The patch builder instance. + """ + self._add_patch( + OwnershipClass.ASPECT_NAME, + "add", + path=("owners", owner.owner, str(owner.type)), + value=owner, + ) + return self + + def remove_owner( + self, owner: str, owner_type: Optional[OwnershipTypeClass] = None + ) -> Self: + """Remove an owner from the entity. + + If owner_type is not provided, the owner will be removed regardless of ownership type. + + Args: + owner: The owner to remove. + owner_type: The ownership type of the owner (optional). + + Returns: + The patch builder instance. + """ + self._add_patch( + OwnershipClass.ASPECT_NAME, + "remove", + path=("owners", owner) + ((str(owner_type),) if owner_type else ()), + value=owner, + ) + return self + + def set_owners(self, owners: List[OwnerClass]) -> Self: + """Set the owners of the entity. + + This will effectively replace all existing owners with the new list - it doesn't really patch things. + + Args: + owners: The list of owners to set. + + Returns: + The patch builder instance. + """ + self._add_patch( + OwnershipClass.ASPECT_NAME, "add", path=("owners",), value=owners + ) + return self diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/structured_properties.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/structured_properties.py new file mode 100644 index 0000000000000..48050bbad8e50 --- /dev/null +++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/structured_properties.py @@ -0,0 +1,72 @@ +from typing import List, Union + +from typing_extensions import Self + +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.metadata.schema_classes import ( + StructuredPropertiesClass, + StructuredPropertyValueAssignmentClass, +) +from datahub.utilities.urns.structured_properties_urn import ( + make_structured_property_urn, +) + + +class HasStructuredPropertiesPatch(MetadataPatchProposal): + def set_structured_property( + self, key: str, value: Union[str, float, List[Union[str, float]]] + ) -> Self: + """Add or update a structured property. + + Args: + key: the name of the property (either bare or urn form) + value: the value of the property (for multi-valued properties, this can be a list) + + Returns: + The patch builder instance. + """ + self.remove_structured_property(key) + self.add_structured_property(key, value) + return self + + def remove_structured_property(self, key: str) -> Self: + """Remove a structured property. + + Args: + key: the name of the property (either bare or urn form) + + Returns: + The patch builder instance. + """ + + self._add_patch( + StructuredPropertiesClass.ASPECT_NAME, + "remove", + path=("properties", make_structured_property_urn(key)), + value={}, + ) + return self + + def add_structured_property( + self, key: str, value: Union[str, float, List[Union[str, float]]] + ) -> Self: + """Add a structured property. + + Args: + key: the name of the property (either bare or urn form) + value: the value of the property (for multi-valued properties, this value will be appended to the list) + + Returns: + The patch builder instance. + """ + + self._add_patch( + StructuredPropertiesClass.ASPECT_NAME, + "add", + path=("properties", make_structured_property_urn(key)), + value=StructuredPropertyValueAssignmentClass( + propertyUrn=make_structured_property_urn(key), + values=value if isinstance(value, list) else [value], + ), + ) + return self diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/tags.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/tags.py new file mode 100644 index 0000000000000..afbc9115ca6e2 --- /dev/null +++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/tags.py @@ -0,0 +1,42 @@ +from typing import Union + +from typing_extensions import Self + +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.metadata.schema_classes import ( + GlobalTagsClass as GlobalTags, + TagAssociationClass as Tag, +) +from datahub.metadata.urns import TagUrn, Urn + + +class HasTagsPatch(MetadataPatchProposal): + def add_tag(self, tag: Tag) -> Self: + """Adds a tag to the entity. + + Args: + tag: The Tag object representing the tag to be added. + + Returns: + The patch builder instance. + """ + + # TODO: Make this support raw strings, in addition to Tag objects. + self._add_patch( + GlobalTags.ASPECT_NAME, "add", path=("tags", tag.tag), value=tag + ) + return self + + def remove_tag(self, tag: Union[str, Urn]) -> Self: + """Removes a tag from the entity. + + Args: + tag: The tag to remove, specified as a string or Urn object. + + Returns: + The patch builder instance. + """ + if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): + tag = TagUrn.create_from_id(tag) + self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=("tags", tag), value={}) + return self diff --git a/metadata-ingestion/src/datahub/specific/aspect_helpers/terms.py b/metadata-ingestion/src/datahub/specific/aspect_helpers/terms.py new file mode 100644 index 0000000000000..ae199124372b4 --- /dev/null +++ b/metadata-ingestion/src/datahub/specific/aspect_helpers/terms.py @@ -0,0 +1,43 @@ +from typing import Union + +from typing_extensions import Self + +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.metadata.schema_classes import ( + GlossaryTermAssociationClass as Term, + GlossaryTermsClass, +) +from datahub.metadata.urns import GlossaryTermUrn, Urn + + +class HasTermsPatch(MetadataPatchProposal): + def add_term(self, term: Term) -> Self: + """Adds a glossary term to the entity. + + Args: + term: The Term object representing the glossary term to be added. + + Returns: + The patch builder instance. + """ + # TODO: Make this support raw strings, in addition to Term objects. + self._add_patch( + GlossaryTermsClass.ASPECT_NAME, "add", path=("terms", term.urn), value=term + ) + return self + + def remove_term(self, term: Union[str, Urn]) -> Self: + """Removes a glossary term from the entity. + + Args: + term: The term to remove, specified as a string or Urn object. + + Returns: + The patch builder instance. + """ + if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): + term = GlossaryTermUrn(term) + self._add_patch( + GlossaryTermsClass.ASPECT_NAME, "remove", path=("terms", term), value={} + ) + return self diff --git a/metadata-ingestion/src/datahub/specific/chart.py b/metadata-ingestion/src/datahub/specific/chart.py index 104a7c21a07e2..f44a2ffc0d68a 100644 --- a/metadata-ingestion/src/datahub/specific/chart.py +++ b/metadata-ingestion/src/datahub/specific/chart.py @@ -1,28 +1,29 @@ -from typing import Dict, List, Optional, Union +from typing import List, Optional, Tuple, Union -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath from datahub.metadata.schema_classes import ( AccessLevelClass, ChangeAuditStampsClass, ChartInfoClass as ChartInfo, ChartTypeClass, EdgeClass as Edge, - GlobalTagsClass as GlobalTags, - GlossaryTermAssociationClass as Term, - GlossaryTermsClass as GlossaryTerms, KafkaAuditHeaderClass, - OwnerClass as Owner, - OwnershipTypeClass, SystemMetadataClass, - TagAssociationClass as Tag, ) -from datahub.specific.custom_properties import CustomPropertiesPatchHelper -from datahub.specific.ownership import OwnershipPatchHelper -from datahub.utilities.urns.tag_urn import TagUrn +from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch +from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch +from datahub.specific.aspect_helpers.tags import HasTagsPatch +from datahub.specific.aspect_helpers.terms import HasTermsPatch from datahub.utilities.urns.urn import Urn -class ChartPatchBuilder(MetadataPatchProposal): +class ChartPatchBuilder( + HasOwnershipPatch, + HasCustomPropertiesPatch, + HasTagsPatch, + HasTermsPatch, + MetadataPatchProposal, +): def __init__( self, urn: str, @@ -40,55 +41,10 @@ def __init__( super().__init__( urn, system_metadata=system_metadata, audit_header=audit_header ) - self.custom_properties_patch_helper = CustomPropertiesPatchHelper( - self, ChartInfo.ASPECT_NAME - ) - self.ownership_patch_helper = OwnershipPatchHelper(self) - - def add_owner(self, owner: Owner) -> "ChartPatchBuilder": - """ - Adds an owner to the ChartPatchBuilder. - - Args: - owner: The Owner object to add. - - Returns: - The ChartPatchBuilder instance. - """ - self.ownership_patch_helper.add_owner(owner) - return self - def remove_owner( - self, owner: str, owner_type: Optional[OwnershipTypeClass] = None - ) -> "ChartPatchBuilder": - """ - Removes an owner from the ChartPatchBuilder. - - Args: - owner: The owner to remove. - owner_type: The ownership type of the owner (optional). - - Returns: - The ChartPatchBuilder instance. - - Notes: - `owner_type` is optional. - """ - self.ownership_patch_helper.remove_owner(owner, owner_type) - return self - - def set_owners(self, owners: List[Owner]) -> "ChartPatchBuilder": - """ - Sets the owners of the ChartPatchBuilder. - - Args: - owners: A list of Owner objects. - - Returns: - The ChartPatchBuilder instance. - """ - self.ownership_patch_helper.set_owners(owners) - return self + @classmethod + def _custom_properties_location(cls) -> Tuple[str, PatchPath]: + return ChartInfo.ASPECT_NAME, ("customProperties",) def add_input_edge(self, input: Union[Edge, Urn, str]) -> "ChartPatchBuilder": """ @@ -120,7 +76,7 @@ def add_input_edge(self, input: Union[Edge, Urn, str]) -> "ChartPatchBuilder": self._add_patch( ChartInfo.ASPECT_NAME, "add", - path=f"/inputEdges/{self.quote(input_urn)}", + path=("inputEdges", input_urn), value=input_urn, ) return self @@ -138,7 +94,7 @@ def remove_input_edge(self, input: Union[str, Urn]) -> "ChartPatchBuilder": self._add_patch( ChartInfo.ASPECT_NAME, "remove", - path=f"/inputEdges/{self.quote(str(input))}", + path=("inputEdges", str(input)), value={}, ) return self @@ -159,129 +115,17 @@ def set_input_edges(self, inputs: List[Edge]) -> "ChartPatchBuilder": self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/inputEdges", + path=("inputEdges",), value=inputs, ) return self - def add_tag(self, tag: Tag) -> "ChartPatchBuilder": - """ - Adds a tag to the ChartPatchBuilder. - - Args: - tag: The Tag object representing the tag to be added. - - Returns: - The ChartPatchBuilder instance. - """ - self._add_patch( - GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag - ) - return self - - def remove_tag(self, tag: Union[str, Urn]) -> "ChartPatchBuilder": - """ - Removes a tag from the ChartPatchBuilder. - - Args: - tag: The tag to remove, specified as a string or Urn object. - - Returns: - The ChartPatchBuilder instance. - """ - if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): - tag = TagUrn.create_from_id(tag) - self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={}) - return self - - def add_term(self, term: Term) -> "ChartPatchBuilder": - """ - Adds a glossary term to the ChartPatchBuilder. - - Args: - term: The Term object representing the glossary term to be added. - - Returns: - The ChartPatchBuilder instance. - """ - self._add_patch( - GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term - ) - return self - - def remove_term(self, term: Union[str, Urn]) -> "ChartPatchBuilder": - """ - Removes a glossary term from the ChartPatchBuilder. - - Args: - term: The term to remove, specified as a string or Urn object. - - Returns: - The ChartPatchBuilder instance. - """ - if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): - term = "urn:li:glossaryTerm:" + term - self._add_patch( - GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={} - ) - return self - - def set_custom_properties( - self, custom_properties: Dict[str, str] - ) -> "ChartPatchBuilder": - """ - Sets the custom properties for the ChartPatchBuilder. - - Args: - custom_properties: A dictionary containing the custom properties to be set. - - Returns: - The ChartPatchBuilder instance. - - Notes: - This method replaces all existing custom properties with the given dictionary. - """ - self._add_patch( - ChartInfo.ASPECT_NAME, - "add", - path="/customProperties", - value=custom_properties, - ) - return self - - def add_custom_property(self, key: str, value: str) -> "ChartPatchBuilder": - """ - Adds a custom property to the ChartPatchBuilder. - - Args: - key: The key of the custom property. - value: The value of the custom property. - - Returns: - The ChartPatchBuilder instance. - """ - self.custom_properties_patch_helper.add_property(key, value) - return self - - def remove_custom_property(self, key: str) -> "ChartPatchBuilder": - """ - Removes a custom property from the ChartPatchBuilder. - - Args: - key: The key of the custom property to remove. - - Returns: - The ChartPatchBuilder instance. - """ - self.custom_properties_patch_helper.remove_property(key) - return self - def set_title(self, title: str) -> "ChartPatchBuilder": assert title, "ChartInfo title should not be None" self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/title", + path=("title",), value=title, ) @@ -292,7 +136,7 @@ def set_description(self, description: str) -> "ChartPatchBuilder": self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/description", + path=("description",), value=description, ) @@ -303,7 +147,7 @@ def set_last_refreshed(self, last_refreshed: Optional[int]) -> "ChartPatchBuilde self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/lastRefreshed", + path=("lastRefreshed",), value=last_refreshed, ) @@ -316,7 +160,7 @@ def set_last_modified( self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/lastModified", + path=("lastModified",), value=last_modified, ) @@ -327,7 +171,7 @@ def set_external_url(self, external_url: Optional[str]) -> "ChartPatchBuilder": self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/externalUrl", + path=("externalUrl",), value=external_url, ) return self @@ -337,7 +181,7 @@ def set_chart_url(self, dashboard_url: Optional[str]) -> "ChartPatchBuilder": self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/chartUrl", + path=("chartUrl",), value=dashboard_url, ) @@ -350,7 +194,7 @@ def set_type( self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/type", + path=("type",), value=type, ) @@ -363,7 +207,7 @@ def set_access( self._add_patch( ChartInfo.ASPECT_NAME, "add", - path="/access", + path=("access",), value=access, ) @@ -375,7 +219,7 @@ def add_inputs(self, input_urns: Optional[List[str]]) -> "ChartPatchBuilder": self._add_patch( aspect_name=ChartInfo.ASPECT_NAME, op="add", - path=f"/inputs/{urn}", + path=("inputs", urn), value=urn, ) diff --git a/metadata-ingestion/src/datahub/specific/custom_properties.py b/metadata-ingestion/src/datahub/specific/custom_properties.py deleted file mode 100644 index d399a448cc0c2..0000000000000 --- a/metadata-ingestion/src/datahub/specific/custom_properties.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import Generic, TypeVar - -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal - -_Parent = TypeVar("_Parent", bound=MetadataPatchProposal) - - -class CustomPropertiesPatchHelper(Generic[_Parent]): - def __init__( - self, - parent: _Parent, - aspect_name: str, - ) -> None: - self.aspect_name = aspect_name - self._parent = parent - self.aspect_field = "customProperties" - - def parent(self) -> _Parent: - return self._parent - - def add_property(self, key: str, value: str) -> "CustomPropertiesPatchHelper": - self._parent._add_patch( - self.aspect_name, - "add", - path=f"/{self.aspect_field}/{key}", - value=value, - ) - return self - - def remove_property(self, key: str) -> "CustomPropertiesPatchHelper": - self._parent._add_patch( - self.aspect_name, - "remove", - path=f"/{self.aspect_field}/{key}", - value={}, - ) - return self diff --git a/metadata-ingestion/src/datahub/specific/dashboard.py b/metadata-ingestion/src/datahub/specific/dashboard.py index da5abbfd1dc12..515fcf0c6da95 100644 --- a/metadata-ingestion/src/datahub/specific/dashboard.py +++ b/metadata-ingestion/src/datahub/specific/dashboard.py @@ -1,27 +1,28 @@ -from typing import Dict, List, Optional, Union +from typing import List, Optional, Tuple, Union -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath from datahub.metadata.schema_classes import ( AccessLevelClass, ChangeAuditStampsClass, DashboardInfoClass as DashboardInfo, EdgeClass as Edge, - GlobalTagsClass as GlobalTags, - GlossaryTermAssociationClass as Term, - GlossaryTermsClass as GlossaryTerms, KafkaAuditHeaderClass, - OwnerClass as Owner, - OwnershipTypeClass, SystemMetadataClass, - TagAssociationClass as Tag, ) -from datahub.specific.custom_properties import CustomPropertiesPatchHelper -from datahub.specific.ownership import OwnershipPatchHelper -from datahub.utilities.urns.tag_urn import TagUrn +from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch +from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch +from datahub.specific.aspect_helpers.tags import HasTagsPatch +from datahub.specific.aspect_helpers.terms import HasTermsPatch from datahub.utilities.urns.urn import Urn -class DashboardPatchBuilder(MetadataPatchProposal): +class DashboardPatchBuilder( + HasOwnershipPatch, + HasCustomPropertiesPatch, + HasTagsPatch, + HasTermsPatch, + MetadataPatchProposal, +): def __init__( self, urn: str, @@ -39,55 +40,10 @@ def __init__( super().__init__( urn, system_metadata=system_metadata, audit_header=audit_header ) - self.custom_properties_patch_helper = CustomPropertiesPatchHelper( - self, DashboardInfo.ASPECT_NAME - ) - self.ownership_patch_helper = OwnershipPatchHelper(self) - - def add_owner(self, owner: Owner) -> "DashboardPatchBuilder": - """ - Adds an owner to the DashboardPatchBuilder. - - Args: - owner: The Owner object to add. - - Returns: - The DashboardPatchBuilder instance. - """ - self.ownership_patch_helper.add_owner(owner) - return self - - def remove_owner( - self, owner: str, owner_type: Optional[OwnershipTypeClass] = None - ) -> "DashboardPatchBuilder": - """ - Removes an owner from the DashboardPatchBuilder. - - Args: - owner: The owner to remove. - owner_type: The ownership type of the owner (optional). - - Returns: - The DashboardPatchBuilder instance. - - Notes: - `owner_type` is optional. - """ - self.ownership_patch_helper.remove_owner(owner, owner_type) - return self - - def set_owners(self, owners: List[Owner]) -> "DashboardPatchBuilder": - """ - Sets the owners of the DashboardPatchBuilder. - Args: - owners: A list of Owner objects. - - Returns: - The DashboardPatchBuilder instance. - """ - self.ownership_patch_helper.set_owners(owners) - return self + @classmethod + def _custom_properties_location(cls) -> Tuple[str, PatchPath]: + return DashboardInfo.ASPECT_NAME, ("customProperties",) def add_dataset_edge( self, dataset: Union[Edge, Urn, str] @@ -126,7 +82,7 @@ def add_dataset_edge( self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path=f"/datasetEdges/{self.quote(dataset_urn)}", + path=("datasetEdges", dataset_urn), value=dataset_edge, ) return self @@ -144,7 +100,7 @@ def remove_dataset_edge(self, dataset: Union[str, Urn]) -> "DashboardPatchBuilde self._add_patch( DashboardInfo.ASPECT_NAME, "remove", - path=f"/datasetEdges/{dataset}", + path=("datasetEdges", dataset), value={}, ) return self @@ -169,7 +125,7 @@ def set_dataset_edges(self, datasets: List[Edge]) -> "DashboardPatchBuilder": self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/datasetEdges", + path=("datasetEdges",), value=datasets, ) return self @@ -209,7 +165,7 @@ def add_chart_edge(self, chart: Union[Edge, Urn, str]) -> "DashboardPatchBuilder self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path=f"/chartEdges/{self.quote(chart_urn)}", + path=("chartEdges", chart_urn), value=chart_edge, ) return self @@ -227,7 +183,7 @@ def remove_chart_edge(self, chart: Union[str, Urn]) -> "DashboardPatchBuilder": self._add_patch( DashboardInfo.ASPECT_NAME, "remove", - path=f"/chartEdges/{chart}", + path=("chartEdges", chart), value={}, ) return self @@ -252,129 +208,17 @@ def set_chart_edges(self, charts: List[Edge]) -> "DashboardPatchBuilder": self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/chartEdges", + path=("chartEdges",), value=charts, ) return self - def add_tag(self, tag: Tag) -> "DashboardPatchBuilder": - """ - Adds a tag to the DashboardPatchBuilder. - - Args: - tag: The Tag object representing the tag to be added. - - Returns: - The DashboardPatchBuilder instance. - """ - self._add_patch( - GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag - ) - return self - - def remove_tag(self, tag: Union[str, Urn]) -> "DashboardPatchBuilder": - """ - Removes a tag from the DashboardPatchBuilder. - - Args: - tag: The tag to remove, specified as a string or Urn object. - - Returns: - The DashboardPatchBuilder instance. - """ - if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): - tag = TagUrn.create_from_id(tag) - self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={}) - return self - - def add_term(self, term: Term) -> "DashboardPatchBuilder": - """ - Adds a glossary term to the DashboardPatchBuilder. - - Args: - term: The Term object representing the glossary term to be added. - - Returns: - The DashboardPatchBuilder instance. - """ - self._add_patch( - GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term - ) - return self - - def remove_term(self, term: Union[str, Urn]) -> "DashboardPatchBuilder": - """ - Removes a glossary term from the DashboardPatchBuilder. - - Args: - term: The term to remove, specified as a string or Urn object. - - Returns: - The DashboardPatchBuilder instance. - """ - if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): - term = "urn:li:glossaryTerm:" + term - self._add_patch( - GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={} - ) - return self - - def set_custom_properties( - self, custom_properties: Dict[str, str] - ) -> "DashboardPatchBuilder": - """ - Sets the custom properties for the DashboardPatchBuilder. - - Args: - custom_properties: A dictionary containing the custom properties to be set. - - Returns: - The DashboardPatchBuilder instance. - - Notes: - This method replaces all existing custom properties with the given dictionary. - """ - self._add_patch( - DashboardInfo.ASPECT_NAME, - "add", - path="/customProperties", - value=custom_properties, - ) - return self - - def add_custom_property(self, key: str, value: str) -> "DashboardPatchBuilder": - """ - Adds a custom property to the DashboardPatchBuilder. - - Args: - key: The key of the custom property. - value: The value of the custom property. - - Returns: - The DashboardPatchBuilder instance. - """ - self.custom_properties_patch_helper.add_property(key, value) - return self - - def remove_custom_property(self, key: str) -> "DashboardPatchBuilder": - """ - Removes a custom property from the DashboardPatchBuilder. - - Args: - key: The key of the custom property to remove. - - Returns: - The DashboardPatchBuilder instance. - """ - self.custom_properties_patch_helper.remove_property(key) - return self - def set_title(self, title: str) -> "DashboardPatchBuilder": assert title, "DashboardInfo title should not be None" self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/title", + path=("title",), value=title, ) @@ -385,27 +229,18 @@ def set_description(self, description: str) -> "DashboardPatchBuilder": self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/description", + path=("description",), value=description, ) return self - def add_custom_properties( - self, custom_properties: Optional[Dict[str, str]] = None - ) -> "DashboardPatchBuilder": - if custom_properties: - for key, value in custom_properties.items(): - self.custom_properties_patch_helper.add_property(key, value) - - return self - def set_external_url(self, external_url: Optional[str]) -> "DashboardPatchBuilder": if external_url: self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/externalUrl", + path=("externalUrl",), value=external_url, ) return self @@ -416,7 +251,7 @@ def add_charts(self, chart_urns: Optional[List[str]]) -> "DashboardPatchBuilder" self._add_patch( aspect_name=DashboardInfo.ASPECT_NAME, op="add", - path=f"/charts/{urn}", + path=("charts", urn), value=urn, ) @@ -430,7 +265,7 @@ def add_datasets( self._add_patch( aspect_name=DashboardInfo.ASPECT_NAME, op="add", - path=f"/datasets/{urn}", + path=("datasets", urn), value=urn, ) @@ -443,7 +278,7 @@ def set_dashboard_url( self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/dashboardUrl", + path=("dashboardUrl",), value=dashboard_url, ) @@ -456,7 +291,7 @@ def set_access( self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/access", + path=("access",), value=access, ) @@ -469,7 +304,7 @@ def set_last_refreshed( self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/lastRefreshed", + path=("lastRefreshed",), value=last_refreshed, ) @@ -482,7 +317,7 @@ def set_last_modified( self._add_patch( DashboardInfo.ASPECT_NAME, "add", - path="/lastModified", + path=("lastModified",), value=last_modified, ) diff --git a/metadata-ingestion/src/datahub/specific/datajob.py b/metadata-ingestion/src/datahub/specific/datajob.py index 6ff4741b09c26..fd826c6dd59ca 100644 --- a/metadata-ingestion/src/datahub/specific/datajob.py +++ b/metadata-ingestion/src/datahub/specific/datajob.py @@ -1,25 +1,27 @@ -from typing import Dict, List, Optional, Union +from typing import List, Optional, Tuple, Union -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath from datahub.metadata.schema_classes import ( DataJobInfoClass as DataJobInfo, DataJobInputOutputClass as DataJobInputOutput, EdgeClass as Edge, - GlobalTagsClass as GlobalTags, - GlossaryTermAssociationClass as Term, - GlossaryTermsClass as GlossaryTerms, KafkaAuditHeaderClass, - OwnerClass as Owner, - OwnershipTypeClass, SystemMetadataClass, - TagAssociationClass as Tag, ) -from datahub.metadata.urns import SchemaFieldUrn, TagUrn, Urn -from datahub.specific.custom_properties import CustomPropertiesPatchHelper -from datahub.specific.ownership import OwnershipPatchHelper - - -class DataJobPatchBuilder(MetadataPatchProposal): +from datahub.metadata.urns import SchemaFieldUrn, Urn +from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch +from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch +from datahub.specific.aspect_helpers.tags import HasTagsPatch +from datahub.specific.aspect_helpers.terms import HasTermsPatch + + +class DataJobPatchBuilder( + HasOwnershipPatch, + HasCustomPropertiesPatch, + HasTagsPatch, + HasTermsPatch, + MetadataPatchProposal, +): def __init__( self, urn: str, @@ -37,55 +39,10 @@ def __init__( super().__init__( urn, system_metadata=system_metadata, audit_header=audit_header ) - self.custom_properties_patch_helper = CustomPropertiesPatchHelper( - self, DataJobInfo.ASPECT_NAME - ) - self.ownership_patch_helper = OwnershipPatchHelper(self) - - def add_owner(self, owner: Owner) -> "DataJobPatchBuilder": - """ - Adds an owner to the DataJobPatchBuilder. - - Args: - owner: The Owner object to add. - - Returns: - The DataJobPatchBuilder instance. - """ - self.ownership_patch_helper.add_owner(owner) - return self - - def remove_owner( - self, owner: str, owner_type: Optional[OwnershipTypeClass] = None - ) -> "DataJobPatchBuilder": - """ - Removes an owner from the DataJobPatchBuilder. - - Args: - owner: The owner to remove. - owner_type: The ownership type of the owner (optional). - - Returns: - The DataJobPatchBuilder instance. - - Notes: - `owner_type` is optional. - """ - self.ownership_patch_helper.remove_owner(owner, owner_type) - return self - - def set_owners(self, owners: List[Owner]) -> "DataJobPatchBuilder": - """ - Sets the owners of the DataJobPatchBuilder. - - Args: - owners: A list of Owner objects. - Returns: - The DataJobPatchBuilder instance. - """ - self.ownership_patch_helper.set_owners(owners) - return self + @classmethod + def _custom_properties_location(cls) -> Tuple[str, PatchPath]: + return DataJobInfo.ASPECT_NAME, ("customProperties",) def add_input_datajob(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilder": """ @@ -120,7 +77,7 @@ def add_input_datajob(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilde self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path=f"/inputDatajobEdges/{self.quote(input_urn)}", + path=("inputDatajobEdges", input_urn), value=input_edge, ) return self @@ -138,7 +95,7 @@ def remove_input_datajob(self, input: Union[str, Urn]) -> "DataJobPatchBuilder": self._add_patch( DataJobInputOutput.ASPECT_NAME, "remove", - path=f"/inputDatajobEdges/{input}", + path=("inputDatajobEdges", input), value={}, ) return self @@ -163,7 +120,7 @@ def set_input_datajobs(self, inputs: List[Edge]) -> "DataJobPatchBuilder": self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path="/inputDatajobEdges", + path=("inputDatajobEdges",), value=inputs, ) return self @@ -201,7 +158,7 @@ def add_input_dataset(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilde self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path=f"/inputDatasetEdges/{self.quote(input_urn)}", + path=("inputDatasetEdges", input_urn), value=input_edge, ) return self @@ -219,7 +176,7 @@ def remove_input_dataset(self, input: Union[str, Urn]) -> "DataJobPatchBuilder": self._add_patch( DataJobInputOutput.ASPECT_NAME, "remove", - path=f"/inputDatasetEdges/{self.quote(str(input))}", + path=("inputDatasetEdges", input), value={}, ) return self @@ -244,7 +201,7 @@ def set_input_datasets(self, inputs: List[Edge]) -> "DataJobPatchBuilder": self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path="/inputDatasetEdges", + path=("inputDatasetEdges",), value=inputs, ) return self @@ -284,7 +241,7 @@ def add_output_dataset( self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path=f"/outputDatasetEdges/{self.quote(output_urn)}", + path=("outputDatasetEdges", output_urn), value=output_edge, ) return self @@ -302,7 +259,7 @@ def remove_output_dataset(self, output: Union[str, Urn]) -> "DataJobPatchBuilder self._add_patch( DataJobInputOutput.ASPECT_NAME, "remove", - path=f"/outputDatasetEdges/{self.quote(str(output))}", + path=("outputDatasetEdges", output), value={}, ) return self @@ -327,7 +284,7 @@ def set_output_datasets(self, outputs: List[Edge]) -> "DataJobPatchBuilder": self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path="/outputDatasetEdges", + path=("outputDatasetEdges",), value=outputs, ) return self @@ -351,7 +308,7 @@ def add_input_dataset_field(self, input: Union[Urn, str]) -> "DataJobPatchBuilde self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path=f"/inputDatasetFields/{self.quote(input_urn)}", + path=("inputDatasetFields", input_urn), value={}, ) return self @@ -372,7 +329,7 @@ def remove_input_dataset_field( self._add_patch( DataJobInputOutput.ASPECT_NAME, "remove", - path=f"/inputDatasetFields/{self.quote(input_urn)}", + path=("inputDatasetFields", input_urn), value={}, ) return self @@ -397,7 +354,7 @@ def set_input_dataset_fields(self, inputs: List[Edge]) -> "DataJobPatchBuilder": self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path="/inputDatasetFields", + path=("inputDatasetFields",), value=inputs, ) return self @@ -423,7 +380,7 @@ def add_output_dataset_field( self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path=f"/outputDatasetFields/{self.quote(output_urn)}", + path=("outputDatasetFields", output_urn), value={}, ) return self @@ -444,7 +401,7 @@ def remove_output_dataset_field( self._add_patch( DataJobInputOutput.ASPECT_NAME, "remove", - path=f"/outputDatasetFields/{self.quote(output_urn)}", + path=("outputDatasetFields", output_urn), value={}, ) return self @@ -469,119 +426,7 @@ def set_output_dataset_fields(self, outputs: List[Edge]) -> "DataJobPatchBuilder self._add_patch( DataJobInputOutput.ASPECT_NAME, "add", - path="/outputDatasetFields", + path=("outputDatasetFields",), value=outputs, ) return self - - def add_tag(self, tag: Tag) -> "DataJobPatchBuilder": - """ - Adds a tag to the DataJobPatchBuilder. - - Args: - tag: The Tag object representing the tag to be added. - - Returns: - The DataJobPatchBuilder instance. - """ - self._add_patch( - GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag - ) - return self - - def remove_tag(self, tag: Union[str, Urn]) -> "DataJobPatchBuilder": - """ - Removes a tag from the DataJobPatchBuilder. - - Args: - tag: The tag to remove, specified as a string or Urn object. - - Returns: - The DataJobPatchBuilder instance. - """ - if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): - tag = TagUrn.create_from_id(tag) - self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={}) - return self - - def add_term(self, term: Term) -> "DataJobPatchBuilder": - """ - Adds a glossary term to the DataJobPatchBuilder. - - Args: - term: The Term object representing the glossary term to be added. - - Returns: - The DataJobPatchBuilder instance. - """ - self._add_patch( - GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term - ) - return self - - def remove_term(self, term: Union[str, Urn]) -> "DataJobPatchBuilder": - """ - Removes a glossary term from the DataJobPatchBuilder. - - Args: - term: The term to remove, specified as a string or Urn object. - - Returns: - The DataJobPatchBuilder instance. - """ - if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): - term = "urn:li:glossaryTerm:" + term - self._add_patch( - GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={} - ) - return self - - def set_custom_properties( - self, custom_properties: Dict[str, str] - ) -> "DataJobPatchBuilder": - """ - Sets the custom properties for the DataJobPatchBuilder. - - Args: - custom_properties: A dictionary containing the custom properties to be set. - - Returns: - The DataJobPatchBuilder instance. - - Notes: - This method replaces all existing custom properties with the given dictionary. - """ - self._add_patch( - DataJobInfo.ASPECT_NAME, - "add", - path="/customProperties", - value=custom_properties, - ) - return self - - def add_custom_property(self, key: str, value: str) -> "DataJobPatchBuilder": - """ - Adds a custom property to the DataJobPatchBuilder. - - Args: - key: The key of the custom property. - value: The value of the custom property. - - Returns: - The DataJobPatchBuilder instance. - """ - self.custom_properties_patch_helper.add_property(key, value) - return self - - def remove_custom_property(self, key: str) -> "DataJobPatchBuilder": - """ - Removes a custom property from the DataJobPatchBuilder. - - Args: - key: The key of the custom property to remove. - - Returns: - The DataJobPatchBuilder instance. - """ - self.custom_properties_patch_helper.remove_property(key) - return self diff --git a/metadata-ingestion/src/datahub/specific/dataproduct.py b/metadata-ingestion/src/datahub/specific/dataproduct.py index f9830a4b23df0..d38d2d4156315 100644 --- a/metadata-ingestion/src/datahub/specific/dataproduct.py +++ b/metadata-ingestion/src/datahub/specific/dataproduct.py @@ -1,25 +1,25 @@ -from typing import Dict, List, Optional, Union +from typing import List, Optional, Tuple -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath from datahub.metadata.schema_classes import ( DataProductAssociationClass as DataProductAssociation, DataProductPropertiesClass as DataProductProperties, - GlobalTagsClass as GlobalTags, - GlossaryTermAssociationClass as Term, - GlossaryTermsClass as GlossaryTerms, KafkaAuditHeaderClass, - OwnerClass as Owner, - OwnershipTypeClass, SystemMetadataClass, - TagAssociationClass as Tag, ) -from datahub.specific.custom_properties import CustomPropertiesPatchHelper -from datahub.specific.ownership import OwnershipPatchHelper -from datahub.utilities.urns.tag_urn import TagUrn -from datahub.utilities.urns.urn import Urn - - -class DataProductPatchBuilder(MetadataPatchProposal): +from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch +from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch +from datahub.specific.aspect_helpers.tags import HasTagsPatch +from datahub.specific.aspect_helpers.terms import HasTermsPatch + + +class DataProductPatchBuilder( + HasOwnershipPatch, + HasCustomPropertiesPatch, + HasTagsPatch, + HasTermsPatch, + MetadataPatchProposal, +): def __init__( self, urn: str, @@ -31,59 +31,16 @@ def __init__( system_metadata=system_metadata, audit_header=audit_header, ) - self.custom_properties_patch_helper = CustomPropertiesPatchHelper( - self, DataProductProperties.ASPECT_NAME - ) - self.ownership_patch_helper = OwnershipPatchHelper(self) - - def add_owner(self, owner: Owner) -> "DataProductPatchBuilder": - self.ownership_patch_helper.add_owner(owner) - return self - - def remove_owner( - self, owner: str, owner_type: Optional[OwnershipTypeClass] = None - ) -> "DataProductPatchBuilder": - """ - param: owner_type is optional - """ - self.ownership_patch_helper.remove_owner(owner, owner_type) - return self - - def set_owners(self, owners: List[Owner]) -> "DataProductPatchBuilder": - self.ownership_patch_helper.set_owners(owners) - return self - - def add_tag(self, tag: Tag) -> "DataProductPatchBuilder": - self._add_patch( - GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag - ) - return self - - def remove_tag(self, tag: Union[str, Urn]) -> "DataProductPatchBuilder": - if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): - tag = TagUrn.create_from_id(tag) - self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={}) - return self - - def add_term(self, term: Term) -> "DataProductPatchBuilder": - self._add_patch( - GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term - ) - return self - def remove_term(self, term: Union[str, Urn]) -> "DataProductPatchBuilder": - if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): - term = "urn:li:glossaryTerm:" + term - self._add_patch( - GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={} - ) - return self + @classmethod + def _custom_properties_location(cls) -> Tuple[str, PatchPath]: + return DataProductProperties.ASPECT_NAME, ("customProperties",) def set_name(self, name: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path="/name", + path=("name",), value=name, ) return self @@ -92,37 +49,18 @@ def set_description(self, description: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path="/description", + path=("description",), value=description, ) return self - def set_custom_properties( - self, custom_properties: Dict[str, str] - ) -> "DataProductPatchBuilder": - self._add_patch( - DataProductProperties.ASPECT_NAME, - "add", - path="/customProperties", - value=custom_properties, - ) - return self - - def add_custom_property(self, key: str, value: str) -> "DataProductPatchBuilder": - self.custom_properties_patch_helper.add_property(key, value) - return self - - def remove_custom_property(self, key: str) -> "DataProductPatchBuilder": - self.custom_properties_patch_helper.remove_property(key) - return self - def set_assets( self, assets: List[DataProductAssociation] ) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path="/assets", + path=("assets",), value=assets, ) return self @@ -131,7 +69,7 @@ def add_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path=f"/assets/{self.quote(asset_urn)}", + path=("assets", asset_urn), value=DataProductAssociation(destinationUrn=asset_urn), ) return self @@ -140,7 +78,7 @@ def remove_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "remove", - path=f"/assets/{self.quote(asset_urn)}", + path=("assets", asset_urn), value={}, ) return self @@ -149,7 +87,7 @@ def set_external_url(self, external_url: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path="/externalUrl", + path=("externalUrl",), value=external_url, ) return self diff --git a/metadata-ingestion/src/datahub/specific/dataset.py b/metadata-ingestion/src/datahub/specific/dataset.py index b171dc4cc2939..6332386684bbf 100644 --- a/metadata-ingestion/src/datahub/specific/dataset.py +++ b/metadata-ingestion/src/datahub/specific/dataset.py @@ -1,27 +1,27 @@ -from typing import Dict, Generic, List, Optional, Tuple, TypeVar, Union +from typing import Generic, List, Optional, Tuple, TypeVar, Union -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp from datahub.metadata.schema_classes import ( DatasetPropertiesClass as DatasetProperties, EditableDatasetPropertiesClass as EditableDatasetProperties, EditableSchemaMetadataClass as EditableSchemaMetadata, FineGrainedLineageClass as FineGrainedLineage, - GlobalTagsClass as GlobalTags, GlossaryTermAssociationClass as Term, - GlossaryTermsClass as GlossaryTerms, KafkaAuditHeaderClass, - OwnerClass as Owner, - OwnershipTypeClass, SchemaMetadataClass, SystemMetadataClass, TagAssociationClass as Tag, UpstreamClass as Upstream, UpstreamLineageClass as UpstreamLineage, ) -from datahub.specific.custom_properties import CustomPropertiesPatchHelper -from datahub.specific.ownership import OwnershipPatchHelper -from datahub.specific.structured_properties import StructuredPropertiesPatchHelper +from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch +from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch +from datahub.specific.aspect_helpers.structured_properties import ( + HasStructuredPropertiesPatch, +) +from datahub.specific.aspect_helpers.tags import HasTagsPatch +from datahub.specific.aspect_helpers.terms import HasTermsPatch from datahub.utilities.urns.tag_urn import TagUrn from datahub.utilities.urns.urn import Urn @@ -48,7 +48,7 @@ def add_tag(self, tag: Tag) -> "FieldPatchHelper": self._parent._add_patch( self.aspect_name, "add", - path=f"/{self.aspect_field}/{self.field_path}/globalTags/tags/{tag.tag}", + path=(self.aspect_field, self.field_path, "globalTags", "tags", tag.tag), value=tag, ) return self @@ -59,7 +59,7 @@ def remove_tag(self, tag: Union[str, Urn]) -> "FieldPatchHelper": self._parent._add_patch( self.aspect_name, "remove", - path=f"/{self.aspect_field}/{self.field_path}/globalTags/tags/{tag}", + path=(self.aspect_field, self.field_path, "globalTags", "tags", tag), value={}, ) return self @@ -68,7 +68,13 @@ def add_term(self, term: Term) -> "FieldPatchHelper": self._parent._add_patch( self.aspect_name, "add", - path=f"/{self.aspect_field}/{self.field_path}/glossaryTerms/terms/{term.urn}", + path=( + self.aspect_field, + self.field_path, + "glossaryTerms", + "terms", + term.urn, + ), value=term, ) return self @@ -79,7 +85,7 @@ def remove_term(self, term: Union[str, Urn]) -> "FieldPatchHelper": self._parent._add_patch( self.aspect_name, "remove", - path=f"/{self.aspect_field}/{self.field_path}/glossaryTerms/terms/{term}", + path=(self.aspect_field, self.field_path, "glossaryTerms", "terms", term), value={}, ) return self @@ -88,7 +94,14 @@ def parent(self) -> _Parent: return self._parent -class DatasetPatchBuilder(MetadataPatchProposal): +class DatasetPatchBuilder( + HasOwnershipPatch, + HasCustomPropertiesPatch, + HasStructuredPropertiesPatch, + HasTagsPatch, + HasTermsPatch, + MetadataPatchProposal, +): def __init__( self, urn: str, @@ -98,34 +111,16 @@ def __init__( super().__init__( urn, system_metadata=system_metadata, audit_header=audit_header ) - self.custom_properties_patch_helper = CustomPropertiesPatchHelper( - self, DatasetProperties.ASPECT_NAME - ) - self.ownership_patch_helper = OwnershipPatchHelper(self) - self.structured_properties_patch_helper = StructuredPropertiesPatchHelper(self) - - def add_owner(self, owner: Owner) -> "DatasetPatchBuilder": - self.ownership_patch_helper.add_owner(owner) - return self - def remove_owner( - self, owner: str, owner_type: Optional[OwnershipTypeClass] = None - ) -> "DatasetPatchBuilder": - """ - param: owner_type is optional - """ - self.ownership_patch_helper.remove_owner(owner, owner_type) - return self - - def set_owners(self, owners: List[Owner]) -> "DatasetPatchBuilder": - self.ownership_patch_helper.set_owners(owners) - return self + @classmethod + def _custom_properties_location(cls) -> Tuple[str, PatchPath]: + return DatasetProperties.ASPECT_NAME, ("customProperties",) def add_upstream_lineage(self, upstream: Upstream) -> "DatasetPatchBuilder": self._add_patch( UpstreamLineage.ASPECT_NAME, "add", - path=f"/upstreams/{self.quote(upstream.dataset)}", + path=("upstreams", upstream.dataset), value=upstream, ) return self @@ -136,14 +131,14 @@ def remove_upstream_lineage( self._add_patch( UpstreamLineage.ASPECT_NAME, "remove", - path=f"/upstreams/{dataset}", + path=("upstreams", dataset), value={}, ) return self def set_upstream_lineages(self, upstreams: List[Upstream]) -> "DatasetPatchBuilder": self._add_patch( - UpstreamLineage.ASPECT_NAME, "add", path="/upstreams", value=upstreams + UpstreamLineage.ASPECT_NAME, "add", path=("upstreams",), value=upstreams ) return self @@ -159,7 +154,7 @@ def add_fine_grained_upstream_lineage( self._add_patch( UpstreamLineage.ASPECT_NAME, "add", - path=DatasetPatchBuilder.quote_fine_grained_path( + path=self._build_fine_grained_path( transform_op, downstream_urn, query_id, upstream_urn ), value={"confidenceScore": fine_grained_lineage.confidenceScore}, @@ -179,12 +174,15 @@ def get_fine_grained_key( return transform_op, downstream_urn, query_id @classmethod - def quote_fine_grained_path( + def _build_fine_grained_path( cls, transform_op: str, downstream_urn: str, query_id: str, upstream_urn: str - ) -> str: + ) -> PatchPath: return ( - f"/fineGrainedLineages/{cls.quote(transform_op)}/" - f"{cls.quote(downstream_urn)}/{cls.quote(query_id)}/{cls.quote(upstream_urn)}" + "fineGrainedLineages", + transform_op, + downstream_urn, + query_id, + upstream_urn, ) def remove_fine_grained_upstream_lineage( @@ -199,7 +197,7 @@ def remove_fine_grained_upstream_lineage( self._add_patch( UpstreamLineage.ASPECT_NAME, "remove", - path=DatasetPatchBuilder.quote_fine_grained_path( + path=self._build_fine_grained_path( transform_op, downstream_urn, query_id, upstream_urn ), value={}, @@ -212,37 +210,11 @@ def set_fine_grained_upstream_lineages( self._add_patch( UpstreamLineage.ASPECT_NAME, "add", - path="/fineGrainedLineages", + path=("fineGrainedLineages",), value=fine_grained_lineages, ) return self - def add_tag(self, tag: Tag) -> "DatasetPatchBuilder": - self._add_patch( - GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag - ) - return self - - def remove_tag(self, tag: Union[str, Urn]) -> "DatasetPatchBuilder": - if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): - tag = TagUrn.create_from_id(tag) - self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={}) - return self - - def add_term(self, term: Term) -> "DatasetPatchBuilder": - self._add_patch( - GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term - ) - return self - - def remove_term(self, term: Union[str, Urn]) -> "DatasetPatchBuilder": - if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): - term = "urn:li:glossaryTerm:" + term - self._add_patch( - GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={} - ) - return self - def for_field( self, field_path: str, editable: bool = True ) -> FieldPatchHelper["DatasetPatchBuilder"]: @@ -269,38 +241,11 @@ def set_description( else EditableDatasetProperties.ASPECT_NAME ), "add", - path="/description", + path=("description",), value=description, ) return self - def set_custom_properties( - self, custom_properties: Dict[str, str] - ) -> "DatasetPatchBuilder": - self._add_patch( - DatasetProperties.ASPECT_NAME, - "add", - path="/customProperties", - value=custom_properties, - ) - return self - - def add_custom_property(self, key: str, value: str) -> "DatasetPatchBuilder": - self.custom_properties_patch_helper.add_property(key, value) - return self - - def add_custom_properties( - self, custom_properties: Optional[Dict[str, str]] = None - ) -> "DatasetPatchBuilder": - if custom_properties is not None: - for key, value in custom_properties.items(): - self.custom_properties_patch_helper.add_property(key, value) - return self - - def remove_custom_property(self, key: str) -> "DatasetPatchBuilder": - self.custom_properties_patch_helper.remove_property(key) - return self - def set_display_name( self, display_name: Optional[str] = None ) -> "DatasetPatchBuilder": @@ -308,7 +253,7 @@ def set_display_name( self._add_patch( DatasetProperties.ASPECT_NAME, "add", - path="/name", + path=("name",), value=display_name, ) return self @@ -320,7 +265,7 @@ def set_qualified_name( self._add_patch( DatasetProperties.ASPECT_NAME, "add", - path="/qualifiedName", + path=("qualifiedName",), value=qualified_name, ) return self @@ -332,7 +277,7 @@ def set_created( self._add_patch( DatasetProperties.ASPECT_NAME, "add", - path="/created", + path=("created",), value=timestamp, ) return self @@ -344,37 +289,7 @@ def set_last_modified( self._add_patch( DatasetProperties.ASPECT_NAME, "add", - path="/lastModified", + path=("lastModified",), value=timestamp, ) return self - - def set_structured_property( - self, property_name: str, value: Union[str, float, List[Union[str, float]]] - ) -> "DatasetPatchBuilder": - """ - This is a helper method to set a structured property. - @param property_name: the name of the property (either bare or urn form) - @param value: the value of the property (for multi-valued properties, this can be a list) - """ - self.structured_properties_patch_helper.set_property(property_name, value) - return self - - def add_structured_property( - self, property_name: str, value: Union[str, float] - ) -> "DatasetPatchBuilder": - """ - This is a helper method to add a structured property. - @param property_name: the name of the property (either bare or urn form) - @param value: the value of the property (for multi-valued properties, this value will be appended to the list) - """ - self.structured_properties_patch_helper.add_property(property_name, value) - return self - - def remove_structured_property(self, property_name: str) -> "DatasetPatchBuilder": - """ - This is a helper method to remove a structured property. - @param property_name: the name of the property (either bare or urn form) - """ - self.structured_properties_patch_helper.remove_property(property_name) - return self diff --git a/metadata-ingestion/src/datahub/specific/form.py b/metadata-ingestion/src/datahub/specific/form.py index 78182c202f716..281b3cac99b2c 100644 --- a/metadata-ingestion/src/datahub/specific/form.py +++ b/metadata-ingestion/src/datahub/specific/form.py @@ -5,15 +5,13 @@ FormInfoClass as FormInfo, FormPromptClass, KafkaAuditHeaderClass, - OwnerClass as Owner, - OwnershipTypeClass, SystemMetadataClass, ) -from datahub.specific.ownership import OwnershipPatchHelper +from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch from datahub.utilities.urns.urn import Urn -class FormPatchBuilder(MetadataPatchProposal): +class FormPatchBuilder(HasOwnershipPatch, MetadataPatchProposal): def __init__( self, urn: str, @@ -23,31 +21,13 @@ def __init__( super().__init__( urn, system_metadata=system_metadata, audit_header=audit_header ) - self.ownership_patch_helper = OwnershipPatchHelper(self) - - def add_owner(self, owner: Owner) -> "FormPatchBuilder": - self.ownership_patch_helper.add_owner(owner) - return self - - def remove_owner( - self, owner: str, owner_type: Optional[OwnershipTypeClass] = None - ) -> "FormPatchBuilder": - """ - param: owner_type is optional - """ - self.ownership_patch_helper.remove_owner(owner, owner_type) - return self - - def set_owners(self, owners: List[Owner]) -> "FormPatchBuilder": - self.ownership_patch_helper.set_owners(owners) - return self def set_name(self, name: Optional[str] = None) -> "FormPatchBuilder": if name is not None: self._add_patch( FormInfo.ASPECT_NAME, "add", - path="/name", + path=("name",), value=name, ) return self @@ -57,7 +37,7 @@ def set_description(self, description: Optional[str] = None) -> "FormPatchBuilde self._add_patch( FormInfo.ASPECT_NAME, "add", - path="/description", + path=("description",), value=description, ) return self @@ -67,7 +47,7 @@ def set_type(self, type: Optional[str] = None) -> "FormPatchBuilder": self._add_patch( FormInfo.ASPECT_NAME, "add", - path="/type", + path=("type",), value=type, ) return self @@ -76,7 +56,7 @@ def add_prompt(self, prompt: FormPromptClass) -> "FormPatchBuilder": self._add_patch( FormInfo.ASPECT_NAME, "add", - path=f"/prompts/{self.quote(prompt.id)}", + path=("prompts", prompt.id), value=prompt, ) return self @@ -90,7 +70,7 @@ def remove_prompt(self, prompt_id: str) -> "FormPatchBuilder": self._add_patch( FormInfo.ASPECT_NAME, "remove", - path=f"/prompts/{self.quote(prompt_id)}", + path=("prompts", prompt_id), value=prompt_id, ) return self @@ -104,7 +84,7 @@ def set_ownership_form(self, is_ownership: bool) -> "FormPatchBuilder": self._add_patch( FormInfo.ASPECT_NAME, "add", - path="/actors/owners", + path=("actors", "owners"), value=is_ownership, ) return self @@ -113,7 +93,7 @@ def add_assigned_user(self, user_urn: Union[str, Urn]) -> "FormPatchBuilder": self._add_patch( FormInfo.ASPECT_NAME, "add", - path=f"/actors/users/{self.quote(str(user_urn))}", + path=("actors", "users", user_urn), value=user_urn, ) return self @@ -122,7 +102,7 @@ def remove_assigned_user(self, user_urn: Union[str, Urn]) -> "FormPatchBuilder": self._add_patch( FormInfo.ASPECT_NAME, "remove", - path=f"/actors/users/{self.quote(str(user_urn))}", + path=("actors", "users", user_urn), value=user_urn, ) return self @@ -131,7 +111,7 @@ def add_assigned_group(self, group_urn: Union[str, Urn]) -> "FormPatchBuilder": self._add_patch( FormInfo.ASPECT_NAME, "add", - path=f"/actors/groups/{self.quote(str(group_urn))}", + path=("actors", "groups", group_urn), value=group_urn, ) return self @@ -140,7 +120,7 @@ def remove_assigned_group(self, group_urn: Union[str, Urn]) -> "FormPatchBuilder self._add_patch( FormInfo.ASPECT_NAME, "remove", - path=f"/actors/groups/{self.quote(str(group_urn))}", + path=("actors", "groups", group_urn), value=group_urn, ) return self diff --git a/metadata-ingestion/src/datahub/specific/ownership.py b/metadata-ingestion/src/datahub/specific/ownership.py deleted file mode 100644 index b377a8814f38a..0000000000000 --- a/metadata-ingestion/src/datahub/specific/ownership.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Generic, List, Optional, TypeVar - -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal -from datahub.metadata.schema_classes import ( - OwnerClass, - OwnershipClass, - OwnershipTypeClass, -) - -_Parent = TypeVar("_Parent", bound=MetadataPatchProposal) - - -class OwnershipPatchHelper(Generic[_Parent]): - def __init__(self, parent: _Parent) -> None: - self._parent = parent - self.aspect_field = OwnershipClass.ASPECT_NAME - - def parent(self) -> _Parent: - return self._parent - - def add_owner(self, owner: OwnerClass) -> "OwnershipPatchHelper": - self._parent._add_patch( - OwnershipClass.ASPECT_NAME, - "add", - path=f"/owners/{owner.owner}/{owner.type}", - value=owner, - ) - return self - - def remove_owner( - self, owner: str, owner_type: Optional[OwnershipTypeClass] = None - ) -> "OwnershipPatchHelper": - """ - param: owner_type is optional - """ - self._parent._add_patch( - OwnershipClass.ASPECT_NAME, - "remove", - path=f"/owners/{owner}" + (f"/{owner_type}" if owner_type else ""), - value=owner, - ) - return self - - def set_owners(self, owners: List[OwnerClass]) -> "OwnershipPatchHelper": - self._parent._add_patch( - OwnershipClass.ASPECT_NAME, "add", path="/owners", value=owners - ) - return self diff --git a/metadata-ingestion/src/datahub/specific/structured_properties.py b/metadata-ingestion/src/datahub/specific/structured_properties.py deleted file mode 100644 index 17d896249c474..0000000000000 --- a/metadata-ingestion/src/datahub/specific/structured_properties.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import Generic, List, TypeVar, Union - -from datahub.emitter.mcp_patch_builder import MetadataPatchProposal -from datahub.metadata.schema_classes import StructuredPropertyValueAssignmentClass -from datahub.utilities.urns.structured_properties_urn import ( - make_structured_property_urn, -) - -_Parent = TypeVar("_Parent", bound=MetadataPatchProposal) - - -class StructuredPropertiesPatchHelper(Generic[_Parent]): - def __init__( - self, - parent: _Parent, - aspect_name: str = "structuredProperties", - ) -> None: - self.aspect_name = aspect_name - self._parent = parent - self.aspect_field = "properties" - - def parent(self) -> _Parent: - return self._parent - - def set_property( - self, key: str, value: Union[str, float, List[Union[str, float]]] - ) -> "StructuredPropertiesPatchHelper": - self.remove_property(key) - self.add_property(key, value) - return self - - def remove_property(self, key: str) -> "StructuredPropertiesPatchHelper": - self._parent._add_patch( - self.aspect_name, - "remove", - path=(self.aspect_field, make_structured_property_urn(key)), - value={}, - ) - return self - - def add_property( - self, key: str, value: Union[str, float, List[Union[str, float]]] - ) -> "StructuredPropertiesPatchHelper": - self._parent._add_patch( - self.aspect_name, - "add", - path=(self.aspect_field, make_structured_property_urn(key)), - value=StructuredPropertyValueAssignmentClass( - propertyUrn=make_structured_property_urn(key), - values=value if isinstance(value, list) else [value], - ), - ) - return self diff --git a/metadata-ingestion/src/datahub/specific/structured_property.py b/metadata-ingestion/src/datahub/specific/structured_property.py index 50f1f079c2aa7..bcae174ed3c4f 100644 --- a/metadata-ingestion/src/datahub/specific/structured_property.py +++ b/metadata-ingestion/src/datahub/specific/structured_property.py @@ -29,7 +29,7 @@ def set_qualified_name( self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path="/qualifiedName", + path=("qualifiedName",), value=qualified_name, ) return self @@ -41,7 +41,7 @@ def set_display_name( self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path="/displayName", + path=("displayName",), value=display_name, ) return self @@ -53,7 +53,7 @@ def set_value_type( self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path="/valueType", + path=("valueType",), value=value_type, ) return self @@ -66,7 +66,7 @@ def set_type_qualifier( self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path="/typeQualifier", + path=("typeQualifier",), value=type_qualifier, ) return self @@ -78,7 +78,7 @@ def add_allowed_value( self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path=f"/allowedValues/{str(allowed_value.get('value'))}", + path=("allowedValues", str(allowed_value.get("value"))), value=allowed_value, ) return self @@ -87,7 +87,7 @@ def set_cardinality(self, cardinality: str) -> "StructuredPropertyPatchBuilder": self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path="/cardinality", + path=("cardinality",), value=cardinality, ) return self @@ -98,7 +98,7 @@ def add_entity_type( self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path=f"/entityTypes/{self.quote(str(entity_type))}", + path=("entityTypes", str(entity_type)), value=entity_type, ) return self @@ -110,7 +110,7 @@ def set_description( self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path="/description", + path=("description",), value=description, ) return self @@ -119,7 +119,7 @@ def set_immutable(self, immutable: bool) -> "StructuredPropertyPatchBuilder": self._add_patch( StructuredPropertyDefinition.ASPECT_NAME, "add", - path="/immutable", + path=("immutable",), value=immutable, ) return self diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index f81eb291e89e1..25b63ffac45f9 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -165,6 +165,7 @@ class KnownQueryLineageInfo: timestamp: Optional[datetime] = None session_id: Optional[str] = None query_type: QueryType = QueryType.UNKNOWN + query_id: Optional[str] = None @dataclasses.dataclass @@ -198,7 +199,7 @@ def id(self) -> str: @dataclasses.dataclass class PreparsedQuery: - # If not provided, we will generate one using the fast fingerprint generator. + # If not provided, we will generate one using the fingerprint generator. query_id: Optional[QueryId] query_text: str @@ -618,12 +619,13 @@ def add_known_query_lineage( self.report.num_known_query_lineage += 1 # Generate a fingerprint for the query. - with self.report.sql_fingerprinting_timer: - query_fingerprint = get_query_fingerprint( - known_query_lineage.query_text, - platform=self.platform.platform_name, - fast=True, - ) + query_fingerprint = known_query_lineage.query_id + if not query_fingerprint: + with self.report.sql_fingerprinting_timer: + query_fingerprint = get_query_fingerprint( + known_query_lineage.query_text, + platform=self.platform.platform_name, + ) formatted_query = self._maybe_format_query(known_query_lineage.query_text) # Register the query. @@ -848,7 +850,6 @@ def add_preparsed_query( query_fingerprint = get_query_fingerprint( parsed.query_text, platform=self.platform.platform_name, - fast=True, ) # Format the query. diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index b8c27666d7f53..fb028605c35b7 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -243,7 +243,7 @@ def __post_init__(self) -> None: # This was added in 3.24.0 from 2018-06-04. # See https://www.sqlite.org/lang_conflict.html if OVERRIDE_SQLITE_VERSION_REQUIREMENT: - self.use_sqlite_on_conflict = False + self._use_sqlite_on_conflict = False else: raise RuntimeError("SQLite version 3.24.0 or later is required") diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 9488683d6d8ca..fc1b1ed58244c 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -57,7 +57,7 @@ def __exit__( self.finish() return None - def elapsed_seconds(self) -> float: + def elapsed_seconds(self, digits: int = 4) -> float: """ Returns the elapsed time in seconds. """ @@ -65,11 +65,18 @@ def elapsed_seconds(self) -> float: return self._past_active_time if self.end_time is None: - return (time.perf_counter() - self.start_time) + (self._past_active_time) + elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time) else: - return (self.end_time - self.start_time) + self._past_active_time + elapsed = (self.end_time - self.start_time) + self._past_active_time + + return round(elapsed, digits) def assert_timer_is_running(self) -> None: + if not self.is_running(): + self._error_state = True + logger.warning("Did you forget to start the timer ?") + + def is_running(self) -> bool: """ Returns true if timer is in running state. Timer is in NOT in running state if @@ -77,9 +84,7 @@ def assert_timer_is_running(self) -> None: 2. it is in paused state. 3. it had been started and finished in the past but not started again. """ - if self.start_time is None or self.paused or self.end_time: - self._error_state = True - logger.warning("Did you forget to start the timer ?") + return self.start_time is not None and not self.paused and self.end_time is None def __repr__(self) -> str: return repr(self.as_obj()) diff --git a/metadata-ingestion/tests/integration/athena/athena_mce_golden.json b/metadata-ingestion/tests/integration/athena/athena_mce_golden.json new file mode 100644 index 0000000000000..1b3fdb0bdb253 --- /dev/null +++ b/metadata-ingestion/tests/integration/athena/athena_mce_golden.json @@ -0,0 +1,1362 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "athena", + "env": "PROD", + "database": "test_schema" + }, + "name": "test_schema", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:athena" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "key": "value", + "table_type": "EXTERNAL_TABLE", + "is_view": "True", + "view_definition": "CREATE VIEW \"test_schema\".test_view_1 AS\nSELECT *\nFROM\n \"test_schema\".\"test_table\"" + }, + "name": "test_table", + "description": "Test table description", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test_schema.test_table", + "platform": "urn:li:dataPlatform:athena", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=string].employee_id", + "nullable": false, + "description": "Unique identifier for the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}" + }, + { + "fieldPath": "[version=2.0].[type=long].annual_salary", + "nullable": true, + "description": "Annual salary of the employee in USD", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "BIGINT", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"BIGINT\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].employee_name", + "nullable": false, + "description": "Full name of the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history", + "nullable": true, + "description": "Job history map: year to details (company, role)", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": "string", + "valueType": "record" + } + } + }, + "nativeDataType": "MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=int].year", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"INTEGER\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].company", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].role", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=long].department_budgets", + "nullable": true, + "description": "Map of department names to their respective budgets", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": "string", + "valueType": "long" + } + } + }, + "nativeDataType": "MapType(String(), BIGINT())", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"MapType(String(), BIGINT())\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].skills", + "nullable": true, + "description": "List of skills possessed by the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "string" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "urn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "key": "value", + "table_type": "EXTERNAL_TABLE", + "is_view": "True", + "view_definition": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT employee_id, employee_name, skills\nFROM\n \"test_schema\".\"test_view_1\"" + }, + "name": "test_view_1", + "description": "Test table description", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test_schema.test_view_1", + "platform": "urn:li:dataPlatform:athena", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=string].employee_id", + "nullable": false, + "description": "Unique identifier for the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}" + }, + { + "fieldPath": "[version=2.0].[type=long].annual_salary", + "nullable": true, + "description": "Annual salary of the employee in USD", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "BIGINT", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"BIGINT\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].employee_name", + "nullable": false, + "description": "Full name of the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history", + "nullable": true, + "description": "Job history map: year to details (company, role)", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": "string", + "valueType": "record" + } + } + }, + "nativeDataType": "MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=int].year", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"INTEGER\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].company", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].role", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=long].department_budgets", + "nullable": true, + "description": "Map of department names to their respective budgets", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": "string", + "valueType": "long" + } + } + }, + "nativeDataType": "MapType(String(), BIGINT())", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"MapType(String(), BIGINT())\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].skills", + "nullable": true, + "description": "List of skills possessed by the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "string" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW \"test_schema\".test_view_1 AS\nSELECT *\nFROM\n \"test_schema\".\"test_table\"", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "urn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "key": "value", + "table_type": "EXTERNAL_TABLE", + "is_view": "True", + "view_definition": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT employee_id, employee_name, skills\nFROM\n \"test_schema\".\"test_view_1\"" + }, + "name": "test_view_2", + "description": "Test table description", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test_schema.test_view_2", + "platform": "urn:li:dataPlatform:athena", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=string].employee_id", + "nullable": false, + "description": "Unique identifier for the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}" + }, + { + "fieldPath": "[version=2.0].[type=long].annual_salary", + "nullable": true, + "description": "Annual salary of the employee in USD", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "BIGINT", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"BIGINT\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].employee_name", + "nullable": false, + "description": "Full name of the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": false}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history", + "nullable": true, + "description": "Job history map: year to details (company, role)", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": "string", + "valueType": "record" + } + } + }, + "nativeDataType": "MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"MapType(String(), STRUCT(year=INTEGER(), company=String(), role=String()))\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=int].year", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"INTEGER\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].company", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=struct].job_history.[type=string].role", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=long].department_budgets", + "nullable": true, + "description": "Map of department names to their respective budgets", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": "string", + "valueType": "long" + } + } + }, + "nativeDataType": "MapType(String(), BIGINT())", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"MapType(String(), BIGINT())\", \"key_type\": {\"type\": \"string\", \"native_data_type\": \"VARCHAR\", \"_nullable\": true}, \"key_native_data_type\": \"VARCHAR\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].skills", + "nullable": true, + "description": "List of skills possessed by the employee", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "string" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "isPartitioningKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT employee_id, employee_name, skills\nFROM\n \"test_schema\".\"test_view_1\"", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67", + "urn": "urn:li:container:28d9272f625e7a366dfdc276b6ce4a67" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1671098400000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD)", + "type": "COPY" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),employee_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),annual_salary)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),annual_salary)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),employee_name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),job_history)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),job_history)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),department_budgets)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),department_budgets)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/test_table,PROD),skills)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),skills)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1671098400000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),annual_salary)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),annual_salary)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),job_history)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),job_history)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),department_budgets)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),department_budgets)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),skills)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW \"test_schema\".test_view_1 AS\nSELECT\n *\nFROM \"test_schema\".\"test_table\"", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1671098400000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),annual_salary)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),department_budgets)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),employee_name)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),job_history)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_table,PROD),skills)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),annual_salary)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),job_history)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),department_budgets)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:athena" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1671098400000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_id)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_name)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),skills)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW \"test_schema\".test_view_2 AS\nSELECT\n employee_id,\n employee_name,\n skills\nFROM \"test_schema\".\"test_view_1\"", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1671098400000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),employee_name)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_1,PROD),skills)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),employee_name)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:athena,test_schema.test_view_2,PROD),skills)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:athena" + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aathena%2Ctest_schema.test_view_2%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1671098400000, + "runId": "athena-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/athena/test_athena_source.py b/metadata-ingestion/tests/integration/athena/test_athena_source.py new file mode 100644 index 0000000000000..56e7cbe6b3e2d --- /dev/null +++ b/metadata-ingestion/tests/integration/athena/test_athena_source.py @@ -0,0 +1,163 @@ +from unittest.mock import MagicMock, patch + +from freezegun import freeze_time +from sqlalchemy import ARRAY, BIGINT, INTEGER, String +from sqlalchemy_bigquery import STRUCT + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.sql.athena import AthenaSource +from datahub.utilities.sqlalchemy_type_converter import MapType +from tests.test_helpers import ( # Ensure mce_helpers is available for validation. + mce_helpers, +) + +FROZEN_TIME = "2022-12-15 10:00:00" + + +@freeze_time(FROZEN_TIME) +def test_athena_source_ingestion(pytestconfig, tmp_path): + """Test Athena source ingestion and generate MCP JSON file for validation.""" + output_file_name = "athena_mce_output.json" + golden_file_name = "athena_mce_golden.json" + test_resources_dir = pytestconfig.rootpath / "tests/integration/athena" + + # Mock dependencies + with patch.object( + AthenaSource, "get_inspectors" + ) as mock_get_inspectors, patch.object( + AthenaSource, "get_table_properties" + ) as mock_get_table_properties: + # Mock engine and inspectors + mock_inspector = MagicMock() + mock_get_inspectors.return_value = [mock_inspector] + mock_engine_instance = MagicMock() + mock_engine_instance.url.database = "" + mock_inspector.engine = mock_engine_instance + + # Mock schema and table names + mock_inspector.get_schema_names.return_value = ["test_schema"] + mock_inspector.get_table_names.return_value = ["test_table"] + mock_inspector.get_view_names.return_value = ["test_view_1", "test_view_2"] + + # Mock view definitions + def mock_get_view_definition(view_name, schema): + if view_name == "test_view_1": + return ( + 'CREATE VIEW "test_schema".test_view_1 AS\n' + "SELECT *\n" + "FROM\n" + ' "test_schema"."test_table"' + ) + elif view_name == "test_view_2": + return ( + 'CREATE VIEW "test_schema".test_view_2 AS\n' + "SELECT employee_id, employee_name, skills\n" + "FROM\n" + ' "test_schema"."test_view_1"' + ) + return "" + + mock_inspector.get_view_definition.side_effect = mock_get_view_definition + + mock_inspector.get_columns.return_value = [ + { + "name": "employee_id", + "type": String(), + "nullable": False, + "default": None, + "autoincrement": False, + "comment": "Unique identifier for the employee", + "dialect_options": {"awsathena_partition": None}, + }, + { + "name": "annual_salary", + "type": BIGINT(), + "nullable": True, + "default": None, + "autoincrement": False, + "comment": "Annual salary of the employee in USD", + "dialect_options": {"awsathena_partition": None}, + }, + { + "name": "employee_name", + "type": String(), + "nullable": False, + "default": None, + "autoincrement": False, + "comment": "Full name of the employee", + "dialect_options": {"awsathena_partition": None}, + }, + { + "name": "job_history", + "type": MapType( + String(), STRUCT(year=INTEGER(), company=String(), role=String()) + ), + "nullable": True, + "default": None, + "autoincrement": False, + "comment": "Job history map: year to details (company, role)", + "dialect_options": {"awsathena_partition": None}, + }, + { + "name": "department_budgets", + "type": MapType(String(), BIGINT()), + "nullable": True, + "default": None, + "autoincrement": False, + "comment": "Map of department names to their respective budgets", + "dialect_options": {"awsathena_partition": None}, + }, + { + "name": "skills", + "type": ARRAY(String()), + "nullable": True, + "default": None, + "autoincrement": False, + "comment": "List of skills possessed by the employee", + "dialect_options": {"awsathena_partition": None}, + }, + ] + # Mock table properties + mock_get_table_properties.return_value = ( + "Test table description", + {"key": "value", "table_type": "EXTERNAL_TABLE"}, + make_s3_urn("s3://test-bucket/test_table", "PROD"), + ) + + # Define the pipeline configuration + config_dict = { + "run_id": "athena-test", + "source": { + "type": "athena", + "config": { + "aws_region": "us-east-1", + "work_group": "primary", + "query_result_location": "s3://athena-query-results/", + "catalog_name": "awsdatacatalog", + "include_views": True, + "include_tables": True, + "profiling": { + "enabled": False, + }, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/{output_file_name}", + }, + }, + } + + # Create and run the pipeline + pipeline = Pipeline.create(config_dict) + pipeline.run() + pipeline.raise_from_status() + + # Validate the output with the golden file + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, + output_path=f"{tmp_path}/{output_file_name}", + golden_path=f"{test_resources_dir}/{golden_file_name}", + ) diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_1.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_1.json new file mode 100644 index 0000000000000..8f411ca513771 --- /dev/null +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_1.json @@ -0,0 +1,977 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1" + }, + "name": "project-id-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1", + "dataset_id": "bigquery-dataset-1", + "location": "US" + }, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m4!1m3!3m2!1sproject-id-1!2sbigquery-dataset-1", + "name": "bigquery-dataset-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1", + "name": "table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.view-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3sview-1", + "name": "view-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.view-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view `bigquery-dataset-1.view-1` as select email from `bigquery-dataset-1.table-1`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3ssnapshot-table-1", + "name": "snapshot-table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Bigquery Table Snapshot" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "COPY" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),age)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),email)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `bigquery-dataset-1.view-1` AS\nSELECT\n email\nFROM `bigquery-dataset-1.table-1`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Test Policy Tag", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Test Policy Tag" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-0mn4n3", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_2.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_2.json new file mode 100644 index 0000000000000..26abc09569ccf --- /dev/null +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_lineage_golden_2.json @@ -0,0 +1,1064 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1" + }, + "name": "project-id-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1", + "dataset_id": "bigquery-dataset-1", + "location": "US" + }, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m4!1m3!3m2!1sproject-id-1!2sbigquery-dataset-1", + "name": "bigquery-dataset-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1", + "name": "table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.view-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3sview-1", + "name": "view-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.view-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view `bigquery-dataset-1.view-1` as select email from `bigquery-dataset-1.table-1`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3ssnapshot-table-1", + "name": "snapshot-table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Bigquery Table Snapshot" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "COPY" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),age)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),email)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `bigquery-dataset-1.view-1` AS\nSELECT\n email\nFROM `bigquery-dataset-1.table-1`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetUsageStatistics", + "aspect": { + "json": { + "timestampMillis": 1643760000000, + "eventGranularity": { + "unit": "DAY", + "multiple": 1 + }, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "uniqueUserCount": 0, + "totalSqlQueries": 0, + "topSqlQueries": [], + "userCounts": [], + "fieldCounts": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetUsageStatistics", + "aspect": { + "json": { + "timestampMillis": 1643760000000, + "eventGranularity": { + "unit": "DAY", + "multiple": 1 + }, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "uniqueUserCount": 0, + "totalSqlQueries": 0, + "topSqlQueries": [], + "userCounts": [], + "fieldCounts": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetUsageStatistics", + "aspect": { + "json": { + "timestampMillis": 1643760000000, + "eventGranularity": { + "unit": "DAY", + "multiple": 1 + }, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "uniqueUserCount": 0, + "totalSqlQueries": 0, + "topSqlQueries": [], + "userCounts": [], + "fieldCounts": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Test Policy Tag", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Test Policy Tag" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-k4o1z9", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 1f14688636161..2dd320041a113 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -4,6 +4,7 @@ from typing import Any, Dict, Optional from unittest.mock import MagicMock, patch +import pytest from freezegun import freeze_time from google.cloud.bigquery.table import TableListItem @@ -577,3 +578,147 @@ def test_bigquery_queries_v2_lineage_usage_ingest( output_path=mcp_output_path, golden_path=mcp_golden_path, ) + + +@freeze_time(FROZEN_TIME) +@patch.object(BigQuerySchemaApi, "get_snapshots_for_dataset") +@patch.object(BigQuerySchemaApi, "get_views_for_dataset") +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigQuerySchemaGenerator, "get_core_table_details") +@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") +@patch.object(BigQuerySchemaApi, "get_columns_for_dataset") +@patch.object(BigQueryDataReader, "get_sample_data_for_table") +@patch("google.cloud.bigquery.Client") +@patch("google.cloud.datacatalog_v1.PolicyTagManagerClient") +@patch("google.cloud.resourcemanager_v3.ProjectsClient") +@pytest.mark.parametrize( + "use_queries_v2, include_table_lineage, include_usage_statistics, golden_file", + [ + (True, False, False, "bigquery_mcp_lineage_golden_1.json"), + (True, True, False, "bigquery_mcp_lineage_golden_1.json"), + (False, False, True, "bigquery_mcp_lineage_golden_2.json"), + (False, True, True, "bigquery_mcp_lineage_golden_2.json"), + ], +) +def test_bigquery_lineage_v2_ingest_view_snapshots( + client, + policy_tag_manager_client, + projects_client, + get_sample_data_for_table, + get_columns_for_dataset, + get_datasets_for_project_id, + get_core_table_details, + get_tables_for_dataset, + get_views_for_dataset, + get_snapshots_for_dataset, + pytestconfig, + tmp_path, + use_queries_v2, + include_table_lineage, + include_usage_statistics, + golden_file, +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" + mcp_golden_path = f"{test_resources_dir}/{golden_file}" + mcp_output_path = "{}/{}_output.json".format(tmp_path, golden_file) + + dataset_name = "bigquery-dataset-1" + get_datasets_for_project_id.return_value = [ + BigqueryDataset(name=dataset_name, location="US") + ] + + table_list_item = TableListItem( + {"tableReference": {"projectId": "", "datasetId": "", "tableId": ""}} + ) + table_name = "table-1" + snapshot_table_name = "snapshot-table-1" + view_name = "view-1" + get_core_table_details.return_value = {table_name: table_list_item} + columns = [ + BigqueryColumn( + name="age", + ordinal_position=1, + is_nullable=False, + field_path="col_1", + data_type="INT", + comment="comment", + is_partition_column=False, + cluster_column_position=None, + policy_tags=["Test Policy Tag"], + ), + BigqueryColumn( + name="email", + ordinal_position=1, + is_nullable=False, + field_path="col_2", + data_type="STRING", + comment="comment", + is_partition_column=False, + cluster_column_position=None, + ), + ] + + get_columns_for_dataset.return_value = { + table_name: columns, + snapshot_table_name: columns, + view_name: columns, + } + get_sample_data_for_table.return_value = { + "age": [random.randint(1, 80) for i in range(20)], + "email": [random_email() for i in range(20)], + } + + bigquery_table = BigqueryTable( + name=table_name, + comment=None, + created=None, + last_altered=None, + size_in_bytes=None, + rows_count=None, + ) + get_tables_for_dataset.return_value = iter([bigquery_table]) + + bigquery_view = BigqueryView( + name=view_name, + comment=None, + created=None, + view_definition=f"create view `{dataset_name}.view-1` as select email from `{dataset_name}.table-1`", + last_altered=None, + size_in_bytes=None, + rows_count=None, + materialized=False, + ) + + get_views_for_dataset.return_value = iter([bigquery_view]) + snapshot_table = BigqueryTableSnapshot( + name=snapshot_table_name, + comment=None, + created=None, + last_altered=None, + size_in_bytes=None, + rows_count=None, + base_table_identifier=BigqueryTableIdentifier( + project_id="project-id-1", + dataset="bigquery-dataset-1", + table="table-1", + ), + ) + get_snapshots_for_dataset.return_value = iter([snapshot_table]) + + pipeline_config_dict: Dict[str, Any] = recipe( + mcp_output_path=mcp_output_path, + source_config_override={ + "use_queries_v2": use_queries_v2, + "include_table_lineage": include_table_lineage, + "include_usage_statistics": include_usage_statistics, + "classification": {"enabled": False}, + }, + ) + + run_and_get_pipeline(pipeline_config_dict) + + mce_helpers.check_golden_file( + pytestconfig, + output_path=mcp_output_path, + golden_path=mcp_golden_path, + ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 71e5ad10c2fc5..d7868038a40aa 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1,6 +1,6 @@ import json import pathlib -from typing import Any, Dict, List, cast +from typing import Any, Dict, List, Union, cast from unittest import mock import pytest @@ -13,10 +13,15 @@ GroupItem, ProjectItem, SiteItem, + UserItem, ViewItem, WorkbookItem, ) from tableauserverclient.models.reference_item import ResourceReference +from tableauserverclient.server.endpoint.exceptions import ( + NonXMLResponseError, + TableauError, +) from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -270,7 +275,7 @@ def side_effect_site_get_by_id(id, *arg, **kwargs): def mock_sdk_client( - side_effect_query_metadata_response: List[dict], + side_effect_query_metadata_response: List[Union[dict, TableauError]], datasources_side_effect: List[dict], sign_out_side_effect: List[dict], ) -> mock.MagicMock: @@ -1312,6 +1317,61 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): ) +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_retry_on_error(pytestconfig, tmp_path, mock_datahub_graph): + with mock.patch( + "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", + mock_datahub_graph, + ) as mock_checkpoint: + mock_checkpoint.return_value = mock_datahub_graph + + with mock.patch("datahub.ingestion.source.tableau.tableau.Server") as mock_sdk: + mock_client = mock_sdk_client( + side_effect_query_metadata_response=[ + NonXMLResponseError( + """{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}""" + ), + *mock_data(), + ], + sign_out_side_effect=[{}], + datasources_side_effect=[{}], + ) + mock_client.users = mock.Mock() + mock_client.users.get_by_id.side_effect = [ + UserItem( + name="name", site_role=UserItem.Roles.SiteAdministratorExplorer + ) + ] + mock_sdk.return_value = mock_client + + reporter = TableauSourceReport() + tableau_source = TableauSiteSource( + platform="tableau", + config=mock.MagicMock(), + ctx=mock.MagicMock(), + site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"), + server=mock_sdk.return_value, + report=reporter, + ) + + tableau_source.get_connection_object_page( + query=mock.MagicMock(), + connection_type=mock.MagicMock(), + query_filter=mock.MagicMock(), + current_cursor=None, + retries_remaining=1, + fetch_size=10, + ) + + assert reporter.num_actual_tableau_metadata_queries == 2 + assert reporter.tableau_server_error_stats + assert reporter.tableau_server_error_stats["NonXMLResponseError"] == 1 + + assert reporter.warnings == [] + assert reporter.failures == [] + + @freeze_time(FROZEN_TIME) @pytest.mark.parametrize( "extract_project_hierarchy, allowed_projects", diff --git a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py index 9cb80ff02657b..24460f3829806 100644 --- a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py +++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py @@ -26,14 +26,14 @@ def run_test(): report = BigQueryV2Report() - report.set_ingestion_stage("All", "Seed Data Generation") - seed_metadata = generate_data( - num_containers=2000, - num_tables=20000, - num_views=2000, - time_range=timedelta(days=7), - ) - all_tables = seed_metadata.all_tables + with report.new_stage("All: Seed Data Generation"): + seed_metadata = generate_data( + num_containers=2000, + num_tables=20000, + num_views=2000, + time_range=timedelta(days=7), + ) + all_tables = seed_metadata.all_tables config = BigQueryV2Config( start_time=seed_metadata.start_time, @@ -51,42 +51,45 @@ def run_test(): schema_resolver=SchemaResolver(platform="bigquery"), identifiers=BigQueryIdentifierBuilder(config, report), ) - report.set_ingestion_stage("All", "Event Generation") - - num_projects = 100 - projects = [f"project-{i}" for i in range(num_projects)] - table_to_project = {table.name: random.choice(projects) for table in all_tables} - table_refs = {str(ref_from_table(table, table_to_project)) for table in all_tables} + with report.new_stage("All: Event Generation"): + num_projects = 100 + projects = [f"project-{i}" for i in range(num_projects)] + table_to_project = {table.name: random.choice(projects) for table in all_tables} + table_refs = { + str(ref_from_table(table, table_to_project)) for table in all_tables + } - queries = list( - generate_queries( - seed_metadata, - num_selects=240_000, - num_operations=800_000, - num_unique_queries=50_000, - num_users=2000, - query_length=NormalDistribution(2000, 500), + queries = list( + generate_queries( + seed_metadata, + num_selects=240_000, + num_operations=800_000, + num_unique_queries=50_000, + num_users=2000, + query_length=NormalDistribution(2000, 500), + ) ) - ) - queries.sort(key=lambda q: q.timestamp) - events = list(generate_events(queries, projects, table_to_project, config=config)) - print(f"Events generated: {len(events)}") - pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss - print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}") + queries.sort(key=lambda q: q.timestamp) + events = list( + generate_events(queries, projects, table_to_project, config=config) + ) + print(f"Events generated: {len(events)}") + pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss + print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}") - report.set_ingestion_stage("All", "Event Ingestion") - with PerfTimer() as timer: - workunits = usage_extractor._get_workunits_internal(events, table_refs) - num_workunits, peak_memory_usage = workunit_sink(workunits) - report.set_ingestion_stage("All", "Done") - print(f"Workunits Generated: {num_workunits}") - print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + with report.new_stage("All: Event Ingestion"): + with PerfTimer() as timer: + workunits = usage_extractor._get_workunits_internal(events, table_refs) + num_workunits, peak_memory_usage = workunit_sink(workunits) + with report.new_stage("All: Done"): + print(f"Workunits Generated: {num_workunits}") + print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") - print( - f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" - ) - print(f"Disk Used: {report.processing_perf.usage_state_size}") - print(f"Hash collisions: {report.num_usage_query_hash_collisions}") + print( + f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" + ) + print(f"Disk Used: {report.processing_perf.usage_state_size}") + print(f"Hash collisions: {report.num_usage_query_hash_collisions}") if __name__ == "__main__": diff --git a/metadata-ingestion/tests/performance/databricks/test_unity.py b/metadata-ingestion/tests/performance/databricks/test_unity.py index ddd19804ba184..71192dc5b509b 100644 --- a/metadata-ingestion/tests/performance/databricks/test_unity.py +++ b/metadata-ingestion/tests/performance/databricks/test_unity.py @@ -59,7 +59,7 @@ def run_test(): workunits = source.get_workunits() num_workunits, peak_memory_usage = workunit_sink(workunits) print(f"Workunits Generated: {num_workunits}") - print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") print( f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 984d9e4295745..a940cce46a8f7 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -53,7 +53,7 @@ def run_test(): workunits = source.get_workunits() num_workunits, peak_memory_usage = workunit_sink(workunits) logging.info(f"Workunits Generated: {num_workunits}") - logging.info(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + logging.info(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") logging.info(source.get_report().as_string()) logging.info( diff --git a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py index 5f783efc559bc..f09047c0ec4a4 100644 --- a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py +++ b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py @@ -12,12 +12,14 @@ def run_test() -> None: for i in range(N): if i % 50 == 0: print( - f"Running iteration {i}, elapsed time: {timer.elapsed_seconds():.2f} seconds" + f"Running iteration {i}, elapsed time: {timer.elapsed_seconds(digits=2)} seconds" ) try_format_query.__wrapped__(large_sql_query, platform="snowflake") - print(f"Total time taken for {N} iterations: {timer.elapsed_seconds():.2f} seconds") + print( + f"Total time taken for {N} iterations: {timer.elapsed_seconds(digits=2)} seconds" + ) if __name__ == "__main__": diff --git a/metadata-ingestion/tests/unit/cli/test_cli_utils.py b/metadata-ingestion/tests/unit/cli/test_cli_utils.py index c9693c75d96fe..c430f585200e5 100644 --- a/metadata-ingestion/tests/unit/cli/test_cli_utils.py +++ b/metadata-ingestion/tests/unit/cli/test_cli_utils.py @@ -70,6 +70,10 @@ def test_fixup_gms_url(): cli_utils.fixup_gms_url("http://abc.acryl.io/api/gms") == "https://abc.acryl.io/gms" ) + assert ( + cli_utils.fixup_gms_url("http://abcd.acryl.io:8080") + == "https://abcd.acryl.io/gms" + ) def test_guess_frontend_url_from_gms_url(): diff --git a/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py new file mode 100644 index 0000000000000..8bae38eaa7444 --- /dev/null +++ b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py @@ -0,0 +1,42 @@ +import time + +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport + + +def test_ingestion_stage_context_records_duration(): + report = IngestionStageReport() + with report.new_stage(stage="Test Stage"): + pass + assert len(report.ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report.ingestion_stage_durations.keys())) + + +def test_ingestion_stage_context_handles_exceptions(): + report = IngestionStageReport() + try: + with report.new_stage(stage="Test Stage"): + raise ValueError("Test Exception") + except ValueError: + pass + assert len(report.ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report.ingestion_stage_durations)) + + +def test_ingestion_stage_context_report_handles_multiple_stages(): + report = IngestionStageReport() + with report.new_stage(stage="Test Stage 1"): + time.sleep(0.1) + with report.new_stage(stage="Test Stage 2"): + time.sleep(0.1) + with report.new_stage(stage="Test Stage 3"): + time.sleep(0.1) + assert len(report.ingestion_stage_durations) == 3 + assert all( + isinstance(duration, float) and duration > 0.0 + for duration in report.ingestion_stage_durations.values() + ) + + sorted_stages = list(sorted(report.ingestion_stage_durations.keys())) + assert "Test Stage 1" in sorted_stages[0] + assert "Test Stage 2" in sorted_stages[1] + assert "Test Stage 3" in sorted_stages[2] diff --git a/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py index b4d7cb17b66f5..81120dfc87aba 100644 --- a/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py +++ b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py @@ -4,39 +4,41 @@ MOCK_GMS_ENDPOINT = "http://fakegmshost:8080" -def test_datahub_rest_emitter_construction(): +def test_datahub_rest_emitter_construction() -> None: emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT) - assert emitter._connect_timeout_sec == rest_emitter._DEFAULT_CONNECT_TIMEOUT_SEC - assert emitter._read_timeout_sec == rest_emitter._DEFAULT_READ_TIMEOUT_SEC - assert emitter._retry_status_codes == rest_emitter._DEFAULT_RETRY_STATUS_CODES - assert emitter._retry_max_times == rest_emitter._DEFAULT_RETRY_MAX_TIMES + assert emitter._session_config.timeout == rest_emitter._DEFAULT_TIMEOUT_SEC + assert ( + emitter._session_config.retry_status_codes + == rest_emitter._DEFAULT_RETRY_STATUS_CODES + ) + assert ( + emitter._session_config.retry_max_times == rest_emitter._DEFAULT_RETRY_MAX_TIMES + ) -def test_datahub_rest_emitter_timeout_construction(): +def test_datahub_rest_emitter_timeout_construction() -> None: emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, connect_timeout_sec=2, read_timeout_sec=4 ) - assert emitter._connect_timeout_sec == 2 - assert emitter._read_timeout_sec == 4 + assert emitter._session_config.timeout == (2, 4) -def test_datahub_rest_emitter_general_timeout_construction(): +def test_datahub_rest_emitter_general_timeout_construction() -> None: emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT, timeout_sec=2, read_timeout_sec=4) - assert emitter._connect_timeout_sec == 2 - assert emitter._read_timeout_sec == 4 + assert emitter._session_config.timeout == (2, 4) -def test_datahub_rest_emitter_retry_construction(): +def test_datahub_rest_emitter_retry_construction() -> None: emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, retry_status_codes=[418], retry_max_times=42, ) - assert emitter._retry_status_codes == [418] - assert emitter._retry_max_times == 42 + assert emitter._session_config.retry_status_codes == [418] + assert emitter._session_config.retry_max_times == 42 -def test_datahub_rest_emitter_extra_params(): +def test_datahub_rest_emitter_extra_params() -> None: emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, extra_headers={"key1": "value1", "key2": "value2"} ) diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json index 0d8822736c95e..31d7419b2c8cc 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json @@ -18,7 +18,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f" + "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae" } ], "fineGrainedLineages": [ @@ -32,7 +32,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f" + "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae" }, { "upstreamType": "FIELD_SET", @@ -44,7 +44,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" ], "confidenceScore": 1.0, - "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f" + "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae" }, { "upstreamType": "FIELD_SET", @@ -56,7 +56,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" ], "confidenceScore": 1.0, - "query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f" + "query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae" } ] } @@ -64,7 +64,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f", + "entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -87,7 +87,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f", + "entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -114,7 +114,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f", + "entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -137,7 +137,7 @@ }, "operationType": "INSERT", "customProperties": { - "query_urn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f" + "query_urn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae" }, "lastUpdatedTimestamp": 20000 } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index fd8475090f009..e22947fd96ce4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -133,7 +133,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" + "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b" } ], "fineGrainedLineages": [ @@ -147,7 +147,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" + "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b" }, { "upstreamType": "FIELD_SET", @@ -159,7 +159,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" ], "confidenceScore": 1.0, - "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" + "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b" }, { "upstreamType": "FIELD_SET", @@ -171,7 +171,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" ], "confidenceScore": 1.0, - "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" + "query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b" } ] } @@ -179,7 +179,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4", + "entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -202,7 +202,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4", + "entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -229,7 +229,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4", + "entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json index a4ac349c3c455..b657b46476cbb 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename_with_temp.json @@ -133,7 +133,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332" + "query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e" } ], "fineGrainedLineages": [ @@ -147,7 +147,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" ], "confidenceScore": 0.2, - "query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332" + "query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e" }, { "upstreamType": "FIELD_SET", @@ -159,7 +159,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" ], "confidenceScore": 0.2, - "query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332" + "query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e" } ] } @@ -167,7 +167,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332", + "entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -190,7 +190,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332", + "entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -217,7 +217,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332", + "entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json index d9d46a4b14a14..09a98a81f2602 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json @@ -133,7 +133,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" + "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300" } ], "fineGrainedLineages": [ @@ -147,7 +147,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" + "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300" }, { "upstreamType": "FIELD_SET", @@ -159,7 +159,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)" ], "confidenceScore": 1.0, - "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" + "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300" }, { "upstreamType": "FIELD_SET", @@ -171,7 +171,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)" ], "confidenceScore": 1.0, - "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" + "query": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300" } ] } @@ -179,7 +179,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405", + "entityUrn": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -202,7 +202,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405", + "entityUrn": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -229,7 +229,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405", + "entityUrn": "urn:li:query:1ed34195f33514203e8359ca22772e03a3588b669e0db00b1681e1a8d0862300", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -257,7 +257,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559" + "query": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c" } ] } @@ -265,7 +265,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559", + "entityUrn": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -288,7 +288,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559", + "entityUrn": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -306,7 +306,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:6f71602f39d01a39b3f8bd411c74c5ac08dc4b90bc3d49b257089acb19fa8559", + "entityUrn": "urn:li:query:76f0a8e1da90c4d33b5741c6e1014251ce2d1650ba0f58ab136ebaf1bb64dc8c", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -334,7 +334,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae" + "query": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d" } ] } @@ -342,7 +342,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", + "entityUrn": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -365,7 +365,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", + "entityUrn": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -383,7 +383,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:4b1fad909083e1ed5c47c146bd01247ed4d6295d175c34f9065b8fc6000fc7ae", + "entityUrn": "urn:li:query:37c14a3bbb67360d19d1666fa4e11b67ef81926e1e2bcd46b87ea239d27a549d", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -411,7 +411,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904" + "query": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943" }, { "auditStamp": { @@ -424,7 +424,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_incremental,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f" + "query": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2" } ] } @@ -432,7 +432,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904", + "entityUrn": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -455,7 +455,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904", + "entityUrn": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -473,7 +473,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904", + "entityUrn": "urn:li:query:f4eb748a53291bbea59e080f6d415b08dfd7003d0b7c3d538d02f4e404b30943", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -484,7 +484,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", + "entityUrn": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -507,7 +507,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", + "entityUrn": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -525,7 +525,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:481d0392ffeffdafd198d94e0a9f778dd722b60daa47083a32800b99ea21f86f", + "entityUrn": "urn:li:query:29935c31db1f06edf50d62a59d2874a86c51570256ab3b3102984439c03be1f2", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json index b4eaf76a14933..69bcd8eb10e95 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json @@ -133,7 +133,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3" + "query": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df" }, { "auditStamp": { @@ -146,7 +146,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3" + "query": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df" } ], "fineGrainedLineages": [ @@ -161,7 +161,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3" + "query": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df" } ] } @@ -169,7 +169,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3", + "entityUrn": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -192,7 +192,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3", + "entityUrn": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -219,7 +219,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3", + "entityUrn": "urn:li:query:composite_a10e266957d5007837642526d09f058ca461e42e2159ff45c328ebd069c112df", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -247,7 +247,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80" + "query": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544" }, { "auditStamp": { @@ -260,7 +260,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80" + "query": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544" } ], "fineGrainedLineages": [ @@ -275,7 +275,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_backup,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80" + "query": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544" } ] } @@ -283,7 +283,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80", + "entityUrn": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -306,7 +306,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80", + "entityUrn": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -330,7 +330,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_49daa72ac1d22734879a6bed1224daa7f8c1293750d6d7b8a24a0aa0e9f74d80", + "entityUrn": "urn:li:query:composite_5d8360cfc2f57f023d9945749848ad52227674fefc9fec568e7fbb1787cfd544", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py index dbe24ade6944f..c3c3a4a15d915 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py @@ -186,3 +186,15 @@ def test_query_fingerprint(): assert get_query_fingerprint( "select 1 + 1", platform="postgres" ) != get_query_fingerprint("select 2", platform="postgres") + + +def test_redshift_query_fingerprint(): + query1 = "insert into insert_into_table (select * from base_table);" + query2 = "INSERT INTO insert_into_table (SELECT * FROM base_table)" + + assert get_query_fingerprint(query1, "redshift") == get_query_fingerprint( + query2, "redshift" + ) + assert get_query_fingerprint(query1, "redshift", True) != get_query_fingerprint( + query2, "redshift", True + ) diff --git a/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py new file mode 100644 index 0000000000000..d03b08b77d5a9 --- /dev/null +++ b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py @@ -0,0 +1,213 @@ +from unittest.mock import Mock + +import pytest +import yaml + +from datahub.api.entities.structuredproperties.structuredproperties import ( + AllowedValue, + StructuredProperties, + TypeQualifierAllowedTypes, +) +from datahub.ingestion.graph.client import DataHubGraph +from datahub.metadata.schema_classes import ( + PropertyValueClass, + StructuredPropertyDefinitionClass, +) + + +@pytest.fixture +def sample_yaml_content(): + return """ +- id: test_property + type: string + description: Test description + display_name: Test Property + entity_types: + - dataset + cardinality: SINGLE + allowed_values: + - value: test_value + description: Test value description +""" + + +@pytest.fixture +def sample_yaml_file(tmp_path, sample_yaml_content): + yaml_file = tmp_path / "test_properties.yaml" + yaml_file.write_text(sample_yaml_content) + return str(yaml_file) + + +@pytest.fixture +def mock_graph(): + return Mock(spec=DataHubGraph) + + +def test_structured_properties_basic_creation(): + props = StructuredProperties( + id="test_prop", type="string", description="Test description" + ) + assert props.id == "test_prop" + assert props.type == "urn:li:dataType:datahub.string" + assert props.description == "Test description" + assert props.urn == "urn:li:structuredProperty:test_prop" + + +def test_structured_properties_validate_type(): + # Test valid types + props = StructuredProperties(id="test", type="string") + assert props.type == "urn:li:dataType:datahub.string" + + # Test invalid type + with pytest.raises(ValueError, match="Type .* is not allowed"): + StructuredProperties(id="test", type="invalid_type") + + +def test_structured_properties_validate_entity_types(): + # Test valid entity type + props = StructuredProperties(id="test", type="string", entity_types=["dataset"]) + assert props.entity_types + assert "urn:li:entityType:datahub.dataset" in props.entity_types + + # Test invalid entity type + with pytest.raises(ValueError, match="not a valid entity type"): + StructuredProperties(id="test", type="string", entity_types=["invalid_entity"]) + + +def test_structured_properties_from_yaml(sample_yaml_file): + props = StructuredProperties.from_yaml(sample_yaml_file) + assert len(props) == 1 + assert props[0].id == "test_property" + assert props[0].type == "urn:li:dataType:datahub.string" + assert props[0].description == "Test description" + assert props[0].display_name + assert props[0].display_name == "Test Property" + assert props[0].allowed_values + assert len(props[0].allowed_values) == 1 + assert props[0].allowed_values[0].value == "test_value" + + +def test_structured_properties_generate_mcps(): + props = StructuredProperties( + id="test_prop", + type="string", + description="Test description", + display_name="Test Property", + entity_types=["dataset"], + allowed_values=[ + AllowedValue(value="test_value", description="Test value description") + ], + ) + + mcps = props.generate_mcps() + assert len(mcps) == 1 + mcp = mcps[0] + + assert mcp.entityUrn == "urn:li:structuredProperty:test_prop" + assert isinstance(mcp.aspect, StructuredPropertyDefinitionClass) + assert mcp.aspect.valueType == "urn:li:dataType:datahub.string" + assert mcp.aspect.description == "Test description" + assert mcp.aspect.allowedValues + assert len(mcp.aspect.allowedValues) == 1 + assert mcp.aspect.allowedValues[0].value == "test_value" + + +def test_structured_properties_from_datahub(mock_graph): + mock_aspect = StructuredPropertyDefinitionClass( + qualifiedName="test_prop", + valueType="urn:li:dataType:datahub.string", + displayName="Test Property", + description="Test description", + entityTypes=["urn:li:entityType:datahub.dataset"], + cardinality="SINGLE", + allowedValues=[ + PropertyValueClass(value="test_value", description="Test description") + ], + ) + + mock_graph.get_aspect.return_value = mock_aspect + + props = StructuredProperties.from_datahub( + mock_graph, "urn:li:structuredProperty:test_prop" + ) + + assert props.qualified_name == "test_prop" + assert props.type == "urn:li:dataType:datahub.string" + assert props.display_name == "Test Property" + assert props.allowed_values + assert len(props.allowed_values) == 1 + assert props.allowed_values[0].value == "test_value" + + +def test_structured_properties_to_yaml(tmp_path): + props = StructuredProperties( + id="test_prop", + type="string", + description="Test description", + allowed_values=[ + AllowedValue(value="test_value", description="Test value description") + ], + ) + + yaml_file = tmp_path / "output.yaml" + props.to_yaml(yaml_file) + + # Verify the yaml file was created and contains expected content + assert yaml_file.exists() + with open(yaml_file) as f: + content = yaml.safe_load(f) + assert content["id"] == "test_prop" + assert content["type"] == "urn:li:dataType:datahub.string" + assert content["description"] == "Test description" + + +@pytest.mark.parametrize( + "input_type,expected_type", + [ + ("string", "urn:li:dataType:datahub.string"), + ("STRING", "urn:li:dataType:datahub.string"), + ("number", "urn:li:dataType:datahub.number"), + ("date", "urn:li:dataType:datahub.date"), + ], +) +def test_structured_properties_type_normalization(input_type, expected_type): + props = StructuredProperties(id="test_prop", type=input_type) + assert props.type == expected_type + + +def test_structured_properties_type_qualifier(): + props = StructuredProperties( + id="test_prop", + type="urn", + type_qualifier=TypeQualifierAllowedTypes(allowed_types=["dataset"]), + ) + + mcps = props.generate_mcps() + assert mcps[0].aspect + assert mcps[0].aspect.typeQualifier["allowedTypes"] == [ # type: ignore + "urn:li:entityType:datahub.dataset" + ] + + +def test_structured_properties_list(mock_graph): + mock_graph.get_urns_by_filter.return_value = [ + "urn:li:structuredProperty:prop1", + "urn:li:structuredProperty:prop2", + ] + + mock_aspect = StructuredPropertyDefinitionClass( + qualifiedName="test_prop", + valueType="urn:li:dataType:string", + entityTypes=["urn:li:entityType:datahub.dataset"], + ) + mock_graph.get_aspect.return_value = mock_aspect + + props = list(StructuredProperties.list(mock_graph)) + + # Verify get_urns_by_filter was called with correct arguments + mock_graph.get_urns_by_filter.assert_called_once_with( + entity_types=["structuredProperty"] + ) + + assert len(props) == 2 + assert all(isinstance(prop, StructuredProperties) for prop in props) diff --git a/metadata-ingestion/tests/unit/test_gc.py b/metadata-ingestion/tests/unit/test_gc.py index 8f00d5e064db8..fde9a3f2e0cf0 100644 --- a/metadata-ingestion/tests/unit/test_gc.py +++ b/metadata-ingestion/tests/unit/test_gc.py @@ -9,6 +9,34 @@ DataProcessCleanupConfig, DataProcessCleanupReport, ) +from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import ( + SoftDeletedEntitiesCleanup, + SoftDeletedEntitiesCleanupConfig, + SoftDeletedEntitiesReport, +) + + +class TestSoftDeletedEntitiesCleanup(unittest.TestCase): + def setUp(self): + self.ctx = PipelineContext(run_id="test_run") + self.ctx.graph = MagicMock() + self.config = SoftDeletedEntitiesCleanupConfig() + self.report = SoftDeletedEntitiesReport() + self.cleanup = SoftDeletedEntitiesCleanup( + self.ctx, self.config, self.report, dry_run=True + ) + + def test_update_report(self): + self.cleanup._update_report( + urn="urn:li:dataset:1", + entity_type="dataset", + ) + self.assertEqual(1, self.report.num_hard_deleted) + self.assertEqual(1, self.report.num_hard_deleted_by_type["dataset"]) + + def test_increment_retained_count(self): + self.cleanup._increment_retained_count() + self.assertEqual(1, self.report.num_soft_deleted_retained_due_to_age) class TestDataProcessCleanup(unittest.TestCase): diff --git a/metadata-ingestion/tests/unit/test_tableau_source.py b/metadata-ingestion/tests/unit/test_tableau_source.py index 44e59decaecbd..227519fdb464a 100644 --- a/metadata-ingestion/tests/unit/test_tableau_source.py +++ b/metadata-ingestion/tests/unit/test_tableau_source.py @@ -1,4 +1,4 @@ -from typing import Any, Dict +from typing import Any, Dict, List import pytest @@ -7,6 +7,7 @@ from datahub.ingestion.source.tableau.tableau_common import ( get_filter_pages, make_filter, + optimize_query_filter, tableau_field_to_schema_field, ) from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField @@ -203,3 +204,46 @@ def test_get_filter_pages_id_filter_splits_into_multiple_filters(): {c.ID_WITH_IN: filter_dict[c.ID_WITH_IN][i : i + page_size]} for i in range(0, num_ids, page_size) ] + + +def test_optimize_query_filter_removes_duplicates(): + query_filter = { + c.ID_WITH_IN: ["id1", "id2", "id1"], + c.PROJECT_NAME_WITH_IN: ["project1", "project2", "project1"], + } + result = optimize_query_filter(query_filter) + assert len(result) == 2 + assert result[c.ID_WITH_IN] == ["id1", "id2"] + assert result[c.PROJECT_NAME_WITH_IN] == ["project1", "project2"] + + +def test_optimize_query_filter_handles_empty_lists(): + query_filter: Dict[str, List[str]] = {c.ID_WITH_IN: [], c.PROJECT_NAME_WITH_IN: []} + result = optimize_query_filter(query_filter) + assert len(result) == 2 + assert result[c.ID_WITH_IN] == [] + assert result[c.PROJECT_NAME_WITH_IN] == [] + + +def test_optimize_query_filter_handles_missing_keys(): + query_filter: Dict[str, List[str]] = {} + result = optimize_query_filter(query_filter) + assert result == {} + + +def test_optimize_query_filter_handles_other_keys(): + query_filter = {"any_other_key": ["id1", "id2", "id1"]} + result = optimize_query_filter(query_filter) + assert len(result) == 1 + assert result["any_other_key"] == ["id1", "id2", "id1"] + + +def test_optimize_query_filter_handles_no_duplicates(): + query_filter = { + c.ID_WITH_IN: ["id1", "id2"], + c.PROJECT_NAME_WITH_IN: ["project1", "project2"], + } + result = optimize_query_filter(query_filter) + assert len(result) == 2 + assert result[c.ID_WITH_IN] == ["id1", "id2"] + assert result[c.PROJECT_NAME_WITH_IN] == ["project1", "project2"] diff --git a/metadata-ingestion/tests/unit/test_usage_common.py b/metadata-ingestion/tests/unit/test_usage_common.py index e01f0ea77df83..bd6d194835dd9 100644 --- a/metadata-ingestion/tests/unit/test_usage_common.py +++ b/metadata-ingestion/tests/unit/test_usage_common.py @@ -5,6 +5,7 @@ from freezegun import freeze_time from pydantic import ValidationError +import datahub.ingestion.source.usage.usage_common from datahub.configuration.common import AllowDenyPattern from datahub.configuration.time_window_config import BucketDuration, get_time_bucket from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance @@ -28,6 +29,7 @@ UserUsageCountsClass, WindowDurationClass, ) +from datahub.testing.doctest import assert_doctest _TestTableRef = str @@ -373,3 +375,7 @@ def test_convert_usage_aggregation_class(): eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.MONTH), ), ) + + +def test_extract_user_email(): + assert_doctest(datahub.ingestion.source.usage.usage_common) diff --git a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py index 6230c2e37edc6..7e1627151c6eb 100644 --- a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py +++ b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py @@ -5,6 +5,7 @@ import sqlite3 from dataclasses import dataclass from typing import Counter, Dict +from unittest.mock import patch import pytest @@ -15,6 +16,36 @@ ) +def test_set_use_sqlite_on_conflict(): + with patch("sqlite3.sqlite_version_info", (3, 24, 0)): + cache = FileBackedDict[int]( + tablename="cache", + cache_max_size=10, + cache_eviction_batch_size=10, + ) + assert cache._use_sqlite_on_conflict is True + + with pytest.raises(RuntimeError): + with patch("sqlite3.sqlite_version_info", (3, 23, 1)): + cache = FileBackedDict[int]( + tablename="cache", + cache_max_size=10, + cache_eviction_batch_size=10, + ) + assert cache._use_sqlite_on_conflict is False + + with patch("sqlite3.sqlite_version_info", (3, 23, 1)), patch( + "datahub.utilities.file_backed_collections.OVERRIDE_SQLITE_VERSION_REQUIREMENT", + True, + ): + cache = FileBackedDict[int]( + tablename="cache", + cache_max_size=10, + cache_eviction_batch_size=10, + ) + assert cache._use_sqlite_on_conflict is False + + @pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) def test_file_dict(use_sqlite_on_conflict: bool) -> None: cache = FileBackedDict[int]( diff --git a/metadata-integration/java/acryl-spark-lineage/README.md b/metadata-integration/java/acryl-spark-lineage/README.md index 97851e90e860e..e51c884c297d7 100644 --- a/metadata-integration/java/acryl-spark-lineage/README.md +++ b/metadata-integration/java/acryl-spark-lineage/README.md @@ -24,7 +24,7 @@ When running jobs using spark-submit, the agent needs to be configured in the co ```text #Configuring DataHub spark agent jar -spark.jars.packages io.acryl:acryl-spark-lineage:0.2.16 +spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17 spark.extraListeners datahub.spark.DatahubSparkListener spark.datahub.rest.server http://localhost:8080 ``` @@ -32,7 +32,7 @@ spark.datahub.rest.server http://localhost:8080 ## spark-submit command line ```sh -spark-submit --packages io.acryl:acryl-spark-lineage:0.2.16 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py +spark-submit --packages io.acryl:acryl-spark-lineage:0.2.17 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py ``` ### Configuration Instructions: Amazon EMR @@ -41,7 +41,7 @@ Set the following spark-defaults configuration properties as it stated [here](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html) ```text -spark.jars.packages io.acryl:acryl-spark-lineage:0.2.16 +spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17 spark.extraListeners datahub.spark.DatahubSparkListener spark.datahub.rest.server https://your_datahub_host/gms #If you have authentication set up then you also need to specify the Datahub access token @@ -56,7 +56,7 @@ When running interactive jobs from a notebook, the listener can be configured wh spark = SparkSession.builder .master("spark://spark-master:7077") .appName("test-application") -.config("spark.jars.packages", "io.acryl:acryl-spark-lineage:0.2.16") +.config("spark.jars.packages", "io.acryl:acryl-spark-lineage:0.2.17") .config("spark.extraListeners", "datahub.spark.DatahubSparkListener") .config("spark.datahub.rest.server", "http://localhost:8080") .enableHiveSupport() @@ -79,7 +79,7 @@ appName("test-application") config("spark.master","spark://spark-master:7077") . -config("spark.jars.packages","io.acryl:acryl-spark-lineage:0.2.16") +config("spark.jars.packages","io.acryl:acryl-spark-lineage:0.2.17") . config("spark.extraListeners","datahub.spark.DatahubSparkListener") @@ -158,45 +158,47 @@ information like tokens. ## Configuration Options -| Field | Required | Default | Description | -|--------------------------------------------------------|----------|-----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| spark.jars.packages | ✅ | | Set with latest/required version io.acryl:acryl-spark-lineage:0.2.15 | -| spark.extraListeners | ✅ | | datahub.spark.DatahubSparkListener | -| spark.datahub.emitter | | rest | Specify the ways to emit metadata. By default it sends to DataHub using REST emitter. Valid options are rest, kafka or file | -| spark.datahub.rest.server | | http://localhost:8080 | Datahub server url eg: | -| spark.datahub.rest.token | | | Authentication token. | -| spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! | -| spark.datahub.rest.disable_chunked_encoding | | false | Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled. || -| spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed | -| spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries | -| spark.datahub.file.filename | | | The file where metadata will be written if file emitter is set | -| spark.datahub.kafka.bootstrap | | | The Kafka bootstrap server url to use if the Kafka emitter is set | -| spark.datahub.kafka.schema_registry_url | | | The Schema registry url to use if the Kafka emitter is set | -| spark.datahub.kafka.schema_registry_config. | | | Additional config to pass in to the Schema Registry Client | -| spark.datahub.kafka.producer_config. | | | Additional config to pass in to the Kafka producer. For example: `--conf "spark.datahub.kafka.producer_config.client.id=my_client_id"` | -| spark.datahub.metadata.pipeline.platformInstance | | | Pipeline level platform instance | -| spark.datahub.metadata.dataset.platformInstance | | | dataset level platform instance (it is usefult to set if you have it in your glue ingestion) | -| spark.datahub.metadata.dataset.env | | PROD | [Supported values](https://datahubproject.io/docs/graphql/enums#fabrictype). In all other cases, will fallback to PROD | -| spark.datahub.metadata.dataset.hivePlatformAlias | | hive | By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to `glue` | +| Field | Required | Default | Description | +|--------------------------------------------------------|----------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| spark.jars.packages | ✅ | | Set with latest/required version io.acryl:acryl-spark-lineage:0.2.15 | +| spark.extraListeners | ✅ | | datahub.spark.DatahubSparkListener | +| spark.datahub.emitter | | rest | Specify the ways to emit metadata. By default it sends to DataHub using REST emitter. Valid options are rest, kafka or file | +| spark.datahub.rest.server | | http://localhost:8080 | Datahub server url eg: | +| spark.datahub.rest.token | | | Authentication token. | +| spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! | +| spark.datahub.rest.disable_chunked_encoding | | false | Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled. || +| spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed | +| spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries | +| spark.datahub.file.filename | | | The file where metadata will be written if file emitter is set | +| spark.datahub.kafka.bootstrap | | | The Kafka bootstrap server url to use if the Kafka emitter is set | +| spark.datahub.kafka.schema_registry_url | | | The Schema registry url to use if the Kafka emitter is set | +| spark.datahub.kafka.schema_registry_config. | | | Additional config to pass in to the Schema Registry Client | +| spark.datahub.kafka.producer_config. | | | Additional config to pass in to the Kafka producer. For example: `--conf "spark.datahub.kafka.producer_config.client.id=my_client_id"` | +| spark.datahub.metadata.pipeline.platformInstance | | | Pipeline level platform instance | +| spark.datahub.metadata.dataset.platformInstance | | | dataset level platform instance (it is usefult to set if you have it in your glue ingestion) | +| spark.datahub.metadata.dataset.env | | PROD | [Supported values](https://datahubproject.io/docs/graphql/enums#fabrictype). In all other cases, will fallback to PROD | +| spark.datahub.metadata.dataset.hivePlatformAlias | | hive | By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to `glue` | | spark.datahub.metadata.include_scheme | | true | Include scheme from the path URI (e.g. hdfs://, s3://) in the dataset URN. We recommend setting this value to false, it is set to true for backwards compatibility with previous versions | -| spark.datahub.metadata.remove_partition_pattern | | | Remove partition pattern. (e.g. /partition=\d+) It change database/table/partition=123 to database/table | -| spark.datahub.coalesce_jobs | | true | Only one datajob(task) will be emitted containing all input and output datasets for the spark application | -| spark.datahub.parent.datajob_urn | | | Specified dataset will be set as upstream dataset for datajob created. Effective only when spark.datahub.coalesce_jobs is set to true | -| spark.datahub.metadata.dataset.materialize | | false | Materialize Datasets in DataHub | -| spark.datahub.platform.s3.path_spec_list | | | List of pathspec per platform | -| spark.datahub.metadata.dataset.include_schema_metadata | false | | Emit dataset schema metadata based on the spark execution. It is recommended to get schema information from platform specific DataHub sources as this is less reliable | -| spark.datahub.flow_name | | | If it is set it will be used as the DataFlow name otherwise it uses spark app name as flow_name | -| spark.datahub.file_partition_regexp | | | Strip partition part from the path if path end matches with the specified regexp. Example `year=.*/month=.*/day=.*` | -| spark.datahub.tags | | | Comma separated list of tags to attach to the DataFlow | -| spark.datahub.domains | | | Comma separated list of domain urns to attach to the DataFlow | -| spark.datahub.stage_metadata_coalescing | | | Normally it coalesces and sends metadata at the onApplicationEnd event which is never called on Databricks or on Glue. You should enable this on Databricks if you want coalesced run. | -| spark.datahub.patch.enabled | | false | Set this to true to send lineage as a patch, which appends rather than overwrites existing Dataset lineage edges. By default, it is disabled. | -| spark.datahub.metadata.dataset.lowerCaseUrns | | false | Set this to true to lowercase dataset urns. By default, it is disabled. | -| spark.datahub.disableSymlinkResolution | | false | Set this to true if you prefer using the s3 location instead of the Hive table. By default, it is disabled. | -| spark.datahub.s3.bucket | | | The name of the bucket where metadata will be written if s3 emitter is set | -| spark.datahub.s3.prefix | | | The prefix for the file where metadata will be written on s3 if s3 emitter is set | -| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set | - +| spark.datahub.metadata.remove_partition_pattern | | | Remove partition pattern. (e.g. /partition=\d+) It change database/table/partition=123 to database/table | +| spark.datahub.coalesce_jobs | | true | Only one datajob(task) will be emitted containing all input and output datasets for the spark application | +| spark.datahub.parent.datajob_urn | | | Specified dataset will be set as upstream dataset for datajob created. Effective only when spark.datahub.coalesce_jobs is set to true | +| spark.datahub.metadata.dataset.materialize | | false | Materialize Datasets in DataHub | +| spark.datahub.platform.s3.path_spec_list | | | List of pathspec per platform | +| spark.datahub.metadata.dataset.include_schema_metadata | false | | Emit dataset schema metadata based on the spark execution. It is recommended to get schema information from platform specific DataHub sources as this is less reliable | +| spark.datahub.flow_name | | | If it is set it will be used as the DataFlow name otherwise it uses spark app name as flow_name | +| spark.datahub.file_partition_regexp | | | Strip partition part from the path if path end matches with the specified regexp. Example `year=.*/month=.*/day=.*` | +| spark.datahub.tags | | | Comma separated list of tags to attach to the DataFlow | +| spark.datahub.domains | | | Comma separated list of domain urns to attach to the DataFlow | +| spark.datahub.stage_metadata_coalescing | | | Normally it coalesces and sends metadata at the onApplicationEnd event which is never called on Databricks or on Glue. You should enable this on Databricks if you want coalesced run. | +| spark.datahub.patch.enabled | | false | Set this to true to send lineage as a patch, which appends rather than overwrites existing Dataset lineage edges. By default, it is disabled. | +| spark.datahub.metadata.dataset.lowerCaseUrns | | false | Set this to true to lowercase dataset urns. By default, it is disabled. | +| spark.datahub.disableSymlinkResolution | | false | Set this to true if you prefer using the s3 location instead of the Hive table. By default, it is disabled. | +| spark.datahub.s3.bucket | | | The name of the bucket where metadata will be written if s3 emitter is set | +| spark.datahub.s3.prefix | | | The prefix for the file where metadata will be written on s3 if s3 emitter is set | +| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set | +| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set | +|spark.datahub.log.mcps | | true | Set this to true to log MCPS to the log. By default, it is enabled. | +|spark.datahub.legacyLineageCleanup.enabled| | false | Set this to true to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob. By default, it is disabled. | ## What to Expect: The Metadata Model @@ -358,6 +360,19 @@ Use Java 8 to build the project. The project uses Gradle as the build tool. To b + ## Changelog +### Version 0.2.17 +- *Major changes*: + - Finegrained lineage is emitted on the DataJob and not on the emitted Datasets. This is the correct behaviour which was not correct earlier. This causes earlier emitted finegrained lineages won't be overwritten by the new ones. + You can remove the old lineages by setting `spark.datahub.legacyLineageCleanup.enabled=true`. Make sure you have the latest server if you enable with patch support. (this was introduced since 0.2.17-rc5) + +- *Changes*: + - OpenLineage 1.25.0 upgrade + - Add option to disable chunked encoding in the datahub rest sink -> `spark.datahub.rest.disable_chunked_encoding` + - Add option to specify the mcp kafka topic for the datahub kafka sink -> `spark.datahub.kafka.mcp_topic` + - Add option to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob -> `spark.datahub.legacyLineageCleanup.enabled` +- *Fixes*: + - Fix handling map transformation in the lineage. Earlier it generated wrong lineage for map transformation. + ### Version 0.2.16 - Remove logging DataHub config into logs diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java index 0bcc7db9e8740..84f397226ce91 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubEventEmitter.java @@ -1,12 +1,18 @@ package datahub.spark; +import static com.linkedin.metadata.Constants.*; import static datahub.spark.converter.SparkStreamingEventToDatahub.*; import static io.datahubproject.openlineage.converter.OpenLineageToDataHub.*; import static io.datahubproject.openlineage.utils.DatahubUtils.*; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.StreamReadConstraints; +import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.common.GlobalTags; import com.linkedin.common.UrnArray; import com.linkedin.common.urn.DataJobUrn; +import com.linkedin.data.DataMap; +import com.linkedin.data.template.JacksonDataTemplateCodec; import com.linkedin.data.template.StringMap; import com.linkedin.dataprocess.DataProcessInstanceRelationships; import com.linkedin.dataprocess.RunResultType; @@ -62,12 +68,23 @@ public class DatahubEventEmitter extends EventEmitter { private final Map schemaMap = new HashMap<>(); private SparkLineageConf datahubConf; private static final int DEFAULT_TIMEOUT_SEC = 10; + private final ObjectMapper objectMapper; + private final JacksonDataTemplateCodec dataTemplateCodec; private final EventFormatter eventFormatter = new EventFormatter(); public DatahubEventEmitter(SparkOpenLineageConfig config, String applicationJobName) throws URISyntaxException { super(config, applicationJobName); + objectMapper = new ObjectMapper().setSerializationInclusion(JsonInclude.Include.NON_NULL); + int maxSize = + Integer.parseInt( + System.getenv() + .getOrDefault(INGESTION_MAX_SERIALIZED_STRING_LENGTH, MAX_JACKSON_STRING_SIZE)); + objectMapper + .getFactory() + .setStreamReadConstraints(StreamReadConstraints.builder().maxStringLength(maxSize).build()); + dataTemplateCodec = new JacksonDataTemplateCodec(objectMapper.getFactory()); } private Optional getEmitter() { @@ -407,7 +424,14 @@ protected void emitMcps(List mcps) { .map( mcp -> { try { - log.info("emitting mcpw: " + mcp); + if (this.datahubConf.isLogMcps()) { + DataMap map = mcp.data(); + String serializedMCP = dataTemplateCodec.mapToString(map); + log.info("emitting mcpw: {}", serializedMCP); + } else { + log.info( + "emitting aspect: {} for urn: {}", mcp.getAspectName(), mcp.getEntityUrn()); + } return emitter.get().emit(mcp); } catch (IOException ioException) { log.error("Failed to emit metadata to DataHub", ioException); diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java index 3860285083c4b..824cd1a687b26 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java @@ -31,6 +31,7 @@ public class SparkConfigParser { public static final String FILE_EMITTER_FILE_NAME = "file.filename"; public static final String DISABLE_SSL_VERIFICATION_KEY = "rest.disable_ssl_verification"; public static final String REST_DISABLE_CHUNKED_ENCODING = "rest.disable_chunked_encoding"; + public static final String CONFIG_LOG_MCPS = "log.mcps"; public static final String MAX_RETRIES = "rest.max_retries"; public static final String RETRY_INTERVAL_IN_SEC = "rest.retry_interval_in_sec"; @@ -51,6 +52,7 @@ public class SparkConfigParser { public static final String COALESCE_KEY = "coalesce_jobs"; public static final String PATCH_ENABLED = "patch.enabled"; + public static final String LEGACY_LINEAGE_CLEANUP = "legacyLineageCleanup.enabled"; public static final String DISABLE_SYMLINK_RESOLUTION = "disableSymlinkResolution"; public static final String STAGE_METADATA_COALESCING = "stage_metadata_coalescing"; @@ -158,6 +160,7 @@ public static DatahubOpenlineageConfig sparkConfigToDatahubOpenlineageConf( Config sparkConfig, SparkAppContext sparkAppContext) { DatahubOpenlineageConfig.DatahubOpenlineageConfigBuilder builder = DatahubOpenlineageConfig.builder(); + builder.isSpark(true); builder.filePartitionRegexpPattern( SparkConfigParser.getFilePartitionRegexpPattern(sparkConfig)); builder.fabricType(SparkConfigParser.getCommonFabricType(sparkConfig)); @@ -172,6 +175,7 @@ public static DatahubOpenlineageConfig sparkConfigToDatahubOpenlineageConf( builder.commonDatasetPlatformInstance(SparkConfigParser.getCommonPlatformInstance(sparkConfig)); builder.hivePlatformAlias(SparkConfigParser.getHivePlatformAlias(sparkConfig)); builder.usePatch(SparkConfigParser.isPatchEnabled(sparkConfig)); + builder.removeLegacyLineage(SparkConfigParser.isLegacyLineageCleanupEnabled(sparkConfig)); builder.disableSymlinkResolution(SparkConfigParser.isDisableSymlinkResolution(sparkConfig)); builder.lowerCaseDatasetUrns(SparkConfigParser.isLowerCaseDatasetUrns(sparkConfig)); try { @@ -311,6 +315,13 @@ public static boolean isDatasetMaterialize(Config datahubConfig) { && datahubConfig.getBoolean(DATASET_MATERIALIZE_KEY); } + public static boolean isLogMcps(Config datahubConfig) { + if (datahubConfig.hasPath(CONFIG_LOG_MCPS)) { + return datahubConfig.getBoolean(CONFIG_LOG_MCPS); + } + return true; + } + public static boolean isIncludeSchemaMetadata(Config datahubConfig) { if (datahubConfig.hasPath(DATASET_INCLUDE_SCHEMA_METADATA)) { return datahubConfig.getBoolean(DATASET_INCLUDE_SCHEMA_METADATA); @@ -352,6 +363,14 @@ public static boolean isPatchEnabled(Config datahubConfig) { return datahubConfig.hasPath(PATCH_ENABLED) && datahubConfig.getBoolean(PATCH_ENABLED); } + public static boolean isLegacyLineageCleanupEnabled(Config datahubConfig) { + if (!datahubConfig.hasPath(LEGACY_LINEAGE_CLEANUP)) { + return false; + } + return datahubConfig.hasPath(LEGACY_LINEAGE_CLEANUP) + && datahubConfig.getBoolean(LEGACY_LINEAGE_CLEANUP); + } + public static boolean isDisableSymlinkResolution(Config datahubConfig) { if (!datahubConfig.hasPath(DISABLE_SYMLINK_RESOLUTION)) { return false; diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java index 014cff873bbde..96afe729b82c0 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkLineageConf.java @@ -17,6 +17,7 @@ public class SparkLineageConf { final DatahubOpenlineageConfig openLineageConf; @Builder.Default final boolean coalesceEnabled = true; @Builder.Default final boolean emitCoalescePeriodically = false; + @Builder.Default final boolean logMcps = true; final SparkAppContext sparkAppContext; final DatahubEmitterConfig datahubEmitterConfig; @Builder.Default final List tags = new LinkedList<>(); @@ -32,6 +33,7 @@ public static SparkLineageConf toSparkLineageConf( SparkConfigParser.sparkConfigToDatahubOpenlineageConf(sparkConfig, sparkAppContext); builder.openLineageConf(datahubOpenlineageConfig); builder.coalesceEnabled(SparkConfigParser.isCoalesceEnabled(sparkConfig)); + builder.logMcps(SparkConfigParser.isLogMcps(sparkConfig)); if (SparkConfigParser.getTags(sparkConfig) != null) { builder.tags(Arrays.asList(Objects.requireNonNull(SparkConfigParser.getTags(sparkConfig)))); } diff --git a/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java b/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java index ef2b17e9932f2..b9a142364d4e8 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java +++ b/metadata-integration/java/acryl-spark-lineage/src/test/java/datahub/spark/OpenLineageEventToDatahubTest.java @@ -814,4 +814,32 @@ public void testProcessGCSInputsOutputs() throws URISyntaxException, IOException dataset.getUrn().toString()); } } + + public void testProcessMappartitionJob() throws URISyntaxException, IOException { + DatahubOpenlineageConfig.DatahubOpenlineageConfigBuilder builder = + DatahubOpenlineageConfig.builder(); + builder.fabricType(FabricType.DEV); + builder.lowerCaseDatasetUrns(true); + builder.materializeDataset(true); + builder.includeSchemaMetadata(true); + builder.isSpark(true); + + String olEvent = + IOUtils.toString( + this.getClass().getResourceAsStream("/ol_events/map_partition_job.json"), + StandardCharsets.UTF_8); + + OpenLineage.RunEvent runEvent = OpenLineageClientUtils.runEventFromJson(olEvent); + DatahubJob datahubJob = OpenLineageToDataHub.convertRunEventToJob(runEvent, builder.build()); + + assertNotNull(datahubJob); + + assertEquals(1, datahubJob.getInSet().size()); + for (DatahubDataset dataset : datahubJob.getInSet()) { + assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/my_dir/my_file.csv,DEV)", + dataset.getUrn().toString()); + } + assertEquals(0, datahubJob.getOutSet().size()); + } } diff --git a/metadata-integration/java/acryl-spark-lineage/src/test/resources/ol_events/map_partition_job.json b/metadata-integration/java/acryl-spark-lineage/src/test/resources/ol_events/map_partition_job.json new file mode 100644 index 0000000000000..39560a782840c --- /dev/null +++ b/metadata-integration/java/acryl-spark-lineage/src/test/resources/ol_events/map_partition_job.json @@ -0,0 +1,66 @@ +{ + "eventTime": "2024-11-20T12:59:29.059Z", + "producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark", + "schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunEvent", + "eventType": "START", + "run": { + "runId": "01902a1e-0b05-750e-b38d-439998f7a853", + "facets": { + "parent": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-1/ParentRunFacet.json#/$defs/ParentRunFacet", + "run": { + "runId": "01902a1e-0b05-750e-b38d-439998f7a853" + }, + "job": { + "namespace": "default", + "name": "spark_context_session" + } + }, + "processing_engine": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-1-1/ProcessingEngineRunFacet.json#/$defs/ProcessingEngineRunFacet", + "version": "3.4.2", + "name": "spark" + }, + "spark_jobDetails": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "jobId": 0 + }, + "spark_properties": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "properties": { + "spark.master": "yarn", + "spark.app.name": "SparkContextSession" + } + } + } + }, + "job": { + "namespace": "default", + "name": "spark_context_session.map_partitions_parallel_collection", + "facets": { + "jobType": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.24.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/2-0-3/JobTypeJobFacet.json#/$defs/JobTypeJobFacet", + "processingType": "BATCH", + "integration": "SPARK", + "jobType": "RDD_JOB" + } + } + }, + "inputs": [ + { + "namespace": "s3://my-bucket", + "name": "my_dir/my_file.csv" + } + ], + "outputs": [ + { + "namespace": "s3://my-bucket", + "name": "my_dir/my_file.csv" + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java index 5abb3c90d232b..c725673eae47b 100644 --- a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java @@ -16,6 +16,7 @@ @Getter @ToString public class DatahubOpenlineageConfig { + @Builder.Default private final boolean isSpark = false; @Builder.Default private final boolean isStreaming = false; @Builder.Default private final String pipelineName = null; private final String platformInstance; @@ -34,6 +35,7 @@ public class DatahubOpenlineageConfig { @Builder.Default private Map urnAliases = new HashMap<>(); @Builder.Default private final boolean disableSymlinkResolution = false; @Builder.Default private final boolean lowerCaseDatasetUrns = false; + @Builder.Default private final boolean removeLegacyLineage = false; public List getPathSpecsForPlatform(String platform) { if ((pathSpecs == null) || (pathSpecs.isEmpty())) { diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java index 9237ee60f473b..9fcfc68bd03f5 100644 --- a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java @@ -675,9 +675,30 @@ private static void convertJobToDataJob( datahubJob.setJobInfo(dji); DataJobInputOutput inputOutput = new DataJobInputOutput(); + boolean inputsEqualOutputs = false; + if ((datahubConf.isSpark()) + && ((event.getInputs() != null && event.getOutputs() != null) + && (event.getInputs().size() == event.getOutputs().size()))) { + inputsEqualOutputs = + event.getInputs().stream() + .map(OpenLineage.Dataset::getName) + .collect(Collectors.toSet()) + .equals( + event.getOutputs().stream() + .map(OpenLineage.Dataset::getName) + .collect(Collectors.toSet())); + if (inputsEqualOutputs) { + log.info( + "Inputs equals Outputs: {}. This is most probably because of an rdd map operation and we only process Inputs", + inputsEqualOutputs); + } + } + processJobInputs(datahubJob, event, datahubConf); - processJobOutputs(datahubJob, event, datahubConf); + if (!inputsEqualOutputs) { + processJobOutputs(datahubJob, event, datahubConf); + } DataProcessInstanceRunEvent dpire = processDataProcessInstanceResult(event); datahubJob.setDataProcessInstanceRunEvent(dpire); diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java index 60caaae359677..e2aa2c3a04c40 100644 --- a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java @@ -28,7 +28,10 @@ import com.linkedin.dataprocess.DataProcessInstanceRelationships; import com.linkedin.dataprocess.DataProcessInstanceRunEvent; import com.linkedin.dataset.FineGrainedLineage; +import com.linkedin.dataset.FineGrainedLineageArray; import com.linkedin.dataset.Upstream; +import com.linkedin.dataset.UpstreamArray; +import com.linkedin.dataset.UpstreamLineage; import com.linkedin.domain.Domains; import com.linkedin.metadata.aspect.patch.builder.DataJobInputOutputPatchBuilder; import com.linkedin.metadata.aspect.patch.builder.GlobalTagsPatchBuilder; @@ -167,11 +170,34 @@ public List toMcps(DatahubOpenlineageConfig config) thro return mcps; } + private FineGrainedLineageArray mergeFinegrainedLineages() { + FineGrainedLineageArray fgls = new FineGrainedLineageArray(); + + for (DatahubDataset dataset : inSet) { + if (dataset.lineage != null && dataset.lineage.getFineGrainedLineages() != null) { + dataset.lineage.getFineGrainedLineages().stream() + .filter(Objects::nonNull) + .forEach(fgls::add); + } + } + + for (DatahubDataset dataset : outSet) { + if (dataset.lineage != null && dataset.lineage.getFineGrainedLineages() != null) { + dataset.lineage.getFineGrainedLineages().stream() + .filter(Objects::nonNull) + .forEach(fgls::add); + } + } + + return fgls; + } + private void generateDataJobInputOutputMcp( EdgeArray inputEdges, EdgeArray outputEdges, DatahubOpenlineageConfig config, List mcps) { + DataJobInputOutput dataJobInputOutput = new DataJobInputOutput(); log.info("Adding DataJob edges to {}", jobUrn); if (config.isUsePatch() && (!parentJobs.isEmpty() || !inSet.isEmpty() || !outSet.isEmpty())) { @@ -186,6 +212,27 @@ private void generateDataJobInputOutputMcp( for (DataJobUrn parentJob : parentJobs) { dataJobInputOutputPatchBuilder.addInputDatajobEdge(parentJob); } + + FineGrainedLineageArray fgls = mergeFinegrainedLineages(); + fgls.forEach( + fgl -> { + Objects.requireNonNull(fgl.getUpstreams()) + .forEach( + upstream -> { + Objects.requireNonNull(fgl.getDownstreams()) + .forEach( + downstream -> { + dataJobInputOutputPatchBuilder.addFineGrainedUpstreamField( + upstream, + fgl.getConfidenceScore(), + StringUtils.defaultIfEmpty( + fgl.getTransformOperation(), "TRANSFORM"), + downstream, + fgl.getQuery()); + }); + }); + }); + MetadataChangeProposal dataJobInputOutputMcp = dataJobInputOutputPatchBuilder.build(); log.info( "dataJobInputOutputMcp: {}", @@ -195,6 +242,8 @@ private void generateDataJobInputOutputMcp( mcps.add(dataJobInputOutputPatchBuilder.build()); } else { + FineGrainedLineageArray fgls = mergeFinegrainedLineages(); + dataJobInputOutput.setFineGrainedLineages(fgls); dataJobInputOutput.setInputDatasetEdges(inputEdges); dataJobInputOutput.setInputDatasets(new DatasetUrnArray()); dataJobInputOutput.setOutputDatasetEdges(outputEdges); @@ -235,6 +284,49 @@ private void generateDataProcessInstanceMcp( generateDataProcessInstanceRelationship(mcps); } + private void deleteOldDatasetLineage( + DatahubDataset dataset, DatahubOpenlineageConfig config, List mcps) { + if (dataset.getLineage() != null) { + if (config.isUsePatch()) { + if (!dataset.getLineage().getUpstreams().isEmpty()) { + UpstreamLineagePatchBuilder upstreamLineagePatchBuilder = + new UpstreamLineagePatchBuilder().urn(dataset.getUrn()); + for (Upstream upstream : dataset.getLineage().getUpstreams()) { + upstreamLineagePatchBuilder.removeUpstream(upstream.getDataset()); + } + + log.info("Removing FineGrainedLineage to {}", dataset.getUrn()); + for (FineGrainedLineage fineGrainedLineage : + Objects.requireNonNull(dataset.getLineage().getFineGrainedLineages())) { + for (Urn upstream : Objects.requireNonNull(fineGrainedLineage.getUpstreams())) { + for (Urn downstream : Objects.requireNonNull(fineGrainedLineage.getDownstreams())) { + upstreamLineagePatchBuilder.removeFineGrainedUpstreamField( + upstream, + StringUtils.defaultIfEmpty( + fineGrainedLineage.getTransformOperation(), "TRANSFORM"), + downstream, + null); + } + } + } + MetadataChangeProposal mcp = upstreamLineagePatchBuilder.build(); + log.info( + "upstreamLineagePatch: {}", + mcp.getAspect().getValue().asString(Charset.defaultCharset())); + mcps.add(mcp); + } + } else { + if (!dataset.getLineage().getUpstreams().isEmpty()) { + // Remove earlier created UpstreamLineage which most probably was created by the plugin. + UpstreamLineage upstreamLineage = new UpstreamLineage(); + upstreamLineage.setUpstreams(new UpstreamArray()); + upstreamLineage.setFineGrainedLineages(new FineGrainedLineageArray()); + addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, upstreamLineage, mcps); + } + } + } + } + private Pair processDownstreams( DatahubOpenlineageConfig config, List mcps) { UrnArray outputUrnArray = new UrnArray(); @@ -263,43 +355,13 @@ private Pair processDownstreams( dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getSchemaMetadata(), mcps); } - if (dataset.getLineage() != null) { - if (config.isUsePatch()) { - if (!dataset.getLineage().getUpstreams().isEmpty()) { - UpstreamLineagePatchBuilder upstreamLineagePatchBuilder = - new UpstreamLineagePatchBuilder().urn(dataset.getUrn()); - for (Upstream upstream : dataset.getLineage().getUpstreams()) { - upstreamLineagePatchBuilder.addUpstream( - upstream.getDataset(), upstream.getType()); - } - - log.info("Adding FineGrainedLineage to {}", dataset.getUrn()); - for (FineGrainedLineage fineGrainedLineage : - Objects.requireNonNull(dataset.getLineage().getFineGrainedLineages())) { - for (Urn upstream : Objects.requireNonNull(fineGrainedLineage.getUpstreams())) { - for (Urn downstream : - Objects.requireNonNull(fineGrainedLineage.getDownstreams())) { - upstreamLineagePatchBuilder.addFineGrainedUpstreamField( - upstream, - fineGrainedLineage.getConfidenceScore(), - StringUtils.defaultIfEmpty( - fineGrainedLineage.getTransformOperation(), "TRANSFORM"), - downstream, - null); - } - } - } - MetadataChangeProposal mcp = upstreamLineagePatchBuilder.build(); - log.info( - "upstreamLineagePatch: {}", - mcp.getAspect().getValue().asString(Charset.defaultCharset())); - mcps.add(mcp); - } - } else { - addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getLineage(), mcps); - } + // Remove lineage which was added by older plugin that set lineage on Datasets and not on + // DataJobs + if (config.isRemoveLegacyLineage()) { + deleteOldDatasetLineage(dataset, config, mcps); } }); + return Pair.of(outputUrnArray, outputEdges); } @@ -330,10 +392,6 @@ private Pair processUpstreams( addAspectToMcps( dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getSchemaMetadata(), mcps); } - - if (dataset.getLineage() != null) { - addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getLineage(), mcps); - } }); return Pair.of(inputUrnArray, inputEdges); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 3d35f5956b0f4..35d133c74c069 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -775,7 +775,8 @@ public List batchIngestProposals( List updatedUrns = new ArrayList<>(); Iterators.partition( - metadataChangeProposals.iterator(), Math.max(1, entityClientConfig.getBatchGetV2Size())) + metadataChangeProposals.iterator(), + Math.max(1, entityClientConfig.getBatchIngestSize())) .forEachRemaining( batch -> { AspectsBatch aspectsBatch = diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java index 6de79b6c4b181..792e67e69f2da 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java @@ -411,6 +411,8 @@ private void reindex(ReindexConfig indexState) throws Throwable { boolean reindexTaskCompleted = false; Pair documentCounts = getDocumentCounts(indexState.name(), tempIndexName); long documentCountsLastUpdated = System.currentTimeMillis(); + long previousDocCount = documentCounts.getSecond(); + long estimatedMinutesRemaining = 0; while (System.currentTimeMillis() < timeoutAt) { log.info( @@ -421,8 +423,22 @@ private void reindex(ReindexConfig indexState) throws Throwable { Pair tempDocumentsCount = getDocumentCounts(indexState.name(), tempIndexName); if (!tempDocumentsCount.equals(documentCounts)) { - documentCountsLastUpdated = System.currentTimeMillis(); + long currentTime = System.currentTimeMillis(); + long timeElapsed = currentTime - documentCountsLastUpdated; + long docsIndexed = tempDocumentsCount.getSecond() - previousDocCount; + + // Calculate indexing rate (docs per millisecond) + double indexingRate = timeElapsed > 0 ? (double) docsIndexed / timeElapsed : 0; + + // Calculate remaining docs and estimated time + long remainingDocs = tempDocumentsCount.getFirst() - tempDocumentsCount.getSecond(); + long estimatedMillisRemaining = + indexingRate > 0 ? (long) (remainingDocs / indexingRate) : 0; + estimatedMinutesRemaining = estimatedMillisRemaining / (1000 * 60); + + documentCountsLastUpdated = currentTime; documentCounts = tempDocumentsCount; + previousDocCount = documentCounts.getSecond(); } if (documentCounts.getFirst().equals(documentCounts.getSecond())) { @@ -435,12 +451,15 @@ private void reindex(ReindexConfig indexState) throws Throwable { break; } else { + float progressPercentage = + 100 * (1.0f * documentCounts.getSecond()) / documentCounts.getFirst(); log.warn( - "Task: {} - Document counts do not match {} != {}. Complete: {}%", + "Task: {} - Document counts do not match {} != {}. Complete: {}%. Estimated time remaining: {} minutes", parentTaskId, documentCounts.getFirst(), documentCounts.getSecond(), - 100 * (1.0f * documentCounts.getSecond()) / documentCounts.getFirst()); + progressPercentage, + estimatedMinutesRemaining); long lastUpdateDelta = System.currentTimeMillis() - documentCountsLastUpdated; if (lastUpdateDelta > (300 * 1000)) { diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java index 5e387d7d88292..968f0dd4dd61e 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java @@ -137,7 +137,7 @@ protected OperationContext sampleDataOperationContext( return testOpContext.toBuilder() .searchContext(SearchContext.builder().indexConvention(indexConvention).build()) - .build(testOpContext.getSessionAuthentication()); + .build(testOpContext.getSessionAuthentication(), true); } @Bean(name = "longTailOperationContext") @@ -148,7 +148,7 @@ protected OperationContext longTailOperationContext( return testOpContext.toBuilder() .searchContext(SearchContext.builder().indexConvention(indexConvention).build()) - .build(testOpContext.getSessionAuthentication()); + .build(testOpContext.getSessionAuthentication(), true); } protected EntityIndexBuilders entityIndexBuildersHelper(OperationContext opContext) { diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java index b7b698c73ddac..26443e019829b 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java @@ -162,7 +162,7 @@ protected OperationContext searchLineageOperationContext( return testOpContext.toBuilder() .searchContext(SearchContext.builder().indexConvention(indexConvention).build()) - .build(testOpContext.getSessionAuthentication()); + .build(testOpContext.getSessionAuthentication(), true); } @Bean(name = "searchLineageESIndexBuilder") diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index f16c9dbd82e74..c92749385145d 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -95,7 +95,8 @@ public OperationContext operationContext( mock(ServicesRegistryContext.class), indexConvention, TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry), - mock(ValidationContext.class)); + mock(ValidationContext.class), + true); } @MockBean SpringStandardPluginConfiguration springStandardPluginConfiguration; diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java index c08b7fad4dee3..11e38dfb179e0 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java @@ -29,23 +29,31 @@ @EqualsAndHashCode public class ActorContext implements ContextInterface { - public static ActorContext asSystem(Authentication systemAuthentication) { - return ActorContext.builder().systemAuth(true).authentication(systemAuthentication).build(); + public static ActorContext asSystem( + Authentication systemAuthentication, boolean enforceExistenceEnabled) { + return ActorContext.builder() + .systemAuth(true) + .authentication(systemAuthentication) + .enforceExistenceEnabled(enforceExistenceEnabled) + .build(); } public static ActorContext asSessionRestricted( Authentication authentication, Set dataHubPolicySet, - Collection groupMembership) { + Collection groupMembership, + boolean enforceExistenceEnabled) { return ActorContext.builder() .systemAuth(false) .authentication(authentication) .policyInfoSet(dataHubPolicySet) .groupMembership(groupMembership) + .enforceExistenceEnabled(enforceExistenceEnabled) .build(); } private final Authentication authentication; + private final boolean enforceExistenceEnabled; @EqualsAndHashCode.Exclude @Builder.Default private final Set policyInfoSet = Collections.emptySet(); @@ -79,7 +87,7 @@ public boolean isActive(AspectRetriever aspectRetriever) { Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of()); - if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) { + if (enforceExistenceEnabled && !aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) { // user is hard deleted return false; } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java index 9158129235b39..30255f7ebcac3 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java @@ -152,7 +152,8 @@ public static OperationContext asSystem( @Nullable ServicesRegistryContext servicesRegistryContext, @Nullable IndexConvention indexConvention, @Nullable RetrieverContext retrieverContext, - @Nonnull ValidationContext validationContext) { + @Nonnull ValidationContext validationContext, + boolean enforceExistenceEnabled) { return asSystem( config, systemAuthentication, @@ -161,7 +162,8 @@ public static OperationContext asSystem( indexConvention, retrieverContext, validationContext, - ObjectMapperContext.DEFAULT); + ObjectMapperContext.DEFAULT, + enforceExistenceEnabled); } public static OperationContext asSystem( @@ -172,10 +174,15 @@ public static OperationContext asSystem( @Nullable IndexConvention indexConvention, @Nullable RetrieverContext retrieverContext, @Nonnull ValidationContext validationContext, - @Nonnull ObjectMapperContext objectMapperContext) { + @Nonnull ObjectMapperContext objectMapperContext, + boolean enforceExistenceEnabled) { ActorContext systemActorContext = - ActorContext.builder().systemAuth(true).authentication(systemAuthentication).build(); + ActorContext.builder() + .systemAuth(true) + .authentication(systemAuthentication) + .enforceExistenceEnabled(enforceExistenceEnabled) + .build(); OperationContextConfig systemConfig = config.toBuilder().allowSystemAuthentication(true).build(); SearchContext systemSearchContext = @@ -457,13 +464,16 @@ public int hashCode() { public static class OperationContextBuilder { @Nonnull - public OperationContext build(@Nonnull Authentication sessionAuthentication) { - return build(sessionAuthentication, false); + public OperationContext build( + @Nonnull Authentication sessionAuthentication, boolean enforceExistenceEnabled) { + return build(sessionAuthentication, false, enforceExistenceEnabled); } @Nonnull public OperationContext build( - @Nonnull Authentication sessionAuthentication, boolean skipCache) { + @Nonnull Authentication sessionAuthentication, + boolean skipCache, + boolean enforceExistenceEnabled) { final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr()); final ActorContext sessionActor = ActorContext.builder() @@ -476,6 +486,7 @@ public OperationContext build( .equals(sessionAuthentication.getActor())) .policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn)) .groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn)) + .enforceExistenceEnabled(enforceExistenceEnabled) .build(); return build(sessionActor, skipCache); } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index 4abfbb196f067..92d62d42295b9 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -260,7 +260,8 @@ public static OperationContext systemContext( servicesRegistryContext, indexConvention, retrieverContext, - validationContext); + validationContext, + true); if (postConstruct != null) { postConstruct.accept(operationContext); diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java index 15fe2bc277b9b..de6f71408e258 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/ActorContextTest.java @@ -87,42 +87,43 @@ public void actorContextId() { Authentication userAuth = new Authentication(new Actor(ActorType.USER, "USER"), ""); assertEquals( - ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of()).getCacheKeyComponent(), - ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of()).getCacheKeyComponent(), + ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of(), true).getCacheKeyComponent(), + ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of(), true).getCacheKeyComponent(), "Expected equality across instances"); assertEquals( - ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of()).getCacheKeyComponent(), + ActorContext.asSessionRestricted(userAuth, Set.of(), Set.of(), true).getCacheKeyComponent(), ActorContext.asSessionRestricted( - userAuth, Set.of(), Set.of(UrnUtils.getUrn("urn:li:corpGroup:group1"))) + userAuth, Set.of(), Set.of(UrnUtils.getUrn("urn:li:corpGroup:group1")), true) .getCacheKeyComponent(), "Expected no impact to cache context from group membership"); assertEquals( - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of()) + ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of(), true) .getCacheKeyComponent(), - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of()) + ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of(), true) .getCacheKeyComponent(), "Expected equality when non-ownership policies are identical"); assertNotEquals( - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC_RESOURCE, POLICY_D), Set.of()) + ActorContext.asSessionRestricted( + userAuth, Set.of(POLICY_ABC_RESOURCE, POLICY_D), Set.of(), true) .getCacheKeyComponent(), - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of()) + ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_ABC, POLICY_D), Set.of(), true) .getCacheKeyComponent(), "Expected differences with non-identical resource policy"); assertNotEquals( - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER), Set.of()) + ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER), Set.of(), true) .getCacheKeyComponent(), - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of()) + ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of(), true) .getCacheKeyComponent(), "Expected differences with ownership policy"); assertNotEquals( - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER_TYPE), Set.of()) + ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D_OWNER_TYPE), Set.of(), true) .getCacheKeyComponent(), - ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of()) + ActorContext.asSessionRestricted(userAuth, Set.of(POLICY_D), Set.of(), true) .getCacheKeyComponent(), "Expected differences with ownership type policy"); } diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java index f77b244d8f2d8..a2575c1c56220 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java @@ -27,7 +27,8 @@ public void testSystemPrivilegeEscalation() { mock(ServicesRegistryContext.class), null, TestOperationContexts.emptyActiveUsersRetrieverContext(null), - mock(ValidationContext.class)); + mock(ValidationContext.class), + true); OperationContext opContext = systemOpContext.asSession(RequestContext.TEST, Authorizer.EMPTY, userAuth); @@ -51,7 +52,7 @@ public void testSystemPrivilegeEscalation() { systemOpContext.getOperationContextConfig().toBuilder() .allowSystemAuthentication(false) .build()) - .build(userAuth); + .build(userAuth, true); assertEquals( opContextNoSystem.getAuthentication(), diff --git a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java index 442263bbd6b43..81cc5e60552a7 100644 --- a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java +++ b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java @@ -9,6 +9,9 @@ public class AuthenticationConfiguration { /** Whether authentication is enabled */ private boolean enabled; + /** Whether user existence is enforced */ + private boolean enforceExistenceEnabled; + /** * List of configurations for {@link com.datahub.plugins.auth.authentication.Authenticator}s to be * registered diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java index 4437682bfeb0a..ce9c636be16ac 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java @@ -320,7 +320,8 @@ public void setupTest() throws Exception { mock(ServicesRegistryContext.class), mock(IndexConvention.class), mock(RetrieverContext.class), - mock(ValidationContext.class)); + mock(ValidationContext.class), + true); _dataHubAuthorizer = new DataHubAuthorizer( diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index f6fa4a37fdadb..c029cb4648d01 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -6,6 +6,9 @@ authentication: # Enable if you want all requests to the Metadata Service to be authenticated. enabled: ${METADATA_SERVICE_AUTH_ENABLED:true} + # Disable if you want to skip validation of deleted user's tokens + enforceExistenceEnabled: ${METADATA_SERVICE_AUTH_ENFORCE_EXISTENCE_ENABLED:true} + # Required if enabled is true! A configurable chain of Authenticators authenticators: # Required for authenticating requests with DataHub-issued Access Tokens - best not to remove. diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java index 3e2823591e168..78107cc0ecc90 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java @@ -79,7 +79,8 @@ protected OperationContext javaSystemOperationContext( ValidationContext.builder() .alternateValidation( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) - .build()); + .build(), + configurationProvider.getAuthentication().isEnforceExistenceEnabled()); entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); @@ -134,7 +135,8 @@ protected OperationContext restliSystemOperationContext( ValidationContext.builder() .alternateValidation( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) - .build()); + .build(), + configurationProvider.getAuthentication().isEnforceExistenceEnabled()); entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); diff --git a/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java b/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java index cc21819cf4ab5..b47c779f768a9 100644 --- a/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java +++ b/metadata-service/factories/src/test/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStepTest.java @@ -87,7 +87,7 @@ public void testExecuteChecksKeySpecForAllUrns() throws Exception { mockOpContext = mockOpContext.toBuilder() .entityRegistryContext(spyEntityRegistryContext) - .build(mockOpContext.getSessionAuthentication()); + .build(mockOpContext.getSessionAuthentication(), true); mockDBWithWorkToDo(migrationsDao, countOfCorpUserEntities, countOfChartEntities); diff --git a/settings.gradle b/settings.gradle index b0c2c707d566c..77d0706549a43 100644 --- a/settings.gradle +++ b/settings.gradle @@ -78,3 +78,52 @@ include ':metadata-operation-context' include ':metadata-service:openapi-servlet:models' include ':metadata-integration:java:datahub-schematron:lib' include ':metadata-integration:java:datahub-schematron:cli' + +def installPreCommitHooks() { + def preCommitInstalled = false + try { + def process = ["which", "pre-commit"].execute() + def stdout = new StringBuilder() + def stderr = new StringBuilder() + process.waitForProcessOutput(stdout, stderr) + preCommitInstalled = (process.exitValue() == 0) + println "Pre-commit check: ${stdout}" + } catch (Exception e) { + println "Error checking pre-commit: ${e.message}" + return + } + + if (!preCommitInstalled) { + try { + def installProcess = ["python", "-m", "pip", "install", "pre-commit"].execute() + def stdout = new StringBuilder() + def stderr = new StringBuilder() + installProcess.waitForProcessOutput(stdout, stderr) + if (installProcess.exitValue() != 0) { + println "Failed to install pre-commit: ${stderr}" + return + } + println "Install output: ${stdout}" + } catch (Exception e) { + println "Error installing pre-commit: ${e.message}" + return + } + } + + try { + def installHooksProcess = ["python", "-m", "pre_commit", "install"].execute() + def stdout = new StringBuilder() + def stderr = new StringBuilder() + installHooksProcess.waitForProcessOutput(stdout, stderr) + if (installHooksProcess.exitValue() != 0) { + println "Failed to install hooks: ${stderr}" + return + } + println "Hooks output: ${stdout}" + } catch (Exception e) { + println "Error installing hooks: ${e.message}" + return + } +} + +installPreCommitHooks() \ No newline at end of file diff --git a/smoke-test/tests/structured_properties/test_structured_properties.py b/smoke-test/tests/structured_properties/test_structured_properties.py index 533a03a55735a..e3c33aa406efc 100644 --- a/smoke-test/tests/structured_properties/test_structured_properties.py +++ b/smoke-test/tests/structured_properties/test_structured_properties.py @@ -839,3 +839,49 @@ def validate_search(qualified_name, expected): # Validate search works for property #1 & #2 validate_search(property1.qualified_name, expected=[]) validate_search(property2.qualified_name, expected=[dataset_urns[0]]) + + +def test_structured_properties_list(ingest_cleanup_data, graph_client, caplog): + # Create property, assign value to target dataset urn + def create_property(): + property_name = f"listTest{randint(10, 10000)}Property" + value_type = "string" + property_urn = f"urn:li:structuredProperty:{default_namespace}.{property_name}" + + create_property_definition( + property_name=property_name, + graph=graph_client, + value_type=value_type, + cardinality="SINGLE", + ) + + test_property = StructuredProperties.from_datahub( + graph=graph_client, urn=property_urn + ) + assert test_property is not None + + return test_property + + # create 2 structured properties + property1 = create_property() + property2 = create_property() + wait_for_writes_to_sync() + + # validate that urns are in the list + structured_properties_urns = [ + u for u in StructuredProperties.list_urns(graph_client) + ] + assert property1.urn in structured_properties_urns + assert property2.urn in structured_properties_urns + + # list structured properties (full) + structured_properties = StructuredProperties.list(graph_client) + matched_properties = [ + p for p in structured_properties if p.urn in [property1.urn, property2.urn] + ] + assert len(matched_properties) == 2 + retrieved_property1 = next(p for p in matched_properties if p.urn == property1.urn) + retrieved_property2 = next(p for p in matched_properties if p.urn == property2.urn) + + assert property1.dict() == retrieved_property1.dict() + assert property2.dict() == retrieved_property2.dict()