Merge branch 'master' into feature/cus-3546-get-connection-object-pag…

…ination-metrics
datahub-project · Jan 13, 2025 · 507a4f5 · 507a4f5
2 parents 5faf0ab + 457f96e
commit 507a4f5
Show file tree

Hide file tree

Showing 22 changed files with 905 additions and 712 deletions.
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -118,10 +118,12 @@ jobs:
         run: |
           echo "BACKEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(metadata-models|entity-registry|datahuyb-graphql-core|metadata-io|metadata-jobs|metadata-utils|metadata-service|medata-dao-impl|metadata-operation|li-utils|metadata-integration|metadata-events|metadata-auth|ingestion-scheduler|notifications|datahub-upgrade)' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
           echo "FRONTEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(datahub-frontend|datahub-web-react).*\.(xml|json)$' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
+      - name: Generate tz artifact name
+        run: echo "NAME_TZ=$(echo ${{ matrix.timezone }} | tr '/' '-')" >> $GITHUB_ENV
       - uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: Test Results (build) - ${{ matrix.command}}-${{ matrix.timezone }}
+          name: Test Results (build) - ${{ matrix.command}}-${{ env.NAME_TZ }}
           path: |
             **/build/reports/tests/test/**
             **/build/test-results/test/**

diff --git a/build.gradle b/build.gradle
@@ -379,6 +379,7 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
 
     resolutionStrategy.force externalDependency.antlr4Runtime
     resolutionStrategy.force externalDependency.antlr4
+    resolutionStrategy.force 'org.apache.mina:mina-core:2.2.4'
   }
 }
 

diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile
@@ -34,16 +34,12 @@ ARG MAVEN_CENTRAL_REPO_URL
 RUN if [ "${ALPINE_REPO_URL}" != "http://dl-cdn.alpinelinux.org/alpine" ] ; then sed -i "s#http.*://dl-cdn.alpinelinux.org/alpine#${ALPINE_REPO_URL}#g" /etc/apk/repositories ; fi
 
 ENV JMX_VERSION=0.18.0
-ENV JETTY_VERSION=11.0.21
 
 # Upgrade Alpine and base packages
 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762
 RUN apk --no-cache --update-cache --available upgrade \
     && apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat snappy \
     && apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
-    && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-runner/${JETTY_VERSION}/jetty-runner-${JETTY_VERSION}.jar --output jetty-runner.jar \
-    && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-jmx/${JETTY_VERSION}/jetty-jmx-${JETTY_VERSION}.jar --output jetty-jmx.jar \
-    && curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-util/${JETTY_VERSION}/jetty-util-${JETTY_VERSION}.jar --output jetty-util.jar \
     && wget --no-verbose ${GITHUB_REPO_URL}/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \
     && wget --no-verbose ${MAVEN_CENTRAL_REPO_URL}/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \
     && cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks

diff --git a/docs-website/README.md b/docs-website/README.md
@@ -130,7 +130,6 @@ The purpose of this section is to provide developers & technical users with conc
 
 This section aims to provide plain-language feature overviews for both technical and non-technical readers alike.
 
-
 ## Docs Generation Features
 
 **Includes all markdown files**
@@ -145,16 +144,33 @@ You can suppress this check by adding the path to the file in a comment in `side
 
 Use an "inline" directive to include code snippets from other files. The `show_path_as_comment` option will include the path to the file as a comment at the top of the snippet.
 
-  ```python
-  {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }}
-  ```
+    ```python
+    {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }}
+    ```
+
+**Command Output**
+
+Use the `{{ command-output cmd }}` directive to run subprocesses and inject the outputs into the final markdown.
+
+    {{ command-output python -c 'print("Hello world")' }}
 
+This also works for multi-line scripts.
+
+    {{ command-output
+    source metadata-ingestion/venv/bin/activate
+    python -m <something>
+    }}
+
+Regardless of the location of the markdown file, the subcommands will be executed with working directory set to the repo root.
+
+Only the stdout of the subprocess will be outputted. The stderr, if any, will be included as a comment in the markdown.
 
 ## Docs site generation process
 
 This process is orchestrated by a combination of Gradle and Yarn tasks. The main entrypoint is via the `docs-website:yarnGenerate` task, which in turn eventually runs `yarn run generate`.
 
 Steps:
+
 1. Generate the GraphQL combined schema using the gradle's `docs-website:generateGraphQLSchema` task. This generates `./graphql/combined.graphql`.
 2. Generate docs for ingestion sources using the `:metadata-ingestion:docGen` gradle task.
 3. Generate docs for our metadata model using the `:metadata-ingestion:modelDocGen` gradle task.

diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts
@@ -439,6 +439,42 @@ function markdown_process_inline_directives(
   contents.content = new_content;
 }
 
+function markdown_process_command_output(
+  contents: matter.GrayMatterFile<string>,
+  filepath: string
+): void {
+  const new_content = contents.content.replace(
+    /^{{\s*command-output\s*([\s\S]*?)\s*}}$/gm,
+    (_, command: string) => {
+      try {
+        // Change to repo root directory before executing command
+        const repoRoot = path.resolve(__dirname, "..");
+
+        console.log(`Executing command: ${command}`);
+
+        // Execute the command and capture output
+        const output = execSync(command, {
+          cwd: repoRoot,
+          encoding: "utf8",
+          stdio: ["pipe", "pipe", "pipe"],
+        });
+
+        // Return the command output
+        return output.trim();
+      } catch (error: any) {
+        // If there's an error, include it as a comment
+        const errorMessage = error.stderr
+          ? error.stderr.toString()
+          : error.message;
+        return `${
+          error.stdout ? error.stdout.toString().trim() : ""
+        }\n<!-- Error: ${errorMessage.trim()} -->`;
+      }
+    }
+  );
+  contents.content = new_content;
+}
+
 function markdown_sanitize_and_linkify(content: string): string {
   // MDX escaping
   content = content.replace(/</g, "&lt;");
@@ -602,6 +638,7 @@ function copy_python_wheels(): void {
     markdown_rewrite_urls(contents, filepath);
     markdown_enable_specials(contents, filepath);
     markdown_process_inline_directives(contents, filepath);
+    markdown_process_command_output(contents, filepath);
     //copy_platform_logos();
     // console.log(contents);
 

diff --git a/docs/managed-datahub/subscription-and-notification.md b/docs/managed-datahub/subscription-and-notification.md
@@ -17,9 +17,30 @@ Email will work out of box. For installing the DataHub Slack App, see:
 This feature is especially useful in helping you stay on top of any upstream changes that could impact the assets you or your stakeholders rely on. It eliminates the need for you and your team to manually check for upstream changes, or for upstream stakeholders to identify and notify impacted users.
 As a user, you can subscribe to and receive notifications about changes such as deprecations, schema changes, changes in ownership, assertions, or incidents. You’ll always been in the know about potential data quality issues so you can proactively manage your data resources.
 
+
+## Platform Admin Notifications
+
+Datahub provides three levels of notifications:
+
+- **Platform-level**
+- **Group-level** (described in other sections)
+- **User-level** (described in other sections)
+
+**Setting Platform-Level Notifications:**
+This requires appropriate permissions. Go to `Settings` > `Notifications` (under the `Platform` section, not `My Notifications`).
+
+**Platform-level Notifications:**
+Platform-level notifications are applied to all assets within Datahub.
+Example: If "An owner is added or removed from a data asset" is ticked, the designated Slack channel or email will receive notifications for any such changes across all assets.
+
+**Our Recommendations:**
+
+Notifying on tag changes for every asset in the platform would be noisy, and so we recommend to use these platform-level notifications only where appropriate. For example, we recommend notifications for ingestion failures routed to a central Slack channel or email. This will help you proactively ensure your Datahub metadata stays fresh.
+
 ## Prerequisites
 
 Once you have [configured Slack within your DataHub instance](slack/saas-slack-setup.md), you will be able to subscribe to any Entity in DataHub and begin recieving notifications via DM.
+
 To begin receiving personal notifications, go to Settings > "My Notifications". From here, toggle on Slack Notifications and input your Slack Member ID.
 
 If you want to create and manage group-level Subscriptions for your team, you will need [the following privileges](../../docs/authorization/roles.md#role-privileges):
@@ -162,6 +183,21 @@ You can unsubscribe from any asset to stop receiving notifications about it. On
 What if I want to be notified about different changes?
 </summary>
 To modify your subscription, use the dropdown menu next to the Subscribe button to modify the changes you want to be notified about.
+</details>
+<details>
+<summary>
+I want to configure multiple channels. How many Slack channels or emails can I configure to get notified? 
+</summary>
+At the platform-level, you can configure one email and one Slack channel.
+
+At the user and group -levels, you can configure one default email and Slack channel as well as overwrite that email/channel when you
+go to a specific asset to subscribe to. 
+
+To configure multiple channels, as a prereq, ensure you have the appropriate privileges. And then:
+1. Create a datahub group for each channel you want notifications for. 
+2. Add yourself as a member to each of the groups.
+3. Now, when you visit an asset and go to subscribe, you'll see the option "Manage Group Subscriptions".
+
 </details>
 
 ## Reference

diff --git a/...es/metadata-file/metadata-file_recipe.yml → ...ocs/sources/metadata-file/file_recipe.yml b/...es/metadata-file/metadata-file_recipe.yml → ...ocs/sources/metadata-file/file_recipe.yml
diff --git a/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md
@@ -0,0 +1,16 @@
+### Configuration Notes
+
+See the
+
+1. [Microsoft Grant user access to a Report Server doc](https://docs.microsoft.com/en-us/sql/reporting-services/security/grant-user-access-to-a-report-server?view=sql-server-ver16)
+2. Use your user credentials from previous step in yaml file
+
+### Concept mapping
+
+| Power BI Report Server | Datahub     |
+| ---------------------- | ----------- |
+| `Paginated Report`     | `Dashboard` |
+| `Power BI Report`      | `Dashboard` |
+| `Mobile Report`        | `Dashboard` |
+| `Linked Report`        | `Dashboard` |
+| `Dataset, Datasource`  | `N/A`       |
diff --git a/.../powerbi/powerbi-report-server_recipe.yml → ...t-server/powerbi-report-server_recipe.yml b/.../powerbi/powerbi-report-server_recipe.yml → ...t-server/powerbi-report-server_recipe.yml
diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md
diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py
@@ -346,7 +346,7 @@ def write_urn_classes(key_aspects: List[dict], urn_dir: Path) -> None:
     code = """
 # This file contains classes corresponding to entity URNs.
 
-from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
+from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
 
 import functools
 from deprecated.sphinx import deprecated as _sphinx_deprecated
@@ -547,10 +547,31 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
         assert fields[0]["type"] == ["null", "string"]
         fields[0]["type"] = "string"
 
+    field_urn_type_classes = {}
+    for field in fields:
+        # Figure out if urn types are valid for each field.
+        field_urn_type_class = None
+        if field_name(field) == "platform":
+            field_urn_type_class = "DataPlatformUrn"
+        elif field.get("Urn"):
+            if len(field.get("entityTypes", [])) == 1:
+                field_entity_type = field["entityTypes"][0]
+                field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn"
+            else:
+                field_urn_type_class = "Urn"
+
+        field_urn_type_classes[field_name(field)] = field_urn_type_class
+
     _init_arg_parts: List[str] = []
     for field in fields:
+        field_urn_type_class = field_urn_type_classes[field_name(field)]
+
         default = '"PROD"' if field_name(field) == "env" else None
-        _arg_part = f"{field_name(field)}: {field_type(field)}"
+
+        type_hint = field_type(field)
+        if field_urn_type_class:
+            type_hint = f'Union["{field_urn_type_class}", str]'
+        _arg_part = f"{field_name(field)}: {type_hint}"
         if default:
             _arg_part += f" = {default}"
         _init_arg_parts.append(_arg_part)
@@ -579,16 +600,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
         init_validation += f'if not {field_name(field)}:\n    raise InvalidUrnError("{class_name} {field_name(field)} cannot be empty")\n'
 
         # Generalized mechanism for validating embedded urns.
-        field_urn_type_class = None
-        if field_name(field) == "platform":
-            field_urn_type_class = "DataPlatformUrn"
-        elif field.get("Urn"):
-            if len(field.get("entityTypes", [])) == 1:
-                field_entity_type = field["entityTypes"][0]
-                field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn"
-            else:
-                field_urn_type_class = "Urn"
-
+        field_urn_type_class = field_urn_type_classes[field_name(field)]
         if field_urn_type_class:
             init_validation += f"{field_name(field)} = str({field_name(field)})\n"
             init_validation += (
@@ -608,7 +620,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
             init_coercion += "    platform_name = DataPlatformUrn.from_string(platform_name).platform_name\n"
 
         if field_name(field) == "platform":
-            init_coercion += "platform = DataPlatformUrn(platform).urn()\n"
+            init_coercion += "platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()\n"
         elif field_urn_type_class is None:
             # For all non-urns, run the value through the UrnEncoder.
             init_coercion += (
-Original file line number
+Diff line change
@@ Expand Up @@
         resolutionStrategy.force externalDependency.antlr4Runtime
         resolutionStrategy.force externalDependency.antlr4
+        resolutionStrategy.force 'org.apache.mina:mina-core:2.2.4'
       }
     }
@@ Expand Down @@