Merge branch 'datahub-project:master' into master

acryldata · Nov 25, 2024 · cc809a6 · cc809a6
2 parents c898913 + 56a5dca
commit cc809a6
Show file tree

Hide file tree

Showing 46 changed files with 1,280 additions and 318 deletions.
diff --git a/...core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java b/...core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java
@@ -84,8 +84,21 @@ private TimeSeriesChart getActiveUsersTimeSeriesChart(
       final DateTime end,
       final String title,
       final DateInterval interval) {
-    final DateRange dateRange =
-        new DateRange(String.valueOf(beginning.getMillis()), String.valueOf(end.getMillis()));
+
+    final DateRange dateRange;
+
+    // adjust month to show 1st of month rather than last day of previous month
+    if (interval == DateInterval.MONTH) {
+      dateRange =
+          new DateRange(
+              String.valueOf(beginning.plusDays(1).getMillis()), // Shift start by 1 day
+              String.valueOf(end.plusDays(1).getMillis()) // Shift end by 1 day
+              );
+    } else {
+      // week display starting Sundays
+      dateRange =
+          new DateRange(String.valueOf(beginning.getMillis()), String.valueOf(end.getMillis()));
+    }
 
     final List<NamedLine> timeSeriesLines =
         _analyticsService.getTimeseriesChart(
@@ -96,6 +109,7 @@ private TimeSeriesChart getActiveUsersTimeSeriesChart(
             ImmutableMap.of(),
             Collections.emptyMap(),
             Optional.of("browserId"));
+
     return TimeSeriesChart.builder()
         .setTitle(title)
         .setDateRange(dateRange)

diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java
@@ -47,4 +47,26 @@ public void testStartOfNextWeek() {
     Mockito.when(dateUtil.getNow()).thenReturn(setTimeParts(8, false));
     assertEqualStartOfNextWeek(dateUtil, 9);
   }
+
+  // validates logic to display correct dates in MAU chart
+  @Test
+  public void testDateAdjustmentsForMonth() {
+    DateUtil dateUtil = Mockito.spy(DateUtil.class);
+
+    Mockito.when(dateUtil.getNow()).thenReturn(new DateTime(2024, 11, 15, 0, 0, 0));
+
+    // start date should be next month minus a day
+    // but we want to display Dec 1 instead of Nov 30, so add a day and verify it's Dec
+    DateTime startOfNextMonthMinus12 = dateUtil.getStartOfNextMonth().minusMonths(12);
+    DateTime adjustedStart = startOfNextMonthMinus12.minusMillis(1).plusDays(1);
+    assertEquals(12, adjustedStart.getMonthOfYear()); // Verify it is December
+    assertEquals(2023, adjustedStart.getYear()); // Verify it is 2023
+
+    // verify that the end date displays correctly
+    // the chart will display Oct 1 as the last month because we don't show current month
+    DateTime startOfThisMonth = dateUtil.getStartOfThisMonth();
+    DateTime adjustedEnd = startOfThisMonth.minusMillis(1).plusDays(1);
+    assertEquals(11, adjustedEnd.getMonthOfYear()); // Verify it is November
+    assertEquals(2024, adjustedEnd.getYear()); // Verify it is 2024
+  }
 }
diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json
@@ -284,7 +284,7 @@
         "name": "csv-enricher",
         "displayName": "CSV",
         "description": "Import metadata from a formatted CSV.",
-        "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv'",
+        "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher",
         "recipe": "source: \n    type: csv-enricher \n    config: \n        # URL of your csv file to ingest \n        filename: \n        array_delimiter: '|' \n        delimiter: ',' \n        write_semantics: PATCH"
     },
     {

diff --git a/datahub-web-react/src/app/ingest/source/conf/csv/csv.ts b/datahub-web-react/src/app/ingest/source/conf/csv/csv.ts
@@ -15,7 +15,7 @@ const csvConfig: SourceConfig = {
     type: 'csv-enricher',
     placeholderRecipe,
     displayName: 'CSV',
-    docsUrl: 'https://datahubproject.io/docs/generated/ingestion/sources/csv',
+    docsUrl: 'https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher',
     logoUrl: csvLogo,
 };
 

diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml
@@ -101,6 +101,7 @@ x-datahub-gms-service: &datahub-gms-service
     <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
     ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml}
     ALTERNATE_MCP_VALIDATION: ${ALTERNATE_MCP_VALIDATION:-true}
+    STRICT_URN_VALIDATION_ENABLED: ${STRICT_URN_VALIDATION_ENABLED:-true}
   healthcheck:
     test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
     start_period: 90s
@@ -183,6 +184,7 @@ x-datahub-mce-consumer-service: &datahub-mce-consumer-service
   environment: &datahub-mce-consumer-env
     <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
     ALTERNATE_MCP_VALIDATION: ${ALTERNATE_MCP_VALIDATION:-true}
+    STRICT_URN_VALIDATION_ENABLED: ${STRICT_URN_VALIDATION_ENABLED:-true}
 
 x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev
   <<: *datahub-mce-consumer-service

diff --git a/docs/deploy/environment-vars.md b/docs/deploy/environment-vars.md
@@ -9,12 +9,13 @@ DataHub works.
 
 ## Feature Flags
 
-| Variable                                         | Default | Unit/Type | Components                              | Description                                                                                                                 |
-|--------------------------------------------------|---------|-----------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
-| `UI_INGESTION_ENABLED`                           | `true`  | boolean   | [`GMS`, `MCE Consumer`]                 | Enable UI based ingestion.                                                                                                  |
-| `DATAHUB_ANALYTICS_ENABLED`                      | `true`  | boolean   | [`Frontend`, `GMS`]                     | Collect DataHub usage to populate the analytics dashboard.                                                                  |
-| `BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE` | `true`  | boolean   | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Do not wait for the `system-update` to complete before starting. This should typically only be disabled during development. |
-| `ER_MODEL_RELATIONSHIP_FEATURE_ENABLED`          | `false` | boolean   | [`Frontend`, `GMS`]                     | Enable ER Model Relation Feature that shows Relationships Tab within a Dataset UI.                                          |
+| Variable                                         | Default  | Unit/Type | Components                               | Description                                                                                                                 |
+|--------------------------------------------------|----------|-----------|------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
+| `UI_INGESTION_ENABLED`                           | `true`   | boolean   | [`GMS`, `MCE Consumer`]                  | Enable UI based ingestion.                                                                                                  |
+| `DATAHUB_ANALYTICS_ENABLED`                      | `true`   | boolean   | [`Frontend`, `GMS`]                      | Collect DataHub usage to populate the analytics dashboard.                                                                  |
+| `BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE` | `true`   | boolean   | [`GMS`, `MCE Consumer`, `MAE Consumer`]  | Do not wait for the `system-update` to complete before starting. This should typically only be disabled during development. |
+| `ER_MODEL_RELATIONSHIP_FEATURE_ENABLED`          | `false`  | boolean   | [`Frontend`, `GMS`]                      | Enable ER Model Relation Feature that shows Relationships Tab within a Dataset UI.                                          |
+| `STRICT_URN_VALIDATION_ENABLED`                  | `false`  | boolean   | [`GMS`, `MCE Consumer`, `MAE Consumer`]  | Enable stricter URN validation logic                                                                                        |
 
 
 ## Ingestion

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -38,7 +38,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 
 ### Breaking Changes
 
-- #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`.
+- #11486 - Criterion's `value` parameter has been previously deprecated. Use of `value` instead of `values` is no longer supported and will be completely removed on the next major version.
 - #11484 - Metadata service authentication enabled by default
 - #11484 - Rest API authorization enabled by default
 - #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases.

diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md
@@ -32,7 +32,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
               datahub:
                 timezone: 'America/Los_Angeles'
           ```
-    - #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`.
+    - #11486 - Criterion's `value` parameter has been previously deprecated. Use of `value` instead of `values` is no longer supported and will be completely removed on the next major version.
     - #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases.
     - #11619 - schema field/column paths can no longer be empty strings
     - #11619 - schema field/column paths can no longer be duplicated within the schema

diff --git a/docs/what/urn.md b/docs/what/urn.md
@@ -38,7 +38,8 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)
 There are a few restrictions when creating an urn:
 
 1. Commas are reserved character in URN fields: `,`
-2. Parentheses are reserved characters in URN fields: `( , )`
+2. Parentheses are reserved characters in URN fields: `(` or `)`
 3. Colons are reserved characters in URN fields: `:`
+4. Urn separator UTF-8 character `␟`
 
 Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java
@@ -9,6 +9,7 @@
 import com.linkedin.util.Pair;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -49,7 +50,8 @@ default List<MCPItem> getMCPItems() {
    *     various hooks
    */
   Pair<Map<String, Set<String>>, List<ChangeMCP>> toUpsertBatchItems(
-      Map<String, Map<String, SystemAspect>> latestAspects);
+      Map<String, Map<String, SystemAspect>> latestAspects,
+      Map<String, Map<String, Long>> nextVersions);
 
   /**
    * Apply read mutations to batch
@@ -227,4 +229,39 @@ static String toAbbreviatedString(Collection<? extends BatchItem> items, int max
         + StringUtils.abbreviate(itemsAbbreviated.toString(), maxWidth)
         + '}';
   }
+
+  /**
+   * Increment aspect within a batch, tracking both the next aspect version and the most recent
+   *
+   * @param changeMCP changeMCP to be incremented
+   * @param latestAspects lastest aspects within the batch
+   * @param nextVersions next version for the aspects in the batch
+   * @return the incremented changeMCP
+   */
+  static ChangeMCP incrementBatchVersion(
+      ChangeMCP changeMCP,
+      Map<String, Map<String, SystemAspect>> latestAspects,
+      Map<String, Map<String, Long>> nextVersions) {
+    long nextVersion =
+        nextVersions
+            .getOrDefault(changeMCP.getUrn().toString(), Collections.emptyMap())
+            .getOrDefault(changeMCP.getAspectName(), 0L);
+
+    changeMCP.setPreviousSystemAspect(
+        latestAspects
+            .getOrDefault(changeMCP.getUrn().toString(), Collections.emptyMap())
+            .getOrDefault(changeMCP.getAspectName(), null));
+
+    changeMCP.setNextAspectVersion(nextVersion);
+
+    // support inner-batch upserts
+    latestAspects
+        .computeIfAbsent(changeMCP.getUrn().toString(), key -> new HashMap<>())
+        .put(changeMCP.getAspectName(), changeMCP.getSystemAspect(nextVersion));
+    nextVersions
+        .computeIfAbsent(changeMCP.getUrn().toString(), key -> new HashMap<>())
+        .put(changeMCP.getAspectName(), nextVersion + 1);
+
+    return changeMCP;
+  }
 }
diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java
@@ -10,6 +10,7 @@ public class Constants {
   public static final String INTERNAL_DELEGATED_FOR_ACTOR_HEADER_NAME = "X-DataHub-Delegated-For";
   public static final String INTERNAL_DELEGATED_FOR_ACTOR_TYPE = "X-DataHub-Delegated-For-";
 
+  public static final String URN_LI_PREFIX = "urn:li:";
   public static final String DATAHUB_ACTOR = "urn:li:corpuser:datahub"; // Super user.
   public static final String SYSTEM_ACTOR =
       "urn:li:corpuser:__datahub_system"; // DataHub internal service principal.

diff --git a/metadata-ingestion/examples/mce_files/bootstrap_mce.json b/metadata-ingestion/examples/mce_files/bootstrap_mce.json
@@ -3613,33 +3613,6 @@
     },
     "systemMetadata": null
   },
-  {
-      "entityType": "post",
-      "entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8de",
-      "changeType": "UPSERT",
-      "aspectName": "postInfo",
-      "aspect": {
-          "json": {
-              "type": "HOME_PAGE_ANNOUNCEMENT",
-              "content": {
-                  "title": "Join Metadata & AI Summit 2024",
-                  "type": "LINK",
-                  "link": "http://www.acryldata.io/conference?utm_source=datahub_quickstart&utm_medium=metadata_ai_2024&utm_campaign=pinned_announcement",
-                  "media": {
-                      "type": "IMAGE",
-                      "location": "https://formulatedby.com/wp-content/uploads/2024/07/0193320a6d93e7508d1598f7b24662f75a87e92f-352x456-1.svg"
-                  }
-              },
-              "created": 1712547125049,
-              "lastModified": 1712547125049
-          }
-      },
-      "systemMetadata": {
-          "lastObserved": 1712548844816,
-          "runId": "datahub-2024_04_08-13_00_44",
-          "lastRunId": "no-run-id-provided"
-      }
-  },
   {
       "entityType": "post",
       "entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8dd",

diff --git a/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py b/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py
@@ -30,6 +30,9 @@ def _resolve_oauth_callback(self) -> None:
 
         call_back = self.get_call_back_attribute()
 
-        assert call_back  # to silent lint
+        assert isinstance(call_back, str), (
+            "oauth_cb must be a string representing python function reference "
+            "in the format <python-module>:<function-name>."
+        )
         # Set the callback
         self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -8,7 +8,7 @@
 
 from datahub.ingestion.api.report import Report
 from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
-from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
 from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
 from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
@@ -77,7 +77,7 @@ class BigQueryQueriesExtractorReport(Report):
 
 @dataclass
 class BigQueryV2Report(
-    ProfilingSqlReport,
+    SQLSourceReport,
     IngestionStageReport,
     BaseTimeWindowReport,
     ClassificationReportMixin,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -152,6 +152,21 @@ class BigqueryDataset:
     snapshots: List[BigqueryTableSnapshot] = field(default_factory=list)
     columns: List[BigqueryColumn] = field(default_factory=list)
 
+    # Some INFORMATION_SCHEMA views are not available for BigLake tables
+    # based on Amazon S3 and Blob Storage data.
+    # https://cloud.google.com/bigquery/docs/omni-introduction#limitations
+    # Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
+    def is_biglake_dataset(self) -> bool:
+        return self.location is not None and self.location.lower().startswith(
+            ("aws-", "azure-")
+        )
+
+    def supports_table_constraints(self) -> bool:
+        return not self.is_biglake_dataset()
+
+    def supports_table_partitions(self) -> bool:
+        return not self.is_biglake_dataset()
+
 
 @dataclass
 class BigqueryProject:
@@ -541,18 +556,26 @@ def get_table_constraints_for_dataset(
                         table_name=constraint.table_name,
                         type=constraint.constraint_type,
                         field_path=constraint.column_name,
-                        referenced_project_id=constraint.referenced_catalog
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
-                        referenced_dataset=constraint.referenced_schema
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
-                        referenced_table_name=constraint.referenced_table
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
-                        referenced_column_name=constraint.referenced_column
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
+                        referenced_project_id=(
+                            constraint.referenced_catalog
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
+                        referenced_dataset=(
+                            constraint.referenced_schema
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
+                        referenced_table_name=(
+                            constraint.referenced_table
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
+                        referenced_column_name=(
+                            constraint.referenced_column
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
                     )
                 )
             self.report.num_get_table_constraints_for_dataset_api_requests += 1

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -498,7 +498,10 @@ def _process_schema(
                 report=self.report,
                 rate_limiter=rate_limiter,
             )
-            if self.config.include_table_constraints:
+            if (
+                self.config.include_table_constraints
+                and bigquery_dataset.supports_table_constraints()
+            ):
                 constraints = self.schema_api.get_table_constraints_for_dataset(
                     project_id=project_id, dataset_name=dataset_name, report=self.report
                 )
@@ -1157,9 +1160,11 @@ def gen_schema_metadata(
             # fields=[],
             fields=self.gen_schema_fields(
                 columns,
-                table.constraints
-                if (isinstance(table, BigqueryTable) and table.constraints)
-                else [],
+                (
+                    table.constraints
+                    if (isinstance(table, BigqueryTable) and table.constraints)
+                    else []
+                ),
             ),
             foreignKeys=foreign_keys if foreign_keys else None,
         )
@@ -1180,13 +1185,9 @@ def get_tables_for_dataset(
     ) -> Iterable[BigqueryTable]:
         # In bigquery there is no way to query all tables in a Project id
         with PerfTimer() as timer:
-            # PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables
-            # based on Amazon S3 and Blob Storage data.
-            # https://cloud.google.com/bigquery/docs/omni-introduction#limitations
-            # Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
-            with_partitions = self.config.have_table_data_read_permission and not (
-                dataset.location
-                and dataset.location.lower().startswith(("aws-", "azure-"))
+            with_partitions = (
+                self.config.have_table_data_read_permission
+                and dataset.supports_table_partitions()
             )
 
             # Partitions view throw exception if we try to query partition info for too many tables