Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Nov 25, 2024
2 parents c898913 + 56a5dca commit cc809a6
Show file tree
Hide file tree
Showing 46 changed files with 1,280 additions and 318 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,21 @@ private TimeSeriesChart getActiveUsersTimeSeriesChart(
final DateTime end,
final String title,
final DateInterval interval) {
final DateRange dateRange =
new DateRange(String.valueOf(beginning.getMillis()), String.valueOf(end.getMillis()));

final DateRange dateRange;

// adjust month to show 1st of month rather than last day of previous month
if (interval == DateInterval.MONTH) {
dateRange =
new DateRange(
String.valueOf(beginning.plusDays(1).getMillis()), // Shift start by 1 day
String.valueOf(end.plusDays(1).getMillis()) // Shift end by 1 day
);
} else {
// week display starting Sundays
dateRange =
new DateRange(String.valueOf(beginning.getMillis()), String.valueOf(end.getMillis()));
}

final List<NamedLine> timeSeriesLines =
_analyticsService.getTimeseriesChart(
Expand All @@ -96,6 +109,7 @@ private TimeSeriesChart getActiveUsersTimeSeriesChart(
ImmutableMap.of(),
Collections.emptyMap(),
Optional.of("browserId"));

return TimeSeriesChart.builder()
.setTitle(title)
.setDateRange(dateRange)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,26 @@ public void testStartOfNextWeek() {
Mockito.when(dateUtil.getNow()).thenReturn(setTimeParts(8, false));
assertEqualStartOfNextWeek(dateUtil, 9);
}

// validates logic to display correct dates in MAU chart
@Test
public void testDateAdjustmentsForMonth() {
DateUtil dateUtil = Mockito.spy(DateUtil.class);

Mockito.when(dateUtil.getNow()).thenReturn(new DateTime(2024, 11, 15, 0, 0, 0));

// start date should be next month minus a day
// but we want to display Dec 1 instead of Nov 30, so add a day and verify it's Dec
DateTime startOfNextMonthMinus12 = dateUtil.getStartOfNextMonth().minusMonths(12);
DateTime adjustedStart = startOfNextMonthMinus12.minusMillis(1).plusDays(1);
assertEquals(12, adjustedStart.getMonthOfYear()); // Verify it is December
assertEquals(2023, adjustedStart.getYear()); // Verify it is 2023

// verify that the end date displays correctly
// the chart will display Oct 1 as the last month because we don't show current month
DateTime startOfThisMonth = dateUtil.getStartOfThisMonth();
DateTime adjustedEnd = startOfThisMonth.minusMillis(1).plusDays(1);
assertEquals(11, adjustedEnd.getMonthOfYear()); // Verify it is November
assertEquals(2024, adjustedEnd.getYear()); // Verify it is 2024
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@
"name": "csv-enricher",
"displayName": "CSV",
"description": "Import metadata from a formatted CSV.",
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv'",
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher",
"recipe": "source: \n type: csv-enricher \n config: \n # URL of your csv file to ingest \n filename: \n array_delimiter: '|' \n delimiter: ',' \n write_semantics: PATCH"
},
{
Expand Down
2 changes: 1 addition & 1 deletion datahub-web-react/src/app/ingest/source/conf/csv/csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const csvConfig: SourceConfig = {
type: 'csv-enricher',
placeholderRecipe,
displayName: 'CSV',
docsUrl: 'https://datahubproject.io/docs/generated/ingestion/sources/csv',
docsUrl: 'https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher',
logoUrl: csvLogo,
};

Expand Down
2 changes: 2 additions & 0 deletions docker/profiles/docker-compose.gms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ x-datahub-gms-service: &datahub-gms-service
<<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml}
ALTERNATE_MCP_VALIDATION: ${ALTERNATE_MCP_VALIDATION:-true}
STRICT_URN_VALIDATION_ENABLED: ${STRICT_URN_VALIDATION_ENABLED:-true}
healthcheck:
test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
start_period: 90s
Expand Down Expand Up @@ -183,6 +184,7 @@ x-datahub-mce-consumer-service: &datahub-mce-consumer-service
environment: &datahub-mce-consumer-env
<<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
ALTERNATE_MCP_VALIDATION: ${ALTERNATE_MCP_VALIDATION:-true}
STRICT_URN_VALIDATION_ENABLED: ${STRICT_URN_VALIDATION_ENABLED:-true}

x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev
<<: *datahub-mce-consumer-service
Expand Down
13 changes: 7 additions & 6 deletions docs/deploy/environment-vars.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ DataHub works.

## Feature Flags

| Variable | Default | Unit/Type | Components | Description |
|--------------------------------------------------|---------|-----------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
| `UI_INGESTION_ENABLED` | `true` | boolean | [`GMS`, `MCE Consumer`] | Enable UI based ingestion. |
| `DATAHUB_ANALYTICS_ENABLED` | `true` | boolean | [`Frontend`, `GMS`] | Collect DataHub usage to populate the analytics dashboard. |
| `BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE` | `true` | boolean | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Do not wait for the `system-update` to complete before starting. This should typically only be disabled during development. |
| `ER_MODEL_RELATIONSHIP_FEATURE_ENABLED` | `false` | boolean | [`Frontend`, `GMS`] | Enable ER Model Relation Feature that shows Relationships Tab within a Dataset UI. |
| Variable | Default | Unit/Type | Components | Description |
|--------------------------------------------------|----------|-----------|------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
| `UI_INGESTION_ENABLED` | `true` | boolean | [`GMS`, `MCE Consumer`] | Enable UI based ingestion. |
| `DATAHUB_ANALYTICS_ENABLED` | `true` | boolean | [`Frontend`, `GMS`] | Collect DataHub usage to populate the analytics dashboard. |
| `BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE` | `true` | boolean | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Do not wait for the `system-update` to complete before starting. This should typically only be disabled during development. |
| `ER_MODEL_RELATIONSHIP_FEATURE_ENABLED` | `false` | boolean | [`Frontend`, `GMS`] | Enable ER Model Relation Feature that shows Relationships Tab within a Dataset UI. |
| `STRICT_URN_VALIDATION_ENABLED` | `false` | boolean | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Enable stricter URN validation logic |


## Ingestion
Expand Down
2 changes: 1 addition & 1 deletion docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

### Breaking Changes

- #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`.
- #11486 - Criterion's `value` parameter has been previously deprecated. Use of `value` instead of `values` is no longer supported and will be completely removed on the next major version.
- #11484 - Metadata service authentication enabled by default
- #11484 - Rest API authorization enabled by default
- #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases.
Expand Down
2 changes: 1 addition & 1 deletion docs/managed-datahub/release-notes/v_0_3_7.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
datahub:
timezone: 'America/Los_Angeles'
```
- #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`.
- #11486 - Criterion's `value` parameter has been previously deprecated. Use of `value` instead of `values` is no longer supported and will be completely removed on the next major version.
- #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases.
- #11619 - schema field/column paths can no longer be empty strings
- #11619 - schema field/column paths can no longer be duplicated within the schema
Expand Down
3 changes: 2 additions & 1 deletion docs/what/urn.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)
There are a few restrictions when creating an urn:

1. Commas are reserved character in URN fields: `,`
2. Parentheses are reserved characters in URN fields: `( , )`
2. Parentheses are reserved characters in URN fields: `(` or `)`
3. Colons are reserved characters in URN fields: `:`
4. Urn separator UTF-8 character ``

Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import com.linkedin.util.Pair;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -49,7 +50,8 @@ default List<MCPItem> getMCPItems() {
* various hooks
*/
Pair<Map<String, Set<String>>, List<ChangeMCP>> toUpsertBatchItems(
Map<String, Map<String, SystemAspect>> latestAspects);
Map<String, Map<String, SystemAspect>> latestAspects,
Map<String, Map<String, Long>> nextVersions);

/**
* Apply read mutations to batch
Expand Down Expand Up @@ -227,4 +229,39 @@ static String toAbbreviatedString(Collection<? extends BatchItem> items, int max
+ StringUtils.abbreviate(itemsAbbreviated.toString(), maxWidth)
+ '}';
}

/**
* Increment aspect within a batch, tracking both the next aspect version and the most recent
*
* @param changeMCP changeMCP to be incremented
* @param latestAspects lastest aspects within the batch
* @param nextVersions next version for the aspects in the batch
* @return the incremented changeMCP
*/
static ChangeMCP incrementBatchVersion(
ChangeMCP changeMCP,
Map<String, Map<String, SystemAspect>> latestAspects,
Map<String, Map<String, Long>> nextVersions) {
long nextVersion =
nextVersions
.getOrDefault(changeMCP.getUrn().toString(), Collections.emptyMap())
.getOrDefault(changeMCP.getAspectName(), 0L);

changeMCP.setPreviousSystemAspect(
latestAspects
.getOrDefault(changeMCP.getUrn().toString(), Collections.emptyMap())
.getOrDefault(changeMCP.getAspectName(), null));

changeMCP.setNextAspectVersion(nextVersion);

// support inner-batch upserts
latestAspects
.computeIfAbsent(changeMCP.getUrn().toString(), key -> new HashMap<>())
.put(changeMCP.getAspectName(), changeMCP.getSystemAspect(nextVersion));
nextVersions
.computeIfAbsent(changeMCP.getUrn().toString(), key -> new HashMap<>())
.put(changeMCP.getAspectName(), nextVersion + 1);

return changeMCP;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class Constants {
public static final String INTERNAL_DELEGATED_FOR_ACTOR_HEADER_NAME = "X-DataHub-Delegated-For";
public static final String INTERNAL_DELEGATED_FOR_ACTOR_TYPE = "X-DataHub-Delegated-For-";

public static final String URN_LI_PREFIX = "urn:li:";
public static final String DATAHUB_ACTOR = "urn:li:corpuser:datahub"; // Super user.
public static final String SYSTEM_ACTOR =
"urn:li:corpuser:__datahub_system"; // DataHub internal service principal.
Expand Down
27 changes: 0 additions & 27 deletions metadata-ingestion/examples/mce_files/bootstrap_mce.json
Original file line number Diff line number Diff line change
Expand Up @@ -3613,33 +3613,6 @@
},
"systemMetadata": null
},
{
"entityType": "post",
"entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8de",
"changeType": "UPSERT",
"aspectName": "postInfo",
"aspect": {
"json": {
"type": "HOME_PAGE_ANNOUNCEMENT",
"content": {
"title": "Join Metadata & AI Summit 2024",
"type": "LINK",
"link": "http://www.acryldata.io/conference?utm_source=datahub_quickstart&utm_medium=metadata_ai_2024&utm_campaign=pinned_announcement",
"media": {
"type": "IMAGE",
"location": "https://formulatedby.com/wp-content/uploads/2024/07/0193320a6d93e7508d1598f7b24662f75a87e92f-352x456-1.svg"
}
},
"created": 1712547125049,
"lastModified": 1712547125049
}
},
"systemMetadata": {
"lastObserved": 1712548844816,
"runId": "datahub-2024_04_08-13_00_44",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "post",
"entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8dd",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ def _resolve_oauth_callback(self) -> None:

call_back = self.get_call_back_attribute()

assert call_back # to silent lint
assert isinstance(call_back, str), (
"oauth_cb must be a string representing python function reference "
"in the format <python-module>:<function-name>."
)
# Set the callback
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from datahub.ingestion.api.report import Report
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
Expand Down Expand Up @@ -77,7 +77,7 @@ class BigQueryQueriesExtractorReport(Report):

@dataclass
class BigQueryV2Report(
ProfilingSqlReport,
SQLSourceReport,
IngestionStageReport,
BaseTimeWindowReport,
ClassificationReportMixin,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,21 @@ class BigqueryDataset:
snapshots: List[BigqueryTableSnapshot] = field(default_factory=list)
columns: List[BigqueryColumn] = field(default_factory=list)

# Some INFORMATION_SCHEMA views are not available for BigLake tables
# based on Amazon S3 and Blob Storage data.
# https://cloud.google.com/bigquery/docs/omni-introduction#limitations
# Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
def is_biglake_dataset(self) -> bool:
return self.location is not None and self.location.lower().startswith(
("aws-", "azure-")
)

def supports_table_constraints(self) -> bool:
return not self.is_biglake_dataset()

def supports_table_partitions(self) -> bool:
return not self.is_biglake_dataset()


@dataclass
class BigqueryProject:
Expand Down Expand Up @@ -541,18 +556,26 @@ def get_table_constraints_for_dataset(
table_name=constraint.table_name,
type=constraint.constraint_type,
field_path=constraint.column_name,
referenced_project_id=constraint.referenced_catalog
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_dataset=constraint.referenced_schema
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_table_name=constraint.referenced_table
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_column_name=constraint.referenced_column
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_project_id=(
constraint.referenced_catalog
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_dataset=(
constraint.referenced_schema
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_table_name=(
constraint.referenced_table
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_column_name=(
constraint.referenced_column
if constraint.constraint_type == "FOREIGN KEY"
else None
),
)
)
self.report.num_get_table_constraints_for_dataset_api_requests += 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,10 @@ def _process_schema(
report=self.report,
rate_limiter=rate_limiter,
)
if self.config.include_table_constraints:
if (
self.config.include_table_constraints
and bigquery_dataset.supports_table_constraints()
):
constraints = self.schema_api.get_table_constraints_for_dataset(
project_id=project_id, dataset_name=dataset_name, report=self.report
)
Expand Down Expand Up @@ -1157,9 +1160,11 @@ def gen_schema_metadata(
# fields=[],
fields=self.gen_schema_fields(
columns,
table.constraints
if (isinstance(table, BigqueryTable) and table.constraints)
else [],
(
table.constraints
if (isinstance(table, BigqueryTable) and table.constraints)
else []
),
),
foreignKeys=foreign_keys if foreign_keys else None,
)
Expand All @@ -1180,13 +1185,9 @@ def get_tables_for_dataset(
) -> Iterable[BigqueryTable]:
# In bigquery there is no way to query all tables in a Project id
with PerfTimer() as timer:
# PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables
# based on Amazon S3 and Blob Storage data.
# https://cloud.google.com/bigquery/docs/omni-introduction#limitations
# Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
with_partitions = self.config.have_table_data_read_permission and not (
dataset.location
and dataset.location.lower().startswith(("aws-", "azure-"))
with_partitions = (
self.config.have_table_data_read_permission
and dataset.supports_table_partitions()
)

# Partitions view throw exception if we try to query partition info for too many tables
Expand Down
Loading

0 comments on commit cc809a6

Please sign in to comment.