Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
githendrik authored Nov 14, 2023
2 parents 1e2fc3a + f1b6aa7 commit 322d8a0
Show file tree
Hide file tree
Showing 46 changed files with 2,723 additions and 717 deletions.
5 changes: 3 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
**/node_modules/
datahub-frontend/build/
metadata-ingestion/venv/
*/build/
*/*/build/
*/venv/
out
**/*.class
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars
Expand Down
28 changes: 20 additions & 8 deletions .github/scripts/check_policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
elif urn == "urn:li:dataHubPolicy:editor-platform-policy":
editor_platform_policy_privileges = policy["info"]["privileges"]
elif urn == "urn:li:dataHubPolicy:7":
all_user_platform_policy_privilges = policy["info"]["privileges"]
all_user_platform_policy_privileges = policy["info"]["privileges"]
try:
doc_type = policy["info"]["type"]
privileges = policy["info"]["privileges"]
Expand Down Expand Up @@ -54,10 +54,22 @@
)
assert len(diff_policies) == 0, f"Missing privileges for root user are {diff_policies}"

diff_policies = set(editor_platform_policy_privileges).difference(
set(all_user_platform_policy_privilges)
)
assert "MANAGE_POLICIES" not in all_user_platform_policy_privilges
assert (
len(diff_policies) == 0
), f"Missing privileges for all user policies are {diff_policies}"
# All users privileges checks
assert "MANAGE_POLICIES" not in all_user_platform_policy_privileges
assert "MANAGE_USERS_AND_GROUPS" not in all_user_platform_policy_privileges
assert "MANAGE_SECRETS" not in all_user_platform_policy_privileges
assert "MANAGE_USER_CREDENTIALS" not in all_user_platform_policy_privileges
assert "MANAGE_ACCESS_TOKENS" not in all_user_platform_policy_privileges
assert "EDIT_ENTITY" not in all_user_platform_policy_privileges
assert "DELETE_ENTITY" not in all_user_platform_policy_privileges

# Editor checks
assert "MANAGE_POLICIES" not in editor_platform_policy_privileges
assert "MANAGE_USERS_AND_GROUPS" not in editor_platform_policy_privileges
assert "MANAGE_SECRETS" not in editor_platform_policy_privileges
assert "MANAGE_USER_CREDENTIALS" not in editor_platform_policy_privileges
assert "MANAGE_ACCESS_TOKENS" not in editor_platform_policy_privileges
# These don't prevent a user from modifying entities they are an asset owner of, i.e. their own profile info
assert "EDIT_CONTACT_INFO" not in editor_platform_policy_privileges
assert "EDIT_USER_PROFILE" not in editor_platform_policy_privileges
assert "EDIT_ENTITY_OWNERS" not in editor_platform_policy_privileges
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to
- [SpotHero](https://spothero.com)
- [Stash](https://www.stash.com)
- [Shanghai HuaRui Bank](https://www.shrbank.com)
- [s7 Airlines](https://www.s7.ru/)
- [ThoughtWorks](https://www.thoughtworks.com)
- [TypeForm](http://typeform.com)
- [Udemy](https://www.udemy.com/)
Expand Down
3 changes: 2 additions & 1 deletion datahub-frontend/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion datahub-upgrade/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,11 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
8 changes: 5 additions & 3 deletions docker/datahub-ingestion-base/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,20 @@ ext {
docker_repo = 'datahub-ingestion-base'
docker_dir = 'datahub-ingestion-base'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"

revision = 2 // increment to trigger rebuild
}

docker {
name "${docker_registry}/${docker_repo}:v${version}-${docker_target}"
version "v${version}-${docker_target}"
name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}

def dockerBuildArgs = [APP_ENV: docker_target]
Expand Down
4 changes: 2 additions & 2 deletions docker/datahub-ingestion/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh .
RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN pip install --no-cache --user ".[base]" && \
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
pip install --no-cache --user ".[all]" && \
./pyspark_jars.sh
pip install --no-cache --user ".[all]"
RUN ./pyspark_jars.sh

FROM base as full-install

Expand Down
7 changes: 7 additions & 0 deletions docker/datahub-ingestion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,10 @@
[![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml)

Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service.

## Slim vs Full Image Build

There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies.

Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full
image with pyspark use the following project property `-PdockerTarget=full`.
20 changes: 10 additions & 10 deletions docker/datahub-ingestion/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ ext {
docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry
docker_repo = 'datahub-ingestion'
docker_dir = 'datahub-ingestion'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"

revision = 2 // increment to trigger rebuild
}
Expand All @@ -19,22 +21,20 @@ dependencies {
}

docker {
name "${docker_registry}/${docker_repo}:v${version}-slim"
version "v${version}-slim"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only")
name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
include "metadata-ingestion/**"
include "metadata-ingestion-modules/**"
}.exclude {
i -> i.file.isHidden() ||
i.file == buildDir ||
i.file == project(':metadata-ingestion').buildDir ||
i.file == project(':metadata-ingestion-modules').buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}

def dockerBuildArgs = [DOCKER_VERSION: version,
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')]
def dockerBuildArgs = [[DOCKER_VERSION: version,
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')]]

// Add build args if they are defined (needed for some CI or enterprise environments)
if (project.hasProperty('pipMirrorUrl')) {
Expand Down
40 changes: 26 additions & 14 deletions docker/datahub-ingestion/pyspark_jars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,33 @@

set -ex

HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}"
ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"

# Remove conflicting versions
echo "Removing version conflicts from $PYSPARK_JARS"
CONFLICTS="zookeeper hadoop- slf4j-"
for jar in $CONFLICTS; do
rm "$PYSPARK_JARS/$jar"*.jar
done
function replace_jar {
JAR_PREFIX=$1
TRANSITIVE=$2
DEPENDENCY=$3

# Fetch dependencies
mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY"
mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY"
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm -r "$HOME/.m2" || true

# Move to pyspark location
echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \;
if [ ! -z "$DEPENDENCY" ]; then
echo "Resolving $DEPENDENCY"
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null

echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
fi
}

replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"
5 changes: 3 additions & 2 deletions docker/elasticsearch-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
include "metadata-service/restli-servlet-impl/src/main/resources/index/**"
include 'metadata-service/restli-servlet-impl/src/main/resources/index/**'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion docker/kafka-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion docker/mysql-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion docker/postgres-setup/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

Expand Down
3 changes: 2 additions & 1 deletion docs/actions/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ datahub actions -c <config-1.yaml> -c <config-2.yaml>
### Running in debug mode
Simply append the `--debug` flag to the CLI to run your action in debug mode.
Simply append the `--debug` flag to the CLI to run your action in debug mode. NOTE: This will reveal sensitive information in the logs, do not share the logs with outside resources and ensure untrusted
users will not have access to logs through UI ingestions before enabling on instances.
```
datahub actions -c <config.yaml> --debug
Expand Down
2 changes: 1 addition & 1 deletion docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ The environment variables listed below take precedence over the DataHub CLI conf
- `DATAHUB_GMS_TOKEN` (default `None`) - Used for communicating with DataHub Cloud.
- `DATAHUB_TELEMETRY_ENABLED` (default `true`) - Set to `false` to disable telemetry. If CLI is being run in an environment with no access to public internet then this should be disabled.
- `DATAHUB_TELEMETRY_TIMEOUT` (default `10`) - Set to a custom integer value to specify timeout in secs when sending telemetry.
- `DATAHUB_DEBUG` (default `false`) - Set to `true` to enable debug logging for CLI. Can also be achieved through `--debug` option of the CLI.
- `DATAHUB_DEBUG` (default `false`) - Set to `true` to enable debug logging for CLI. Can also be achieved through `--debug` option of the CLI. This exposes sensitive information in logs, enabling on production instances should be avoided especially if UI ingestion is in use as logs can be made available for runs through the UI.
- `DATAHUB_VERSION` (default `head`) - Set to a specific version to run quickstart with the particular version of docker images.
- `ACTIONS_VERSION` (default `head`) - Set to a specific version to run quickstart with that image tag of `datahub-actions` container.
- `DATAHUB_ACTIONS_IMAGE` (default `acryldata/datahub-actions`) - Set to `-slim` to run a slimmer actions container without pyspark/deequ features.
Expand Down
22 changes: 20 additions & 2 deletions docs/deploy/telemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,25 @@

To effectively build and maintain the DataHub Project, we must understand how end-users work within DataHub. Beginning in version 0.8.35, DataHub collects anonymous usage statistics and errors to inform our roadmap priorities and to enable us to proactively address errors.

Deployments are assigned a UUID which is sent along with event details, Java version, OS, and timestamp; telemetry collection is enabled by default and can be disabled by setting `DATAHUB_TELEMETRY_ENABLED=false` in your Docker Compose config.
Both the DataHub backend and the ingestion framework collect telemetry.

## DataHub Backend Telemetry

The source code is available [here.](../../metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/TelemetryUtils.java)
Deployments are assigned a UUID which is sent along with event details, Java version, OS, and timestamp.
The source code is available [here](../../metadata-service/factories/src/main/java/com/linkedin/gms/factory/telemetry/TelemetryUtils.java).

## Ingestion Framework Telemetry

The ingestion framework collects telemetry including CLI invocations, source/sink types, error types, versions, and timestamps. If you run with `datahub --debug`, all telemetry calls will be logged.

On first invocation, the CLI will generate a randomized UUID, which will be sent alongside every telemetry event. This config is stored in `~/.datahub/telemetry-config.json`.

The source code is available [here](../../metadata-ingestion/src/datahub/telemetry/telemetry.py).

## Disabling Telemetry

Telemetry is enabled by default. While we are careful to anonymize all telemetry data and encourage users to keep it enabled so that we can improve DataHub, we understand that some users may wish to disable it.

You can disable backend telemetry by setting the `DATAHUB_TELEMETRY_ENABLED` environment variable to `false`. You'll need to set this on both the datahub-gms and datahub-actions containers.

If you're using the DataHub CLI, ingestion framework telemetry will be disabled when the `DATAHUB_TELEMETRY_ENABLED` environment variable is set to `false`. To persist this change for your machine, run `datahub telemetry disable`.
2 changes: 1 addition & 1 deletion metadata-events/mxe-schemas/rename-namespace.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/bash

SCRIPT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]:-$0}" )" >/dev/null && pwd )"

Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion-modules/airflow-plugin/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_long_description():
return pathlib.Path(os.path.join(root, "README.md")).read_text()


_version = package_metadata["__version__"]
_version: str = package_metadata["__version__"]
_self_pin = f"=={_version}" if not _version.endswith("dev0") else ""


Expand Down
11 changes: 3 additions & 8 deletions metadata-ingestion/scripts/docgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
import sys
import textwrap
from importlib.metadata import metadata, requires
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from typing import Any, Dict, Iterable, List, Optional

import click
from pydantic import BaseModel, Field
from pydantic.dataclasses import dataclass

from datahub.configuration.common import ConfigModel
from datahub.ingestion.api.decorators import (
Expand Down Expand Up @@ -94,7 +93,6 @@ class Component(BaseModel):

@staticmethod
def map_field_path_to_components(field_path: str) -> List[Component]:

m = re.match(FieldRow._V2_FIELD_PATH_TOKEN_MATCHER_PREFIX, field_path)
v = re.match(FieldRow._V2_FIELD_PATH_FIELD_NAME_MATCHER, field_path)
components: List[FieldRow.Component] = []
Expand Down Expand Up @@ -197,7 +195,7 @@ def get_checkbox(self) -> str:
# Using a non-breaking space to prevent the checkbox from being
# broken into a new line.
if not self.parent: # None and empty string both count
return f'&nbsp;<abbr title="Required">✅</abbr>'
return '&nbsp;<abbr title="Required">✅</abbr>'
else:
return f'&nbsp;<abbr title="Required if {self.parent} is set">❓</abbr>'
else:
Expand Down Expand Up @@ -356,7 +354,6 @@ def priority_value(path: str) -> str:


def gen_md_table_from_struct(schema_dict: Dict[str, Any]) -> List[str]:

from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator

# we don't want default field values to be injected into the description of the field
Expand Down Expand Up @@ -460,7 +457,6 @@ def get_additional_deps_for_extra(extra_name: str) -> List[str]:


def relocate_path(orig_path: str, relative_path: str, relocated_path: str) -> str:

newPath = os.path.join(os.path.dirname(orig_path), relative_path)
assert os.path.exists(newPath)

Expand Down Expand Up @@ -515,7 +511,6 @@ def generate(

if extra_docs:
for path in glob.glob(f"{extra_docs}/**/*[.md|.yaml|.yml]", recursive=True):

m = re.search("/docs/sources/(.*)/(.*).md", path)
if m:
platform_name = m.group(1).lower()
Expand Down Expand Up @@ -741,7 +736,7 @@ def generate(
i += 1
f.write(f"---\nsidebar_position: {i}\n---\n\n")
f.write(
f"import Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\n"
"import Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\n"
)
f.write(f"# {platform_docs['name']}\n")

Expand Down
Loading

0 comments on commit 322d8a0

Please sign in to comment.