diff --git a/README.md b/README.md index 71b709cd..6e929b4b 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,10 @@ Refer to specific use-case: - [Standard setup guide](./docs/setup.md) -- [Terraform](./terraform/README.md) +- Terraform + - [AWS](./terraform/aws/TERRAFORM_AWS.md) + - [Azure](./terraform/azure/TERRAFORM_Azure.md) + - [GCP](./terraform/gcp/TERRAFORM_GCP.md) - [Deprecated: Manual setup](./docs/deprecated_old_setup.md) ## Introduction diff --git a/configs/security_best_practices.csv b/configs/security_best_practices.csv index 94ef3a32..604c3a92 100644 --- a/configs/security_best_practices.csv +++ b/configs/security_best_practices.csv @@ -6,13 +6,13 @@ id,check_id,category,check,evaluation_value,severity,recommendation,aws,azure,gc 5,DP-5,Data Protection,Downloading results is disabled,-1,Medium,Disable download button for notebook results,1,1,1,1,0,Check workspace-conf for enableResultsDownloading setting,curl -n -X GET 'https:///api/2.0/preview/workspace-conf?keys=enableResultsDownloading',https://docs.databricks.com/administration-guide/workspace/notebooks.html#manage-download-results,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/workspace/notebooks#manage-download-results,https://docs.gcp.databricks.com/administration-guide/workspace/notebooks.html#manage-download-results 6,GOV-1,Governance,Cluster policies consistently applied,-1,High,Configure cluster policies to enforce data access patterns and control costs,1,1,1,1,0,Check if policy_id is set for clusters,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/administration-guide/clusters/policies.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/clusters/policies,https://docs.gcp.databricks.com/administration-guide/clusters/policies.html 7,GOV-2,Governance,PAT tokens are about to expire,7,High,Set an expiration day (maximum lifetime) for tokens. Also regularly review PAT tokens to avoid expired tokens.,1,1,1,1,0,Check each token expiry_time and report if the expiry_time is within configured days,curl --netrc -X GET \ https:///api/2.0/token/list | jq .,https://docs.databricks.com/administration-guide/access-control/tokens.html#manage-personal-access-tokens,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/access-control/tokens#manage-personal-access-tokens,https://docs.gcp.databricks.com/administration-guide/access-control/tokens.html#manage-personal-access-tokens -8,GOV-3,Governance,Log delivery configurations,-1,High,Configure Databricks audit log delivery,1,1,0,1,0,"Check account log-delivery configuration and look for audit log config with log_type set as ""AUDIT_LOGS"" and status set as ""ENABLED""",curl -netrc -X GET \ 'https://accounts.cloud.databricks.com/api/2.0/accounts//log-delivery',https://docs.databricks.com/administration-guide/account-settings/audit-logs.html#configure-audit-log-delivery,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/account-settings/audit-logs#configure-audit-log-delivery,https://docs.gcp.databricks.com/administration-guide/account-settings/audit-logs.html#configure-audit-log-delivery +8,GOV-3,Governance,Log delivery configurations,-1,High,Configure Databricks audit log delivery (or see GOV-34),1,1,0,1,0,"Check account log-delivery configuration and look for audit log config with log_type set as ""AUDIT_LOGS"" and status set as ""ENABLED""",curl -netrc -X GET \ 'https://accounts.cloud.databricks.com/api/2.0/accounts//log-delivery',https://docs.databricks.com/administration-guide/account-settings/audit-logs.html#configure-audit-log-delivery,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/account-settings/audit-logs#configure-audit-log-delivery,https://docs.gcp.databricks.com/administration-guide/account-settings/audit-logs.html#configure-audit-log-delivery 9,GOV-4,Governance,Long-running clusters,24,Medium,Restart clusters on a regular schedule to use the latest available base image and container image.,1,1,1,1,0,Check each running cluster's last restart time till now and report on clusters that were running longer than the configured number of days without a restart,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/clusters/clusters-manage.html#restart-a-cluster-to-update-it-with-the-latest-images,https://learn.microsoft.com/en-us/azure/databricks/clusters/clusters-manage#--restart-a-cluster-to-update-it-with-the-latest-images,https://docs.gcp.databricks.com/clusters/clusters-manage.html#restart-a-cluster-to-update-it-with-the-latest-images 10,GOV-5,Governance,Deprecated versions of Databricks runtimes,-1,High,Deprecated runtime version detected. Please update your cluster runtimes to Databricks supported runtimes,1,1,1,1,0,List clusters with spark version that is not in the supported spark versions,curl --netrc -X GET \ https:///api/2.0/clusters/spark-versions,https://docs.databricks.com/release-notes/runtime/releases.html,https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases,https://docs.gcp.databricks.com/release-notes/runtime/releases.html -11,GOV-6,Governance,All-purpose cluster custom tags,-1,Low,Configure cluster tagging to monitor usage and enable chargebacks,1,1,1,1,0,check if custom_tags is set for clusters,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/administration-guide/account-settings/usage-detail-tags-aws.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/account-settings/usage-detail-tags-azure,https://docs.gcp.databricks.com/administration-guide/account-settings-gcp/usage-detail-tags-gcp.html -12,GOV-7,Governance,Job cluster custom tags,-1,Low,Configure job tagging to monitor usage and enable chargebacks,1,1,1,1,0,Check if settings.new_cluster.custom_tags is not null for job clusters,curl --netrc -X GET \ https:///api/2.0/jobs/list \ | jq,https://docs.databricks.com/administration-guide/account-settings/usage-detail-tags-aws.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/account-settings/usage-detail-tags-azure,https://docs.gcp.databricks.com/administration-guide/account-settings-gcp/usage-detail-tags-gcp.html -13,GOV-8,Governance,All-purpose cluster log configuration,-1,Low,Configure Databricks cluster log delivery,1,1,1,1,0,Check if cluster_log_conf is set for clusters,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/clusters/configure.html#cluster-log-delivery-1,https://learn.microsoft.com/en-us/azure/databricks/clusters/configure#cluster-log-delivery,https://docs.gcp.databricks.com/clusters/configure.html#cluster-log-delivery-1 -14,GOV-9,Governance,Job cluster log configuration,-1,Low,Configure Databricks job custer log delivery,1,1,1,1,0,Check if cluster_log_conf is set for job clusters,curl --netrc -X GET \ https:///api/2.0/jobs/list \ | jq,https://docs.databricks.com/clusters/configure.html#cluster-log-delivery-1,https://learn.microsoft.com/en-us/azure/databricks/clusters/configure#cluster-log-delivery,https://docs.gcp.databricks.com/clusters/configure.html#cluster-log-delivery-1 +11,GOV-6,Governance,All-purpose cluster custom tags,-1,Low,Configure cluster tagging to monitor usage and enable chargebacks,1,1,1,0,0,check if custom_tags is set for clusters,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/administration-guide/account-settings/usage-detail-tags-aws.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/account-settings/usage-detail-tags-azure,https://docs.gcp.databricks.com/administration-guide/account-settings-gcp/usage-detail-tags-gcp.html +12,GOV-7,Governance,Job cluster custom tags,-1,Low,Configure job tagging to monitor usage and enable chargebacks,1,1,1,0,0,Check if settings.new_cluster.custom_tags is not null for job clusters,curl --netrc -X GET \ https:///api/2.0/jobs/list \ | jq,https://docs.databricks.com/administration-guide/account-settings/usage-detail-tags-aws.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/account-settings/usage-detail-tags-azure,https://docs.gcp.databricks.com/administration-guide/account-settings-gcp/usage-detail-tags-gcp.html +13,GOV-8,Governance,All-purpose cluster log configuration,-1,Low,Configure Databricks cluster log delivery,1,1,1,0,0,Check if cluster_log_conf is set for clusters,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/clusters/configure.html#cluster-log-delivery-1,https://learn.microsoft.com/en-us/azure/databricks/clusters/configure#cluster-log-delivery,https://docs.gcp.databricks.com/clusters/configure.html#cluster-log-delivery-1 +14,GOV-9,Governance,Job cluster log configuration,-1,Low,Configure Databricks job custer log delivery,1,1,1,0,0,Check if cluster_log_conf is set for job clusters,curl --netrc -X GET \ https:///api/2.0/jobs/list \ | jq,https://docs.databricks.com/clusters/configure.html#cluster-log-delivery-1,https://learn.microsoft.com/en-us/azure/databricks/clusters/configure#cluster-log-delivery,https://docs.gcp.databricks.com/clusters/configure.html#cluster-log-delivery-1 15,GOV-10,Governance,Managed tables in DBFS root,1,Low,The DBFS root is not intended for production customer data,1,1,1,1,0,Check the /user/hive/warehouse/ folder for any data folders stored more than the configured value,"curl --netrc -X GET \ https:///api/2.0/dbfs/list \ --data '{ ""path"": ""/user/hive/warehouse/"" }'",https://docs.databricks.com/data/databricks-file-system.html#configuration-and-usage-recommendations,https://learn.microsoft.com/en-us/azure/databricks/data/databricks-file-system#configuration-and-usage-recommendations,https://docs.gcp.databricks.com/dbfs/index.html#configuration-and-usage-recommendations 16,GOV-11,Governance,DBFS mounts,1,Low,Avoid DBFS mounts for accessing production data,1,1,1,1,0,Check for mnt paths in dbutils.fs.mounts() and report if there are datasources loaded as FUSE mounts to the workspace than the configured value,dbutils.fs.mounts(),https://docs.databricks.com/data/databricks-file-system.html#configuration-and-usage-recommendations,https://learn.microsoft.com/en-us/azure/databricks/data/databricks-file-system#configuration-and-usage-recommendations,https://docs.gcp.databricks.com/dbfs/index.html#configuration-and-usage-recommendations 17,GOV-12,Governance,Unity Catalog enabled clusters,-1,High,Use Unity Catalog enabled clusters,1,1,0,1,0,Check if there are clusters without data_security_mode as (USER_ISOLATION or SINGLE_USER) or data_security_mode as none,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/data-governance/unity-catalog/index.html#cluster-access-modes-for-unity-catalog,https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/#cluster-access-modes-for-unity-catalog,https://docs.gcp.databricks.com/data-governance/unity-catalog/index.html#cluster-access-modes-for-unity-catalog @@ -33,9 +33,9 @@ id,check_id,category,check,evaluation_value,severity,recommendation,aws,azure,gc 32,INFO-11,Informational,Workspace for supporting Git repos,-1,High,It is recommended to store code in Git repos,1,1,1,1,0,Check workspace-conf for enableProjectTypeInWorkspace setting,curl -n -X GET 'https:///api/2.0/preview/workspace-conf?keys=enableProjectTypeInWorkspace',https://docs.databricks.com/repos/index.html,https://learn.microsoft.com/en-us/azure/databricks/repos/index,https://docs.gcp.databricks.com/repos/index.html 33,NS-1,Network Security,Public keys for all-purpose clusters,-1,High,"Remote SSH access to clusters is discouraged, use web terminal instead",1,1,1,1,0,Check if ssh_public_keys is configured on any cluster,curl --netrc -X GET \ https:///api/2.0/clusters/list \ | jq .,https://docs.databricks.com/clusters/web-terminal.html,https://learn.microsoft.com/en-us/azure/databricks/clusters/web-terminal,https://docs.gcp.databricks.com/clusters/web-terminal.html 34,NS-2,Network Security,Public keys for job cluster,-1,High,"Remote SSH access to clusters is discouraged, use web terminal instead",1,1,1,1,0,Check if ssh_public_keys is configured on any job cluster,curl --netrc -X GET \ https:///api/2.0/jobs/list \ | jq,https://docs.databricks.com/clusters/web-terminal.html,https://learn.microsoft.com/en-us/azure/databricks/clusters/web-terminal,https://docs.gcp.databricks.com/clusters/web-terminal.html -35,NS-3,Network Security,Front-end private connectivity,-1,High,"Configure private network connectivity for accessing the web application and REST APIs. You can configure AWS PrivateLink, Azure Private Link, or Google Private Service Connect. Note that enabling and requiring front-end private connectivity are different, see the documentation for details.",1,1,1,1,0,Check if private_access_settings_id is set for the workspace,curl -n -X GET 'https://accounts.cloud.databricks.com/api/2.0/accounts//workspaces',https://docs.databricks.com/administration-guide/cloud-configurations/aws/privatelink.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/cloud-configurations/azure/private-link,https://cloud.google.com/vpc/docs/private-access-options +35,NS-3,Network Security,Front-end private connectivity,-1,Medium,"Configure either private network connectivity or IP access lists (NS-5) for workspace access and REST APIs. You can configure AWS PrivateLink, Azure Private Link, or Google Private Service Connect. Note that enabling and requiring front-end private connectivity are different, see the documentation for details.",1,1,1,1,0,Check if private_access_settings_id is set for the workspace,curl -n -X GET 'https://accounts.cloud.databricks.com/api/2.0/accounts//workspaces',https://docs.databricks.com/administration-guide/cloud-configurations/aws/privatelink.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/cloud-configurations/azure/private-link,https://cloud.google.com/vpc/docs/private-access-options 36,NS-4,Network Security,"Workspace uses a customer-managed VPC (AWS, GCP) or enables VNet injection (Azure)",-1,Medium,"Deploy with a customer-managed VPC (AWS, GCP) or use VNet injection (Azure) to help enforce data exfiltration protections",1,1,1,1,0,Check if network_id is set for this workspace,curl -n -X GET 'https://accounts.cloud.databricks.com/api/2.0/accounts//workspaces',https://docs.databricks.com/administration-guide/cloud-configurations/aws/customer-managed-vpc.html,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/cloud-configurations/azure/vnet-inject,https://docs.gcp.databricks.com/administration-guide/cloud-configurations/gcp/customer-managed-vpc.html -37,NS-5,Network Security,IP access lists for workspace access,-1,Medium,Configure IP access lists for workspaces that restrict the IP addresses that can authenticate to help protect against data exfiltration and account takeover,1,1,1,1,0,Check if ip-access-lists are configured and enabled,curl --netrc -X GET \ https:///api/2.0/ip-access-lists,https://docs.databricks.com/security/network/ip-access-list.html#add-an-ip-access-list,https://learn.microsoft.com/en-us/azure/databricks/security/network/front-end/ip-access-list,https://docs.gcp.databricks.com/security/network/ip-access-list.html +37,NS-5,Network Security,IP access lists for workspace access,-1,Medium,Configure IP access lists or private network connectivity (NS-3) for workspaces that restrict the IP addresses that can authenticate to help protect against data exfiltration and account takeover,1,1,1,1,0,Check if ip-access-lists are configured and enabled,curl --netrc -X GET \ https:///api/2.0/ip-access-lists,https://docs.databricks.com/security/network/ip-access-list.html#add-an-ip-access-list,https://learn.microsoft.com/en-us/azure/databricks/security/network/front-end/ip-access-list,https://docs.gcp.databricks.com/security/network/ip-access-list.html 38,IA-5,Identity & Access,Maximum lifetime of new tokens to something other than unlimited,-1,Medium,Configure maximum lifetime for all future tokens to a value other than unlimited,1,1,1,1,0,Check workspace-conf for maxTokenLifetimeDays In Workspace setting,curl -n -X GET 'https:///api/2.0/preview/workspace-conf?keys=maxTokenLifetimeDays',https://docs.databricks.com/administration-guide/access-control/tokens.html#lifetime,https://learn.microsoft.com/en-us/azure/databricks/administration-guide/access-control/tokens#--set-maximum-lifetime-of-new-tokens-rest-api-only,https://docs.gcp.databricks.com/administration-guide/access-control/tokens.html#lifetime 39,NS-6,Network Security,Secure cluster connectivity (on Azure this is also called NoPublicIp),-1,Medium,"Configure secure cluster connectivity (On Azure, this is called NoPublicIP / NPIP)",0,1,0,1,0,Check if enableNoPublicIp are configured and enabled,curl --netrc -X GET \ https://accounts.azuredatabricks.net/api/2.0/accounts//workspaces,N/A,https://learn.microsoft.com/en-us/azure/databricks/security/secure-cluster-connectivity, 40,GOV-13,Governance,Enforce User Isolation,-1,Medium,Enforce user isolation cluster types on a workspace,1,1,1,1,0,Check workspace-conf for enforceUserIsolation In Workspace setting,curl -n -X GET 'https:///api/2.0/preview/workspace-conf?keys= enforceUserIsolation',https://docs.databricks.com/security/enforce-user-isolation.html,https://learn.microsoft.com/en-us/azure/databricks/security/enforce-user-isolation,https://docs.gcp.databricks.com/security/enforce-user-isolation.html @@ -70,7 +70,7 @@ id,check_id,category,check,evaluation_value,severity,recommendation,aws,azure,gc 101,DP-14,Data Protection,Store and retrieve embeddings securely,-1,Low,Store and retrieve embeddings securely using the Vector Search,1,1,0,1,0,List all the Vector Search endpoints and see if at least one endpoint is configured,curl -n -X GET 'https:///api/2.0/vector-search/endpoints',https://docs.databricks.com/en/generative-ai/vector-search.html,https://learn.microsoft.com/en-us/azure/databricks/generative-ai/vector-search,N/A 103,INFO-37,Informational,Compliance security profile for new workspaces,-1,Low,Validate and deploy on a platform that has put in place controls to meet the unique compliance needs of highly regulated industries,1,0,0,1,0,Check if compliance security profile for new workspaces is enabled,curl -n -X GET 'https://accounts.cloud.databricks.com/api/2.0/accounts///accounts/{accountid}/settings/types/shield_csp_enablement_ac/names/default',https://docs.databricks.com/en/security/privacy/security-profile.html,https://learn.microsoft.com/en-us/azure/databricks/security/privacy/security-profile,N/A 104,INFO-38,Informational,Third-party library control,-1,Low,Add libraries and init scripts to the allowlist in Unity Catalog,1,1,1,1,0,Get the artifact allowlist of and check if any allowed artifacts are configured,curl -n -X GET 'https:///api/2.1/unity-catalog/artifact-allowlists/{artifact_type},https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/allowlist.html,https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/manage-privileges/allowlist,https://docs.gcp.databricks.com/en/data-governance/unity-catalog/manage-privileges/allowlist.html -105,GOV-34,Governance,Monitor audit logs with system tables,-1,High,Configure system tables and set up automated monitoring and alerting,1,1,1,1,0,Get the the systemschemas for the metastores and see if the access schema is enabled,curl -n -X GET 'https:///api/2.0/unity-catalog/metastores//systemschemas',https://docs.databricks.com/en/admin/system-tables/index.html,https://learn.microsoft.com/en-us/azure/databricks/admin/system-tables/,https://docs.gcp.databricks.com/en/admin/system-tables/index.html +105,GOV-34,Governance,Monitor audit logs with system tables (or see GOV-3),-1,High,Configure system tables and set up automated monitoring and alerting,1,1,1,1,0,Get the the systemschemas for the metastores and see if the access schema is enabled,curl -n -X GET 'https:///api/2.0/unity-catalog/metastores//systemschemas',https://docs.databricks.com/en/admin/system-tables/index.html,https://learn.microsoft.com/en-us/azure/databricks/admin/system-tables/,https://docs.gcp.databricks.com/en/admin/system-tables/index.html 106,GOV-35,Governance,Restrict workspace admins,-1,Medium,Restrict workspace admins to only change a job owner to themselves and the job run as setting to a service principal that they have the Service Principal User role on,1,1,1,1,0,Get the restrict workspace admins setting and check if RestrictWorkspaceAdmins set to ALLOW_ALL,curl -n -X GET 'https:///api/2.0/settings/types/restrict_workspace_admins/names/default',https://docs.databricks.com/en/admin/workspace-settings/restrict-workspace-admins.html,https://learn.microsoft.com/en-us/azure/databricks/admin/workspace-settings/restrict-workspace-admins,https://docs.gcp.databricks.com/en/admin/workspace-settings/restrict-workspace-admins.html 107,GOV-36,Governance,Automatic cluster update,-1,Medium,Ensure that all the clusters in a workspace are periodically updated to the latest host OS image and security updates,1,1,0,1,0,Get the automatic cluster update setting and check the value is set to true,curl --netrc -X GET \ https:///api/2.0/settings/types/automatic_cluster_update/names/default \ | jq,https://docs.databricks.com/en/admin/clusters/automatic-cluster-update.html,https://learn.microsoft.com/en-us/azure/databricks/admin/clusters/automatic-cluster-update,N/A 108,INFO-39,Informational,Compliance security profile for the workspace,-1,Low,Validate and deploy on a platform that has put in place controls to meet the unique compliance needs of highly regulated industries,1,1,0,1,0,Check if compliance security profile for new workspaces is enabled,curl -n -X GET 'https:////api/2.0/settings/types/shield_csp_enablement_ws_db/names/default',https://docs.databricks.com/en/security/privacy/security-profile.html,https://learn.microsoft.com/en-us/azure/databricks/security/privacy/security-profile,N/A diff --git a/dabs/dabs_template/initialize.py.tmpl b/dabs/dabs_template/initialize.py.tmpl deleted file mode 100644 index b3eb6cef..00000000 --- a/dabs/dabs_template/initialize.py.tmpl +++ /dev/null @@ -1,139 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC **Notebook name:** initialize -# MAGIC **Functionality:** initializes the necessary configruation values for the rest of the process into a json - -# COMMAND ---------- - -# MAGIC %run ./common - -# COMMAND ---------- - -# replace values for accounts exec -hostname = ( - dbutils.notebook.entry_point.getDbutils() - .notebook() - .getContext() - .apiUrl() - .getOrElse(None) -) -cloud_type = getCloudType(hostname) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ##### Modify JSON values -# MAGIC * **account_id** Account ID. Can get this from the accounts console -# MAGIC * **sql_warehouse_id** SQL Warehouse ID to import dashboard -# MAGIC * **verbosity** (optional). debug, info, warning, error, critical -# MAGIC * **master_name_scope** Secret Scope for Account Name -# MAGIC * **master_name_key** Secret Key for Account Name -# MAGIC * **master_pwd_scope** Secret Scope for Account Password -# MAGIC * **master_pwd_key** Secret Key for Account Password -# MAGIC * **workspace_pat_scope** Secret Scope for Workspace PAT -# MAGIC * **workspace_pat_token_prefix** Secret Key prefix for Workspace PAT. Workspace ID will automatically be appended to this per workspace -# MAGIC * **use_mastercreds** (optional) Use master account credentials for all workspaces - -# COMMAND ---------- - -import json - -json_ = { - "account_id": dbutils.secrets.get(scope="sat_scope", key="account-console-id"), - "sql_warehouse_id": dbutils.secrets.get(scope="sat_scope", key="sql-warehouse-id"), - "analysis_schema_name": "{{.catalog}}.security_analysis", - "verbosity": "info", -} - -# COMMAND ---------- - -json_.update( - { - "master_name_scope": "sat_scope", - "master_name_key": "user", - "master_pwd_scope": "sat_scope", - "master_pwd_key": "pass", - "workspace_pat_scope": "sat_scope", - "workspace_pat_token_prefix": "sat-token", - "dashboard_id": "317f4809-8d9d-4956-a79a-6eee51412217", - "dashboard_folder": f"{basePath()}/dashboards/", - "dashboard_tag": "SAT", - "use_mastercreds": True, - "use_parallel_runs": True, - } -) - - -# COMMAND ---------- - -# DBTITLE 1,GCP configurations -if cloud_type == "gcp": - json_.update( - { - "service_account_key_file_path": dbutils.secrets.get( - scope="sat_scope", key="gs-path-to-json" - ), - "impersonate_service_account": dbutils.secrets.get( - scope="sat_scope", key="impersonate-service-account" - ), - "use_mastercreds": False, - } - ) - - -# COMMAND ---------- - -# DBTITLE 1,Azure configurations -if cloud_type == "azure": - json_.update( - { - "account_id": "azure", - "subscription_id": dbutils.secrets.get( - scope="sat_scope", key="subscription-id" - ), # Azure subscriptionId - "tenant_id": dbutils.secrets.get( - scope="sat_scope", key="tenant-id" - ), # The Directory (tenant) ID for the application registered in Azure AD. - "client_id": dbutils.secrets.get( - scope="sat_scope", key="client-id" - ), # The Application (client) ID for the application registered in Azure AD. - "client_secret_key": "client-secret", # The secret generated by AAD during your confidential app registration - "use_mastercreds": True, - } - ) - - -# COMMAND ---------- - -# DBTITLE 1,AWS configurations -if cloud_type == "aws": - sp_auth = { - "use_sp_auth": "False", - "client_id": "", - "client_secret_key": "client-secret", - } - try: - use_sp_auth = ( - dbutils.secrets.get(scope="sat_scope", key="use-sp-auth").lower() == "true" - ) - if use_sp_auth: - sp_auth["use_sp_auth"] = "True" - sp_auth["client_id"] = dbutils.secrets.get( - scope="sat_scope", key="client-id" - ) - except: - pass - json_.update(sp_auth) - -# COMMAND ---------- - -create_schema() -create_security_checks_table() -create_account_info_table() -create_account_workspaces_table() -create_workspace_run_complete_table() - -# COMMAND ---------- - -# Initialize best practices if not already loaded into database -readBestPracticesConfigsFile() \ No newline at end of file diff --git a/dabs/sat/config.py b/dabs/sat/config.py index d5a2a069..fb7c0eab 100644 --- a/dabs/sat/config.py +++ b/dabs/sat/config.py @@ -43,13 +43,37 @@ def form(): ignore=lambda x: not x["enable_uc"], default="hive_metastore", ), + Text( + name="security_analysis_schema", + message="Schema name for SAT", + default="security_analysis", + ), List( name="warehouse", message="Select warehouse", choices=loading(get_warehouses, client=client), ), ] - questions = questions + cloud_specific_questions(client) + proxies = [ + Confirm( + name="use_proxy", + message="Want to use a proxy?", + default=False, + ), + Text( + name="http", + message="HTTP Proxy", + ignore=lambda x: not x["use_proxy"], + default="", + ), + Text( + name="https", + message="HTTPS Proxy", + ignore=lambda x: not x["use_proxy"], + default="", + ), + ] + questions = questions + cloud_specific_questions(client) + proxies return client, prompt(questions), profile @@ -116,15 +140,6 @@ def generate_secrets(client: WorkspaceClient, answers: dict, cloud_type: str): client.secrets.create_scope(scope_name) - token = client.tokens.create( - lifetime_seconds=86400 * 90, - comment="Security Analysis Tool", - ) - client.secrets.put_secret( - scope=scope_name, - key=f"sat-token-{client.get_workspace_id()}", - string_value=token.token_value, - ) client.secrets.put_secret( scope=scope_name, key="account-console-id", @@ -135,6 +150,29 @@ def generate_secrets(client: WorkspaceClient, answers: dict, cloud_type: str): key="sql-warehouse-id", string_value=answers["warehouse"]["id"], ) + client.secrets.put_secret( + scope=scope_name, + key="analysis_schema_name", + string_value=f'{answers["catalog"]}.{answers["security_analysis_schema"]}', + ) + + if answers["use_proxy"]: + client.secrets.put_secret( + scope=scope_name, + key="proxies", + string_value=json.dumps( + { + "http": answers["http"], + "https": answers["https"], + } + ), + ) + else: + client.secrets.put_secret( + scope=scope_name, + key="proxies", + string_value="{}", + ) if cloud_type == "aws": client.secrets.put_secret( diff --git a/dabs/setup.sh b/dabs/setup.sh index 80a29783..e77821e3 100644 --- a/dabs/setup.sh +++ b/dabs/setup.sh @@ -6,8 +6,6 @@ config_file=$3 cp -r ../configs ../notebooks ../dashboards ./dabs_template/template/tmp -rm ./dabs_template/template/tmp/notebooks/Utils/initialize.py -cp ./dabs_template/initialize.py.tmpl ./dabs_template/template/tmp/notebooks/Utils/initialize.py.tmpl databricks bundle init ./dabs_template -p $profile --config-file $config_file rm -rf $config_file diff --git a/docs/deprecated_old_setup.md b/docs/deprecated_old_setup.md index bbddb382..7aaef135 100644 --- a/docs/deprecated_old_setup.md +++ b/docs/deprecated_old_setup.md @@ -206,6 +206,7 @@ Please gather the following information before you start setting up: * Set the PAT token value for the workspace_id * Set the value for the account_id * Set the value for the sql_warehouse_id + * Set the value for the analysis_schema_name to store SAT analysis results (for UC use catalog.schema name , for hive use schema name) ``` @@ -219,6 +220,10 @@ Please gather the following information before you start setting up: ``` databricks --profile e2-sat secrets put-secret sat_scope sql-warehouse-id ``` + + ``` + databricks --profile e2-sat secrets put-secret sat_scope analysis_schema_name + ``` * In your environment where you imported SAT project from git (Refer to Step 4 in Prerequisites) Open the \/notebooks/Utils/initialize notebook and modify the JSON string with : @@ -230,7 +235,8 @@ Please gather the following information before you start setting up: ``` { "account_id": dbutils.secrets.get(scope="sat_scope", key="account-console-id"), - "sql_warehouse_id": dbutils.secrets.get(scope="sat_scope", key="sql-warehouse-id") + "sql_warehouse_id": dbutils.secrets.get(scope="sat_scope", key="sql-warehouse-id"), + "analysis_schema_name": dbutils.secrets.get(scope="sat_scope", key="analysis_schema_name"), "verbosity":"info" } diff --git a/docs/images/gcp_ws.png b/docs/images/gcp_ws.png new file mode 100644 index 00000000..0d8d13ea Binary files /dev/null and b/docs/images/gcp_ws.png differ diff --git a/docs/setup.md b/docs/setup.md index 26c831c8..af4a0f08 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -1,5 +1,7 @@ # Setup Guide +> **SAT v0.2.0 or higher** brings full support for Unity Catalog. Now you can pick your catalog instead of hive_metastore. Plus, you get to choose your own schema name. + Follow this guide to setup the Security Analysis Tool (SAT) on your Databricks workspace. ## Prerequisites @@ -27,12 +29,13 @@ SAT creates a new security_analysis database and Delta tables. If you are an exi ### Unity Catalog based schema ```sql - drop database .security_analysis cascade; + drop database . cascade; ``` ## Setup -> SAT is a productivity tool to help verify security configurations of Databricks deployments, it's not meant to be used as certification or attestation of your deployments. SAT project is regularly updated to improve the correctness of checks, add new checks, and fix bugs. Please send your feedback and comments to sat@databricks.com. +> SAT is a productivity tool to help verify security configurations of Databricks deployments, it's not meant to be used as certification or attestation of your deployments. SAT project is regularly updated to improve the correctness of checks, add new checks, and fix bugs. You will need a single SAT install per Databricks account in AWS and GCP and a single install per azure subscription in Azure. Add the Service principle as mentioned in the detailed steps to analyze the rest of the workspaces from the workspace where SAT is installed. You can choose not to add SP to a given workspace if you wish to ignore a given workspace. +> Please send your feedback and comments to sat@databricks.com. SAT can be setup on any of the cloud providers where Databricks is hosted. Follow the setup guide for the cloud provider you are using: @@ -70,10 +73,10 @@ You now have two jobs (SAT Initializer Notebook & SAT Driver Notebook). Run SAT ### 2. Access Databricks SQL Dashboards - > **Note:** You can also use Lakeview Dashboards to view the results, instead of classic Dashboards. + > **Note:** You can also use Lakeview Dashboards to view the results. -In DBSQL find "SAT - Security Analysis Tool" dashboard to see the report. You can filter the dashboard by **SAT** tag. +In DBSQL find "SAT - Security Analysis Tool" dashboard to see the report. You can filter the dashboard by **SAT** tag. (The old classic legacy dashboard can be found in Workspace -> Home -> SAT_dashboard) diff --git a/docs/setup/aws.md b/docs/setup/aws.md index 18eabf2a..bded12c2 100644 --- a/docs/setup/aws.md +++ b/docs/setup/aws.md @@ -30,9 +30,9 @@ The first step is to create a Service Principal in Databricks. This will allow S - Save the `Secret` and `Client ID` - To deploy SAT in a workspace, you must add the Service Principal to the workspace. -![AWS_SP_Workspace](../images/aws_ws.png) +![AWS_SP_Workspace](../images/gcp_ws.png) -> The Service Principle requires an [Accounts Admin role](https://docs.databricks.com/en/admin/users-groups/service-principals.html#assign-account-admin-roles-to-a-service-principal), [Admin role](https://docs.databricks.com/en/admin/users-groups/service-principals.html#assign-a-service-principal-to-a-workspace-using-the-account-console) for **each workspace** and needs to be a member of the [metastore admin group](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/admin-privileges.html#who-has-metastore-admin-privileges) is required to analyze many of the APIs +> The Service Principle requires an [Accounts Admin role](https://docs.gcp.databricks.com/en/admin/users-groups/service-principals.html#assign-account-admin-roles-to-a-service-principal), [Admin role](https://docs.gcp.databricks.com/en/admin/users-groups/service-principals.html#assign-a-service-principal-to-a-workspace-using-the-account-console) for **each workspace** and needs to be a member of the [metastore admin group](https://docs.gcp.databricks.com/en/data-governance/unity-catalog/manage-privileges/admin-privileges.html#who-has-metastore-admin-privileges) is required to analyze many of the APIs ## Installation @@ -61,6 +61,7 @@ To execute the SAT follow these steps on your workstation or a compatible VM tha ./install.sh ``` +> **Proxies are now supported as part of SAT. You can add your HTTP and HTTPS links to use your proxies.** ![](../gif/terminal-aws.gif) diff --git a/docs/setup/faqs_and_troubleshooting.md b/docs/setup/faqs_and_troubleshooting.md index dfd47661..2db023d5 100644 --- a/docs/setup/faqs_and_troubleshooting.md +++ b/docs/setup/faqs_and_troubleshooting.md @@ -16,7 +16,7 @@ We created diagnosis notebooks for respective clouds to help troubleshoot your S * [SAT Azure troubleshooting notebook](https://github.com/databricks-industry-solutions/security-analysis-tool/blob/main/notebooks/diagnosis/sat_diagnosis_azure.py) * [SAT GCP troubleshooting notebook](https://github.com/databricks-industry-solutions/security-analysis-tool/blob/main/notebooks/diagnosis/sat_diagnosis_gcp.py) -### 1. Incorrectly configured secrets +### Incorrectly configured secrets * **Error:** @@ -31,19 +31,7 @@ We created diagnosis notebooks for respective clouds to help troubleshoot your S databricks --profile e2-sat secrets list-secrets sat_scope ``` -### 2. Invalid access token - -* **Error:** - - ``` - Error 403 Invalid access token. - ``` - -* **Resolution:** - - Check your PAT token configuration for the `workspace_pat_token` key. - -### 3. Firewall blocking Databricks accounts console +### Firewall blocking Databricks accounts console * **Error:** @@ -86,7 +74,7 @@ We created diagnosis notebooks for respective clouds to help troubleshoot your S If you don’t see a JSON with a clean listing of workspaces, you are likely having a firewall issue that is blocking calls to the accounts console. Please have your infrastructure team add `accounts.cloud.databricks.com` to the allow-list. Ensure that the private IPv4 address from the NAT gateway is added to the IP allow list. -### 4. Offline install of libraries in case of no PyPI access +### Offline install of libraries in case of no PyPI access * **Steps:** diff --git a/docs/setup/gcp.md b/docs/setup/gcp.md index 51700b28..2ea0307f 100644 --- a/docs/setup/gcp.md +++ b/docs/setup/gcp.md @@ -5,6 +5,7 @@ This guide will help you setup the Security Analysis Tool (SAT) on GCP Databrick - [GCP Setup Guide](#gcp-setup-guide) - [Prerequisites](#prerequisites) - [Service Accounts](#service-accounts) + - [Databricks Service Principal](#databricks-service-principal) - [Installation](#installation) - [Credentials Needed](#credentials-needed) - [Troubleshooting](#troubleshooting) @@ -27,6 +28,21 @@ The first step is to create a Service Principal in GCP. This will allow SAT to a - Assign the Workspace Admin Role: The Service Principal must be assigned the `Workspace Admin` role for each workspace it will manage. This role provides the ability to manage workspace-level settings and permissions. - Add to the Metastore Admin Group: The Service Principal must be added to the `Metastore Admin` group or role. This role provides the ability to manage metastore-level settings and permissions. +### Databricks Service Principal + +The first step is to create a Service Principal in Databricks. This will allow SAT to authenticate with the other workspaces. Follow the steps: + +- Go to the [Account Console](https://accounts.cloud.databricks.com) +- On the left side bar menu, click on `User management` +- Select `Service Principal` and then `Add service principal` +- Type a new name for the service principal. +- The Service Principal must be granted the `Account Admin` role. This role provides the ability to manage account-level settings and permissions. +- Assign the Workspace Admin Role: The Service Principal must be assigned the `Workspace Admin` role for each workspace it will manage. This role provides the ability to manage workspace-level settings and permissions. +- Add to the Metastore Admin Group: The Service Principal must be added to the `Metastore Admin` group or role. This role provides the ability to manage metastore-level settings and permissions. +- Create a new OAuth Secret. +- Save the `Secret` and `Client ID` +- To deploy SAT in a workspace, you must add the Service Principal to the workspace. + ## Installation ### Credentials Needed diff --git a/notebooks/Includes/install_sat_sdk.py b/notebooks/Includes/install_sat_sdk.py index 09cb0985..c9e071d0 100644 --- a/notebooks/Includes/install_sat_sdk.py +++ b/notebooks/Includes/install_sat_sdk.py @@ -22,7 +22,7 @@ # COMMAND ---------- -SDK_VERSION='0.1.34' +SDK_VERSION='0.1.35' # COMMAND ---------- diff --git a/notebooks/Includes/workspace_analysis.py b/notebooks/Includes/workspace_analysis.py index 9d844650..c30ffcbb 100644 --- a/notebooks/Includes/workspace_analysis.py +++ b/notebooks/Includes/workspace_analysis.py @@ -357,7 +357,7 @@ def rbac_rule(df): # DBTITLE 1,Token Management check_id='21' #PAT Token with no lifetime limit enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -expiry_limit_evaluation_value = int(sbp_rec['evaluation_value']) +expiry_limit_evaluation_value = sbp_rec['evaluation_value'] def token_rule(df): #Check for count of tokens that are either set to expire in over 90 days from today or set to never expire. if df is not None and not df.rdd.isEmpty() and len(df.collect()) > 1: @@ -383,7 +383,7 @@ def token_rule(df): check_id='7' # PAT Tokens About to Expire enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -expiry_limit_evaluation_value = int(sbp_rec['evaluation_value']) +expiry_limit_evaluation_value = sbp_rec['evaluation_value'] def token_rule(df): #Check for count of tokens that expiring in expiry_limit_evaluation_value days from today. if df is not None and not df.rdd.isEmpty() and len(df.collect()) > 1: @@ -449,7 +449,7 @@ def token_max_life_rule(df): # DBTITLE 1,Admin count check_id='27' #Admin Count enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -admin_count_evaluation_value = int(sbp_rec['evaluation_value']) +admin_count_evaluation_value = sbp_rec['evaluation_value'] def admin_rule(df): if df is not None and not df.rdd.isEmpty() and len(df.collect()) > admin_count_evaluation_value: df = df.rdd.map(lambda x: (re.sub('[\"\'\\\\]', '_', x['Admins']),)).toDF(['Admins']) @@ -474,7 +474,7 @@ def admin_rule(df): check_id='42' #Use service principals enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -service_principals_evaluation_value = int(sbp_rec['evaluation_value']) +service_principals_evaluation_value = sbp_rec['evaluation_value'] def use_service_principals(df): if df is not None and not df.rdd.isEmpty() and len(df.collect()) >= service_principals_evaluation_value: return (check_id, 0, {'SPs': len(df.collect())}) @@ -502,7 +502,7 @@ def use_service_principals(df): # DBTITLE 1,Secrets Management check_id='1' #Secrets Management enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -secrets_count_evaluation_value = int(sbp_rec['evaluation_value']) +secrets_count_evaluation_value = sbp_rec['evaluation_value'] def secrets_rule(df): if df is not None and not df.rdd.isEmpty() and df.collect()[0][0] >= secrets_count_evaluation_value: num_secrets = df.collect()[0][0] @@ -773,7 +773,7 @@ def logconf_check_job(df): # DBTITLE 1,DBFS /user/hive/warehouse - managed tables check_id='15' #Managed Tables enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -dbfs_warehouses_evaluation_value = int(sbp_rec['evaluation_value']) +dbfs_warehouses_evaluation_value = sbp_rec['evaluation_value'] def dbfs_check(df): if df is not None and not df.rdd.isEmpty() and len(df.collect()) >= dbfs_warehouses_evaluation_value: @@ -797,7 +797,7 @@ def dbfs_check(df): # DBTITLE 1,DBFS /mnt check check_id='16' #Mounts enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -dbfs_fuse_mnt_evaluation_value = int(sbp_rec['evaluation_value']) +dbfs_fuse_mnt_evaluation_value = sbp_rec['evaluation_value'] def dbfs_mnt_check(df): if df is not None and not df.rdd.isEmpty() and len(df.collect())>=dbfs_fuse_mnt_evaluation_value: @@ -896,7 +896,7 @@ def pool_check(df): # DBTITLE 1,jobs - max concurrent runs >=5 (Denial of Service) check_id='23' #Max concurrent runs enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -max_concurrent_runs_evaluation_value = int(sbp_rec['evaluation_value']) +max_concurrent_runs_evaluation_value = sbp_rec['evaluation_value'] def mcr_check(df): if df is not None and not df.rdd.isEmpty(): mcr = df.collect() @@ -947,7 +947,7 @@ def lib_check(df): # DBTITLE 1,Multiple users have cluster create privileges check_id='25' #User Privileges enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -max_cluster_create_count_evaluation_value = int(sbp_rec['evaluation_value']) +max_cluster_create_count_evaluation_value = sbp_rec['evaluation_value'] # Report on Clusters that do not have a policy id associated with them def cc_check(df): if df is not None and not df.rdd.isEmpty() and len(df.collect())>max_cluster_create_count_evaluation_value: @@ -1003,7 +1003,7 @@ def log_check(df): # DBTITLE 1,How long since the last cluster restart check_id='9' #Long running clusters enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type) -days_since_restart_evaluation_value = int(sbp_rec['evaluation_value']) +days_since_restart_evaluation_value = sbp_rec['evaluation_value'] def time_check(df): if df is not None and not df.rdd.isEmpty() and len(df.collect())>=1: timlst = df.collect() diff --git a/notebooks/Setup/3. test_connections.py b/notebooks/Setup/3. test_connections.py index fef2011e..324237a8 100644 --- a/notebooks/Setup/3. test_connections.py +++ b/notebooks/Setup/3. test_connections.py @@ -54,6 +54,7 @@ def test_connection(jsonarg, accounts_test=False): db_client = SatDBClient(jsonarg) + token = db_client.get_temporary_oauth_token() if accounts_test == True: hostname='Accounts' workspace_id='Accounts_Cred' @@ -182,17 +183,7 @@ def renewWorkspaceTokens(): masterpwd = dbutils.secrets.get(json_['master_pwd_scope'], json_['master_pwd_key']) json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd}) - - # if the worspace we are testing is the current workspace, - # We need the current workspace connection tested with the token to configure alerts and dashboard later - if ws.workspace_id == current_workspace: - loggr.info(f"\033[1mCurrent workspace {ws.workspace_id} detected. Using PAT.\033[0m") - tokenscope = json_['workspace_pat_scope'] - tokenkey = ws.ws_token #already has prefix in config file - token = dbutils.secrets.get(tokenscope, tokenkey) - json_.update({'token':token}) - #for all other non-current workspaces - elif (json_['use_mastercreds']) is False: + if (json_['use_mastercreds']) is False: tokenscope = json_['workspace_pat_scope'] tokenkey = f"{json_['workspace_pat_token_prefix']}-{json_['workspace_id']}" try: @@ -214,7 +205,3 @@ def renewWorkspaceTokens(): # COMMAND ---------- dbutils.notebook.exit('OK') - -# COMMAND ---------- - - diff --git a/notebooks/Setup/5. import_dashboard_template.py b/notebooks/Setup/5. import_dashboard_template.py index 193f7132..0cc19ca9 100644 --- a/notebooks/Setup/5. import_dashboard_template.py +++ b/notebooks/Setup/5. import_dashboard_template.py @@ -28,6 +28,12 @@ # COMMAND ---------- +hostname = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) +cloud_type = getCloudType(hostname) +clusterid = spark.conf.get("spark.databricks.clusterUsageTags.clusterId") + +# COMMAND ---------- + import json context = json.loads(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()) current_workspace = context['tags']['orgId'] @@ -40,15 +46,46 @@ display(workspacedf) ws = (workspacedf.collect())[0] +# COMMAND ---------- + +from core.dbclient import SatDBClient +json_.update({'url':'https://' + ws.deployment_url, 'workspace_id': ws.workspace_id, 'clusterid':clusterid, 'cloud_type':cloud_type}) + + +token = '' +if cloud_type =='azure': #client secret always needed + client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"]) + json_.update({'token':token, 'client_secret': client_secret}) +elif (cloud_type =='aws' and json_['use_sp_auth'].lower() == 'true'): + client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"]) + json_.update({'token':token, 'client_secret': client_secret}) + mastername = ' ' + masterpwd = ' ' # we still need to send empty user/pwd. + json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd}) +else: #lets populate master key for accounts api + mastername = dbutils.secrets.get(json_['master_name_scope'], json_['master_name_key']) + masterpwd = dbutils.secrets.get(json_['master_pwd_scope'], json_['master_pwd_key']) + json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd}) + +if (json_['use_mastercreds']) is False: + tokenscope = json_['workspace_pat_scope'] + tokenkey = f"{json_['workspace_pat_token_prefix']}-{json_['workspace_id']}" + token = dbutils.secrets.get(tokenscope, tokenkey) + json_.update({'token':token}) + +db_client = SatDBClient(json_) +token = db_client.get_temporary_oauth_token() + + # COMMAND ---------- import requests + DOMAIN = ws.deployment_url -TOKEN = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) loggr.info(f"Looking for data_source_id for : {json_['sql_warehouse_id']}!") response = requests.get( 'https://%s/api/2.0/preview/sql/data_sources' % (DOMAIN), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json=None, timeout=60 ) @@ -64,8 +101,8 @@ if (found == False): dbutils.notebook.exit("The configured SQL Warehouse Endpoint is not found.") else: - loggr.info(f"Error with PAT token, {response.text}") - dbutils.notebook.exit("Invalid access token, check PAT configuration value for this workspace.") + loggr.info(f"Error with token, {response.text}") + dbutils.notebook.exit("Invalid access token, check configuration value for this workspace.") # COMMAND ---------- @@ -73,6 +110,7 @@ #todo: Add parent folder to all SQL assets, expose name in _json (default SAT) #create a folder to house all SAT sql artifacts import requests +from core.dbclient import SatDBClient from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry session = requests.Session() @@ -84,8 +122,6 @@ def create_ws_folder(ws, dir_name): #delete tthe WS folder if it exists delete_ws_folder(ws, dir_name) - - token = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) url = "https://"+ ws.deployment_url headers = {"Authorization": "Bearer " + token, 'Content-type': 'application/json'} path = "/Users/"+context['tags']['user']+"/"+ dir_name @@ -107,7 +143,6 @@ def get_ws_folder_object_id(ws, dir_name): #Use the workspace list API to get the object_id for the folder you want to use. #Here’s an example for how to get it for a folder called “/Users/me@example.com”: - token = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) url = "https://"+ ws.deployment_url headers = {"Authorization": "Bearer " + token, 'Content-type': 'application/json'} path = "/Users/"+context['tags']['user']+"/" @@ -129,8 +164,6 @@ def get_ws_folder_object_id(ws, dir_name): #delete folder that houses all SAT sql artifacts def delete_ws_folder(ws, dir_name): - - token = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) url = "https://"+ ws.deployment_url headers = {"Authorization": "Bearer " + token, 'Content-type': 'application/json'} path = "/Users/"+context['tags']['user']+"/"+ dir_name @@ -329,7 +362,8 @@ def load_dashboard(target_client: Client, dashboard_id, dashboard_state, folder_ # COMMAND ---------- -target_client, dashboard_id_to_load,dashboard_folder = get_client(ws, dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token)) +from core.dbclient import SatDBClient +target_client, dashboard_id_to_load,dashboard_folder = get_client(ws, token) workspace_state = {} loggr.info(f"Loading dashboard to master workspace {ws.workspace_id} from dashboard folder {dashboard_folder}") load_dashboard(target_client, dashboard_id_to_load, workspace_state, dashboard_folder) diff --git a/notebooks/Setup/5. import_dashboard_template_lakeview.py b/notebooks/Setup/5. import_dashboard_template_lakeview.py index f3bc86f0..d59f15a3 100644 --- a/notebooks/Setup/5. import_dashboard_template_lakeview.py +++ b/notebooks/Setup/5. import_dashboard_template_lakeview.py @@ -34,21 +34,58 @@ # COMMAND ---------- +hostname = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) +cloud_type = getCloudType(hostname) +clusterid = spark.conf.get("spark.databricks.clusterUsageTags.clusterId") + +# COMMAND ---------- + workspacedf = spark.sql("select * from `global_temp`.`all_workspaces` where workspace_id='" + current_workspace + "'" ) if (workspacedf.rdd.isEmpty()): dbutils.notebook.exit("The current workspace is not found in configured list of workspaces for analysis.") display(workspacedf) ws = (workspacedf.collect())[0] +# COMMAND ---------- + +from core.dbclient import SatDBClient +json_.update({'url':'https://' + ws.deployment_url, 'workspace_id': ws.workspace_id, 'clusterid':clusterid, 'cloud_type':cloud_type}) + + +token = '' +if cloud_type =='azure': #client secret always needed + client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"]) + json_.update({'token':token, 'client_secret': client_secret}) +elif (cloud_type =='aws' and json_['use_sp_auth'].lower() == 'true'): + client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"]) + json_.update({'token':token, 'client_secret': client_secret}) + mastername = ' ' + masterpwd = ' ' # we still need to send empty user/pwd. + json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd}) +else: #lets populate master key for accounts api + mastername = dbutils.secrets.get(json_['master_name_scope'], json_['master_name_key']) + masterpwd = dbutils.secrets.get(json_['master_pwd_scope'], json_['master_pwd_key']) + json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd}) + +if (json_['use_mastercreds']) is False: + tokenscope = json_['workspace_pat_scope'] + tokenkey = f"{json_['workspace_pat_token_prefix']}-{json_['workspace_id']}" + token = dbutils.secrets.get(tokenscope, tokenkey) + json_.update({'token':token}) + +db_client = SatDBClient(json_) +token = db_client.get_temporary_oauth_token() + + # COMMAND ---------- import requests + DOMAIN = ws.deployment_url -TOKEN = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) loggr.info(f"Looking for data_source_id for : {json_['sql_warehouse_id']}!") response = requests.get( 'https://%s/api/2.0/preview/sql/data_sources' % (DOMAIN), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json=None, timeout=60 ) @@ -64,8 +101,8 @@ if (found == False): dbutils.notebook.exit("The configured SQL Warehouse Endpoint is not found.") else: - loggr.info(f"Error with PAT token, {response.text}") - dbutils.notebook.exit("Invalid access token, check PAT configuration value for this workspace.") + loggr.info(f"Error with token, {response.text}") + dbutils.notebook.exit("Invalid access token, check configuration value for this workspace.") # COMMAND ---------- @@ -121,7 +158,7 @@ def replace_string(obj, old_str, new_str): loggr.info(f"Getting Dashboard") response = requests.get( 'https://%s/api/2.0/workspace/get-status' % (DOMAIN), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json=BODY, timeout=60 ) @@ -145,7 +182,7 @@ def replace_string(obj, old_str, new_str): loggr.info(f"Deleting Dashboard") response = requests.delete( 'https://%s/api/2.0/lakeview/dashboards/%s' % (DOMAIN, dashboard_id), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json=BODY, timeout=60 ) @@ -172,7 +209,7 @@ def replace_string(obj, old_str, new_str): loggr.info(f"Creating Dashboard") response = requests.post( 'https://%s/api/2.0/lakeview/dashboards' % (DOMAIN), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json=BODY, timeout=60 ) @@ -204,7 +241,7 @@ def replace_string(obj, old_str, new_str): loggr.info(f"Publishing the Dashboard using the SAT SQL Warehouse") response = requests.post( URL, - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json=BODY, timeout=60 ) diff --git a/notebooks/Setup/6. configure_alerts_template.py b/notebooks/Setup/6. configure_alerts_template.py index ac916419..f9450294 100644 --- a/notebooks/Setup/6. configure_alerts_template.py +++ b/notebooks/Setup/6. configure_alerts_template.py @@ -28,6 +28,12 @@ # COMMAND ---------- +hostname = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) +cloud_type = getCloudType(hostname) +clusterid = spark.conf.get("spark.databricks.clusterUsageTags.clusterId") + +# COMMAND ---------- + import json context = json.loads(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()) current_workspace = context['tags']['orgId'] @@ -40,6 +46,37 @@ display(workspacedf) ws = (workspacedf.collect())[0] +# COMMAND ---------- + +from core.dbclient import SatDBClient +json_.update({'url':'https://' + ws.deployment_url, 'workspace_id': ws.workspace_id, 'clusterid':clusterid, 'cloud_type':cloud_type}) + + +token = '' +if cloud_type =='azure': #client secret always needed + client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"]) + json_.update({'token':token, 'client_secret': client_secret}) +elif (cloud_type =='aws' and json_['use_sp_auth'].lower() == 'true'): + client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"]) + json_.update({'token':token, 'client_secret': client_secret}) + mastername = ' ' + masterpwd = ' ' # we still need to send empty user/pwd. + json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd}) +else: #lets populate master key for accounts api + mastername = dbutils.secrets.get(json_['master_name_scope'], json_['master_name_key']) + masterpwd = dbutils.secrets.get(json_['master_pwd_scope'], json_['master_pwd_key']) + json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd}) + +if (json_['use_mastercreds']) is False: + tokenscope = json_['workspace_pat_scope'] + tokenkey = f"{json_['workspace_pat_token_prefix']}-{json_['workspace_id']}" + token = dbutils.secrets.get(tokenscope, tokenkey) + json_.update({'token':token}) + +db_client = SatDBClient(json_) +token = db_client.get_temporary_oauth_token() + + # COMMAND ---------- workspacesdf = spark.sql('select * from `global_temp`.`all_workspaces`') @@ -51,6 +88,7 @@ #todo: Add parent folder to all SQL assets, expose name in _json (default SAT) #create a folder to house all SAT sql artifacts import requests + from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry session = requests.Session() @@ -62,8 +100,6 @@ def create_ws_folder(ws, dir_name): #delete tthe WS folder if it exists delete_ws_folder(ws, dir_name) - - token = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) url = "https://"+ ws.deployment_url headers = {"Authorization": "Bearer " + token, 'Content-type': 'application/json'} path = "/Users/"+context['tags']['user']+"/"+ dir_name @@ -85,7 +121,7 @@ def get_ws_folder_object_id(ws, dir_name): #Use the workspace list API to get the object_id for the folder you want to use. #Here’s an example for how to get it for a folder called “/Users/me@example.com”: - token = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) + url = "https://"+ ws.deployment_url headers = {"Authorization": "Bearer " + token, 'Content-type': 'application/json'} path = "/Users/"+context['tags']['user']+"/" @@ -107,8 +143,6 @@ def get_ws_folder_object_id(ws, dir_name): #delete folder that houses all SAT sql artifacts def delete_ws_folder(ws, dir_name): - - token = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) url = "https://"+ ws.deployment_url headers = {"Authorization": "Bearer " + token, 'Content-type': 'application/json'} path = "/Users/"+context['tags']['user']+"/"+ dir_name @@ -134,16 +168,18 @@ def delete_ws_folder(ws, dir_name): # COMMAND ---------- import requests +from core.dbclient import SatDBClient + data_source_id ='' user_id = None DOMAIN = ws.deployment_url -TOKEN = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) - + + loggr.info(f"Looking for data_source_id for : {json_['sql_warehouse_id']}!") response = requests.get( 'https://%s/api/2.0/preview/sql/data_sources' % (DOMAIN), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json=None, timeout=60 ) @@ -164,9 +200,6 @@ def delete_ws_folder(ws, dir_name): adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) - -DOMAIN = ws.deployment_url -TOKEN = dbutils.secrets.get(json_['workspace_pat_scope'], ws.ws_token) loggr.info(f"Creating alerts on: {DOMAIN}!") #load alerts templates for each workspace @@ -177,7 +210,7 @@ def delete_ws_folder(ws, dir_name): response = requests.get( 'https://%s/api/2.0/preview/sql/alerts' % (DOMAIN), json = body, - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, timeout=60) alerts = response.json() found = False @@ -194,7 +227,7 @@ def delete_ws_folder(ws, dir_name): continue response = session.post( 'https://%s/api/2.0/preview/sql/queries' % (DOMAIN), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json={ "data_source_id":data_source_id, "name": "sat_alert_"+ws_to_load.workspace_id, @@ -293,7 +326,7 @@ def delete_ws_folder(ws, dir_name): if query_id is not None: response = session.post( 'https://%s/api/2.0/preview/sql/alerts' % (DOMAIN), - headers={'Authorization': 'Bearer %s' % TOKEN}, + headers={'Authorization': 'Bearer %s' % token}, json={ "name":"sat_alerts_"+ws_to_load.workspace_id+"", "options":{ diff --git a/notebooks/Setup/7. update_sat_check_configuration.py b/notebooks/Setup/7. update_sat_check_configuration.py index d37c669a..8ee61699 100644 --- a/notebooks/Setup/7. update_sat_check_configuration.py +++ b/notebooks/Setup/7. update_sat_check_configuration.py @@ -1,8 +1,11 @@ # Databricks notebook source - # MAGIC %md # MAGIC **Notebook name:** 7. update_sat_check_configuration. # MAGIC **Functionality:** Optional notebooks for updating SAT checks customization. +# MAGIC +# MAGIC + +# COMMAND ---------- # MAGIC %pip install PyYAML diff --git a/notebooks/Setup/gcp/configure_sa_auth_tokens.py b/notebooks/Setup/gcp/configure_sa_auth_tokens.py index b340df61..12ca28b6 100644 --- a/notebooks/Setup/gcp/configure_sa_auth_tokens.py +++ b/notebooks/Setup/gcp/configure_sa_auth_tokens.py @@ -43,8 +43,6 @@ workspace_pat_scope = json_['workspace_pat_scope'] tokenscope = json_['workspace_pat_token_prefix'] -ws_pat_token = dbutils.secrets.get(workspace_pat_scope, tokenscope+"-"+current_workspace) - master_name_scope = json_["master_name_scope"] master_name_key = json_["master_name_key"] @@ -190,15 +188,17 @@ def storeTokenAsSecret(deployment_url, scope, key, PAT_token, token): if gcp_workspace_url is None or identity_token is None or access_token is None: dbutils.notebook.exit("Failed to create the necessary tokens, please check your Service key file and the Impersonation service account ") +ws_temp_token = generateGCPWSToken(gcp_workspace_url ,dbutils.secrets.get(scope=json_['master_name_scope'], key='gs-path-to-json'),dbutils.secrets.get(scope=json_['master_name_scope'], key='impersonate-service-account')) + if gcp_workspace_url: if identity_token: loggr.info(f"Storing identity token :- scope:{master_name_scope}, key:{master_name_key}, on workpace:{gcp_workspace_url}") - storeTokenAsSecret(gcp_workspace_url,master_name_scope, master_name_key,ws_pat_token,identity_token) + storeTokenAsSecret(gcp_workspace_url,master_name_scope, master_name_key,ws_temp_token,identity_token) if access_token: loggr.info(f"Storing identity token :- scope:{master_pwd_scope}, key:{master_pwd_key}, on workpace:{gcp_workspace_url}") - storeTokenAsSecret(gcp_workspace_url,master_pwd_scope, master_pwd_key,ws_pat_token,access_token) + storeTokenAsSecret(gcp_workspace_url,master_pwd_scope, master_pwd_key,ws_temp_token,access_token) # COMMAND ---------- diff --git a/notebooks/Setup/gcp/configure_tokens_for_worksaces.py b/notebooks/Setup/gcp/configure_tokens_for_worksaces.py index 0406bd0a..54abd541 100644 --- a/notebooks/Setup/gcp/configure_tokens_for_worksaces.py +++ b/notebooks/Setup/gcp/configure_tokens_for_worksaces.py @@ -46,9 +46,7 @@ workspace_pat_scope = json_['workspace_pat_scope'] tokenscope = json_['workspace_pat_token_prefix'] -ws_pat_token = dbutils.secrets.get(workspace_pat_scope, tokenscope+"-"+current_workspace) -account_id = json_["account_id"] workspace_id = None try: @@ -169,20 +167,19 @@ def storeTokenAsSecret(gcp_workspace_url, scope, key, PAT_token, token): json=None, timeout=600 ) - +ws_temp_token = generateGCPWSToken(gcp_workspace_url ,dbutils.secrets.get(scope=json_['master_name_scope'], key='gs-path-to-json'),dbutils.secrets.get(scope=json_['master_name_scope'], key='impersonate-service-account')) if response.status_code == 200: loggr.info("Workspaces query successful!") workspaces = response.json() #generate rest of the workspace tokens and store them in the secret store of the main workspace - #Do not renew token for the current workspace as the PAT token is already provided via config #Renew the token for specified workspace or all workspaces based on the workspace_id value for ws in workspaces: - if((workspace_id is None and (str(ws['workspace_id']) != current_workspace) and (ws['workspace_status'] == 'RUNNING')) or (workspace_id is not None and (str(ws['workspace_id']) == workspace_id) and (str(ws['workspace_id']) != current_workspace))): + if((workspace_id is None and (ws['workspace_status'] == 'RUNNING')) or (workspace_id is not None and (str(ws['workspace_id']) == workspace_id))): deployment_url = "https://"+ ws['deployment_name']+'.'+cloud_type+'.databricks.com' loggr.info(f" Getting token for Workspace : {deployment_url}") token = generateToken(deployment_url) if token: - storeTokenAsSecret(gcp_workspace_url, workspace_pat_scope, tokenscope+"-"+str(ws['workspace_id']), ws_pat_token, token) + storeTokenAsSecret(gcp_workspace_url, workspace_pat_scope, tokenscope+"-"+str(ws['workspace_id']), ws_temp_token, token) else: loggr.info(f"Error querying workspace API. Check account tokens: {response}") diff --git a/notebooks/Utils/common.py b/notebooks/Utils/common.py index 566c34f4..43760694 100644 --- a/notebooks/Utils/common.py +++ b/notebooks/Utils/common.py @@ -243,73 +243,74 @@ def getWorkspaceConfig(): # This is needed only on bootstrap, subsequetly the database is the master copy of the user configuration # Every time the values are altered, the _user file can be regenerated - but it is more as FYI def readBestPracticesConfigsFile(): - import shutil - from os.path import exists - - import pandas as pd - - hostname = ( - dbutils.notebook.entry_point.getDbutils() - .notebook() - .getContext() - .apiUrl() - .getOrElse(None) - ) - cloud_type = getCloudType(hostname) - doc_url = cloud_type + "_doc_url" - - prefix = getConfigPath() - origfile = f"{prefix}/security_best_practices.csv" - - schema_list = [ - "id", - "check_id", - "category", - "check", - "evaluation_value", - "severity", - "recommendation", - "aws", - "azure", - "gcp", - "enable", - "alert", - "logic", - "api", - doc_url, - ] - - schema = """id int, check_id string,category string,check string, evaluation_value string,severity string, - recommendation string,aws int,azure int,gcp int,enable int,alert int, logic string, api string, doc_url string""" - - security_best_practices_pd = pd.read_csv( - origfile, header=0, usecols=schema_list - ).rename(columns={doc_url: "doc_url"}) - - security_best_practices = spark.createDataFrame( - security_best_practices_pd, schema - ).select( - "id", - "check_id", - "category", - "check", - "evaluation_value", - "severity", - "recommendation", - "doc_url", - "aws", - "azure", - "gcp", - "enable", - "alert", - "logic", - "api", - ) - - security_best_practices.write.format("delta").mode("overwrite").saveAsTable( - json_["analysis_schema_name"] + ".security_best_practices" - ) - display(security_best_practices) + security_best_practices_exists = spark.catalog.tableExists( f'{json_["analysis_schema_name"]}.security_best_practices') + if not security_best_practices_exists: + import shutil + from os.path import exists + + import pandas as pd + + hostname = ( + dbutils.notebook.entry_point.getDbutils() + .notebook() + .getContext() + .apiUrl() + .getOrElse(None) + ) + cloud_type = getCloudType(hostname) + doc_url = cloud_type + "_doc_url" + + prefix = getConfigPath() + origfile = f"{prefix}/security_best_practices.csv" + + schema_list = [ + "id", + "check_id", + "category", + "check", + "evaluation_value", + "severity", + "recommendation", + "aws", + "azure", + "gcp", + "enable", + "alert", + "logic", + "api", + doc_url, + ] + + schema = """id int, check_id string,category string,check string, evaluation_value int,severity string, + recommendation string,aws int,azure int,gcp int,enable int,alert int, logic string, api string, doc_url string""" + + security_best_practices_pd = pd.read_csv( + origfile, header=0, usecols=schema_list + ).rename(columns={doc_url: "doc_url"}) + + security_best_practices = spark.createDataFrame( + security_best_practices_pd, schema + ).select( + "id", + "check_id", + "category", + "check", + "evaluation_value", + "severity", + "recommendation", + "doc_url", + "aws", + "azure", + "gcp", + "enable", + "alert", + "logic", + "api", + ) + security_best_practices.write.format("delta").mode("overwrite").saveAsTable( + json_["analysis_schema_name"] + ".security_best_practices" + ) + display(security_best_practices) # COMMAND ---------- @@ -377,16 +378,6 @@ def basePath(): # COMMAND ---------- -"""%sql -CREATE DATABASE IF NOT EXISTS security_analysis; -CREATE TABLE IF NOT EXISTS security_analysis.run_number_table ( - runID BIGINT GENERATED ALWAYS AS IDENTITY, - check_time TIMESTAMP -) -USING DELTA""" - -# COMMAND ---------- - def create_schema(): df = spark.sql(f'CREATE DATABASE IF NOT EXISTS {json_["analysis_schema_name"]}') @@ -504,6 +495,41 @@ def create_workspace_run_complete_table(): ) +# COMMAND ---------- + +def generateGCPWSToken(deployment_url, cred_file_path,target_principal): + from google.oauth2 import service_account + import gcsfs + import json + gcp_accounts_url = 'https://accounts.gcp.databricks.com' + target_scopes = [deployment_url] + print(target_scopes) + # Reading gcs files with gcsfs + gcs_file_system = gcsfs.GCSFileSystem(project="gcp_project_name") + gcs_json_path = cred_file_path + with gcs_file_system.open(gcs_json_path) as f: + json_dict = json.load(f) + key = json.dumps(json_dict) + source_credentials = service_account.Credentials.from_service_account_info(json_dict,scopes=target_scopes) + from google.auth import impersonated_credentials + from google.auth.transport.requests import AuthorizedSession + + target_credentials = impersonated_credentials.Credentials( + source_credentials=source_credentials, + target_principal=target_principal, + target_scopes = target_scopes, + lifetime=36000) + + creds = impersonated_credentials.IDTokenCredentials( + target_credentials, + target_audience=deployment_url, + include_email=True) + + authed_session = AuthorizedSession(creds) + resp = authed_session.get(gcp_accounts_url) + return creds.token + + # COMMAND ---------- # For testing diff --git a/notebooks/Utils/initialize.py b/notebooks/Utils/initialize.py index b3d63eac..47fc3f3f 100644 --- a/notebooks/Utils/initialize.py +++ b/notebooks/Utils/initialize.py @@ -41,8 +41,11 @@ json_ = { "account_id": dbutils.secrets.get(scope="sat_scope", key="account-console-id"), "sql_warehouse_id": dbutils.secrets.get(scope="sat_scope", key="sql-warehouse-id"), - "analysis_schema_name": "security_analysis", + "analysis_schema_name": dbutils.secrets.get( + scope="sat_scope", key="analysis_schema_name" + ), "verbosity": "info", + "proxies": json.loads(dbutils.secrets.get(scope="sat_scope", key="proxies")), } # COMMAND ---------- @@ -135,10 +138,10 @@ # COMMAND ---------- -# Initialize best practices +# Initialize best practices readBestPracticesConfigsFile() # COMMAND ---------- -#Initialize sat dasf mapping +# Initialize sat dasf mapping load_sat_dasf_mapping() diff --git a/notebooks/diagnosis/sat_diagnosis_aws.py b/notebooks/diagnosis/sat_diagnosis_aws.py index d51448dd..3d0c29a1 100644 --- a/notebooks/diagnosis/sat_diagnosis_aws.py +++ b/notebooks/diagnosis/sat_diagnosis_aws.py @@ -56,8 +56,7 @@ dbutils.secrets.get(scope=json_['master_name_scope'], key='client-id') dbutils.secrets.get(scope=json_['master_name_scope'], key='client-secret') dbutils.secrets.get(scope=json_['master_name_scope'], key='use-sp-auth') - tokenkey = f"{json_['workspace_pat_token_prefix']}-{current_workspace}" - dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) + dbutils.secrets.get(scope=json_['master_name_scope'], key="analysis_schema_name") print("Your SAT configuration is has required secret names") except Exception as e: dbutils.notebook.exit(f'Your SAT configuration is missing required secret, please review setup instructions {e}') @@ -79,45 +78,10 @@ # COMMAND ---------- -# MAGIC %md -# MAGIC ### Check to see if the PAT token are valid - -# COMMAND ---------- - -import requests - -access_token = dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) - # Define the URL and headers workspaceUrl = spark.conf.get('spark.databricks.workspaceUrl') - -url = f'https://{workspaceUrl}/api/2.0/clusters/spark-versions' -headers = { - 'Authorization': f'Bearer {access_token}' -} - -# Make the GET request -response = requests.get(url, headers=headers) - -# Print the response -print(response.json()) - - -# COMMAND ---------- - -url = f'https://{workspaceUrl}/api/2.1/unity-catalog/models' -headers = { - 'Authorization': f'Bearer {access_token}' -} - -# Make the GET request -response = requests.get(url, headers=headers) - -# Print the response -print(response.json()) - -# COMMAND ---------- +import requests def getAWSTokenwithOAuth(source, baccount, client_id, client_secret): '''generates OAuth token for Service Principal authentication flow''' @@ -142,7 +106,7 @@ def getAWSTokenwithOAuth(source, baccount, client_id, client_secret): if response is not None and response.status_code == 200: return response.json()['access_token'] - LOGGR.debug(json.dumps(response.json())) + display(json.dumps(response.json())) return None @@ -161,7 +125,6 @@ def getAWSTokenwithOAuth(source, baccount, client_id, client_secret): import requests -access_token = dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) # Define the URL and headers workspaceUrl = spark.conf.get('spark.databricks.workspaceUrl') @@ -183,7 +146,7 @@ def getAWSTokenwithOAuth(source, baccount, client_id, client_secret): import requests -access_token = dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) + # Define the URL and headers workspaceUrl = spark.conf.get('spark.databricks.workspaceUrl') @@ -209,7 +172,7 @@ def getAWSTokenwithOAuth(source, baccount, client_id, client_secret): # MAGIC %sh # MAGIC -# MAGIC curl --header 'Authorization: Bearer ' -X GET 'https://sfe.cloud.databricks.com/api/2.0/clusters/spark-versions' +# MAGIC curl --header 'Authorization: Bearer ' -X GET 'https://.cloud.databricks.com/api/2.0/clusters/spark-versions' # COMMAND ---------- diff --git a/notebooks/diagnosis/sat_diagnosis_azure.py b/notebooks/diagnosis/sat_diagnosis_azure.py index 0dff97df..8e110ad0 100644 --- a/notebooks/diagnosis/sat_diagnosis_azure.py +++ b/notebooks/diagnosis/sat_diagnosis_azure.py @@ -58,8 +58,7 @@ dbutils.secrets.get(scope=json_['master_name_scope'], key='tenant-id') dbutils.secrets.get(scope=json_['master_name_scope'], key='client-id') dbutils.secrets.get(scope=json_['master_name_scope'], key='client-secret') - tokenkey = f"{json_['workspace_pat_token_prefix']}-{current_workspace}" - dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) + dbutils.secrets.get(scope=json_['master_name_scope'], key="analysis_schema_name") print("Your SAT configuration has required secret names") except Exception as e: dbutils.notebook.exit(f'Your SAT configuration is missing required secret, please review setup instructions {e}') @@ -79,33 +78,6 @@ print(" ".join(secretvalue)) -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Check to see if the PAT token are valid - -# COMMAND ---------- - -import requests - -access_token = dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) - -# Define the URL and headers -workspaceUrl = spark.conf.get('spark.databricks.workspaceUrl') - - -url = f'https://{workspaceUrl}/api/2.0/clusters/spark-versions' -headers = { - 'Authorization': f'Bearer {access_token}' -} - -# Make the GET request -response = requests.get(url, headers=headers) - -# Print the response -print(response.json()) - - # COMMAND ---------- # MAGIC %md @@ -194,7 +166,7 @@ # Define the URL and headers DATABRICKS_ACCOUNT_ID = dbutils.secrets.get(scope=sat_scope, key="account-console-id") -url = f'https://accounts.azuredatabricks.net/api/2.0/accounts/{DATABRICKS_ACCOUNT_ID}' +url = f'https://accounts.azuredatabricks.net/api/2.0/accounts/{DATABRICKS_ACCOUNT_ID}/workspaces' ## Note: The access token should be generated for a SP which is an account admin to run this command. diff --git a/notebooks/diagnosis/sat_diagnosis_gcp.py b/notebooks/diagnosis/sat_diagnosis_gcp.py index 2abc49b0..ac5cce8f 100644 --- a/notebooks/diagnosis/sat_diagnosis_gcp.py +++ b/notebooks/diagnosis/sat_diagnosis_gcp.py @@ -39,14 +39,6 @@ -# COMMAND ---------- - -import json -#Get current workspace id -context = json.loads(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()) -current_workspace = context['tags']['orgId'] -print(current_workspace) - # COMMAND ---------- # MAGIC %md @@ -59,27 +51,19 @@ dbutils.secrets.get(scope=json_['master_name_scope'], key='sql-warehouse-id') dbutils.secrets.get(scope=json_['master_name_scope'], key='gs-path-to-json') dbutils.secrets.get(scope=json_['master_name_scope'], key='impersonate-service-account') - tokenkey = f"{json_['workspace_pat_token_prefix']}-{current_workspace}" - dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) + dbutils.secrets.get(scope=json_['master_name_scope'], key="analysis_schema_name") print("Your SAT configuration has required secret names") except Exception as e: dbutils.notebook.exit(f'Your SAT configuration is missing required secret, please review setup instructions {e}') # COMMAND ---------- -# MAGIC %md -# MAGIC ### Validate the following Values and make sure they are correct - -# COMMAND ---------- - -sat_scope = json_['master_name_scope'] +import requests,json -for key in dbutils.secrets.list(sat_scope): - if key.key == tokenkey or not key.key.startswith("sat-token-"): - print(key.key) - secretvalue = dbutils.secrets.get(scope=sat_scope, key=key.key) - print(" ".join(secretvalue)) +# Define the URL and headers +workspaceUrl = json.loads(dbutils.notebook.entry_point.getDbutils().notebook() \ + .getContext().toJson())['tags']['browserHostName'] # COMMAND ---------- @@ -90,25 +74,11 @@ import requests,json -access_token = dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) # Define the URL and headers workspaceUrl = json.loads(dbutils.notebook.entry_point.getDbutils().notebook() \ .getContext().toJson())['tags']['browserHostName'] - -url = f'https://{workspaceUrl}/api/2.0/clusters/spark-versions' -headers = { - 'Authorization': f'Bearer {access_token}' -} - -# Make the GET request -response = requests.get(url, headers=headers) - -# Print the response -print(response.json()) - - # COMMAND ---------- gcp_accounts_url = 'https://accounts.gcp.databricks.com' @@ -197,7 +167,7 @@ def getGCSAccessToken(cred_file_path,target_principal): import requests # Define the URL and headers -DATABRICKS_ACCOUNT_ID = dbutils.secrets.get(scope=sat_scope, key="account-console-id") +DATABRICKS_ACCOUNT_ID = dbutils.secrets.get(scope=json_['master_name_scope'], key="account-console-id") url = f'https://accounts.gcp.databricks.com/api/2.0/accounts/{DATABRICKS_ACCOUNT_ID}/workspaces' ## Note: The access token should be generated for a SP which is an account admin to run this command. @@ -274,7 +244,6 @@ def generateWSToken(deployment_url, cred_file_path,target_principal): import requests,json -access_token = dbutils.secrets.get(scope=json_['master_name_scope'], key=tokenkey) # Define the URL and headers workspaceUrl = json.loads(dbutils.notebook.entry_point.getDbutils().notebook() \ diff --git a/terraform/aws/TERRAFORM_AWS.md b/terraform/aws/TERRAFORM_AWS.md index 9653625e..18067c5e 100644 --- a/terraform/aws/TERRAFORM_AWS.md +++ b/terraform/aws/TERRAFORM_AWS.md @@ -1,5 +1,7 @@ ## Setting up Terraform +> **SAT v0.2.0 or higher** brings full support for Unity Catalog. Now you can pick your catalog instead of hive_metastore. Plus, you get to choose your own schema name. + Step 1: [Install Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) Step 2: [Install Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) on local machine @@ -8,12 +10,13 @@ Step 3: Git Clone Repo ```sh git clone https://github.com/databricks-industry-solutions/security-analysis-tool.git - ``` + ``` + Step 4: Change Directories ```sh cd security-analysis-tool/terraform// - ``` + ``` Step 5: Set values in `terraform.tfvars` file @@ -25,7 +28,13 @@ Further Documentation for some of the variables: [account_console_id](https://docs.databricks.com/administration-guide/account-settings/index.html#locate-your-account-id) -[workspace_PAT](https://docs.databricks.com/dev-tools/auth.html#personal-access-tokens-for-users) +> **Proxies are now supported as part of SAT. You can add your HTTP and HTTPS links to use your proxies.** +```json +{ + "http": "http://example.com", + "https": "https://example.com" +} +``` ## Run Terraform diff --git a/terraform/aws/provider.tf b/terraform/aws/provider.tf index 7bff95a2..21a07448 100644 --- a/terraform/aws/provider.tf +++ b/terraform/aws/provider.tf @@ -7,8 +7,9 @@ terraform { } provider "databricks" { - host = var.databricks_url - token = var.workspace_PAT + host = var.databricks_url + client_id = var.client_id + client_secret = var.client_secret } module "common" { @@ -16,4 +17,6 @@ module "common" { account_console_id = var.account_console_id workspace_id = var.workspace_id sqlw_id = var.sqlw_id + analysis_schema_name = var.analysis_schema_name + proxies = var.proxies } diff --git a/terraform/aws/template.tfvars b/terraform/aws/template.tfvars index 425691f0..4a251de9 100644 --- a/terraform/aws/template.tfvars +++ b/terraform/aws/template.tfvars @@ -2,14 +2,16 @@ databricks_url = "" workspace_PAT = "" workspace_id = "" account_console_id = "" +analysis_schema_name = "" #Uncomment and set sqlw_id if you have an existing SQL Warehouse you'd like to use, else leave it commented out #sqlw_id = "" ### AWS Specific Variables +client_id = "" // Databricks Service Principal Application ID +client_secret = "" //Databricks Service Principal ID Secret + -account_user = "" -account_pass = "" ### Azure Specific Variables diff --git a/terraform/aws/terraform.tfvars b/terraform/aws/terraform.tfvars index 751304c9..406871f7 100644 --- a/terraform/aws/terraform.tfvars +++ b/terraform/aws/terraform.tfvars @@ -1,10 +1,11 @@ -databricks_url = "" -workspace_PAT = "" -workspace_id = "" -account_console_id = "" +databricks_url = "" +workspace_id = "" +account_console_id = "" +analysis_schema_name = "" ### Databricks Service Principal -client_id = "" // Databricks Service Principal Application ID +client_id = "" // Databricks Service Principal Application ID client_secret = "" //Databricks Service Principal ID Secret +proxies = {} diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf index d1a4ee3d..14ac6f9c 100644 --- a/terraform/aws/variables.tf +++ b/terraform/aws/variables.tf @@ -8,11 +8,6 @@ variable "workspace_id" { type = string } -variable "workspace_PAT" { - description = "PAT should look like dapixxxxxxxxxxxxxxxxxxxx" - type = string -} - variable "account_console_id" { description = "Databricks Account Console ID" type = string @@ -59,3 +54,13 @@ variable "client_secret" { type = string default = "value" } + +variable "analysis_schema_name" { + type = string + description = "Name of the schema to be used for analysis" +} + +variable "proxies" { + type = map + description = "Proxies to be used for Databricks API calls" +} diff --git a/terraform/azure/TERRAFORM_Azure.md b/terraform/azure/TERRAFORM_Azure.md index 08ee7e13..a9ca42ee 100644 --- a/terraform/azure/TERRAFORM_Azure.md +++ b/terraform/azure/TERRAFORM_Azure.md @@ -1,5 +1,7 @@ ## Setting up Terraform +> **SAT v0.2.0 or higher** brings full support for Unity Catalog. Now you can pick your catalog instead of hive_metastore. Plus, you get to choose your own schema name. + Step 1: [Install Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) Step 2: [Install Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) on local machine @@ -39,6 +41,13 @@ Run the login command and sign in with your account credentials in the browser. az login ``` +> **Proxies are now supported as part of SAT. You can add your HTTP and HTTPS links to use your proxies.** +```json +{ + "http": "http://example.com", + "https": "https://example.com" +} +``` ## Run Terraform diff --git a/terraform/azure/provider.tf b/terraform/azure/provider.tf index 9274bf67..a94bb9d0 100644 --- a/terraform/azure/provider.tf +++ b/terraform/azure/provider.tf @@ -5,8 +5,7 @@ terraform { } provider "databricks" { - host = var.databricks_url -# auth_type = "azure-cli" + host = var.databricks_url } module "common" { @@ -14,4 +13,6 @@ module "common" { account_console_id = var.account_console_id workspace_id = var.workspace_id sqlw_id = var.sqlw_id + analysis_schema_name = var.analysis_schema_name + proxies = var.proxies } diff --git a/terraform/azure/template.tfvars b/terraform/azure/template.tfvars index c9935711..614821c0 100644 --- a/terraform/azure/template.tfvars +++ b/terraform/azure/template.tfvars @@ -2,6 +2,7 @@ databricks_url = "" workspace_PAT = "" workspace_id = "" account_console_id = "" +analysis_schema_name = "" #Uncomment and set sqlw_id if you have an existing SQL Warehouse you'd like to use, else leave it commented out #sqlw_id = "" diff --git a/terraform/azure/terraform.tfvars b/terraform/azure/terraform.tfvars index b1fcd810..4e726f52 100644 --- a/terraform/azure/terraform.tfvars +++ b/terraform/azure/terraform.tfvars @@ -1,10 +1,13 @@ databricks_url = "" workspace_id = "" account_console_id = "" +analysis_schema_name = "" ### Azure Specific Variables client_id = "" -tenant_id = "" client_secret = "" +tenant_id = "" subscription_id = "" + +proxies = {} diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf index e0dfe626..e380b0bf 100644 --- a/terraform/azure/variables.tf +++ b/terraform/azure/variables.tf @@ -23,6 +23,17 @@ variable "sqlw_id" { default = "new" } +variable "analysis_schema_name" { + type = string + description = "Name of the schema to be used for analysis" +} + +variable "proxies" { + type = map + description = "Proxies to be used for Databricks API calls" +} + + ### Azure Specific Variables variable "client_id" { @@ -30,6 +41,11 @@ variable "client_id" { type = string } +variable "client_secret" { + description = "SP Secret" + type = string +} + variable "tenant_id" { description = "The Directory (tenant) ID for the application registered in Azure AD" type = string @@ -40,7 +56,4 @@ variable "subscription_id" { type = string } -variable "client_secret" { - description = "SP Secret" - type = string -} + diff --git a/terraform/common/secrets.tf b/terraform/common/secrets.tf index a27fbd71..30a35f63 100644 --- a/terraform/common/secrets.tf +++ b/terraform/common/secrets.tf @@ -8,17 +8,6 @@ resource "databricks_secret" "user_email" { scope = databricks_secret_scope.sat.id } -resource "databricks_token" "pat" { - lifetime_seconds = 86400 * 90 - comment = "Security Analysis Tool" -} - -resource "databricks_secret" "pat" { - key = "sat-token-${var.workspace_id}" - string_value = databricks_token.pat.token_value - scope = databricks_secret_scope.sat.id -} - resource "databricks_secret" "account_console_id" { key = "account-console-id" string_value = var.account_console_id @@ -30,3 +19,16 @@ resource "databricks_secret" "sql_warehouse_id" { string_value = var.sqlw_id == "new" ? databricks_sql_endpoint.new[0].id : data.databricks_sql_warehouse.old[0].id scope = databricks_secret_scope.sat.id } + +resource "databricks_secret" "analysis_schema_name" { + key = "analysis_schema_name" + string_value = var.analysis_schema_name + scope = databricks_secret_scope.sat.id +} + +resource "databricks_secret" "proxies" { + key = "proxies" + string_value = jsonencode(var.proxies) + scope = databricks_secret_scope.sat.id +} + diff --git a/terraform/common/variables.tf b/terraform/common/variables.tf index 9a992830..da59e2d7 100644 --- a/terraform/common/variables.tf +++ b/terraform/common/variables.tf @@ -34,3 +34,13 @@ variable "gcp_impersonate_service_account" { description = "GCP Service Account to impersonate (e.g. xyz-sa-2@project.iam.gserviceaccount.com)" default = "" } + +variable "analysis_schema_name" { + type = string + description = "Name of the schema to be used for analysis" +} + +variable "proxies" { + type = map + description = "Proxies to be used for Databricks API calls" +} diff --git a/terraform/gcp/TERRAFORM_GCP.md b/terraform/gcp/TERRAFORM_GCP.md index 4afe5e5d..fff77cf8 100644 --- a/terraform/gcp/TERRAFORM_GCP.md +++ b/terraform/gcp/TERRAFORM_GCP.md @@ -1,5 +1,7 @@ ## Setting up Terraform +> **SAT v0.2.0 or higher** brings full support for Unity Catalog. Now you can pick your catalog instead of hive_metastore. Plus, you get to choose your own schema name. + Step 1: [Install Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) Step 2: [Install Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) on local machine @@ -26,10 +28,16 @@ Further Documentation for some of the variables: [account_console_id](https://docs.gcp.databricks.com/administration-guide/account-settings/#locate-your-account-id) -[workspace_PAT](https://docs.gcp.databricks.com/dev-tools/auth.html#personal-access-tokens-for-users) - [GCP Specific variables](../../docs/setup.md#authentication-information) and navigate to the GCP section +> **Proxies are now supported as part of SAT. You can add your HTTP and HTTPS links to use your proxies.** +```json +{ + "http": "http://example.com", + "https": "https://example.com" +} +``` + ## Run Terraform Step 6: Terraform [Init](https://developer.hashicorp.com/terraform/cli/commands/init) diff --git a/terraform/gcp/provider.tf b/terraform/gcp/provider.tf index 7ed2259c..0b9e6921 100644 --- a/terraform/gcp/provider.tf +++ b/terraform/gcp/provider.tf @@ -7,8 +7,9 @@ terraform { } provider "databricks" { - host = var.databricks_url - token = var.workspace_PAT + host = var.databricks_url + client_id = var.client_id + client_secret = var.client_secret } module "common" { @@ -17,4 +18,6 @@ module "common" { workspace_id = var.workspace_id sqlw_id = var.sqlw_id gcp_impersonate_service_account = var.impersonate_service_account + analysis_schema_name = var.analysis_schema_name + proxies = var.proxies } diff --git a/terraform/gcp/template.tfvars b/terraform/gcp/template.tfvars index 4cc2487a..5e0e92f0 100644 --- a/terraform/gcp/template.tfvars +++ b/terraform/gcp/template.tfvars @@ -1,8 +1,9 @@ databricks_url = "" -workspace_PAT = "" workspace_id = "" account_console_id = "" +analysis_schema_name = "" +proxies = {} #Uncomment and set sqlw_id if you have an existing SQL Warehouse you'd like to use, else leave it commented out #sqlw_id = "" @@ -20,5 +21,7 @@ account_console_id = "" ### GCP Specific Variables +client_id = "" +client_secret = "" gs_path_to_json = "" -impersonate_service_account = "" +impersonate_service_account = "" \ No newline at end of file diff --git a/terraform/gcp/terraform.tfvars b/terraform/gcp/terraform.tfvars index 305becd4..c44a73eb 100644 --- a/terraform/gcp/terraform.tfvars +++ b/terraform/gcp/terraform.tfvars @@ -1,9 +1,14 @@ databricks_url = "" -workspace_PAT = "" workspace_id = "" account_console_id = "" +analysis_schema_name = "" +proxies = {} ### GCP Specific Variables - -gs_path_to_json = "" +client_id = "" +client_secret = "" +gs_path_to_json = "" impersonate_service_account = "" + + + diff --git a/terraform/gcp/variables.tf b/terraform/gcp/variables.tf index 4d314550..e75ca462 100644 --- a/terraform/gcp/variables.tf +++ b/terraform/gcp/variables.tf @@ -8,11 +8,6 @@ variable "workspace_id" { type = string } -variable "workspace_PAT" { - description = "PAT should look like dapixxxxxxxxxxxxxxxxxxxx" - type = string -} - variable "account_console_id" { description = "Databricks Account Console ID" type = string @@ -28,8 +23,28 @@ variable "sqlw_id" { } } +variable "analysis_schema_name" { + type = string + description = "Name of the schema to be used for analysis" +} + +variable "proxies" { + type = map + description = "Proxies to be used for Databricks API calls" +} + ### GCP Specific Variables +variable "client_id" { + description = "Service Principal (client) ID" + type = string +} + +variable "client_secret" { + description = "SP Secret" + type = string +} + variable "gs_path_to_json" { type = string description = "File path to this resource in Cloud Storage (as gs:////.json)"