Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPE-248 weekly script to find deprecated spells to drop in metastore #4953

Closed
wants to merge 11 commits into from
51 changes: 51 additions & 0 deletions .github/workflows/drop_deprecated_spells.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# .github/workflows/run_python_script.yaml
name: Drop Deprecated Spells

on:
pull_request:
paths-ignore:
- 'scripts/**'
- 'Pipfile'
- '.gitignore'
schedule:
- cron: '0 14 * * 1' # At 9 AM EST every Monday (14:00 UTC)
workflow_dispatch:


on:
schedule:
- cron: '0 14 * * 1' # At 9 AM EST every Monday (14:00 UTC)
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
run-script:
runs-on: [ self-hosted, linux, spellbook-trino-ci ]
timeout-minutes: 10 # Timeout set to 10 minutes

steps:
- name: Check out repository code
uses: actions/checkout@v3
with:
ref: main # Specify the main branch

- name: Setup Python environment
uses: actions/setup-python@v3
with:
python-version: '3.9' # or whichever version your script requires

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install psycopg2-binary # Add other dependencies here

- name: Set environment variables
run: |
echo "SH_METASTORE_PROD_PASS=${{ secrets.SH_METASTORE_PROD_PASS }}" >> $GITHUB_ENV
# Set any other necessary environment variables here

- name: Run Python script
run: python drop_deprecated_spells.py
109 changes: 109 additions & 0 deletions scripts/drop_deprecated_spells.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import subprocess
import json
import os

def generate_tables_query():
return """
SELECT DISTINCT d."NAME" || '.' || t."TBL_NAME"
FROM "TBLS" t
JOIN "DBS" d ON d."DB_ID" = t."DB_ID"
JOIN "TABLE_PARAMS" tp ON tp."TBL_ID" = t."TBL_ID"
WHERE tp."PARAM_KEY" = 'dune.data_explorer.category'
AND tp."PARAM_VALUE" = 'abstraction'
AND t."OWNER_TYPE" = 'USER'
AND t."OWNER" = 'spellbook'
AND t."TBL_TYPE" = 'EXTERNAL_TABLE';
"""

def generate_views_query():
return """
SELECT DISTINCT d."NAME" || '.' || t."TBL_NAME"
FROM "TBLS" t
JOIN "DBS" d ON d."DB_ID" = t."DB_ID"
JOIN "TABLE_PARAMS" tp ON tp."TBL_ID" = t."TBL_ID"
WHERE tp."PARAM_KEY" = 'dune.data_explorer.category'
AND tp."PARAM_VALUE" = 'abstraction'
AND t."OWNER_TYPE" = 'USER'
AND t."OWNER" = 'spellbook'
AND t."TBL_TYPE" = 'VIRTUAL_VIEW';
"""

def run_psql_command(sql_query):
postgres_host = "prod-metastore-db"
postgres_password = os.environ.get("SH_METASTORE_PROD_PASS")

psql_command = [
'psql',
'-h', postgres_host,
'-p', '5432',
'-U', 'hive',
'-t',
'-c', sql_query
]

psql_process = subprocess.run(
psql_command,
text=True,
env=dict(os.environ, PGPASSWORD=postgres_password),
capture_output=True
)

if psql_process.returncode != 0:
print("Error executing psql command:")
print("psql_process.stderr:", psql_process.stderr)
return []

result_lines = psql_process.stdout.splitlines()

if not result_lines[-1]:
result_lines.pop()

return result_lines

# Main script

# Step 1: List dbt models and output in JSON format
dbt_command = ['dbt', 'ls', '--resource-type', 'model', '--output', 'json']
dbt_output_bytes = subprocess.check_output(dbt_command)
dbt_output_str = dbt_output_bytes.decode('utf-8')
dbt_lines = dbt_output_str.splitlines()
dbt_json_objects = [line for line in dbt_lines if line.strip().startswith('{')]
dbt_data_list = [json.loads(obj) for obj in dbt_json_objects]

# Iterate through each JSON object and categorize based on 'materialized'
view_models_dbt = []
table_models_dbt = []

for data in dbt_data_list:
materialized = data.get('config', {}).get('materialized', '').lower()
schema = data.get('config', {}).get('schema', 'schema_not_found')
alias = data.get('config', {}).get('alias', 'alias_not_found')

if materialized == 'view':
view_models_dbt.append(f"{schema}.{alias}")
elif materialized == 'table' or materialized == 'incremental':
table_models_dbt.append(f"{schema}.{alias}")

# Generate SQL queries
tables_sql_query = generate_tables_query()
views_sql_query = generate_views_query()

# Run psql queries for tables and views
psql_tables = run_psql_command(tables_sql_query)
psql_views = run_psql_command(views_sql_query)

# Trim whitespace from PostgreSQL tables and views
psql_tables = [table.strip() for table in psql_tables]
psql_views = [view.strip() for view in psql_views]

# Compare psql_views vs. view_models_dbt
print("\nViews in PostgreSQL but not in DBT:")
for view in psql_views:
if view not in view_models_dbt:
print(f"DROP VIEW IF EXISTS {view};")

# Compare psql_tables vs. table_models_dbt
print("\nTables in PostgreSQL but not in DBT:")
for table in psql_tables:
if table not in table_models_dbt:
print(f"DROP TABLE IF EXISTS {table};")
Loading