Skip to content

Commit

Permalink
tech detections per origins
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed Dec 20, 2024
1 parent d0c9143 commit a11e404
Showing 1 changed file with 88 additions and 71 deletions.
159 changes: 88 additions & 71 deletions definitions/output/wappalyzer/tech_detections.js
Original file line number Diff line number Diff line change
@@ -1,127 +1,144 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_detections', {
type: 'table',
description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/page/p_an38lbzywc/edit',
description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/origin/p_an38lbzywc/edit',
schema: 'wappalyzer',
type: 'incremental',
protected: true,
bigquery: {
partitionBy: 'date',
},
tags: ['crawl_complete']
}).query(ctx => `
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.currentMonth}';
`).query(ctx => `
WITH source AS (
SELECT DISTINCT
date,
root_page AS page,
root_page AS origin,
tech.technology
FROM ${ctx.ref('crawl', 'pages')},
UNNEST(technologies) AS tech
WHERE date >= "${pastMonth}" ${constants.devRankFilter}
WHERE date IN ('${pastMonth}', '${constants.currentMonth}') ${constants.devRankFilter}
),
-- Technology in the previous month (August)
-- Technology in the previous month
tech_before AS (
SELECT
page,
origin,
technology
FROM source
WHERE date = "${pastMonth}"
WHERE date = '${pastMonth}'
),
-- Technology in the current month (September)
-- Technology in the current month
tech_current AS (
SELECT
page,
origin,
technology
FROM source
WHERE date = "${constants.currentMonth}"
WHERE date = '${constants.currentMonth}'
),
-- Summary of technology and categories per page in the previous month
-- Summary of technology per origin in the previous month
tech_before_summary AS (
SELECT
technology,
COUNT(DISTINCT page) AS total_pages_before
COUNT(DISTINCT origin) AS total_origins_before
FROM tech_before
GROUP BY technology
),
-- Pages that existed last month but introduced the technology in the current month
tech_introduced_existing_pages AS (
-- origins that persisted across both months and adopted the technology in the current month
tech_adopted_existing_origins AS (
SELECT
tech_current.technology,
COUNT(DISTINCT tech_current.page) AS total_pages_introduced_existing,
STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_existing
FROM tech_current
JOIN tech_before
USING (page)
persisted_origins.technology,
COUNT(DISTINCT persisted_origins.origin) AS total_origins_adopted_existing,
STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_adopted_existing
FROM (
SELECT DISTINCT
tech_current.technology,
tech_current.origin
FROM tech_before
JOIN tech_current
USING (origin)
) as persisted_origins
LEFT JOIN tech_before AS tb
ON tech_current.page = tb.page AND tech_current.technology = tb.technology
WHERE tb.page IS NULL -- Technology was not detected last month
GROUP BY tech_current.technology
ON persisted_origins.origin = tb.origin AND persisted_origins.technology = tb.technology
WHERE tb.origin IS NULL -- Technology was not detected last month
GROUP BY 1
),
-- Pages that were not in the dataset last month but appeared this month with the technology
tech_introduced_new_pages AS (
-- origins that arrived to CrUX in the current month and their detected technologies
tech_adopted_new_origins AS (
SELECT
tech_current.technology,
COUNT(DISTINCT tech_current.page) AS total_pages_introduced_new,
STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_new
COUNT(DISTINCT tech_current.origin) AS total_origins_adopted_new,
--STRING_AGG(DISTINCT tech_current.origin LIMIT 5) AS sample_origins_adopted_new
FROM tech_current
LEFT JOIN tech_before
USING (page)
WHERE tech_before.page IS NULL -- Page was not present last month
GROUP BY tech_current.technology
USING (origin)
WHERE tech_before.origin IS NULL -- origin was not present last month
GROUP BY 1
),
-- Pages that existed this month but no longer have the technology
tech_deprecated_existing_pages AS (
-- origins that persisted across both months and deprecated the technology usage in the current month
tech_deprecated_existing_origins AS (
SELECT
tech_before.technology,
COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_existing,
STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_existing
FROM tech_before
JOIN tech_current
USING (page)
persisted_origins.technology,
COUNT(DISTINCT persisted_origins.origin) AS total_origins_deprecated_existing,
STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_deprecated_existing
FROM (
SELECT DISTINCT
tech_before.technology,
tech_before.origin
FROM tech_before
JOIN tech_current
USING (origin)
) as persisted_origins
LEFT JOIN tech_current AS tc
ON tech_before.page = tc.page AND tech_before.technology = tc.technology
WHERE tc.page IS NULL -- Technology is not detected in the current month
GROUP BY tech_before.technology
ON persisted_origins.origin = tc.origin AND persisted_origins.technology = tc.technology
WHERE tc.origin IS NULL -- Technology is not detected in the current month
GROUP BY 1
),
-- Pages that no longer exist in the current dataset
tech_deprecated_gone_pages AS (
-- origins that were dropped from CrUX in the current dataset, and thus the technology was not detected anymore
tech_deprecated_gone_origins AS (
SELECT
tech_before.technology,
COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_gone,
STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_gone
COUNT(DISTINCT tech_before.origin) AS total_origins_deprecated_gone,
--STRING_AGG(DISTINCT tech_before.origin LIMIT 5) AS sample_origins_deprecated_gone
FROM tech_before
LEFT JOIN tech_current
USING (page)
WHERE tech_current.page IS NULL -- Page no longer exists in current dataset
GROUP BY tech_before.technology
USING (origin)
WHERE tech_current.origin IS NULL -- origin no longer exists in current dataset
GROUP BY 1
)
-- Final aggregation and comparison of technology adoption/deprecation metrics
-- aggregation of technology adoption/deprecation metrics
SELECT
COALESCE(before_summary.technology, tech_introduced_existing_pages.technology, tech_introduced_new_pages.technology, apps.name) AS technology,
DATE('${constants.currentMonth}') AS date,
COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, apps.name) AS technology,
-- Pages summary
0-COALESCE(total_pages_deprecated_existing, 0) AS total_pages_deprecated_existing,
0-COALESCE(total_pages_deprecated_gone, 0) AS total_pages_deprecated_gone,
-- origins summary
0-COALESCE(total_origins_deprecated_existing, 0) AS total_origins_deprecated_existing,
0-COALESCE(total_origins_deprecated_gone, 0) AS total_origins_deprecated_gone,
COALESCE(total_pages_before, 0) - COALESCE(total_pages_deprecated_existing, 0) - COALESCE(total_pages_deprecated_gone, 0) AS total_pages_persisted,
COALESCE(total_origins_before, 0) - COALESCE(total_origins_deprecated_existing, 0) - COALESCE(total_origins_deprecated_gone, 0) AS total_origins_persisted,
COALESCE(total_pages_introduced_existing, 0) AS total_pages_introduced_existing,
COALESCE(total_pages_introduced_new, 0) AS total_pages_introduced_new,
COALESCE(total_origins_adopted_existing, 0) AS total_origins_adopted_existing,
COALESCE(total_origins_adopted_new, 0) AS total_origins_adopted_new,
-- Sample pages
COALESCE(sample_pages_deprecated_existing, "") AS sample_pages_deprecated_existing,
COALESCE(sample_pages_deprecated_gone, "") AS sample_pages_deprecated_gone,
-- Sample origins
COALESCE(sample_origins_deprecated_existing, "") AS sample_origins_deprecated_existing,
--COALESCE(sample_origins_deprecated_gone, "") AS sample_origins_deprecated_gone,
COALESCE(tech_introduced_existing_pages.sample_pages_introduced_existing, "") AS sample_pages_introduced_existing,
COALESCE(tech_introduced_new_pages.sample_pages_introduced_new, "") AS sample_pages_introduced_new
COALESCE(tech_adopted_existing_origins.sample_origins_adopted_existing, "") AS sample_origins_adopted_existing,
--COALESCE(tech_adopted_new_origins.sample_origins_adopted_new, "") AS sample_origins_adopted_new
FROM tech_before_summary before_summary
FULL OUTER JOIN tech_introduced_existing_pages
ON before_summary.technology = tech_introduced_existing_pages.technology
FULL OUTER JOIN tech_introduced_new_pages
ON before_summary.technology = tech_introduced_new_pages.technology
LEFT JOIN tech_deprecated_existing_pages
ON before_summary.technology = tech_deprecated_existing_pages.technology
LEFT JOIN tech_deprecated_gone_pages
ON before_summary.technology = tech_deprecated_gone_pages.technology
FULL OUTER JOIN tech_adopted_existing_origins
ON before_summary.technology = tech_adopted_existing_origins.technology
FULL OUTER JOIN tech_adopted_new_origins
ON before_summary.technology = tech_adopted_new_origins.technology
LEFT JOIN tech_deprecated_existing_origins
ON before_summary.technology = tech_deprecated_existing_origins.technology
LEFT JOIN tech_deprecated_gone_origins
ON before_summary.technology = tech_deprecated_gone_origins.technology
FULL OUTER JOIN wappalyzer.apps
ON before_summary.technology = apps.name
ORDER BY total_pages_persisted DESC
`)

0 comments on commit a11e404

Please sign in to comment.