Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate views navigation metrics to ga4 bq #2005

Merged
merged 4 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions app/domain/etl/aggregations/monthly.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,18 @@ def aggregation_query
dimensions_edition_id,
pviews,
upviews,
entrances,
searches,
feedex,
satisfaction,
useful_yes,
useful_no,
exits,
updated_at,
created_at
)
SELECT '#{month.id}',
max(dimensions_edition_id),
sum(pviews),
sum(upviews),
sum(entrances),
sum(searches),
sum(feedex),
CASE
Expand All @@ -71,7 +68,6 @@ def aggregation_query
END AS satisfaction,
sum(useful_yes),
sum(useful_no),
sum(exits),
now(),
now()
FROM facts_metrics
Expand Down
25 changes: 25 additions & 0 deletions app/domain/etl/ga/bigquery.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require "google/cloud/bigquery"
require "googleauth"

class Etl::GA::Bigquery
include Google::Auth

def self.build
new.build
end

def build
credentials = {
"client_email" => ENV["BIGQUERY_CLIENT_EMAIL"],
"private_key" => ENV["BIGQUERY_PRIVATE_KEY"],
}

Google::Cloud::Bigquery.new(
project_id: ENV["BIGQUERY_PROJECT"],
credentials: Google::Auth::ServiceAccountCredentials.make_creds(
json_key_io: StringIO.new(credentials.to_json),
scope: ["https://www.googleapis.com/auth/bigquery"],
),
)
end
end
10 changes: 1 addition & 9 deletions app/domain/etl/ga/views_and_navigation_processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,10 @@ def load_metrics_query(date_to_s)
<<~SQL
UPDATE facts_metrics
SET upviews = s.upviews,
pviews = s.pviews,
entrances = s.entrances,
exits = s.exits,
bounces = s.bounces,
page_time = s.page_time
pviews = s.pviews
FROM (
SELECT pviews,
upviews,
entrances,
exits,
bounces,
page_time,
dimensions_editions.id
FROM events_gas, dimensions_editions
WHERE page_path = LOWER(base_path)
Expand Down
92 changes: 22 additions & 70 deletions app/domain/etl/ga/views_and_navigation_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,20 @@ def self.find_in_batches(*args, **kwargs, &block)
def find_in_batches(date:, batch_size: 10_000, &block)
fetch_data(date:)
.lazy
.map(&:to_h)
.flat_map(&method(:extract_rows))
.map(&method(:extract_dimensions_and_metrics))
.map(&:stringify_keys)
.map(&method(:append_data_labels))
.reject(&method(:invalid_record?))
.map { |hash| set_date(hash, date) }
.each_slice(batch_size, &block)
end

def client
@client ||= Etl::GA::Client.build
@client ||= Etl::GA::Bigquery.build
end

private

def set_date(hash, date)
hash["date"] = date.strftime("%F")
hash
end

def append_data_labels(values)
page_path, pviews, upviews, entrances, exits, bounces, page_time = *values

{
"page_path" => page_path,
"pviews" => pviews,
"upviews" => upviews,
"process_name" => "views",
"entrances" => entrances,
"exits" => exits,
"bounces" => bounces,
"page_time" => page_time,
}
def append_data_labels(hash)
hash.merge("process_name" => "views")
end

def invalid_record?(data)
Expand All @@ -49,53 +30,24 @@ def invalid_record?(data)
true
end

def extract_dimensions_and_metrics(row)
dimensions = row.fetch(:dimensions)
metrics = row.fetch(:metrics).flat_map do |metric|
metric.fetch(:values).map(&:to_i)
end

dimensions + metrics
end

def extract_rows(report)
report.fetch(:rows)
end

def fetch_data(date:)
@fetch_data ||= client.fetch_all(items: :data) do |page_token, service|
service
.batch_get_reports(
Google::Apis::AnalyticsreportingV4::GetReportsRequest.new(
report_requests: [build_request(date:).merge(page_token:)],
),
)
.reports
.first
end
end

def build_request(date:)
{
date_ranges: [
{ start_date: date.to_fs("%Y-%m-%d"), end_date: date.to_fs("%Y-%m-%d") },
],
dimensions: [
{ name: "ga:pagePath" },
],
hide_totals: true,
hide_value_ranges: true,
metrics: [
{ expression: "ga:pageviews" },
{ expression: "ga:uniquePageviews" },
{ expression: "ga:entrances" },
{ expression: "ga:exits" },
{ expression: "ga:avgTimeOnPage" },
{ expression: "ga:bounces" },
{ expression: "ga:timeOnPage" },
],
page_size: 10_000,
view_id: ENV["GOOGLE_ANALYTICS_GOVUK_VIEW_ID"],
}
@fetch_data ||= client
@date = date.strftime("%Y-%m-%d")

query = <<~SQL
WITH CTE1 AS (
SELECT *
FROM `govuk-content-data.dataform.GA4 dataform`
WHERE the_date = @date
)
SELECT
cleaned_page_location AS page_path,
unique_page_views AS upviews,
total_page_views AS pviews,
the_date AS date,
FROM CTE1
SQL

@fetch_data.query(query, params: { date: @date }).all
end
end
8 changes: 0 additions & 8 deletions config/metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,6 @@ daily:
description: "Number of times a user searches for something else on GOV.UK from the page"
source: 'Google Analytics'

- name: 'entrances'
description: "Number of session that began with this page"
source: 'Google Analytics'

- name: 'exits'
description: "Number of session that ended with this page"
source: 'Google Analytics'

edition:
- name: 'pdf_count'
description: "Number of .pdf attachments"
Expand Down
44 changes: 44 additions & 0 deletions spec/domain/etl/ga/bigquery_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
RSpec.describe Etl::GA::Bigquery do
describe "Connecting to the Google Cloud BigQuery API" do
before do
stub_request(:post, "https://www.googleapis.com/oauth2/v4/token")
.to_return(status: 200, body: "{}", headers: { "Content-Type" => "application/json" })

ENV["BIGQUERY_PROJECT"] = "bigquery-project-123"
ENV["BIGQUERY_CLIENT_EMAIL"] = "[email protected]"
ENV["BIGQUERY_PRIVATE_KEY"] = "key"

allow(OpenSSL::PKey::RSA).to receive(:new).and_return("key")
end

context "api client" do
subject { Etl::GA::Bigquery.new.build }

it "is an instance of Google::Cloud::Bigquery::Project" do
expect(subject).to be_kind_of(Google::Cloud::Bigquery::Project)
end
end

context "when setting up authorization" do
subject { Etl::GA::Bigquery.new.build }

it "uses the given client email from the JSON contents" do
expect(subject.service.credentials.issuer).to eq("[email protected]")
end

it "uses the given private key from the JSON contents" do
expect(subject.service.credentials.signing_key).to eq("key")
end

it "uses the BigQuery scope" do
expect(subject.service.credentials.scope).to include("https://www.googleapis.com/auth/bigquery")
end

it "raises an error if the project_id is not set" do
ENV["BIGQUERY_PROJECT"] = ""

expect { Etl::GA::Bigquery.new.build }.to raise_error(ArgumentError, "project_id is missing")
end
end
end
end
26 changes: 3 additions & 23 deletions spec/domain/etl/ga/concerns/transform_path_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,11 @@ class EtlSpecDummy
:ga_event,
:with_views,
page_path: "/https://www.gov.uk/topics",
bounces: 1,
upviews: 1,
pviews: 1,
useful_no: 1,
useful_yes: 1,
searches: 1,
entrances: 1,
exits: 1,
page_time: 1,
)
end

Expand All @@ -40,20 +36,16 @@ class EtlSpecDummy
:ga_event,
:with_views,
page_path: "/topics",
bounces: 1000,
upviews: 100,
pviews: 200,
useful_no: 300,
useful_yes: 400,
searches: 500,
entrances: 600,
exits: 700,
page_time: 1000,
)
end

it "returns the correct number of metrics" do
names_to_exclude = %w[id bounces date page_path updated_at created_at process_name page_time]
names_to_exclude = %w[id bounces date page_path updated_at created_at process_name page_time entrances exits]
event_attributes = event2.attribute_names.reject { |name| names_to_exclude.include?(name) }

subject.format_events_with_invalid_prefix
Expand All @@ -66,13 +58,13 @@ class EtlSpecDummy
it "explicitly ignores bounces" do
subject.format_events_with_invalid_prefix

expect(event2.reload.bounces).to eq(1)
expect(event2.reload.bounces).to eq(0)
end

it "explicitly ignores page_time" do
subject.format_events_with_invalid_prefix

expect(event2.reload.page_time).to eq(1)
expect(event2.reload.page_time).to eq(0)
end

it "updates events with their combined upviews" do
Expand Down Expand Up @@ -105,18 +97,6 @@ class EtlSpecDummy
expect(event2.reload.searches).to eq 501
end

it "updates events with their combines entrances" do
subject.format_events_with_invalid_prefix

expect(event2.reload.entrances).to eq 601
end

it "updates events with their combines exits" do
subject.format_events_with_invalid_prefix

expect(event2.reload.exits).to eq 701
end

it "deletes the duplicated event" do
subject.format_events_with_invalid_prefix

Expand Down
Loading
Loading