Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce genAI generated broken backlink fixes #348

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
40 changes: 26 additions & 14 deletions src/backlinks/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import AhrefsAPIClient from '@adobe/spacecat-shared-ahrefs-client';
import { AbortController, AbortError } from '@adobe/fetch';
import { retrieveSiteBySiteId } from '../utils/data-access.js';
import { enhanceBacklinksWithFixes, fetch } from '../support/utils.js';
import { obtainSitemapUrls } from '../sitemap/handler.js';

const TIMEOUT = 3000;

Expand Down Expand Up @@ -64,6 +65,8 @@ export default async function auditBrokenBacklinks(message, context) {
const { type, url: siteId, auditContext = {} } = message;
const { dataAccess, log, sqs } = context;
const {
AWS_REGION: region,
SPACECAT_STATISTICS_LAMBDA_ARN: statisticsServiceArn,
AUDIT_RESULTS_QUEUE_URL: queueUrl,
} = context.env;

Expand Down Expand Up @@ -100,20 +103,7 @@ export default async function auditBrokenBacklinks(message, context) {
const filteredBacklinks = result?.backlinks?.filter(
(backlink) => !excludedURLs?.includes(backlink.url_to),
);
let brokenBacklinks = await filterOutValidBacklinks(filteredBacklinks, log);
try {
const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
const keywords = topPages.map(
(page) => ({
url: page.getURL(),
keyword: page.getTopKeyword(),
traffic: page.getTraffic(),
}),
);
brokenBacklinks = enhanceBacklinksWithFixes(brokenBacklinks, keywords, log);
} catch (e) {
log.error(`Enhancing backlinks with fixes for siteId ${siteId} failed with error: ${e.message}`, e);
}
const brokenBacklinks = await filterOutValidBacklinks(filteredBacklinks, log);

auditResult = {
finalUrl: auditContext.finalUrl,
Expand Down Expand Up @@ -143,8 +133,30 @@ export default async function auditBrokenBacklinks(message, context) {
auditContext,
auditResult,
};

await sqs.sendMessage(queueUrl, data);

try {
dzehnder marked this conversation as resolved.
Show resolved Hide resolved
const baseUrl = site.getBaseURL();
const sitemaps = await obtainSitemapUrls(baseUrl);
if (sitemaps?.success && sitemaps?.paths) {
const sitemapUrls = Object.values(sitemaps.paths)
.reduce((acc, curr) => acc.concat(curr), []);
await enhanceBacklinksWithFixes(
siteId,
auditResult.brokenBacklinks,
sitemapUrls,
{
region,
statisticsServiceArn,
log,
},
);
}
} catch (e) {
log.error(`Enhancing backlinks with fixes for siteId ${siteId} failed with error: ${e.message}`, e);
}

log.info(`Successfully audited ${siteId} for ${type} type audit`);
return noContent();
} catch (e) {
Expand Down
4 changes: 2 additions & 2 deletions src/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls) {
* @param {string} inputUrl - The URL for which to find and validate the sitemap
* @returns {Promise<{success: boolean, reasons: Array<{value}>, paths?: any}>} result of sitemap
*/
export async function findSitemap(inputUrl) {
export async function obtainSitemapUrls(inputUrl) {
const logMessages = [];

const parsedUrl = extractDomainAndProtocol(inputUrl);
Expand Down Expand Up @@ -299,7 +299,7 @@ export async function sitemapAuditRunner(baseURL, context) {
const { log } = context;
log.info(`Received sitemap audit request for ${baseURL}`);
const startTime = process.hrtime();
const auditResult = await findSitemap(baseURL);
const auditResult = await obtainSitemapUrls(baseURL);

const endTime = process.hrtime(startTime);
const elapsedSeconds = endTime[0] + endTime[1] / 1e9;
Expand Down
102 changes: 39 additions & 63 deletions src/support/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { hasText, resolveCustomerSecretsName } from '@adobe/spacecat-shared-util
import URI from 'urijs';
import { JSDOM } from 'jsdom';
import { GetSecretValueCommand, SecretsManagerClient } from '@aws-sdk/client-secrets-manager';
import { InvokeCommand, LambdaClient } from '@aws-sdk/client-lambda';

URI.preventInvalidHostname = true;

Expand Down Expand Up @@ -216,69 +217,44 @@ export const extractKeywordsFromUrl = (url, log) => {
};

/**
* Processes broken backlinks to find suggested URLs based on keywords.
*
* @param {Array} brokenBacklinks - The array of broken backlink objects to process.
* @param {Array} keywords - The array of keyword objects to match against.
* @param {Object} log - The logger object for logging messages.
* @returns {Array} A new array of backlink objects with suggested URLs added.
* Enhances the backlinks with fixes, triggers a Lambda function to calculate the fixes.
* @param siteId - The site ID.
* @param brokenBacklinks - The broken backlinks.
* @param sitemapUrls - The sitemap URLs.
* @param config - The configuration object.
* @param config.region - The AWS region.
* @param config.statisticsService - The statistics service Lambda function name.
* @param config.log - The logger.
* @returns {Promise<{status: string}>}
*/
export const enhanceBacklinksWithFixes = (brokenBacklinks, keywords, log) => {
const result = [];

for (const backlink of brokenBacklinks) {
log.info(`trying to find redirect for: ${backlink.url_to}`);
const extractedKeywords = extractKeywordsFromUrl(backlink.url_to, log);

const matchedData = [];

// Match keywords and include rank in the matched data
keywords.forEach((entry) => {
const matchingKeyword = extractedKeywords.find(
(keywordObj) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
return regex.test(entry.keyword);
},
);
if (matchingKeyword) {
matchedData.push({ ...entry, rank: matchingKeyword.rank });
}
});

// Try again with split keywords if no matches found
if (matchedData.length === 0) {
const splitKeywords = extractedKeywords
.map((keywordObj) => keywordObj.keyword.split(' ').map((k) => ({ keyword: k, rank: keywordObj.rank })))
.flat();
export async function enhanceBacklinksWithFixes(siteId, brokenBacklinks, sitemapUrls, config) {
const {
region, statisticsServiceArn, log,
} = config;
log.info(`Enhancing backlinks with fixes for site ${siteId}`);

const payload = {
type: 'broken-backlinks',
payload: {
siteId,
brokenBacklinks,
sitemapUrls,
},
};

const client = new LambdaClient({ region });
const command = new InvokeCommand({
FunctionName: statisticsServiceArn,
Payload: JSON.stringify(payload),
InvocationType: 'Event',
});

splitKeywords.forEach((keywordObj) => {
keywords.forEach((entry) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
if (regex.test(entry.keyword)) {
matchedData.push({ ...entry, rank: keywordObj.rank });
}
});
});
}

// Sort by rank and then by traffic
matchedData.sort((a, b) => {
if (b.rank === a.rank) {
return b.traffic - a.traffic; // Higher traffic ranks first
}
return a.rank - b.rank; // Higher rank ranks first (1 is highest)
});

const newBacklink = { ...backlink };

if (matchedData.length > 0) {
log.info(`found ${matchedData.length} keywords for backlink ${backlink.url_to}`);
newBacklink.url_suggested = matchedData[0].url;
} else {
log.info(`could not find suggested URL for backlink ${backlink.url_to} with keywords ${extractedKeywords.map((k) => k.keyword).join(', ')}`);
}

result.push(newBacklink);
try {
await client.send(command);
log.info(`Lambda function ${statisticsServiceArn} invoked successfully.`);
} catch (error) {
log.error(`Error invoking Lambda function ${statisticsServiceArn}:`, error);
}
return result;
};

return { status: 'Lambda function invoked' };
}
Loading
Loading