Skip to content

Commit

Permalink
Feat(extractor): Export Perplexity Pages (#120)
Browse files Browse the repository at this point in the history
* feat: extract Perplexity Pages paragraphs

* fix: removing useless rule

* feat: exporting Pages title

* fix: too much newlines

* Perplexity refactoring source extraction function

* refactor: Perplexity source extraction genericized
bug tile list extraction, only modal works)

* fix: selectors to get correct titles and not exporting intro sources tiles

* feat: extract Perplexity Pages sources

* fix: correctly exporting sources (no duplication or missing sources)

* docs: jsdoc

* fix: extraction isolation (safeExecute) + sleep 10s instead of 100s

* refactor: selector and newlines

* refactor: cleaning code

* fix: correcting conditions

* docs: fix jsdoc
  • Loading branch information
Hugo-COLLIN authored Jun 24, 2024
1 parent 7745296 commit a6033dd
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 51 deletions.
1 change: 1 addition & 0 deletions src/data/allowedDomains.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"PhindSearch": "www.phind.com/search",
"PhindChat": "www.phind.com/agent",
"Perplexity": "www.perplexity.ai/search",
"PerplexityPages": "www.perplexity.ai/page",
"MaxAIGoogle": "www.google.com/search"
}
}
172 changes: 121 additions & 51 deletions src/scripts/content/extractor/domains/Perplexity.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,74 +33,144 @@ export async function processMessage(content, format) {
return markdown;
}

async function extractSources(content, format) {
export async function extractSources(content, format) {
const SOURCES_HEADER = "---\n**Sources:**\n";
let res = SOURCES_HEADER;

async function extractFromModal() {
let i = 1;
for (const tile of document.querySelectorAll(".fixed > div > [class] > div > div > div > div > div > .flex.group")) {
res += await formatSources(i, format, tile);
i++;
}
}
// Open sources modal
res = await interactAndCatch(content, [
{open: ['button > div > svg[data-icon="ellipsis"]', '.cursor-pointer svg[data-icon="list-timeline"]'], close: ['[data-testid="close-modal"]'], selector: 'TODO'},
{open: ['div.grid > div.flex:nth-last-of-type(1)'], close: ['[data-testid="close-modal"]'], selector: 'TODO'},
], res, format);

// async function interactAndCatch() {
// const btnBottomExpand = content.querySelector('button > div > svg[data-icon="ellipsis"]');
// let btnBottomSources;
// const btnExpandSources = content.querySelector("div.grid > div.flex:nth-last-of-type(1)"); // Get the last button, useful when uploaded file div
// if (btnBottomExpand) {
// btnBottomExpand.parentNode?.click();
// await sleep(10);
// btnBottomSources = document.querySelector('.cursor-pointer svg[data-icon="list-timeline"]');
// if (btnBottomSources) btnBottomSources.parentNode?.click();
// }
// if (!btnBottomSources) {
// btnExpandSources?.click();
// }
// // console.log(btnExpandSources)
//
// // if there's a div tile and it contains multiple images (so it's not a file tile)
// if (btnBottomSources || (btnExpandSources && btnExpandSources.querySelectorAll("img").length > 0)) {
// await sleep(10);
//
// // Extract sources list from modal
// await safeExecute(extractFromModal.call(this));
//
// // Close sources modal
// const closeBtn = document.querySelector('[data-testid="close-modal"]');
// if (closeBtn) closeBtn.click();
// // if (btnBottomExpand) btnBottomExpand.parentNode?.click(); // causes bug
// } else
// await safeExecute(extractFromTileList.call(this));
// }
//
// await interactAndCatch.call(this);

async function extractFromTileList() {
let i = 1;
// Case the first tile is a file, not a link
const tilesNoLink = content.querySelectorAll("div.grid > div.flex");
for (const tile of tilesNoLink) {
if (tile.querySelectorAll("img").length === 0) {
res += await formatSources(i, format, tile);
i++;
}
}
// Don't export header if no sources
return res !== SOURCES_HEADER
? res
: "";
}

// Link tiles
for (const tile of content.querySelectorAll("div.grid > a")) {
async function extractFromModal(res, format) {
let i = 1;
for (const tile of document.querySelectorAll(".fixed > div > [class] > div > div > div > div > div > .flex.group")) {
res += await formatSources(i, format, tile);
i++;
}
return res;
}

async function extractFromTileList(res, format, content) {
let i = 1;
// Case the first tile is a file, not a link
const tilesNoLink = content.querySelectorAll("div.grid > div.flex");
for (const tile of tilesNoLink) {
if (tile.querySelectorAll("img").length === 0) {
res += await formatSources(i, format, tile);
i++;
}
}

// Open sources modal
// TODO: generic function using list of queryselectors (1 for open possibilities, 1 for close?) ; the first one that works is used
const btnBottomExpand = content.querySelector('button > div > svg[data-icon="ellipsis"]');
let btnBottomSources;
const btnExpandSources = content.querySelector("div.grid > div.flex:nth-last-of-type(1)"); // Get the last button, useful when uploaded file div
if (btnBottomExpand) {
btnBottomExpand.parentNode?.click();
await sleep(10);
btnBottomSources = document.querySelector('.cursor-pointer svg[data-icon="list-timeline"]');
if (btnBottomSources) btnBottomSources.parentNode?.click();
// Link tiles
for (const tile of content.querySelectorAll("div.grid > a")) {
res += await formatSources(i, format, tile);
i++;
}
if (!btnBottomSources) {
btnExpandSources?.click();
}
// console.log(btnExpandSources)
return res;
}

// if there's a div tile and it contains multiple images (so it's not a file tile)
if (btnBottomSources || (btnExpandSources && btnExpandSources.querySelectorAll("img").length > 0)) {
await sleep(10);

// Extract sources list from modal
await safeExecute(extractFromModal.call(this));
/**
* Generic function using list of queryselectors (1 for open possibilities, 1 for close) ; they are executed one after the other
* @param content
* @param {Array<{open: Array<string>, close: Array<string>, selector: string}>} selectors
* @param sources_header
* @param format
* @param afterActionSelector
* @returns {Promise<void>}
*/
export async function interactAndCatch(content, selectors, sources_header, format, afterActionSelector = null) {
let res = sources_header;
for (const {open, close, selector} of selectors) {
let btnBottomExpand;
// Open sources modal : each element in the open array is queryselected and clicked one after the other
for (const query of open) {
// TODO: find a way to make this more generic (like global/document: true / scope:document/parent/child/...)
if (query.includes('.cursor-pointer svg[data-icon="list-timeline"]')) {
btnBottomExpand = document.querySelector(query);
} else {
btnBottomExpand = content.querySelector(query);
}

// Close sources modal
const closeBtn = document.querySelector('[data-testid="close-modal"]');
if (closeBtn) closeBtn.click();
// if (btnBottomExpand) btnBottomExpand.parentNode?.click(); // causes bug
}
else
await safeExecute(extractFromTileList.call(this));
if (btnBottomExpand) {
// btnBottomExpand.click ? btnBottomExpand.click() : btnBottomExpand.parentNode?.click();
btnBottomExpand.click ? btnBottomExpand.click() : btnBottomExpand.parentNode?.click();
// btnBottomExpand?.parentNode?.click();
await sleep(10);
}
}

// Don't export header if no sources
return res !== SOURCES_HEADER
? res
: "";
// if (!btnBottomExpand) {
// console.warn("btnBottomExpand undefined");
// return;
// }

res = safeExecute(btnBottomExpand
? await extractFromModal(sources_header, format)
: await extractFromTileList(sources_header, format, content),
res);

// Close sources modal : each element in the close array is queryselected and clicked one after the other
for (const query of close) {
const btnClose = document.querySelector(query);
if (btnClose) {
btnClose.click();
await sleep(10);
}
}

if (res !== sources_header)
break;
}
const afterAction = document.querySelector(afterActionSelector);
if (afterAction) {
await sleep(100)
afterAction.click();
}
return res;
}

async function formatSources(i, format, tile) {
export async function formatSources(i, format, tile) {
const text = "(" + i + ") "
+ format(tile.querySelector("div.default").innerText
.replaceAll("\n", " ")
Expand Down
41 changes: 41 additions & 0 deletions src/scripts/content/extractor/domains/PerplexityPages.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import {safeExecute} from "../../../shared/utils/jsShorteners";
import {interactAndCatch} from "./Perplexity";

export async function processMessage(content, format) {
if (!content.hasChildNodes())
return '';

let markdown = '';

const title = content.querySelector('h2 > span');
markdown += title
? `## ${title?.innerText}\n`
: '';

const answer = content.querySelector('.flex-col > div > .relative > :first-child, [class="group/section"] .prose'); // first one selects the intro, second one the other article parts
markdown += answer?.innerHTML && answer?.innerHTML !== ''
? format(answer?.innerHTML) + '\n\n'
: '';

// Display sources
const src = await safeExecute(await extractSources(content, format));
if (src && src !== '')
markdown += src + "\n";

return markdown;
}

async function extractSources(content, format) {
const SOURCES_HEADER = "---\n**Sources:**\n";
let res = SOURCES_HEADER;

// Open sources modal
res = await interactAndCatch(content, [
{open: ['div.grid > div.flex:nth-last-of-type(1), .group\\/source'], close: [], selector: 'TODO'},
], res, format, '[data-testid="close-modal"]');

// Don't export header if no sources
return res !== SOURCES_HEADER
? res
: "";
}
19 changes: 19 additions & 0 deletions src/scripts/content/extractor/domains/PerplexityPages.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"domainName": "Perplexity Pages",
"contentSelector": "main .mx-auto > div > div > div > div > div.flex-col",
"turndown": {
"init": {
"blankReplacement": "getBlankReplacement_PerplexityPages"
},
"rules": {
"preserveLineBreaksInPre": {
"filter": "filter_PreserveLineBreaksInPre_Perplexity",
"replacement": "replacement_PreserveLineBreaksInPre_Perplexity"
},
"formatCitationsInAnswer": {
"filter": "filter_formatCitationsInAnswer_Perplexity",
"replacement": "replacement_formatCitationsInAnswer_Perplexity"
}
}
}
}
4 changes: 4 additions & 0 deletions src/scripts/content/extractor/extractPage.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ export async function extractPage(domain) {
module = require("./domains/Perplexity");
json = require("./domains/Perplexity.json");
break;
case "PerplexityPages":
module = require("./domains/PerplexityPages");
json = require("./domains/PerplexityPages.json");
break;
case "MaxAIGoogle":
module = require("./domains/MaxAIGoogle");
json = require("./domains/MaxAIGoogle.json");
Expand Down
2 changes: 2 additions & 0 deletions src/scripts/content/extractor/rules/applyRules.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ export function applyExtractorRules(turndownConfig) {

for (const rule in turndownConfig.rules)
turndownConverter.addRule(rule, turndownConfig.rules[rule]);

// console.log(turndownConfig.rules.length + " Turndown rules applied");
}

export function generateRules(configData) {
Expand Down
9 changes: 9 additions & 0 deletions src/scripts/content/extractor/rules/rules.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ export function getBlankReplacement(content, node) {
}
}

export function getBlankReplacement_PerplexityPages(content, node) {
// console.log(node.nodeName === 'SPAN' && node.getAttribute('class') === 'mt-md block', node.nodeName, node.getAttribute('class'), node);
if (node.nodeName === 'SPAN' && node.getAttribute('class')?.includes('block')) {
return '\n\n';
} else {
return '';
}
}

export function filter_PreserveLineBreaksInPre_Perplexity(node) {
return node.nodeName === 'PRE' && node.querySelector('div');
}
Expand Down

0 comments on commit a6033dd

Please sign in to comment.