Skip to content

Commit

Permalink
tried to implement puppeteer for aws
Browse files Browse the repository at this point in the history
  • Loading branch information
nuffertaylor committed Jul 26, 2024
1 parent ab7a891 commit 46fc57b
Show file tree
Hide file tree
Showing 5 changed files with 440 additions and 474 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ out
node_modules
cockroachdb_url.txt
cockroach-keys
lambda.zip
*.zip
.vscode
*.crt
1-1000GR.json
49 changes: 41 additions & 8 deletions aws_lambdas/js/fetchGRPuppeteer/goodreads-puppeteer-scraper.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import puppeteer from 'puppeteer';
// Or import puppeteer from 'puppeteer-core';
import puppeteer from 'puppeteer-core';
import chromium from 'chrome-aws-lambda';

const get_book_data_by_goodreads_id = async (goodreads_id) => {
// Launch the browser and open a new blank page
const browser = await puppeteer.launch({headless: true});
const browser = await puppeteer.launch({
args: chromium.args,
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath,
headless: true
});
const page = await browser.newPage();

const GOODREADS_PRE = 'https://www.goodreads.com/book/show/';
Expand Down Expand Up @@ -42,8 +47,36 @@ const combine_title_and_series = ({title, series}) => {
return title + ' ' + formattedSeries;
}

const result = await get_book_data_by_goodreads_id('2');
console.log(result);
if (result?.series) {
console.log(combine_title_and_series(result));
}
const get_last_sub_dir_from_url = (url) => {
let res = url.split('/').at(-1);
if(res === '') res = url.split('/').at(-2);
if(typeof res === "string") return res;
return "";
};
const remove_query_string = (url) => { return url.split('?')[0]; };
const remove_text_title = (url) => { return url.split(/-|\./)[0]; };
const remove_non_numeric_char_from_str = (str) => { return str.replace(/\D/g,''); };

const main = async (url) => {

if(!url || typeof url !== "string") return {};
url = get_last_sub_dir_from_url(url);
url = remove_query_string(url);
url = remove_text_title(url);
const book_id = remove_non_numeric_char_from_str(url);

if(!book_id) return { statusCode: 500 };

const result = await get_book_data_by_goodreads_id('2');
if (result?.series) {
result.title = combine_title_and_series(result);
}

return {
statusCode: 200,
body: { ...result, book_id: book_id}
};

}

export default main;
7 changes: 7 additions & 0 deletions aws_lambdas/js/fetchGRPuppeteer/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import main from './goodreads-puppeteer-scraper.js';

export const handler = async (event) => {
return await main(event.url);
};

// zip -r fetchGrPuppeteer.zip .
Loading

0 comments on commit 46fc57b

Please sign in to comment.