From 0228787860d98c88a034cbe12ca824e03455a553 Mon Sep 17 00:00:00 2001 From: Pavlo Ivashkov Date: Thu, 26 Dec 2024 18:45:05 +0200 Subject: [PATCH] generalize for translations --- code/.gitignore | 1 + code/crawler/core/crawler.py | 29 ++++++++++++++++++----------- code/crawler/crawl_fci.py | 19 ++++++++++++------- code/genpage.py | 5 ++++- 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/code/.gitignore b/code/.gitignore index eda9882..cba1d15 100644 --- a/code/.gitignore +++ b/code/.gitignore @@ -1,3 +1,4 @@ +.env /.venv /data* diff --git a/code/crawler/core/crawler.py b/code/crawler/core/crawler.py index 8de927d..8f104ed 100644 --- a/code/crawler/core/crawler.py +++ b/code/crawler/core/crawler.py @@ -7,6 +7,7 @@ import requests import sys import time +import tomllib from lxml import html from pathlib import Path from urllib.parse import urlsplit, urlunsplit @@ -21,20 +22,30 @@ def jsondump(obj, fn): class Crawler: - def __init__(self, name, dir, url, parser, dumper, delay=0.01, user_agent=None): + def __init__(self, name, dir, url, parser, dumper, delay=0.01, headers=None): self.name = name or dir.name self.dumpDir = dir self.rootUrl = url self.parser = parser self.dumper = dumper self.delay = delay - self.user_agent = user_agent + self.headers = self._load_env() + if headers: + self.headers = (self.headers or dict()) | headers self.fringe = [self.rootUrl] self.visited = set() self.state = CrawlerState(fileName= '-'.join([self.name, 'crawler-state.json'])) self.state.restore(self) self.req = requests.Session() + def _load_env(self): + headers = None + if (fn := Path('.env')).is_file(): + with fn.open('rb') as fp: + config = tomllib.load(fp) + headers = config.get('crawler', dict()).get('headers') + return headers + def crawl(self): while self.fringe: url = self.norm(self.fringe.pop(0)) @@ -53,9 +64,10 @@ def crawl(self): if self.dumper.exists(item): continue - r = self.get(item['url']) - item_page = self.parser.getcontent(r) - item = self.parser.parse(item, item_page) + if item.get('_partial'): + r = self.get(item['url']) + item_page = self.parser.getcontent(r) + item = self.parser.parse(item, item_page) self.dumper.dump(item, self) @@ -74,13 +86,8 @@ def norm(self, url): def get(self, url): time.sleep(self.delay) - - headers = {} - if self.user_agent: - headers['User-Agent'] = self.user_agent - print(url, file=sys.stderr) - return self.req.get(url, headers=headers) + return self.req.get(url, headers=self.headers) def download(self, url, fn): if url is None: diff --git a/code/crawler/crawl_fci.py b/code/crawler/crawl_fci.py index b5703c2..2949c93 100755 --- a/code/crawler/crawl_fci.py +++ b/code/crawler/crawl_fci.py @@ -29,7 +29,7 @@ def item(self, el, baseurl): url = el.get('href') if url: url = urljoin(baseurl, url) - return {'refid':refid, 'url':url} + return {'refid':refid, 'url':url, '_partial':True} def parse(self, item, page): lang = self.language.upper() @@ -57,6 +57,7 @@ def url(xpath, skip=None): item['group'] = clean_group(text('//a[@id="ContentPlaceHolder1_GroupeHyperLink"]//text()')) item['section'] = text('//span[@id="ContentPlaceHolder1_SectionLabel"]/text()') item['country'] = text('//span[@id="ContentPlaceHolder1_PaysOrigineLabel"]/text()') + del item['_partial'] def stdana(s): return s.startswith('/Nomenclature/Illustrations/STD-ANA-') imgUrl = url('//img[@id="ContentPlaceHolder1_IllustrationsRepeater_Image1_0"]/@src', stdana) @@ -102,17 +103,21 @@ def reset(self): for fn in self.dumpDir.glob('**/entry.json'): fn.unlink() + class FciCrawler: - def __init__(self, basedir, language): + def __init__(self, url, basedir, language='en', parser=None, dumper=None): + base_url = url or f'https://www.fci.be/{language}/nomenclature/' todir = Path(basedir) / 'fci' - self.craw = core.Crawler(name='fci', dir=todir, url=f'https://www.fci.be/{language}/nomenclature/', - parser=FciParser(language=language), dumper=FciDumper(todir)) + parser = parser or FciParser(language=language) + dumper = dumper or FciDumper(todir) + self.engine = core.Crawler(name='fci', dir=todir, url=base_url, + parser=parser, dumper=dumper) def crawl(self): - return self.craw.crawl() + return self.engine.crawl() def reset(self): - self.craw.reset() + self.engine.reset() if __name__ == '__main__': @@ -124,7 +129,7 @@ def reset(self): parser.add_argument('-l', '--language', default='en', help='Language identifier, en|fr|de|es') args = parser.parse_args() - craw = FciCrawler(basedir=args.data_dir, language=args.language) + craw = FciCrawler(url=None, basedir=args.data_dir, language=args.language) if args.reset: craw.reset() craw.crawl() diff --git a/code/genpage.py b/code/genpage.py index 58e4b4b..a88bc43 100755 --- a/code/genpage.py +++ b/code/genpage.py @@ -35,6 +35,7 @@ def main(args): fin = args.file fout = args.output or sys.stdout lang = args.lang or 'en' + base_url = args.url reader = csv.DictReader(fin) gens = dict(id=gen_id, url=gen_link, image=gen_link, pdf=gen_link) @@ -54,6 +55,7 @@ def entries(): context = dict() context['timestamp'] = datetime.now(UTC).date().isoformat() context['lang'] = lang + context['base_url'] = base_url context['ncols'] = len(fieldnames) context['fieldnames'] = map(gen_th, fieldnames) context['entries'] = entries() @@ -105,7 +107,7 @@ def entries(): | es

standwithukraine

-

Data compiled from https://www.fci.be/&lang/nomenclature/.

+

Data compiled from &{base_url}.

Generated on &{timestamp}

Download: CSV

@@ -127,6 +129,7 @@ def entries(): parser.add_argument('file', nargs='?', default='fci-breeds.csv', type=argparse.FileType(), help='the CSV file to process, default is fci-breeds.csv') parser.add_argument('-l', '--lang', help='language code') + parser.add_argument('--url', default='https://www.fci.be/en/nomenclature/', help='data origin URL') parser.add_argument('-o', '--output', type=argparse.FileType('w'), help='destination file name') args = parser.parse_args() main(args)