Skip to content

Commit

Permalink
generalize for translations
Browse files Browse the repository at this point in the history
  • Loading branch information
paiv committed Dec 26, 2024
1 parent 32b4bcd commit 0228787
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 19 deletions.
1 change: 1 addition & 0 deletions code/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.env
/.venv
/data*

Expand Down
29 changes: 18 additions & 11 deletions code/crawler/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests
import sys
import time
import tomllib
from lxml import html
from pathlib import Path
from urllib.parse import urlsplit, urlunsplit
Expand All @@ -21,20 +22,30 @@ def jsondump(obj, fn):


class Crawler:
def __init__(self, name, dir, url, parser, dumper, delay=0.01, user_agent=None):
def __init__(self, name, dir, url, parser, dumper, delay=0.01, headers=None):
self.name = name or dir.name
self.dumpDir = dir
self.rootUrl = url
self.parser = parser
self.dumper = dumper
self.delay = delay
self.user_agent = user_agent
self.headers = self._load_env()
if headers:
self.headers = (self.headers or dict()) | headers
self.fringe = [self.rootUrl]
self.visited = set()
self.state = CrawlerState(fileName= '-'.join([self.name, 'crawler-state.json']))
self.state.restore(self)
self.req = requests.Session()

def _load_env(self):
headers = None
if (fn := Path('.env')).is_file():
with fn.open('rb') as fp:
config = tomllib.load(fp)
headers = config.get('crawler', dict()).get('headers')
return headers

def crawl(self):
while self.fringe:
url = self.norm(self.fringe.pop(0))
Expand All @@ -53,9 +64,10 @@ def crawl(self):
if self.dumper.exists(item):
continue

r = self.get(item['url'])
item_page = self.parser.getcontent(r)
item = self.parser.parse(item, item_page)
if item.get('_partial'):
r = self.get(item['url'])
item_page = self.parser.getcontent(r)
item = self.parser.parse(item, item_page)

self.dumper.dump(item, self)

Expand All @@ -74,13 +86,8 @@ def norm(self, url):

def get(self, url):
time.sleep(self.delay)

headers = {}
if self.user_agent:
headers['User-Agent'] = self.user_agent

print(url, file=sys.stderr)
return self.req.get(url, headers=headers)
return self.req.get(url, headers=self.headers)

def download(self, url, fn):
if url is None:
Expand Down
19 changes: 12 additions & 7 deletions code/crawler/crawl_fci.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def item(self, el, baseurl):
url = el.get('href')
if url:
url = urljoin(baseurl, url)
return {'refid':refid, 'url':url}
return {'refid':refid, 'url':url, '_partial':True}

def parse(self, item, page):
lang = self.language.upper()
Expand Down Expand Up @@ -57,6 +57,7 @@ def url(xpath, skip=None):
item['group'] = clean_group(text('//a[@id="ContentPlaceHolder1_GroupeHyperLink"]//text()'))
item['section'] = text('//span[@id="ContentPlaceHolder1_SectionLabel"]/text()')
item['country'] = text('//span[@id="ContentPlaceHolder1_PaysOrigineLabel"]/text()')
del item['_partial']

def stdana(s): return s.startswith('/Nomenclature/Illustrations/STD-ANA-')
imgUrl = url('//img[@id="ContentPlaceHolder1_IllustrationsRepeater_Image1_0"]/@src', stdana)
Expand Down Expand Up @@ -102,17 +103,21 @@ def reset(self):
for fn in self.dumpDir.glob('**/entry.json'):
fn.unlink()


class FciCrawler:
def __init__(self, basedir, language):
def __init__(self, url, basedir, language='en', parser=None, dumper=None):
base_url = url or f'https://www.fci.be/{language}/nomenclature/'
todir = Path(basedir) / 'fci'
self.craw = core.Crawler(name='fci', dir=todir, url=f'https://www.fci.be/{language}/nomenclature/',
parser=FciParser(language=language), dumper=FciDumper(todir))
parser = parser or FciParser(language=language)
dumper = dumper or FciDumper(todir)
self.engine = core.Crawler(name='fci', dir=todir, url=base_url,
parser=parser, dumper=dumper)

def crawl(self):
return self.craw.crawl()
return self.engine.crawl()

def reset(self):
self.craw.reset()
self.engine.reset()


if __name__ == '__main__':
Expand All @@ -124,7 +129,7 @@ def reset(self):
parser.add_argument('-l', '--language', default='en', help='Language identifier, en|fr|de|es')
args = parser.parse_args()

craw = FciCrawler(basedir=args.data_dir, language=args.language)
craw = FciCrawler(url=None, basedir=args.data_dir, language=args.language)
if args.reset:
craw.reset()
craw.crawl()
5 changes: 4 additions & 1 deletion code/genpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def main(args):
fin = args.file
fout = args.output or sys.stdout
lang = args.lang or 'en'
base_url = args.url
reader = csv.DictReader(fin)

gens = dict(id=gen_id, url=gen_link, image=gen_link, pdf=gen_link)
Expand All @@ -54,6 +55,7 @@ def entries():
context = dict()
context['timestamp'] = datetime.now(UTC).date().isoformat()
context['lang'] = lang
context['base_url'] = base_url
context['ncols'] = len(fieldnames)
context['fieldnames'] = map(gen_th, fieldnames)
context['entries'] = entries()
Expand Down Expand Up @@ -105,7 +107,7 @@ def entries():
| <a href="index-es.html">es</a>
</div>
<p><a href="https://ukrainewar.carrd.co/"><img src="StandWithUkraine.svg" alt="standwithukraine"></a></p>
<p>Data compiled from <a href="https://www.fci.be/&lang/nomenclature/">https://www.fci.be/&lang/nomenclature/</a>.</p>
<p>Data compiled from <a href="&{base_url}">&{base_url}</a>.</p>
<p>Generated on &{timestamp}</p>
<p>Download: <a href="https://github.com/paiv/fci-breeds/releases/latest/download/&{archive}">CSV</a></p>
</div>
Expand All @@ -127,6 +129,7 @@ def entries():
parser.add_argument('file', nargs='?', default='fci-breeds.csv', type=argparse.FileType(),
help='the CSV file to process, default is fci-breeds.csv')
parser.add_argument('-l', '--lang', help='language code')
parser.add_argument('--url', default='https://www.fci.be/en/nomenclature/', help='data origin URL')
parser.add_argument('-o', '--output', type=argparse.FileType('w'), help='destination file name')
args = parser.parse_args()
main(args)

0 comments on commit 0228787

Please sign in to comment.