-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawl.py
executable file
·80 lines (59 loc) · 2.44 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import logging
from langchain.embeddings import OpenAIEmbeddings
from embedding_search.academic_analytics import get_units, get_faculties, get_author
from embedding_search.vector_store import get_author as get_downloaded_author
from embedding_search.utils import AUTHORS_DIR, upload_blob
from embedding_search.data_model import Author
import argparse
logging.basicConfig(filename="main.log", level=logging.INFO)
def append_embeddings(author: Author) -> Author:
"""Get embeddings for authors' articles."""
embeddings = OpenAIEmbeddings()
# embed all articles in batch (m articles, n dimensions)
author.articles_embeddings = embeddings.embed_documents(author.texts)
return author
def download_one_author(id: int | str) -> None:
"""Download or update an author json."""
if isinstance(id, str):
id = int(id)
try:
author = get_downloaded_author(id)
except FileNotFoundError:
raw_author = get_author(id)
author = Author(
id=id,
unit_id=raw_author["primaryUnitAffiliation"]["id"],
first_name=raw_author["firstName"],
last_name=raw_author["lastName"],
)
author.update()
def download_all_authors_in_unit(unit: int) -> None:
"""Download all authors in a list of units."""
faculties = get_faculties(unit)
for i, faculty in enumerate(faculties):
try:
print(f"Processing faculty {i+1}/{len(faculties)}: {faculty['id']}")
download_one_author(faculty["id"])
except Exception as e:
print(f"Error downloading {faculty['id']}: {e}")
continue
def download_authors(resume_from: int = 0) -> None:
"""Download authors from ORCID and their papers from CrossRef."""
units = get_units()
if resume_from:
units = units[resume_from:]
logging.info(f"Found {len(units)} units.")
for i, unit in enumerate(units):
print(f"Processing unit {i+1}/{len(units)}: {unit['unitId']}")
download_all_authors_in_unit(int(unit["unitId"]))
def main():
"""Main crawler."""
parser = argparse.ArgumentParser()
parser.add_argument("--push-to-bucket", action="store_true")
parser.add_argument("--resume-from", type=int, default=0)
args = parser.parse_args()
download_authors(resume_from=args.resume_from)
if args.push_to_bucket:
upload_blob(bucket_name="community-search-raw", source_folder=AUTHORS_DIR)
if __name__ == "__main__":
main()