diff --git a/reginald/models/models/llama_index.py b/reginald/models/models/llama_index.py index cc92526f..a83cf714 100644 --- a/reginald/models/models/llama_index.py +++ b/reginald/models/models/llama_index.py @@ -12,6 +12,7 @@ import nest_asyncio import pandas as pd from git import Repo +from httpx import HTTPError from langchain_community.embeddings import HuggingFaceEmbeddings from llama_index.core import ( Document, @@ -270,18 +271,27 @@ def _load_handbook(self, gh_token: str) -> None: owner = "alan-turing-institute" repo = "REG-handbook" - handbook_loader = GithubRepositoryReader( - GithubClient(gh_token), - owner=owner, - repo=repo, - verbose=False, - concurrent_requests=1, - timeout=60, - retries=3, - filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE), - filter_directories=(["content"], GithubRepositoryReader.FilterType.INCLUDE), - ) - self.documents.extend(handbook_loader.load_data(branch="main")) + try: + handbook_loader = GithubRepositoryReader( + GithubClient(gh_token), + owner=owner, + repo=repo, + verbose=False, + concurrent_requests=1, + timeout=60, + retries=3, + filter_file_extensions=( + [".md"], + GithubRepositoryReader.FilterType.INCLUDE, + ), + filter_directories=( + ["content"], + GithubRepositoryReader.FilterType.INCLUDE, + ), + ) + self.documents.extend(handbook_loader.load_data(branch="main")) + except HTTPError as e: + logging.error(f"Failed to load REG Handbook: {e}") def _load_rse_course(self, gh_token: str) -> None: """ @@ -297,20 +307,23 @@ def _load_rse_course(self, gh_token: str) -> None: owner = "alan-turing-institute" repo = "rse-course" - rse_course_loader = GithubRepositoryReader( - GithubClient(gh_token), - owner=owner, - repo=repo, - verbose=False, - concurrent_requests=1, - timeout=60, - retries=3, - filter_file_extensions=( - [".md", ".ipynb"], - GithubRepositoryReader.FilterType.INCLUDE, - ), - ) - self.documents.extend(rse_course_loader.load_data(branch="main")) + try: + rse_course_loader = GithubRepositoryReader( + GithubClient(gh_token), + owner=owner, + repo=repo, + verbose=False, + concurrent_requests=1, + timeout=60, + retries=3, + filter_file_extensions=( + [".md", ".ipynb"], + GithubRepositoryReader.FilterType.INCLUDE, + ), + ) + self.documents.extend(rse_course_loader.load_data(branch="main")) + except HTTPError as e: + logging.error(f"Failed to load RSE course: {e}") def _load_rds_course(self, gh_token: str) -> None: """ @@ -326,20 +339,23 @@ def _load_rds_course(self, gh_token: str) -> None: owner = "alan-turing-institute" repo = "rds-course" - rds_course_loader = GithubRepositoryReader( - GithubClient(gh_token), - owner=owner, - repo=repo, - verbose=False, - concurrent_requests=1, - timeout=60, - retries=3, - filter_file_extensions=( - [".md", ".ipynb"], - GithubRepositoryReader.FilterType.INCLUDE, - ), - ) - self.documents.extend(rds_course_loader.load_data(branch="develop")) + try: + rds_course_loader = GithubRepositoryReader( + GithubClient(gh_token), + owner=owner, + repo=repo, + verbose=False, + concurrent_requests=1, + timeout=60, + retries=3, + filter_file_extensions=( + [".md", ".ipynb"], + GithubRepositoryReader.FilterType.INCLUDE, + ), + ) + self.documents.extend(rds_course_loader.load_data(branch="develop")) + except HTTPError as e: + logging.error(f"Failed to load RDS course: {e}") def _load_turing_way(self, gh_token: str) -> None: """ @@ -355,17 +371,23 @@ def _load_turing_way(self, gh_token: str) -> None: owner = "the-turing-way" repo = "the-turing-way" - turing_way_loader = GithubRepositoryReader( - GithubClient(gh_token), - owner=owner, - repo=repo, - verbose=False, - concurrent_requests=1, - timeout=60, - retries=3, - filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE), - ) - self.documents.extend(turing_way_loader.load_data(branch="main")) + try: + turing_way_loader = GithubRepositoryReader( + GithubClient(gh_token), + owner=owner, + repo=repo, + verbose=False, + concurrent_requests=1, + timeout=60, + retries=3, + filter_file_extensions=( + [".md"], + GithubRepositoryReader.FilterType.INCLUDE, + ), + ) + self.documents.extend(turing_way_loader.load_data(branch="main")) + except HTTPError as e: + logging.error(f"Failed to load The Turing Way: {e}") def _load_hut23(self, gh_token: str) -> None: """ @@ -381,47 +403,54 @@ def _load_hut23(self, gh_token: str) -> None: owner = "alan-turing-institute" repo = "Hut23" - # load repo - hut23_repo_loader = GithubRepositoryReader( - GithubClient(gh_token), - owner=owner, - repo=repo, - verbose=False, - concurrent_requests=1, - timeout=60, - retries=3, - filter_file_extensions=( - [".md", ".ipynb"], - GithubRepositoryReader.FilterType.INCLUDE, - ), - filter_directories=( - [ - "JDs", - "development", - "newsletters", - "objectives", - "project-appraisal", - "rfc", - "team-meetings", - ], # we can adjust these - GithubRepositoryReader.FilterType.INCLUDE, - ), - ) - self.documents.extend(hut23_repo_loader.load_data(branch="main")) + try: + # load repo + hut23_repo_loader = GithubRepositoryReader( + GithubClient(gh_token), + owner=owner, + repo=repo, + verbose=False, + concurrent_requests=1, + timeout=60, + retries=3, + filter_file_extensions=( + [".md", ".ipynb"], + GithubRepositoryReader.FilterType.INCLUDE, + ), + filter_directories=( + [ + "JDs", + "development", + "newsletters", + "objectives", + "project-appraisal", + "rfc", + "team-meetings", + ], # we can adjust these + GithubRepositoryReader.FilterType.INCLUDE, + ), + ) + self.documents.extend(hut23_repo_loader.load_data(branch="main")) + except HTTPError as e: + logging.error(f"Failed to load Hut23 repo: {e}") - # load_issues - hut23_issues_loader = GitHubRepositoryIssuesReader( - GitHubIssuesClient(gh_token), - owner=owner, - repo=repo, - verbose=True, - ) + try: + # load_issues + hut23_issues_loader = GitHubRepositoryIssuesReader( + GitHubIssuesClient(gh_token), + owner=owner, + repo=repo, + verbose=True, + ) + + issue_docs = hut23_issues_loader.load_data() + for doc in issue_docs: + doc.metadata["api_url"] = str(doc.metadata["url"]) + doc.metadata["url"] = doc.metadata["source"] + self.documents.extend(issue_docs) - issue_docs = hut23_issues_loader.load_data() - for doc in issue_docs: - doc.metadata["api_url"] = str(doc.metadata["url"]) - doc.metadata["url"] = doc.metadata["source"] - self.documents.extend(issue_docs) + except HTTPError as e: + logging.error(f"Failed to load Hut23 issues: {e}") # load collaborators # hut23_collaborators_loader = GitHubRepositoryCollaboratorsReader(