Skip to content

Commit

Permalink
add try except for github readers
Browse files Browse the repository at this point in the history
  • Loading branch information
rwood-97 committed Apr 8, 2024
1 parent 59ae021 commit e7e4c02
Showing 1 changed file with 119 additions and 90 deletions.
209 changes: 119 additions & 90 deletions reginald/models/models/llama_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import nest_asyncio
import pandas as pd
from git import Repo
from httpx import HTTPError
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.core import (
Document,
Expand Down Expand Up @@ -270,18 +271,27 @@ def _load_handbook(self, gh_token: str) -> None:
owner = "alan-turing-institute"
repo = "REG-handbook"

handbook_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
filter_directories=(["content"], GithubRepositoryReader.FilterType.INCLUDE),
)
self.documents.extend(handbook_loader.load_data(branch="main"))
try:
handbook_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md"],
GithubRepositoryReader.FilterType.INCLUDE,
),
filter_directories=(
["content"],
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(handbook_loader.load_data(branch="main"))
except HTTPError as e:
logging.error(f"Failed to load REG Handbook: {e}")

def _load_rse_course(self, gh_token: str) -> None:
"""
Expand All @@ -297,20 +307,23 @@ def _load_rse_course(self, gh_token: str) -> None:
owner = "alan-turing-institute"
repo = "rse-course"

rse_course_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md", ".ipynb"],
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(rse_course_loader.load_data(branch="main"))
try:
rse_course_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md", ".ipynb"],
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(rse_course_loader.load_data(branch="main"))
except HTTPError as e:
logging.error(f"Failed to load RSE course: {e}")

def _load_rds_course(self, gh_token: str) -> None:
"""
Expand All @@ -326,20 +339,23 @@ def _load_rds_course(self, gh_token: str) -> None:
owner = "alan-turing-institute"
repo = "rds-course"

rds_course_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md", ".ipynb"],
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(rds_course_loader.load_data(branch="develop"))
try:
rds_course_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md", ".ipynb"],
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(rds_course_loader.load_data(branch="develop"))
except HTTPError as e:
logging.error(f"Failed to load RDS course: {e}")

def _load_turing_way(self, gh_token: str) -> None:
"""
Expand All @@ -355,17 +371,23 @@ def _load_turing_way(self, gh_token: str) -> None:
owner = "the-turing-way"
repo = "the-turing-way"

turing_way_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
)
self.documents.extend(turing_way_loader.load_data(branch="main"))
try:
turing_way_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md"],
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(turing_way_loader.load_data(branch="main"))
except HTTPError as e:
logging.error(f"Failed to load The Turing Way: {e}")

def _load_hut23(self, gh_token: str) -> None:
"""
Expand All @@ -381,47 +403,54 @@ def _load_hut23(self, gh_token: str) -> None:
owner = "alan-turing-institute"
repo = "Hut23"

# load repo
hut23_repo_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md", ".ipynb"],
GithubRepositoryReader.FilterType.INCLUDE,
),
filter_directories=(
[
"JDs",
"development",
"newsletters",
"objectives",
"project-appraisal",
"rfc",
"team-meetings",
], # we can adjust these
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(hut23_repo_loader.load_data(branch="main"))
try:
# load repo
hut23_repo_loader = GithubRepositoryReader(
GithubClient(gh_token),
owner=owner,
repo=repo,
verbose=False,
concurrent_requests=1,
timeout=60,
retries=3,
filter_file_extensions=(
[".md", ".ipynb"],
GithubRepositoryReader.FilterType.INCLUDE,
),
filter_directories=(
[
"JDs",
"development",
"newsletters",
"objectives",
"project-appraisal",
"rfc",
"team-meetings",
], # we can adjust these
GithubRepositoryReader.FilterType.INCLUDE,
),
)
self.documents.extend(hut23_repo_loader.load_data(branch="main"))
except HTTPError as e:
logging.error(f"Failed to load Hut23 repo: {e}")

# load_issues
hut23_issues_loader = GitHubRepositoryIssuesReader(
GitHubIssuesClient(gh_token),
owner=owner,
repo=repo,
verbose=True,
)
try:
# load_issues
hut23_issues_loader = GitHubRepositoryIssuesReader(
GitHubIssuesClient(gh_token),
owner=owner,
repo=repo,
verbose=True,
)

issue_docs = hut23_issues_loader.load_data()
for doc in issue_docs:
doc.metadata["api_url"] = str(doc.metadata["url"])
doc.metadata["url"] = doc.metadata["source"]
self.documents.extend(issue_docs)

issue_docs = hut23_issues_loader.load_data()
for doc in issue_docs:
doc.metadata["api_url"] = str(doc.metadata["url"])
doc.metadata["url"] = doc.metadata["source"]
self.documents.extend(issue_docs)
except HTTPError as e:
logging.error(f"Failed to load Hut23 issues: {e}")

# load collaborators
# hut23_collaborators_loader = GitHubRepositoryCollaboratorsReader(
Expand Down

0 comments on commit e7e4c02

Please sign in to comment.