-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers.py
executable file
·139 lines (114 loc) · 4.67 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import datetime
import logging
import os
import re
import requests
import openai
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
cached_transcripts_folder = "cached_transcripts"
cached_audio_folder = "cached_audio"
YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
assert YOUTUBE_API_KEY is not None
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY, cache_discovery=False)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
assert OPENAI_API_KEY is not None, "OPENAI_API_KEY environment variable is not set."
openai.api_key = OPENAI_API_KEY
openai_client = openai.OpenAI()
def get_channel_id_locally(url):
"""
Get the channel ID from the HTML of a channel page.
Takes URL as input.
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.find("meta", itemprop="channelId")['content']
def get_channel_id_from_username(username):
"""
Get the channel ID from a channel username. Uses Google API.
"""
channels_response = youtube.channels().list(
part="id",
forUsername=username,
maxResults=5
).execute()
for channel in channels_response["items"]:
print(channel["id"])
if channels_response["items"]:
return channels_response["items"][0]["id"]
else:
print("Channel not found")
return None
def to_video_url(id: str) -> str:
return f"https://www.youtube.com/watch?v={id}"
def to_audio_location(id: str) -> str:
return os.path.join(cached_audio_folder, id + ".mp3")
def to_transcript_location(id: str) -> str:
return os.path.join(cached_transcripts_folder, id + ".txt")
def extract_video_id(youtube_link):
try:
# Enhanced regular expression to match various YouTube URL formats including URLs with additional query parameters
pattern = re.compile(
r'(?:https?://)?' # Optional protocol
r'(?:www\.|m\.)?' # Optional www. or m.
r'(?:youtube\.com/(?:(?:watch.*?v=)|shorts/)|youtu\.be/)' # watch or shorts or youtu.be
r'([\w-]+)', # The capturing group for VIDEO_ID
re.IGNORECASE
)
match = pattern.search(youtube_link)
if not match:
raise ValueError("Invalid YouTube link")
# Using a loop to find the first non-None group (either from the main URL or the shortened youtu.be format)
video_id = next((group for group in match.groups() if group is not None), None)
if not video_id:
raise ValueError("No video ID found in the link")
return video_id
except Exception as e:
print(f"Error: {e}")
return None
def get_video_title(video_id):
try:
# Make a request to the API
request = youtube.videos().list(
part='snippet',
id=video_id
)
response = request.execute()
# Log detailed issues if something goes wrong
if 'items' not in response:
print("Error: 'items' field is missing from the API response.")
return None
elif len(response['items']) == 0:
print("Error: No video found for the provided video ID. The 'items' list is empty.")
return None
# Extract and return the video title
video_title = response['items'][0]['snippet'].get('title')
if video_title is None:
print("Error: 'title' field is missing in the 'snippet' of the video data.")
return None
return video_title
except Exception as e:
print(f"An unexpected error occurred: {e}")
return None
def convert_date(date_str):
"""
Takes mm/dd/yyyy date string and converts it to YouTube API format
"""
if not date_str:
return None
date_obj = datetime.datetime.strptime(date_str, "%m/%d/%Y")
return date_obj.strftime("%Y-%m-%dT%H:%M:%SZ")
# This is used for transcript text, where no markdown formatting is expected
def escape_all_markdown(text: str) -> str:
# Escape special Markdown characters
markdown_chars = ['*', '_', '`', '[', ']', '(', ')', '#', '+', '-', '!', '|', '$']
for char in markdown_chars:
text = text.replace(char, '\\' + char)
return text
# This is used for AI generated text, where sometimes things like bullet points are used but $ for italics is not
def escape_unexpected_markdown(text: str) -> str:
# Escape special Markdown characters
markdown_chars = ['_', '`', '[', ']', '(', ')', '+', '!', '|', '$']
for char in markdown_chars:
text = text.replace(char, '\\' + char)
return text