-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcripts.py
executable file
·88 lines (72 loc) · 3.28 KB
/
transcripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import logging
import math
import os
import time
import json
from typing import Optional, List
from redis_wrapper import value_cache as r, QUEUE_NAME
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Constants
QUEUE_NAME = "transcript_queue"
POLL_INTERVAL = 0.5 # seconds between polls
MAX_POLL_TIME = 60.0 # max seconds to wait for the transcript
MAX_JOBS_IN_FLIGHT = 5 # max queued or in_progress jobs allowed
def get_transcript(video_id: str, task_type: str):
"""
Steps:
1) Always check the "audio" transcript in cache first.
2) Then check the "youtube" transcript in cache.
3) If neither is found, enqueue a job (if not already queued) for the requested task_type.
4) Poll for the result until found or time-out.
"""
# --- 1) Check for audio transcript in Redis ---
audio_key = f"transcript:audio:{video_id}"
audio_data = r.get(audio_key)
if audio_data:
logging.info(f"[User] Found existing audio transcript in cache for video ID: {video_id}.")
return audio_data
# --- 2) If no audio transcript, check for youtube transcript in Redis ---
youtube_key = f"transcript:youtube:{video_id}"
youtube_data = r.get(youtube_key)
if youtube_data:
logging.info(f"[User] Found existing youtube transcript in cache for video ID: {video_id}.")
return youtube_data
jobs_in_queue = r.llen(QUEUE_NAME)
if jobs_in_queue >= MAX_JOBS_IN_FLIGHT:
logging.error(f"[User] Too many jobs in the queue ({jobs_in_queue}). Rejecting new job.")
raise RuntimeError("Transcript queue is full. Please try again later.")
# --- 3) If no transcript, see if job is already queued or in progress for the requested task_type ---
status_key = f"transcript_status:{task_type}:{video_id}"
status_val = r.get(status_key)
if not status_val:
# Mark job as queued
r.set(status_key, "queued", ex=60 * 60)
job_payload = {
"video_id": video_id,
"task_type": task_type # "audio" or "youtube"
}
r.rpush(QUEUE_NAME, json.dumps(job_payload))
logging.info(f"[User] Enqueued job for video ID: {video_id}, task type: {task_type}.")
# --- 4) Poll for transcript, up to MAX_POLL_TIME seconds ---
start_time = time.time()
while True:
# Even while polling, re-check both audio and youtube keys,
# because worker might fall back automatically.
audio_data = r.get(audio_key)
if audio_data:
logging.info(f"[User] Returning fallback audio transcript for video ID: {video_id}")
return audio_data
youtube_data = r.get(youtube_key)
if youtube_data:
logging.info(f"[User] Returning fallback youtube transcript for video ID: {video_id}")
return youtube_data
elapsed = time.time() - start_time
if elapsed > MAX_POLL_TIME:
logging.warning(f"[User] Timed out waiting for transcript for video ID: {video_id}")
return None
if (int(elapsed) % 5) == 0:
logging.info(f"[User] Waiting for transcript... {int(elapsed)}s elapsed for video ID: {video_id}")
time.sleep(POLL_INTERVAL)
# If somehow we exit, we never found it
return None