From 0091de761a8471d4f64cd09befe6159c71f709d5 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Tue, 7 Jan 2025 10:20:26 +0000 Subject: [PATCH 01/22] Add 'extension' field to metadata of scraped media --- packages/media-download/src/index.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/media-download/src/index.ts b/packages/media-download/src/index.ts index 0e103024..3d942b67 100644 --- a/packages/media-download/src/index.ts +++ b/packages/media-download/src/index.ts @@ -36,6 +36,9 @@ const uploadToS3 = async ( Bucket: bucket, Key: key, Body: fileStream, + Metadata: { + extension: metadata.extension, + }, }, }); From a2dcfd349cd7ac7ee7506643b7371f1268c29569 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 11:12:45 +0000 Subject: [PATCH 02/22] Add functionality to export original source media to google drive --- packages/api/src/export.ts | 111 ++++++++++ packages/api/src/index.ts | 152 ++++++++----- packages/api/src/services/googleDrive.ts | 114 ++++++++-- packages/backend-common/src/dynamodb.ts | 22 +- packages/backend-common/src/lambda.ts | 1 + packages/backend-common/src/s3.ts | 38 ++++ .../client/src/components/ExportButton.tsx | 206 +++++++++++------- packages/client/src/services/export.ts | 53 ++++- packages/common/src/types.ts | 26 ++- packages/media-download/src/index.ts | 3 +- 10 files changed, 564 insertions(+), 162 deletions(-) create mode 100644 packages/api/src/export.ts create mode 100644 packages/backend-common/src/lambda.ts diff --git a/packages/api/src/export.ts b/packages/api/src/export.ts new file mode 100644 index 00000000..0f0d1d23 --- /dev/null +++ b/packages/api/src/export.ts @@ -0,0 +1,111 @@ +import { + downloadObject, + getObjectSize, + getObjectText, + isS3Failure, + logger, + TranscriptionConfig, + TranscriptionDynamoItem, +} from '@guardian/transcription-service-backend-common'; +import { ZTokenResponse } from '@guardian/transcription-service-common'; +import { + uploadFileToGoogleDrive, + uploadToGoogleDocs, +} from './services/googleDrive'; +import { S3Client } from '@aws-sdk/client-s3'; +import { LAMBDA_MAX_EPHEMERAL_STORAGE_BYTES } from '@guardian/transcription-service-backend-common/src/lambda'; +import { docs_v1, drive_v3 } from 'googleapis'; +import Drive = drive_v3.Drive; +import Docs = docs_v1.Docs; + +export const exportMediaToDrive = async ( + config: TranscriptionConfig, + s3Client: S3Client, + item: TranscriptionDynamoItem, + oAuthTokenResponse: ZTokenResponse, + folderId: string, +): Promise<{ statusCode: number; fileId?: string; message?: string }> => { + const mediaSize = await getObjectSize( + s3Client, + config.app.sourceMediaBucket, + item.id, + ); + if (mediaSize && mediaSize > LAMBDA_MAX_EPHEMERAL_STORAGE_BYTES) { + const msg = `Media file too large to export to google drive. Please manually download the file and upload using the google drive UI`; + return { + statusCode: 400, + message: msg, + }; + } + const filePath = `/tmp/${item.id.split('/')[1]}`; + const { extension } = await downloadObject( + s3Client, + config.app.sourceMediaBucket, + item.id, + filePath, + ); + + const mimeType = 'application/octet-stream'; + + const id = await uploadFileToGoogleDrive( + `${item.originalFilename}.${extension || 'mp4'}`, + oAuthTokenResponse, + filePath, + mimeType, + folderId, + ); + return { + fileId: id, + statusCode: 200, + }; +}; + +export const exportTranscriptToDoc = async ( + config: TranscriptionConfig, + s3Client: S3Client, + item: TranscriptionDynamoItem, + format: 'srt' | 'text', + folderId: string, + drive: Drive, + docs: Docs, +): Promise<{ statusCode: number; message?: string; documentId?: string }> => { + const transcriptS3Key = item.transcriptKeys[format]; + const transcriptText = await getObjectText( + s3Client, + config.app.transcriptionOutputBucket, + transcriptS3Key, + ); + if (isS3Failure(transcriptText)) { + if (transcriptText.failureReason === 'NoSuchKey') { + const msg = `Failed to export transcript - file has expired. Please re-upload the file and try again.`; + return { + statusCode: 410, + message: msg, + }; + } + const msg = `Failed to fetch transcript. Please contact the digital investigations team for support`; + return { + statusCode: 500, + message: msg, + }; + } + const exportResult = await uploadToGoogleDocs( + drive, + docs, + folderId, + `${item.originalFilename} transcript${item.isTranslation ? ' (English translation)' : ''}`, + transcriptText.text, + ); + if (!exportResult) { + const msg = `Failed to create google document for item with id ${item.id}`; + logger.error(msg); + return { + statusCode: 500, + message: msg, + }; + } + return { + statusCode: 200, + documentId: exportResult, + }; +}; diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index ac8be361..3fd13308 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -13,11 +13,9 @@ import { getSQSClient, generateOutputSignedUrlAndSendMessage, isSqsFailure, - isS3Failure, getSignedDownloadUrl, getObjectMetadata, logger, - getObjectText, getS3Client, sendMessage, } from '@guardian/transcription-service-backend-common'; @@ -28,15 +26,17 @@ import { transcribeFileRequestBody, transcribeUrlRequestBody, MediaDownloadJob, + CreateFolderRequest, + ExportResponse, } from '@guardian/transcription-service-common'; import type { SignedUrlResponseBody } from '@guardian/transcription-service-common'; import { getDynamoClient, getTranscriptionItem, - TranscriptionDynamoItem, } from '@guardian/transcription-service-backend-common/src/dynamodb'; -import { createTranscriptDocument } from './services/googleDrive'; +import { createExportFolder, getDriveClients } from './services/googleDrive'; import { v4 as uuid4 } from 'uuid'; +import { exportMediaToDrive, exportTranscriptToDoc } from './export'; const runningOnAws = process.env['AWS_EXECUTION_ENV']; const emulateProductionLocally = @@ -55,6 +55,10 @@ const getApp = async () => { ); const s3Client = getS3Client(config.aws.region); + const dynamoClient = getDynamoClient( + config.aws.region, + config.aws.localstackEndpoint, + ); app.use(bodyParser.json({ limit: '40mb' })); app.use(passport.initialize()); @@ -190,79 +194,125 @@ const getApp = async () => { }), ]); - apiRouter.post('/export', [ + apiRouter.post('/export/create-folder', [ checkAuth, asyncHandler(async (req, res) => { - const exportRequest = TranscriptExportRequest.safeParse(req.body); - const dynamoClient = getDynamoClient( - config.aws.region, - config.aws.localstackEndpoint, + const createRequest = CreateFolderRequest.safeParse(req.body); + if (!createRequest.success) { + const msg = `Failed to parse create folder request ${createRequest.error.message}`; + logger.error(msg); + res.status(400).send(msg); + return; + } + const { item, errorMessage } = await getTranscriptionItem( + dynamoClient, + 'transcription-service-CODE', + createRequest.data.transcriptId, + ); + if (!item) { + res.status(500).send(errorMessage); + return; + } + const driveClients = await getDriveClients( + config, + createRequest.data.oAuthTokenResponse, ); + const folderId = await createExportFolder( + driveClients.drive, + item.originalFilename, + ); + if (!folderId) { + res.status(500).send('Failed to create folder'); + return; + } + res.send(folderId); + }), + ]); + + apiRouter.post('/export/export', [ + checkAuth, + asyncHandler(async (req, res) => { + const exportRequest = TranscriptExportRequest.safeParse(req.body); if (!exportRequest.success) { const msg = `Failed to parse export request ${exportRequest.error.message}`; logger.error(msg); res.status(400).send(msg); return; } - const item = await getTranscriptionItem( + const { item, errorMessage } = await getTranscriptionItem( dynamoClient, - config.app.tableName, + 'transcription-service-CODE', exportRequest.data.id, ); if (!item) { - const msg = `Failed to fetch item with id ${exportRequest.data.id} from database.`; - logger.error(msg); - res.status(500).send(msg); - return; - } - const parsedItem = TranscriptionDynamoItem.safeParse(item); - if (!parsedItem.success) { - const msg = `Failed to parse item ${exportRequest.data.id} from dynamodb. Error: ${parsedItem.error.message}`; - logger.error(msg); - res.status(500).send(msg); + res.status(500).send(errorMessage); return; } - if (parsedItem.data.userEmail !== req.user?.email) { + if (item.userEmail !== req.user?.email) { // users can only export their own transcripts. Return a 404 to avoid leaking information about other users' transcripts logger.warn( - `User ${req.user?.email} attempted to export transcript ${parsedItem.data.id} which does not belong to them.`, + `User ${req.user?.email} attempted to export transcript ${item.id} which does not belong to them.`, ); res.status(404).send(`Transcript not found`); return; } - - const transcriptS3Key = - parsedItem.data.transcriptKeys[exportRequest.data.transcriptFormat]; - const transcriptText = await getObjectText( - s3Client, - config.app.transcriptionOutputBucket, - transcriptS3Key, + const driveClients = await getDriveClients( + config, + exportRequest.data.oAuthTokenResponse, ); - if (isS3Failure(transcriptText)) { - if (transcriptText.failureReason === 'NoSuchKey') { - const msg = `Failed to export transcript - file has expired. Please re-upload the file and try again.`; - res.status(410).send(msg); + const exportResult: ExportResponse = { + textDocumentId: undefined, + srtDocumentId: undefined, + sourceMediaFileId: undefined, + }; + if (exportRequest.data.items.transcriptText) { + const textResult = await exportTranscriptToDoc( + config, + s3Client, + item, + 'text', + exportRequest.data.folderId, + driveClients.drive, + driveClients.docs, + ); + if (textResult.statusCode !== 200) { + res.status(textResult.statusCode).send(textResult.message); return; } - const msg = `Failed to fetch transcript. Please contact the digital investigations team for support`; - res.status(500).send(msg); - return; + exportResult.textDocumentId = textResult.documentId; } - const exportResult = await createTranscriptDocument( - config, - `${parsedItem.data.originalFilename} transcript${parsedItem.data.isTranslation ? ' (English translation)' : ''}`, - exportRequest.data.oAuthTokenResponse, - transcriptText.text, - ); - if (!exportResult) { - const msg = `Failed to create google document for item with id ${parsedItem.data.id}`; - logger.error(msg); - res.status(500).send(msg); - return; + if (exportRequest.data.items.transcriptSrt) { + const srtResult = await exportTranscriptToDoc( + config, + s3Client, + item, + 'srt', + exportRequest.data.folderId, + driveClients.drive, + driveClients.docs, + ); + if (srtResult.statusCode !== 200) { + res.status(srtResult.statusCode).send(srtResult.message); + return; + } + exportResult.srtDocumentId = srtResult.documentId; } - res.send({ - documentId: exportResult, - }); + if (exportRequest.data.items.sourceMedia) { + const mediaResult = await exportMediaToDrive( + config, + s3Client, + item, + exportRequest.data.oAuthTokenResponse, + exportRequest.data.folderId, + ); + if (mediaResult.statusCode !== 200) { + logger.error('Failed to export media to google drive'); + res.status(mediaResult.statusCode).send(mediaResult.message); + return; + } + exportResult.sourceMediaFileId = mediaResult.fileId; + } + res.send(JSON.stringify(exportResult)); return; }), ]); diff --git a/packages/api/src/services/googleDrive.ts b/packages/api/src/services/googleDrive.ts index 0edba46f..5952ed23 100644 --- a/packages/api/src/services/googleDrive.ts +++ b/packages/api/src/services/googleDrive.ts @@ -5,19 +5,25 @@ import { TranscriptionConfig, } from '@guardian/transcription-service-backend-common'; import { ZTokenResponse } from '@guardian/transcription-service-common'; +import * as fs from 'node:fs'; +import Drive = drive_v3.Drive; -export const getOrCreateTranscriptFolder = async ( +const ROOT_FOLDER_NAME = 'Guardian Transcribe Tool'; + +export const getOrCreateFolder = async ( drive: drive_v3.Drive, folderName: string, + parentId?: string, ) => { const fileMetadata = { name: folderName, mimeType: 'application/vnd.google-apps.folder', + parents: parentId ? [parentId] : [], }; try { // first see if there is already a folder matching folderName const existingFolders = await drive.files.list({ - q: `mimeType='${fileMetadata.mimeType}' and name ='${folderName}' and trashed=false`, + q: `mimeType='${fileMetadata.mimeType}' and name ='${folderName}' and trashed=false ${parentId ? `and '${parentId}' in parents` : ''}`, spaces: 'drive', }); // there could be multiple folders with this name, let's upload to the first one @@ -96,32 +102,106 @@ export const uploadToGoogleDocs = async ( return createResponse.data.id; }; -export const createTranscriptDocument = async ( +export const getDriveClients = async ( config: TranscriptionConfig, - fileName: string, oAuthTokenResponse: ZTokenResponse, - transcriptText: string, ) => { const oAuth2Client: OAuth2Client = new google.auth.OAuth2(config.auth); oAuth2Client.setCredentials(oAuthTokenResponse); const drive = google.drive({ version: 'v3', auth: oAuth2Client }); const docs = google.docs({ version: 'v1', auth: oAuth2Client }); - - const folderId = await getOrCreateTranscriptFolder( + return { drive, - 'Guardian Transcribe Tool', - ); + docs, + }; +}; + +export const createExportFolder = async (drive: Drive, name: string) => { + const rootFolderId = await getOrCreateFolder(drive, ROOT_FOLDER_NAME); + if (!rootFolderId) { + logger.error(`Failed to get or create root folder '${ROOT_FOLDER_NAME}'`); + return undefined; + } + const folderId = await getOrCreateFolder(drive, name, rootFolderId); if (!folderId) { - logger.error('Failed to get or create folder'); + logger.error(`Failed to get or create folder '${name}'`); return undefined; } - const docId = await uploadToGoogleDocs( - drive, - docs, - folderId, - fileName, - transcriptText, + return folderId; +}; + +export const uploadFileToGoogleDrive = async ( + fileName: string, + oAuthTokenResponse: ZTokenResponse, + filePath: string, + mimeType: string, + folderId: string, +) => { + const fileSize = fs.statSync(filePath).size; + + const startResumableSessionResponse = await fetch( + 'https://www.googleapis.com/upload/drive/v3/files?uploadType=resumable', + { + method: 'POST', + headers: { + 'X-Upload-Content-Length': `${fileSize}`, + 'X-Upload-Content-Type': mimeType, + 'Content-Type': 'application/json', + Authorization: `Bearer ${oAuthTokenResponse.access_token}`, + }, + body: JSON.stringify({ + name: fileName, + mimeType, + parents: [folderId], + }), + }, ); - return docId; + + const uploadUrl = startResumableSessionResponse.headers.get('location'); + + if (!uploadUrl) { + throw new Error('Failed to start resumable upload session'); + } + + //when changing this value consider the amount of memory allocated to the API lambda function + const CHUNK_SIZE = 128 * 1024 * 1024; // 128MB - + const fileStream = fs.createReadStream(filePath, { + highWaterMark: CHUNK_SIZE, + }); + + let offset = 0; + + for await (const chunk of fileStream) { + const chunkSize = chunk.length; + const range = `bytes ${offset}-${offset + chunkSize - 1}/${fileSize}`; + + logger.info( + `Uploading chunk: ${range} (Upload ${Math.floor((offset / fileSize) * 100)}% complete)`, + ); + + const response = await fetch(uploadUrl, { + method: 'PUT', + headers: { + 'Content-Range': range, + 'Content-Length': chunkSize, + }, + body: chunk, + }); + + if (response.ok) { + // Response status is 308 until the final chunk. Final response includes file metadata + return ((await response.json()) as { id: string }).id; + } + if (response.status !== 308) { + const text = await response.text(); + logger.error(`Received ${response.status} from google, error: ${text}`); + throw new Error( + `Failed to upload chunk: ${response.status} ${response.statusText}`, + ); + } + + offset += chunkSize; + } + throw new Error('Failed to upload file'); }; diff --git a/packages/backend-common/src/dynamodb.ts b/packages/backend-common/src/dynamodb.ts index 3a59f4e8..6c6dc9c6 100644 --- a/packages/backend-common/src/dynamodb.ts +++ b/packages/backend-common/src/dynamodb.ts @@ -62,7 +62,7 @@ export const writeTranscriptionItem = async ( } }; -export const getTranscriptionItem = async ( +export const getItem = async ( client: DynamoDBDocumentClient, tableName: string, itemId: string, @@ -81,3 +81,23 @@ export const getTranscriptionItem = async ( return undefined; } }; + +export const getTranscriptionItem = async ( + client: DynamoDBDocumentClient, + tableName: string, + itemId: string, +): Promise<{ item?: TranscriptionDynamoItem; errorMessage?: string }> => { + const item = await getItem(client, tableName, itemId); + if (!item) { + const msg = `Failed to fetch item with id ${itemId} from database.`; + logger.error(msg); + return { errorMessage: msg }; + } + const parsedItem = TranscriptionDynamoItem.safeParse(item); + if (!parsedItem.success) { + const msg = `Failed to parse item ${itemId} from dynamodb. Error: ${parsedItem.error.message}`; + logger.error(msg); + return { errorMessage: msg }; + } + return { item: parsedItem.data }; +}; diff --git a/packages/backend-common/src/lambda.ts b/packages/backend-common/src/lambda.ts new file mode 100644 index 00000000..931ea2bf --- /dev/null +++ b/packages/backend-common/src/lambda.ts @@ -0,0 +1 @@ +export const LAMBDA_MAX_EPHEMERAL_STORAGE_BYTES = 10240 * 1024 * 1024; diff --git a/packages/backend-common/src/s3.ts b/packages/backend-common/src/s3.ts index 895a16d6..a9bf8f6f 100644 --- a/packages/backend-common/src/s3.ts +++ b/packages/backend-common/src/s3.ts @@ -129,6 +129,28 @@ export const getObjectWithPresignedUrl = async ( return destinationPath; }; +export const downloadObject = async ( + client: S3Client, + bucket: string, + key: string, + destinationPath: string, +) => { + const data = await client.send( + new GetObjectCommand({ + Bucket: bucket, + Key: key, + }), + ); + if (!data.Body) { + throw new Error(`Failed to retrieve object ${key} from bucket ${bucket}`); + } + await downloadS3Data(data.Body as Readable, destinationPath, key); + return { + destinationPath, + extension: data.Metadata?.['extension'], + }; +}; + const downloadS3Data = async ( data: Readable, destinationPath: string, @@ -169,3 +191,19 @@ export const getObjectMetadata = async ( return; } }; + +export const mediaKey = (id: string) => `downloaded-media/${id}`; + +export const getObjectSize = async ( + client: S3Client, + bucket: string, + key: string, +) => { + const data = await client.send( + new HeadObjectCommand({ + Bucket: bucket, + Key: key, + }), + ); + return data.ContentLength; +}; diff --git a/packages/client/src/components/ExportButton.tsx b/packages/client/src/components/ExportButton.tsx index 2940d2b8..42e1ab18 100644 --- a/packages/client/src/components/ExportButton.tsx +++ b/packages/client/src/components/ExportButton.tsx @@ -1,34 +1,37 @@ import React, { useContext, useState } from 'react'; -import { - ExportResponse, - TranscriptFormat, -} from '@guardian/transcription-service-common'; +import { ExportResponse } from '@guardian/transcription-service-common'; import { AuthContext } from '@/app/template'; import Script from 'next/script'; import { useSearchParams } from 'next/navigation'; -import { exportTranscript } from '@/services/export'; import { - ArrowTopRightOnSquareIcon, - DocumentTextIcon, -} from '@heroicons/react/16/solid'; + createExportFolder, + exportTranscript, + getOAuthToken, +} from '@/services/export'; +import { ArrowTopRightOnSquareIcon } from '@heroicons/react/16/solid'; import { RequestStatus } from '@/types'; import { InfoMessage } from '@/components/InfoMessage'; -import { Alert, CustomFlowbiteTheme, Dropdown, Flowbite } from 'flowbite-react'; +import { + Alert, + Checkbox, + CustomFlowbiteTheme, + Flowbite, + Label, +} from 'flowbite-react'; const ExportButton = () => { const { token } = useContext(AuthContext); const searchParams = useSearchParams(); - const [docId, setDocId] = useState(); - const [loading, setLoading] = useState(false); + const [folderId, setFolderId] = useState(); + const [creatingFolder, setCreatingFolder] = useState(false); + const [exporting, setExporting] = useState(false); const [failureMessage, setFailureMessage] = useState(''); - const [transcriptFormat, setTranscriptFormat] = - useState(null); - const [transcriptFormatValid, setTranscriptFormatValid] = useState< - boolean | undefined - >(undefined); const [requestStatus, setRequestStatus] = useState( RequestStatus.Ready, ); + const [exportText, setExportText] = useState(true); + const [exportSrt, setExportSrt] = useState(false); + const [exportMedia, setExportMedia] = useState(false); // TODO: once we have some CSS/component library, tidy up this messy error handling if (!token) { return ( @@ -36,12 +39,6 @@ const ExportButton = () => { ); } - const transcriptFormatDescription: Record = { - srt: 'Srt (with time code)', - text: 'Text', - json: 'Json', - }; - const transcriptId = searchParams.get('transcriptId'); if (!transcriptId) { return ( @@ -60,7 +57,7 @@ const ExportButton = () => { /> ); } - if (loading) { + if (creatingFolder) { return ( { /> ); } - if (docId) { + if (folderId) { return ( - - - + + + ); } const exportHandler = async () => { - if (!transcriptFormat) { - setTranscriptFormatValid(false); - return; - } - setLoading(true); + setCreatingFolder(true); + setExporting(true); try { - const response = await exportTranscript( + const tokenResponse = await getOAuthToken(token); + const createFolderResponse = await createExportFolder( token, + tokenResponse, transcriptId, - transcriptFormat, ); - setLoading(false); - if (response && response.status !== 200) { - const text = await response.text(); + if (createFolderResponse.status !== 200) { + const text = await createFolderResponse.text(); setFailureMessage(text); setRequestStatus(RequestStatus.Failed); return; } - const json = await response.json(); + const folderId = await createFolderResponse.text(); + setCreatingFolder(false); + setFolderId(folderId); + const exportResponse = await exportTranscript( + token, + tokenResponse, + transcriptId, + { + transcriptText: exportText, + transcriptSrt: exportSrt, + sourceMedia: exportMedia, + }, + folderId, + ); + if (exportResponse.status !== 200) { + const text = await exportResponse.text(); + setFailureMessage(text); + setRequestStatus(RequestStatus.Failed); + return; + } + const json = await exportResponse.json(); const parsedResponse = ExportResponse.safeParse(json); if (!parsedResponse.success) { console.error('Failed to parse export response', parsedResponse.error); @@ -113,7 +140,7 @@ const ExportButton = () => { ); return; } - setDocId(parsedResponse.data.documentId); + setExporting(false); setRequestStatus(RequestStatus.Success); } catch (error) { console.error('Export failed', error); @@ -130,42 +157,63 @@ const ExportButton = () => { }, }; + const atLeastOneExport = () => exportText || exportSrt || exportMedia; + return ( <>
- - { - setTranscriptFormat(TranscriptFormat.TEXT); - setTranscriptFormatValid(true); - }} - > - {transcriptFormatDescription[TranscriptFormat.TEXT]} - - - { - setTranscriptFormat(TranscriptFormat.SRT); - setTranscriptFormatValid(true); - }} - > - {transcriptFormatDescription[TranscriptFormat.SRT]} - - +
+
+

+ Exported items will be saved in the same folder in google drive +

+ +
+ setExportText(!exportText)} + /> + +
+
+ setExportSrt(!exportSrt)} + /> + +
+
+
+ setExportMedia(!exportMedia)} + /> +
+
+ +
+ + Max 10GB, roughly 3 hours of video + +
+
+
+ - {transcriptFormatValid === false ? ( + {!atLeastOneExport() ? ( - A transcript format must be chosen! + Please select at least one item for export ) : null} @@ -173,11 +221,15 @@ const ExportButton = () => { ); diff --git a/packages/client/src/services/export.ts b/packages/client/src/services/export.ts index b7e5d7c4..07be9ecf 100644 --- a/packages/client/src/services/export.ts +++ b/packages/client/src/services/export.ts @@ -1,9 +1,11 @@ import { ClientConfig, + CreateFolderRequest, + ExportItems, TranscriptExportRequest, - TranscriptFormat, } from '@guardian/transcription-service-common'; import { authFetch } from '@/helpers'; +import TokenResponse = google.accounts.oauth2.TokenResponse; const getClientConfig = async (authToken: string): Promise => { const configResp = await authFetch('/api/client-config', authToken); @@ -46,11 +48,9 @@ const promiseInitTokenClient = ( }); }; -export const exportTranscript = async ( +export const getOAuthToken = async ( authToken: string, - transcriptId: string, - transcriptFormat: TranscriptFormat, -): Promise => { +): Promise => { const config = await getClientConfig(authToken); const driveFileScope = 'https://www.googleapis.com/auth/drive.file'; @@ -59,20 +59,57 @@ export const exportTranscript = async ( config.googleClientId, driveFileScope, ); + return tokenResponse; +}; + +export const createExportFolder = async ( + authToken: string, + tokenResponse: TokenResponse, + transcriptId: string, +) => { + const createFolderRequest: CreateFolderRequest = { + transcriptId: transcriptId, + // @ts-expect-error (return object from google isn't actually a TokenResponse, our zod type is more accurate) + oAuthTokenResponse: tokenResponse, + }; + + const createFolderResponse = await authFetch( + '/api/export/create-folder', + authToken, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(createFolderRequest), + }, + ); + + return createFolderResponse; +}; +export const exportTranscript = async ( + authToken: string, + tokenResponse: TokenResponse, + transcriptId: string, + items: ExportItems, + folderId: string, +): Promise => { const exportRequest: TranscriptExportRequest = { id: transcriptId, // @ts-expect-error (return object from google isn't actually a TokenResponse, our zod type is more accurate) oAuthTokenResponse: tokenResponse, - transcriptFormat, + items, + folderId, }; - const exportResponse = await authFetch('/api/export', authToken, { + // we don't await here so that the caller (JSX component) can carry on updating the UI + const exportPromise = authFetch('/api/export/export', authToken, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(exportRequest), }); - return exportResponse; + return exportPromise; }; diff --git a/packages/common/src/types.ts b/packages/common/src/types.ts index 37781b23..6931b811 100644 --- a/packages/common/src/types.ts +++ b/packages/common/src/types.ts @@ -147,22 +147,34 @@ export const ZTokenResponse = z.object({ export type ZTokenResponse = z.infer; -export enum TranscriptFormat { - SRT = 'srt', - TEXT = 'text', - JSON = 'json', -} +export const ExportItems = z.object({ + transcriptText: z.boolean(), + transcriptSrt: z.boolean(), + sourceMedia: z.boolean(), +}); + +export type ExportItems = z.infer; export const TranscriptExportRequest = z.object({ id: z.string(), oAuthTokenResponse: ZTokenResponse, - transcriptFormat: z.nativeEnum(TranscriptFormat), + items: ExportItems, + folderId: z.string(), }); export type TranscriptExportRequest = z.infer; +export const CreateFolderRequest = z.object({ + transcriptId: z.string(), + oAuthTokenResponse: ZTokenResponse, +}); + +export type CreateFolderRequest = z.infer; + export const ExportResponse = z.object({ - documentId: z.string(), + textDocumentId: z.optional(z.string()), + srtDocumentId: z.optional(z.string()), + sourceMediaFileId: z.optional(z.string()), }); export type ExportResponse = z.infer; diff --git a/packages/media-download/src/index.ts b/packages/media-download/src/index.ts index 3d942b67..42c2f3f9 100644 --- a/packages/media-download/src/index.ts +++ b/packages/media-download/src/index.ts @@ -5,6 +5,7 @@ import { getSQSClient, isSqsFailure, logger, + mediaKey, sendMessage, TranscriptionConfig, } from '@guardian/transcription-service-backend-common'; @@ -28,7 +29,7 @@ const uploadToS3 = async ( id: string, ) => { const fileStream = createReadStream(`${metadata.mediaPath}`); - const key = `downloaded-media/${id}`; + const key = mediaKey(id); try { const upload = new Upload({ client: s3Client, From f083b134d99518cb10517b942783fc0e56f30c1a Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 11:30:06 +0000 Subject: [PATCH 03/22] Increase ephemeral storage of lambda to 10gb, memory to 512mb, to allow for downloading/uploading large files --- .../transcription-service.test.ts.snap | 17 ++++++++++++++++- packages/cdk/lib/transcription-service.ts | 3 +++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap b/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap index a5183898..a47a7a8c 100644 --- a/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap +++ b/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap @@ -1798,8 +1798,15 @@ service transcription-service-worker start", }, }, "Memory": 4096, + "MountPoints": [ + { + "ContainerPath": "/media-download", + "ReadOnly": false, + "SourceVolume": "media-download-volume", + }, + ], "Name": "media-download-task-TaskContainer", - "ReadonlyRootFilesystem": false, + "ReadonlyRootFilesystem": true, }, ], "Cpu": "2048", @@ -1850,6 +1857,11 @@ service transcription-service-worker start", "Arn", ], }, + "Volumes": [ + { + "Name": "media-download-volume", + }, + ], }, "Type": "AWS::ECS::TaskDefinition", }, @@ -2112,6 +2124,9 @@ service transcription-service-worker start", "STAGE": "TEST", }, }, + "EphemeralStorage": { + "Size": 10240, + }, "Handler": "index.api", "LoggingConfig": { "LogFormat": "JSON", diff --git a/packages/cdk/lib/transcription-service.ts b/packages/cdk/lib/transcription-service.ts index f4d2d0b0..2a7c3774 100644 --- a/packages/cdk/lib/transcription-service.ts +++ b/packages/cdk/lib/transcription-service.ts @@ -30,6 +30,7 @@ import { Duration, Fn, RemovalPolicy, + Size, Tags, } from 'aws-cdk-lib'; import { EndpointType } from 'aws-cdk-lib/aws-apigateway'; @@ -199,6 +200,8 @@ export class TranscriptionService extends GuStack { noMonitoring: true, }, app: `${APP_NAME}-api`, + ephemeralStorageSize: Size.mebibytes(10240), + memorySize: 512, api: { id: apiId, description: 'API for transcription service frontend', From 5c6b98c660e3bf4bb53f0a969ebfabb0d98ec163 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 15:22:12 +0000 Subject: [PATCH 04/22] Set originalfilename and extension metadata on files uploaded via the ui to s3 --- packages/api/src/index.ts | 10 ++++++++- packages/backend-common/src/s3.ts | 22 ++++++++++++++----- packages/client/src/components/UploadForm.tsx | 10 ++++++++- packages/common/src/types.ts | 6 +++++ 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index 3fd13308..f35c4b69 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -28,6 +28,7 @@ import { MediaDownloadJob, CreateFolderRequest, ExportResponse, + signedUrlRequestBody, } from '@guardian/transcription-service-common'; import type { SignedUrlResponseBody } from '@guardian/transcription-service-common'; import { @@ -317,9 +318,15 @@ const getApp = async () => { }), ]); - apiRouter.get('/signed-url', [ + apiRouter.post('/signed-url', [ checkAuth, asyncHandler(async (req, res) => { + const parsedRequest = signedUrlRequestBody.safeParse(req.body); + if (!parsedRequest.success) { + res.status(400).send('Invalid request'); + return; + } + const s3Key = uuid4(); const presignedS3Url = await getSignedUploadUrl( config.aws.region, @@ -328,6 +335,7 @@ const getApp = async () => { 60, true, s3Key, + parsedRequest.data.fileName, ); res.set('Cache-Control', 'no-cache'); diff --git a/packages/backend-common/src/s3.ts b/packages/backend-common/src/s3.ts index a9bf8f6f..4fbbe85c 100644 --- a/packages/backend-common/src/s3.ts +++ b/packages/backend-common/src/s3.ts @@ -31,19 +31,29 @@ export const getSignedUploadUrl = ( userEmail: string, expiresIn: number, useAccelerateEndpoint: boolean, - id?: string, -) => - getSignedUrlSdk( + id: string, + fileName?: string, +) => { + const metadata = { + 'user-email': userEmail, + }; + const metadataWithFilename = fileName + ? { + ...metadata, + originalFilename: fileName, + extension: path.extname(fileName), + } + : metadata; + return getSignedUrlSdk( getS3Client(region, useAccelerateEndpoint), new PutObjectCommand({ Bucket: bucket, Key: id, - Metadata: { - 'user-email': userEmail, - }, + Metadata: metadataWithFilename, }), { expiresIn }, // override default expiration time of 15 minutes ); +}; export const getSignedDownloadUrl = async ( region: string, diff --git a/packages/client/src/components/UploadForm.tsx b/packages/client/src/components/UploadForm.tsx index 44846070..74d6b6dc 100644 --- a/packages/client/src/components/UploadForm.tsx +++ b/packages/client/src/components/UploadForm.tsx @@ -56,7 +56,15 @@ const uploadFileAndTranscribe = async ( ) => { const blob = new Blob([file as BlobPart]); - const response = await authFetch(`/api/signed-url`, token); + const response = await authFetch(`/api/signed-url`, token, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + fileName: file.name, + }), + }); if (!response) { console.error('Failed to fetch signed url'); return false; diff --git a/packages/common/src/types.ts b/packages/common/src/types.ts index 6931b811..e0769c74 100644 --- a/packages/common/src/types.ts +++ b/packages/common/src/types.ts @@ -197,6 +197,12 @@ export type TranscribeFileRequestBody = z.infer< typeof transcribeFileRequestBody >; +export const signedUrlRequestBody = z.object({ + fileName: z.string(), +}); + +export type SignedUrlRequestBody = z.infer; + export const inputBucketObjectMetadata = z.object({ 'user-email': z.string(), }); From de469a0dffc6d1b7883e63725d61eb61a78ac809 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 15:41:40 +0000 Subject: [PATCH 05/22] Prevent double . in filename --- packages/backend-common/src/s3.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/backend-common/src/s3.ts b/packages/backend-common/src/s3.ts index 4fbbe85c..a398f988 100644 --- a/packages/backend-common/src/s3.ts +++ b/packages/backend-common/src/s3.ts @@ -41,7 +41,7 @@ export const getSignedUploadUrl = ( ? { ...metadata, originalFilename: fileName, - extension: path.extname(fileName), + extension: path.extname(fileName).replace('.', ''), } : metadata; return getSignedUrlSdk( From 9126bbe67102c3f7a8cab9997e04cf12f50a5e5c Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 15:46:37 +0000 Subject: [PATCH 06/22] Tidy up filename --- packages/api/src/export.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/api/src/export.ts b/packages/api/src/export.ts index 0f0d1d23..7e17490f 100644 --- a/packages/api/src/export.ts +++ b/packages/api/src/export.ts @@ -47,8 +47,15 @@ export const exportMediaToDrive = async ( const mimeType = 'application/octet-stream'; + // default to mp4 on the assumption that most media exported will be video + const extensionOrMp4 = extension || 'mp4'; + + const fileName = item.originalFilename.endsWith(`.${extensionOrMp4}`) + ? item.originalFilename + : `${item.originalFilename}.${extensionOrMp4 || 'mp4'}`; + const id = await uploadFileToGoogleDrive( - `${item.originalFilename}.${extension || 'mp4'}`, + fileName, oAuthTokenResponse, filePath, mimeType, @@ -93,7 +100,7 @@ export const exportTranscriptToDoc = async ( drive, docs, folderId, - `${item.originalFilename} transcript${item.isTranslation ? ' (English translation)' : ''}`, + `${item.originalFilename} transcript${format === 'srt' ? ' with timecodes' : ''} ${item.isTranslation ? ' (English translation)' : ''}`, transcriptText.text, ); if (!exportResult) { From 60ab235d4c03b1082485bb893285b129b1d2ce9a Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 16:39:18 +0000 Subject: [PATCH 07/22] Show links to individual files when export complete --- .../client/src/components/ExportButton.tsx | 65 ++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/packages/client/src/components/ExportButton.tsx b/packages/client/src/components/ExportButton.tsx index 42e1ab18..9f988ee9 100644 --- a/packages/client/src/components/ExportButton.tsx +++ b/packages/client/src/components/ExportButton.tsx @@ -17,8 +17,40 @@ import { CustomFlowbiteTheme, Flowbite, Label, + List, } from 'flowbite-react'; +const getDriveLink = (id: string, docType: 'document' | 'file') => { + return docType === 'document' + ? `https://docs.google.com/document/d/${id}` + : `https://drive.google.com/file/d/${id}`; +}; + +const makeFileLinks = ( + exportResponse: ExportResponse, +): { url: string; text: string }[] => { + const links = []; + if (exportResponse.textDocumentId) { + links.push({ + url: getDriveLink(exportResponse.textDocumentId, 'document'), + text: 'Transcript text', + }); + } + if (exportResponse.srtDocumentId) { + links.push({ + url: getDriveLink(exportResponse.srtDocumentId, 'document'), + text: 'Transcript text with timecodes (SRT)', + }); + } + if (exportResponse.sourceMediaFileId) { + links.push({ + url: getDriveLink(exportResponse.sourceMediaFileId, 'file'), + text: 'Original source media', + }); + } + return links; +}; + const ExportButton = () => { const { token } = useContext(AuthContext); const searchParams = useSearchParams(); @@ -32,6 +64,8 @@ const ExportButton = () => { const [exportText, setExportText] = useState(true); const [exportSrt, setExportSrt] = useState(false); const [exportMedia, setExportMedia] = useState(false); + const [exportResponse, setExportResponse] = useState({}); + // TODO: once we have some CSS/component library, tidy up this messy error handling if (!token) { return ( @@ -51,7 +85,7 @@ const ExportButton = () => { if (requestStatus === RequestStatus.Failed) { return ( @@ -70,13 +104,39 @@ const ExportButton = () => { if (folderId) { return ( <> - {exporting && ( + {exporting ? ( + ) : ( +
+ +
+ + {makeFileLinks(exportResponse).map( + ({ url, text }: { url: string; text: string }) => ( + + + {text} + + + ), + )} + +
+
)} { return; } setExporting(false); + setExportResponse(parsedResponse.data); setRequestStatus(RequestStatus.Success); } catch (error) { console.error('Export failed', error); From 56327a030683f585db3bb87c141b22dacb3db4a6 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 16:41:15 +0000 Subject: [PATCH 08/22] Rename ExportButton ExportForm --- packages/client/src/app/export/page.tsx | 4 ++-- .../src/components/{ExportButton.tsx => ExportForm.tsx} | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) rename packages/client/src/components/{ExportButton.tsx => ExportForm.tsx} (99%) diff --git a/packages/client/src/app/export/page.tsx b/packages/client/src/app/export/page.tsx index 7f9da70a..8c8c3213 100644 --- a/packages/client/src/app/export/page.tsx +++ b/packages/client/src/app/export/page.tsx @@ -1,11 +1,11 @@ 'use client'; import React from 'react'; -import ExportButton from '@/components/ExportButton'; +import ExportForm from '@/components/ExportForm'; const Export = () => { return (
- +
); }; diff --git a/packages/client/src/components/ExportButton.tsx b/packages/client/src/components/ExportForm.tsx similarity index 99% rename from packages/client/src/components/ExportButton.tsx rename to packages/client/src/components/ExportForm.tsx index 9f988ee9..2ee1cd12 100644 --- a/packages/client/src/components/ExportButton.tsx +++ b/packages/client/src/components/ExportForm.tsx @@ -51,7 +51,7 @@ const makeFileLinks = ( return links; }; -const ExportButton = () => { +const ExportForm = () => { const { token } = useContext(AuthContext); const searchParams = useSearchParams(); const [folderId, setFolderId] = useState(); @@ -296,4 +296,4 @@ const ExportButton = () => { ); }; -export default ExportButton; +export default ExportForm; From a5f4b4cb3d4b9c13ded24644a593f2d9dbaf2431 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 16:52:08 +0000 Subject: [PATCH 09/22] Fix dynamo table name --- packages/api/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index f35c4b69..9a9ca914 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -207,7 +207,7 @@ const getApp = async () => { } const { item, errorMessage } = await getTranscriptionItem( dynamoClient, - 'transcription-service-CODE', + config.app.tableName, createRequest.data.transcriptId, ); if (!item) { @@ -242,7 +242,7 @@ const getApp = async () => { } const { item, errorMessage } = await getTranscriptionItem( dynamoClient, - 'transcription-service-CODE', + config.app.tableName, exportRequest.data.id, ); if (!item) { From 7a78c1f7a352cde5b3790e0cd535e7b50f8a0510 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 17:08:16 +0000 Subject: [PATCH 10/22] Include date and time in folder name --- packages/api/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index 9a9ca914..d9475aaa 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -220,7 +220,7 @@ const getApp = async () => { ); const folderId = await createExportFolder( driveClients.drive, - item.originalFilename, + `${item.originalFilename} ${new Date().toISOString().slice(0, 16).replace('T', ' ')}`, ); if (!folderId) { res.status(500).send('Failed to create folder'); From 7144de275a6c87182256e5637e5cde4002ac4c3a Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Wed, 8 Jan 2025 18:34:44 +0000 Subject: [PATCH 11/22] Bump lambda timeout to 15 minutes --- .../cdk/lib/__snapshots__/transcription-service.test.ts.snap | 2 +- packages/cdk/lib/transcription-service.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap b/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap index a47a7a8c..b39483f4 100644 --- a/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap +++ b/packages/cdk/lib/__snapshots__/transcription-service.test.ts.snap @@ -2161,7 +2161,7 @@ service transcription-service-worker start", "Value": "TEST", }, ], - "Timeout": 30, + "Timeout": 900, }, "Type": "AWS::Lambda::Function", }, diff --git a/packages/cdk/lib/transcription-service.ts b/packages/cdk/lib/transcription-service.ts index 2a7c3774..728dab53 100644 --- a/packages/cdk/lib/transcription-service.ts +++ b/packages/cdk/lib/transcription-service.ts @@ -202,6 +202,7 @@ export class TranscriptionService extends GuStack { app: `${APP_NAME}-api`, ephemeralStorageSize: Size.mebibytes(10240), memorySize: 512, + timeout: Duration.seconds(900), api: { id: apiId, description: 'API for transcription service frontend', From b31667c549adc78eccca55f884296dbc2576fb67 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Thu, 9 Jan 2025 12:25:19 +0000 Subject: [PATCH 12/22] Refactor so that export returns immediately then client polls status endpoint --- packages/api/src/export.ts | 48 +++-- packages/api/src/index.ts | 135 +++++++------ packages/backend-common/src/dynamodb.ts | 2 + packages/client/src/components/ExportForm.tsx | 177 ++++++++++++------ packages/common/src/types.ts | 49 +++-- 5 files changed, 279 insertions(+), 132 deletions(-) diff --git a/packages/api/src/export.ts b/packages/api/src/export.ts index 7e17490f..a92a7cee 100644 --- a/packages/api/src/export.ts +++ b/packages/api/src/export.ts @@ -7,7 +7,13 @@ import { TranscriptionConfig, TranscriptionDynamoItem, } from '@guardian/transcription-service-backend-common'; -import { ZTokenResponse } from '@guardian/transcription-service-common'; +import { + ExportItems, + ExportStatus, + ExportStatuses, + ExportType, + ZTokenResponse, +} from '@guardian/transcription-service-common'; import { uploadFileToGoogleDrive, uploadToGoogleDocs, @@ -24,7 +30,7 @@ export const exportMediaToDrive = async ( item: TranscriptionDynamoItem, oAuthTokenResponse: ZTokenResponse, folderId: string, -): Promise<{ statusCode: number; fileId?: string; message?: string }> => { +): Promise => { const mediaSize = await getObjectSize( s3Client, config.app.sourceMediaBucket, @@ -33,7 +39,8 @@ export const exportMediaToDrive = async ( if (mediaSize && mediaSize > LAMBDA_MAX_EPHEMERAL_STORAGE_BYTES) { const msg = `Media file too large to export to google drive. Please manually download the file and upload using the google drive UI`; return { - statusCode: 400, + exportType: 'source-media', + status: 'failure', message: msg, }; } @@ -62,8 +69,9 @@ export const exportMediaToDrive = async ( folderId, ); return { - fileId: id, - statusCode: 200, + exportType: 'source-media', + id, + status: 'success', }; }; @@ -75,7 +83,7 @@ export const exportTranscriptToDoc = async ( folderId: string, drive: Drive, docs: Docs, -): Promise<{ statusCode: number; message?: string; documentId?: string }> => { +): Promise => { const transcriptS3Key = item.transcriptKeys[format]; const transcriptText = await getObjectText( s3Client, @@ -86,14 +94,16 @@ export const exportTranscriptToDoc = async ( if (transcriptText.failureReason === 'NoSuchKey') { const msg = `Failed to export transcript - file has expired. Please re-upload the file and try again.`; return { - statusCode: 410, + status: 'failure', message: msg, + exportType: format, }; } const msg = `Failed to fetch transcript. Please contact the digital investigations team for support`; return { - statusCode: 500, + status: 'failure', message: msg, + exportType: format, }; } const exportResult = await uploadToGoogleDocs( @@ -107,12 +117,28 @@ export const exportTranscriptToDoc = async ( const msg = `Failed to create google document for item with id ${item.id}`; logger.error(msg); return { - statusCode: 500, + status: 'failure', message: msg, + exportType: format, }; } return { - statusCode: 200, - documentId: exportResult, + status: 'success', + id: exportResult, + exportType: format, }; }; + +export const exportStatusInProgress = (items: ExportItems): ExportStatuses => { + return items.map((item: ExportType) => ({ + status: 'in-progress', + exportType: item, + })); +}; + +export const updateStatus = ( + status: ExportStatus, + statuses: ExportStatuses, +): ExportStatuses => { + return statuses.map((s) => (s.exportType === status.exportType ? status : s)); +}; diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts index d9475aaa..7a93429d 100644 --- a/packages/api/src/index.ts +++ b/packages/api/src/index.ts @@ -18,6 +18,7 @@ import { logger, getS3Client, sendMessage, + writeTranscriptionItem, } from '@guardian/transcription-service-backend-common'; import { ClientConfig, @@ -27,8 +28,10 @@ import { transcribeUrlRequestBody, MediaDownloadJob, CreateFolderRequest, - ExportResponse, signedUrlRequestBody, + ExportStatus, + ExportStatuses, + ExportType, } from '@guardian/transcription-service-common'; import type { SignedUrlResponseBody } from '@guardian/transcription-service-common'; import { @@ -37,7 +40,13 @@ import { } from '@guardian/transcription-service-backend-common/src/dynamodb'; import { createExportFolder, getDriveClients } from './services/googleDrive'; import { v4 as uuid4 } from 'uuid'; -import { exportMediaToDrive, exportTranscriptToDoc } from './export'; +import { + exportMediaToDrive, + exportStatusInProgress, + exportTranscriptToDoc, + updateStatus, +} from './export'; +import { DynamoDBDocumentClient } from '@aws-sdk/lib-dynamodb'; const runningOnAws = process.env['AWS_EXECUTION_ENV']; const emulateProductionLocally = @@ -56,7 +65,7 @@ const getApp = async () => { ); const s3Client = getS3Client(config.aws.region); - const dynamoClient = getDynamoClient( + const dynamoClient: DynamoDBDocumentClient = getDynamoClient( config.aws.region, config.aws.localstackEndpoint, ); @@ -195,6 +204,30 @@ const getApp = async () => { }), ]); + apiRouter.get('/export/status', [ + checkAuth, + asyncHandler(async (req, res) => { + const id = req.query.id as string; + if (!id) { + res + .status(400) + .send('You must provide the transcript id in the query string'); + return; + } + const { item, errorMessage } = await getTranscriptionItem( + dynamoClient, + config.app.tableName, + id, + ); + if (!item) { + res.status(500).send(errorMessage); + return; + } + res.send(JSON.stringify(item.exportStatus)); + return; + }), + ]); + apiRouter.post('/export/create-folder', [ checkAuth, asyncHandler(async (req, res) => { @@ -261,59 +294,51 @@ const getApp = async () => { config, exportRequest.data.oAuthTokenResponse, ); - const exportResult: ExportResponse = { - textDocumentId: undefined, - srtDocumentId: undefined, - sourceMediaFileId: undefined, - }; - if (exportRequest.data.items.transcriptText) { - const textResult = await exportTranscriptToDoc( - config, - s3Client, - item, - 'text', - exportRequest.data.folderId, - driveClients.drive, - driveClients.docs, - ); - if (textResult.statusCode !== 200) { - res.status(textResult.statusCode).send(textResult.message); - return; - } - exportResult.textDocumentId = textResult.documentId; - } - if (exportRequest.data.items.transcriptSrt) { - const srtResult = await exportTranscriptToDoc( - config, - s3Client, - item, - 'srt', - exportRequest.data.folderId, - driveClients.drive, - driveClients.docs, - ); - if (srtResult.statusCode !== 200) { - res.status(srtResult.statusCode).send(srtResult.message); - return; - } - exportResult.srtDocumentId = srtResult.documentId; - } - if (exportRequest.data.items.sourceMedia) { - const mediaResult = await exportMediaToDrive( - config, - s3Client, - item, - exportRequest.data.oAuthTokenResponse, - exportRequest.data.folderId, + let currentStatuses: ExportStatuses = exportStatusInProgress( + exportRequest.data.items, + ); + await writeTranscriptionItem(dynamoClient, config.app.tableName, { + ...item, + exportStatus: currentStatuses, + }); + const exportPromises: Promise[] = exportRequest.data.items + .map((exportType: ExportType) => { + if (exportType === 'text' || exportType === 'srt') { + return exportTranscriptToDoc( + config, + s3Client, + item, + exportType, + exportRequest.data.folderId, + driveClients.drive, + driveClients.docs, + ); + } else { + return exportMediaToDrive( + config, + s3Client, + item, + exportRequest.data.oAuthTokenResponse, + exportRequest.data.folderId, + ); + } + }) + .map((exportResult: Promise) => + exportResult.then(async (result: ExportStatus) => { + if (result.status === 'failure') { + logger.error(result.message); + } else { + logger.info(`Transcript ${result.exportType} export complete`); + } + currentStatuses = updateStatus(result, currentStatuses); + await writeTranscriptionItem(dynamoClient, config.app.tableName, { + ...item, + exportStatus: currentStatuses, + }); + }), ); - if (mediaResult.statusCode !== 200) { - logger.error('Failed to export media to google drive'); - res.status(mediaResult.statusCode).send(mediaResult.message); - return; - } - exportResult.sourceMediaFileId = mediaResult.fileId; - } - res.send(JSON.stringify(exportResult)); + res.send(JSON.stringify(currentStatuses)); + await Promise.all(exportPromises); return; }), ]); diff --git a/packages/backend-common/src/dynamodb.ts b/packages/backend-common/src/dynamodb.ts index 6c6dc9c6..3f665e5f 100644 --- a/packages/backend-common/src/dynamodb.ts +++ b/packages/backend-common/src/dynamodb.ts @@ -7,6 +7,7 @@ import { import { z } from 'zod'; import { logger } from '@guardian/transcription-service-backend-common'; +import { ExportStatus } from '@guardian/transcription-service-common'; export const getDynamoClient = ( region: string, @@ -39,6 +40,7 @@ export const TranscriptionDynamoItem = z.object({ userEmail: z.string(), completedAt: z.optional(z.string()), // dynamodb can't handle dates so we need to use an ISO date isTranslation: z.boolean(), + exportStatus: z.optional(z.array(ExportStatus)), }); export type TranscriptionDynamoItem = z.infer; diff --git a/packages/client/src/components/ExportForm.tsx b/packages/client/src/components/ExportForm.tsx index 2ee1cd12..b3b96785 100644 --- a/packages/client/src/components/ExportForm.tsx +++ b/packages/client/src/components/ExportForm.tsx @@ -1,5 +1,9 @@ import React, { useContext, useState } from 'react'; -import { ExportResponse } from '@guardian/transcription-service-common'; +import { + ExportStatus, + ExportStatuses, + ExportType, +} from '@guardian/transcription-service-common'; import { AuthContext } from '@/app/template'; import Script from 'next/script'; import { useSearchParams } from 'next/navigation'; @@ -19,36 +23,40 @@ import { Label, List, } from 'flowbite-react'; +import { authFetch } from '@/helpers'; -const getDriveLink = (id: string, docType: 'document' | 'file') => { - return docType === 'document' - ? `https://docs.google.com/document/d/${id}` - : `https://drive.google.com/file/d/${id}`; +const getDriveLink = (id: string, exportType: ExportType) => { + return exportType === 'source-media' + ? `https://drive.google.com/file/d/${id}` + : `https://docs.google.com/document/d/${id}`; }; -const makeFileLinks = ( - exportResponse: ExportResponse, -): { url: string; text: string }[] => { - const links = []; - if (exportResponse.textDocumentId) { - links.push({ - url: getDriveLink(exportResponse.textDocumentId, 'document'), - text: 'Transcript text', - }); - } - if (exportResponse.srtDocumentId) { - links.push({ - url: getDriveLink(exportResponse.srtDocumentId, 'document'), - text: 'Transcript text with timecodes (SRT)', - }); +const getExportTypeText = (exportType: ExportType) => { + switch (exportType) { + case 'source-media': + return 'Original source media'; + case 'text': + return 'Transcript text'; + case 'srt': + return 'Transcript text with timecodes (SRT)'; + default: + return 'Unknown export type'; } - if (exportResponse.sourceMediaFileId) { - links.push({ - url: getDriveLink(exportResponse.sourceMediaFileId, 'file'), - text: 'Original source media', - }); +}; + +const updateExportTypes = ( + type: ExportType, + value: boolean, + currentExportTypes: ExportType[], +) => { + if (value) { + if (!currentExportTypes.includes(type)) { + return [...currentExportTypes, type]; + } + return currentExportTypes; + } else { + return currentExportTypes.filter((currentType) => currentType !== type); } - return links; }; const ExportForm = () => { @@ -61,10 +69,10 @@ const ExportForm = () => { const [requestStatus, setRequestStatus] = useState( RequestStatus.Ready, ); - const [exportText, setExportText] = useState(true); - const [exportSrt, setExportSrt] = useState(false); - const [exportMedia, setExportMedia] = useState(false); - const [exportResponse, setExportResponse] = useState({}); + const [exportTypesRequested, setExportTypesRequested] = useState< + ExportType[] + >(['text']); + const [exportStatuses, setExportStatuses] = useState([]); // TODO: once we have some CSS/component library, tidy up this messy error handling if (!token) { @@ -119,21 +127,32 @@ const ExportForm = () => { />
- {makeFileLinks(exportResponse).map( - ({ url, text }: { url: string; text: string }) => ( - + {exportStatuses.map((status: ExportStatus) => ( + + {status.status === 'success' && ( - {text} + {getExportTypeText(status.exportType)} - - ), - )} + )} + {status.status === 'failure' && ( + + {getExportTypeText(status.exportType)} export failed + + )} + {status.status === 'in-progress' && ( + + {getExportTypeText(status.exportType)} export in + progress + + )} + + ))}
@@ -154,6 +173,38 @@ const ExportForm = () => { ); } + const updateStatuses = async () => { + const statusResponse = await authFetch( + `export/status?id=${transcriptId}`, + token, + ); + if (statusResponse.status === 200) { + const json = await statusResponse.json(); + const parsedResponse = ExportStatuses.safeParse(json); + if (!parsedResponse.success) { + console.error( + 'Failed to parse export status response', + parsedResponse.error, + ); + return; + } + const statuses = parsedResponse.data.map( + (status: ExportStatus) => status.status, + ); + if (statuses.includes('in-progress')) { + setTimeout(updateStatuses, 2000); + } else { + if (statuses.includes('failure')) { + setRequestStatus(RequestStatus.Failed); + setFailureMessage('One or more exports failed'); + return; + } + setRequestStatus(RequestStatus.Success); + } + setExportStatuses(parsedResponse.data); + } + }; + const exportHandler = async () => { setCreatingFolder(true); setExporting(true); @@ -177,11 +228,7 @@ const ExportForm = () => { token, tokenResponse, transcriptId, - { - transcriptText: exportText, - transcriptSrt: exportSrt, - sourceMedia: exportMedia, - }, + exportTypesRequested, folderId, ); if (exportResponse.status !== 200) { @@ -191,7 +238,7 @@ const ExportForm = () => { return; } const json = await exportResponse.json(); - const parsedResponse = ExportResponse.safeParse(json); + const parsedResponse = ExportStatuses.safeParse(json); if (!parsedResponse.success) { console.error('Failed to parse export response', parsedResponse.error); setRequestStatus(RequestStatus.Failed); @@ -201,8 +248,8 @@ const ExportForm = () => { return; } setExporting(false); - setExportResponse(parsedResponse.data); - setRequestStatus(RequestStatus.Success); + await updateStatuses(); + setExportStatuses(parsedResponse.data); } catch (error) { console.error('Export failed', error); setFailureMessage("'Authentication with Google failed'"); @@ -218,7 +265,7 @@ const ExportForm = () => { }, }; - const atLeastOneExport = () => exportText || exportSrt || exportMedia; + const atLeastOneExport = () => exportTypesRequested.length > 0; return ( <> @@ -238,16 +285,32 @@ const ExportForm = () => {
setExportText(!exportText)} + checked={exportTypesRequested.includes('text')} + onChange={(e) => + setExportTypesRequested( + updateExportTypes( + 'text', + e.target.checked, + exportTypesRequested, + ), + ) + } />
setExportSrt(!exportSrt)} + checked={exportTypesRequested.includes('srt')} + onChange={(e) => + setExportTypesRequested( + updateExportTypes( + 'srt', + e.target.checked, + exportTypesRequested, + ), + ) + } />