diff --git a/functions/src/events/scrapeEvents.ts b/functions/src/events/scrapeEvents.ts index cde11aa3f..7cc0b7ead 100644 --- a/functions/src/events/scrapeEvents.ts +++ b/functions/src/events/scrapeEvents.ts @@ -137,6 +137,76 @@ class SessionScraper extends EventScraper { } } +const submitTranscription = async ({ + EventId, + maybeVideoUrl +}: { + EventId: number + maybeVideoUrl: string +}) => { + const newToken = randomBytes(16).toString("hex") + + const transcript = await assembly.transcripts.submit({ + audio: + // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac", + maybeVideoUrl, + webhook_url: + // test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription", + process.env.NODE_ENV === "development" + ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription" + : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription", + speaker_labels: true, + webhook_auth_header_name: "x-maple-webhook", + webhook_auth_header_value: newToken + }) + + await db + .collection("events") + .doc(`hearing-${String(EventId)}`) + .collection("private") + .doc("webhookAuth") + .set({ + videoAssemblyWebhookToken: sha256(newToken) + }) + + return transcript.id +} + +const getHearingVideoUrl = async (EventId: number) => { + const req = await fetch( + `https://malegislature.gov/Events/Hearings/Detail/${EventId}` + ) + const res = await req.text() + if (res) { + const dom = new JSDOM(res) + if (dom) { + const maybeVideoSource = + dom.window.document.querySelectorAll("video source") + if (maybeVideoSource.length && maybeVideoSource[0]) { + const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement + return firstVideoSource.src + } + } + } + return null +} + +const shouldScrapeVideo = async (EventId: number) => { + const eventInDb = await db + .collection("events") + .doc(`hearing-${String(EventId)}`) + .get() + const eventData = eventInDb.data() + + if (!eventData) { + return false + } + if (!eventData.videoFetchedAt) { + return withinCutoff(new Date(Hearing.check(eventData).startsAt.toDate())) + } + return false +} + class HearingScraper extends EventScraper { constructor() { super("every 60 minutes", 240) @@ -150,88 +220,33 @@ class HearingScraper extends EventScraper { async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) { const data = await api.getHearing(EventId) const content = HearingContent.check(data) - const eventInDb = await db - .collection("events") - .doc(`hearing-${String(EventId)}`) - .get() - const eventData = eventInDb.data() - const hearing = Hearing.check(eventData) - const shouldScrape = withinCutoff(hearing.startsAt.toDate()) - - let payload: Hearing = { + + if (await shouldScrapeVideo(EventId)) { + const maybeVideoUrl = await getHearingVideoUrl(EventId) + if (maybeVideoUrl) { + const transcriptId = await submitTranscription({ + maybeVideoUrl, + EventId + }) + + return { + id: `hearing-${EventId}`, + type: "hearing", + content, + ...this.timestamps(content), + videoURL: maybeVideoUrl, + videoFetchedAt: Timestamp.now(), + videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId + } as Hearing + } + } + + return { id: `hearing-${EventId}`, type: "hearing", content, ...this.timestamps(content) - } - if (hearing) { - payload = { - ...payload, - videoURL: hearing.videoURL, - videoFetchedAt: hearing.videoFetchedAt, - videoAssemblyId: hearing.videoAssemblyId - } - } - let maybeVideoURL = null - let transcript = null - - if (!hearing.videoFetchedAt && shouldScrape) { - const req = await fetch( - `https://malegislature.gov/Events/Hearings/Detail/${EventId}` - ) - const res = await req.text() - if (res) { - const dom = new JSDOM(res) - if (dom) { - const maybeVideoSource = - dom.window.document.querySelectorAll("video source") - if (maybeVideoSource.length && maybeVideoSource[0]) { - const newToken = randomBytes(16).toString("hex") - const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement - maybeVideoURL = firstVideoSource.src - - transcript = await assembly.transcripts.submit({ - webhook_url: - process.env.NODE_ENV === "development" - ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription" - : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription", - webhook_auth_header_name: "X-Maple-Webhook", - webhook_auth_header_value: newToken, - audio: firstVideoSource.src, - auto_highlights: true, - custom_topics: true, - entity_detection: true, - iab_categories: false, - format_text: true, - punctuate: true, - speaker_labels: true, - summarization: true, - summary_model: "informative", - summary_type: "bullets" - }) - - await db - .collection("events") - .doc(`hearing-${String(EventId)}`) - .collection("private") - .doc("webhookAuth") - .set({ - videoAssemblyWebhookToken: sha256(newToken) - }) - - payload = { - ...payload, - videoURL: maybeVideoURL, - videoFetchedAt: Timestamp.now(), - videoAssemblyId: transcript.id - } - } - } - } - } - - const event: Hearing = payload - return event + } as Hearing } } diff --git a/functions/src/webhooks/transcription.ts b/functions/src/webhooks/transcription.ts index c8181b2b3..ba64f66aa 100644 --- a/functions/src/webhooks/transcription.ts +++ b/functions/src/webhooks/transcription.ts @@ -1,6 +1,6 @@ import * as functions from "firebase-functions" import { AssemblyAI } from "assemblyai" -import { db } from "../firebase" +import { db, Timestamp } from "../firebase" import { sha256 } from "js-sha256" const assembly = new AssemblyAI({ @@ -8,50 +8,105 @@ const assembly = new AssemblyAI({ }) export const transcription = functions.https.onRequest(async (req, res) => { - if ( - req.headers["X-Maple-Webhook"] && - req.headers["webhook_auth_header_value"] - ) { + if (req.headers["x-maple-webhook"]) { if (req.body.status === "completed") { + // If we get a request with the right header and status, get the + // transcription from the assembly API. const transcript = await assembly.transcripts.get(req.body.transcript_id) if (transcript && transcript.webhook_auth) { - const maybeEventInDb = await db + // If there is a transcript and the transcript has an auth property, + // look for an event (aka Hearing) in the DB with a matching ID. + const maybeEventsInDb = await db .collection("events") - .where("videoAssemblyId", "==", transcript.id) + .where("videoTranscriptionId", "==", transcript.id) .get() - if (maybeEventInDb.docs.length) { - const authenticatedEventsInDb = maybeEventInDb.docs.filter( - async e => { - const hashedToken = sha256( - String(req.headers["webhook_auth_header_value"]) - ) - const tokenInDb = await db - .collection("events") - .doc(e.id) - .collection("private") - .doc("webhookAuth") - .get() - const tokenInDbData = tokenInDb.data() - if (tokenInDbData) { - return hashedToken === tokenInDbData.videoAssemblyWebhookToken - } - return false + if (maybeEventsInDb.docs.length) { + // If we have a match look for one that matches a hash of the token + // we gave Assembly. There should only be one of these but firestore + // gives us an array. If there is more than one member, something is + // wrong + const authenticatedEventIds = [] as string[] + const hashedToken = sha256(String(req.headers["x-maple-webhook"])) + + for (const index in maybeEventsInDb.docs) { + const doc = maybeEventsInDb.docs[index] + + const tokenDocInDb = await db + .collection("events") + .doc(doc.id) + .collection("private") + .doc("webhookAuth") + .get() + + const tokenDataInDb = tokenDocInDb.data()?.videoAssemblyWebhookToken + + if (hashedToken === tokenDataInDb) { + authenticatedEventIds.push(doc.id) } - ) - if (authenticatedEventsInDb) { + } + + // Log edge cases + if (maybeEventsInDb.docs.length === 0) { + console.log("No matching event in db.") + } + if (authenticatedEventIds.length === 0) { + console.log("No authenticated events in db.") + } + if (authenticatedEventIds.length > 1) { + console.log("More than one matching event in db.") + } + + if (authenticatedEventIds.length === 1) { + // If there is one authenticated event, pull out the parts we want to + // save and try to save them in the db. + const { id, text, audio_url, utterances } = transcript try { - await db + const transcriptionInDb = await db .collection("transcriptions") - .doc(transcript.id) - .set({ _timestamp: new Date(), ...transcript }) + .doc(id) - authenticatedEventsInDb.forEach(async d => { - await d.ref.update({ - ["webhook_auth_header_value"]: null - }) + await transcriptionInDb.set({ + id, + text, + createdAt: Timestamp.now(), + audio_url }) - console.log("transcript saved in db") + + // Put each `utterance` in a separate doc in an utterances + // collection. Previously had done the same for `words` but + // got worried about collection size and write times since + // `words` can be tens of thousands of members. + if (utterances) { + const writer = db.bulkWriter() + for (let utterance of utterances) { + const { speaker, confidence, start, end, text } = utterance + + writer.set( + db + .collection("transcriptions") + .doc(`${transcript.id}`) + .collection("utterances") + .doc(), + { speaker, confidence, start, end, text } + ) + } + + await writer.close() + } + + // Delete the hashed webhook auth token from our db now that + // we're done. + for (const index in authenticatedEventIds) { + await db + .collection("events") + .doc(authenticatedEventIds[index]) + .collection("private") + .doc("webhookAuth") + .set({ + videoAssemblyWebhookToken: null + }) + } } catch (error) { console.log(error) }