codeforboston · Mephistic · Apr 22, 2025 · Mar 26, 2025 · Mar 27, 2025 · Apr 5, 2025
@@ -137,6 +137,76 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
   }
 }
 
+const submitTranscription = async ({
+  EventId,
+  maybeVideoUrl
+}: {
+  EventId: number
+  maybeVideoUrl: string
+}) => {
+  const newToken = randomBytes(16).toString("hex")
+
+  const transcript = await assembly.transcripts.submit({
+    audio:
+      // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
+      maybeVideoUrl,
+    webhook_url:
+      // test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
+      process.env.NODE_ENV === "development"
+        ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
+        : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
+    speaker_labels: true,
+    webhook_auth_header_name: "x-maple-webhook",
+    webhook_auth_header_value: newToken
+  })
+
+  await db
+    .collection("events")
+    .doc(`hearing-${String(EventId)}`)
+    .collection("private")
+    .doc("webhookAuth")
+    .set({
+      videoAssemblyWebhookToken: sha256(newToken)
+    })
+
+  return transcript.id
+}
+
+const getHearingVideoUrl = async (EventId: number) => {
+  const req = await fetch(
+    `https://malegislature.gov/Events/Hearings/Detail/${EventId}`
+  )
+  const res = await req.text()
+  if (res) {
+    const dom = new JSDOM(res)
+    if (dom) {
+      const maybeVideoSource =
+        dom.window.document.querySelectorAll("video source")
+      if (maybeVideoSource.length && maybeVideoSource[0]) {
+        const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
+        return firstVideoSource.src
+      }
+    }
+  }
+  return null
+}
+
+const shouldScrapeVideo = async (EventId: number) => {
+  const eventInDb = await db
+    .collection("events")
+    .doc(`hearing-${String(EventId)}`)
+    .get()
+  const eventData = eventInDb.data()
+
+  if (!eventData) {
+    return false
+  }
+  if (!eventData.videoFetchedAt) {
+    return withinCutoff(new Date(Hearing.check(eventData).startsAt.toDate()))
+  }
+  return false
+}
+
 class HearingScraper extends EventScraper<HearingListItem, Hearing> {
   constructor() {
     super("every 60 minutes", 240)
@@ -150,88 +220,33 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
   async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) {
     const data = await api.getHearing(EventId)
     const content = HearingContent.check(data)
-    const eventInDb = await db
-      .collection("events")
-      .doc(`hearing-${String(EventId)}`)
-      .get()
-    const eventData = eventInDb.data()
-    const hearing = Hearing.check(eventData)
-    const shouldScrape = withinCutoff(hearing.startsAt.toDate())
-
-    let payload: Hearing = {
+
+    if (await shouldScrapeVideo(EventId)) {
+      const maybeVideoUrl = await getHearingVideoUrl(EventId)
+      if (maybeVideoUrl) {
+        const transcriptId = await submitTranscription({
+          maybeVideoUrl,
+          EventId
+        })
+
+        return {
+          id: `hearing-${EventId}`,
+          type: "hearing",
+          content,
+          ...this.timestamps(content),
+          videoURL: maybeVideoUrl,
+          videoFetchedAt: Timestamp.now(),
+          videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
+        } as Hearing
+      }
+    }
+
+    return {
       id: `hearing-${EventId}`,
       type: "hearing",
       content,
       ...this.timestamps(content)
-    }
-    if (hearing) {
-      payload = {
-        ...payload,
-        videoURL: hearing.videoURL,
-        videoFetchedAt: hearing.videoFetchedAt,
-        videoAssemblyId: hearing.videoAssemblyId
-      }
-    }
-    let maybeVideoURL = null
-    let transcript = null
-
-    if (!hearing.videoFetchedAt && shouldScrape) {
-      const req = await fetch(
-        `https://malegislature.gov/Events/Hearings/Detail/${EventId}`
-      )
-      const res = await req.text()
-      if (res) {
-        const dom = new JSDOM(res)
-        if (dom) {
-          const maybeVideoSource =
-            dom.window.document.querySelectorAll("video source")
-          if (maybeVideoSource.length && maybeVideoSource[0]) {
-            const newToken = randomBytes(16).toString("hex")
-            const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
-            maybeVideoURL = firstVideoSource.src
-
-            transcript = await assembly.transcripts.submit({
-              webhook_url:
-                process.env.NODE_ENV === "development"
-                  ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
-                  : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
-              webhook_auth_header_name: "X-Maple-Webhook",
-              webhook_auth_header_value: newToken,
-              audio: firstVideoSource.src,
-              auto_highlights: true,
-              custom_topics: true,
-              entity_detection: true,
-              iab_categories: false,
-              format_text: true,
-              punctuate: true,
-              speaker_labels: true,
-              summarization: true,
-              summary_model: "informative",
-              summary_type: "bullets"
-            })
-
-            await db
-              .collection("events")
-              .doc(`hearing-${String(EventId)}`)
-              .collection("private")
-              .doc("webhookAuth")
-              .set({
-                videoAssemblyWebhookToken: sha256(newToken)
-              })
-
-            payload = {
-              ...payload,
-              videoURL: maybeVideoURL,
-              videoFetchedAt: Timestamp.now(),
-              videoAssemblyId: transcript.id
-            }
-          }
-        }
-      }
-    }
-
-    const event: Hearing = payload
-    return event
+    } as Hearing
   }
 }
 

@@ -1,57 +1,112 @@
 import * as functions from "firebase-functions"
 import { AssemblyAI } from "assemblyai"
-import { db } from "../firebase"
+import { db, Timestamp } from "../firebase"
 import { sha256 } from "js-sha256"
 
 const assembly = new AssemblyAI({
   apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
 })
 
 export const transcription = functions.https.onRequest(async (req, res) => {
-  if (
-    req.headers["X-Maple-Webhook"] &&
-    req.headers["webhook_auth_header_value"]
-  ) {
+  if (req.headers["x-maple-webhook"]) {
     if (req.body.status === "completed") {
+      // If we get a request with the right header and status, get the
+      // transcription from the assembly API.
       const transcript = await assembly.transcripts.get(req.body.transcript_id)
       if (transcript && transcript.webhook_auth) {
-        const maybeEventInDb = await db
+        // If there is a transcript and the transcript has an auth property,
+        // look for an event (aka Hearing) in the DB with a matching ID.
+        const maybeEventsInDb = await db
           .collection("events")
-          .where("videoAssemblyId", "==", transcript.id)
+          .where("videoTranscriptionId", "==", transcript.id)
           .get()
-        if (maybeEventInDb.docs.length) {
-          const authenticatedEventsInDb = maybeEventInDb.docs.filter(
-            async e => {
-              const hashedToken = sha256(
-                String(req.headers["webhook_auth_header_value"])
-              )
 
-              const tokenInDb = await db
-                .collection("events")
-                .doc(e.id)
-                .collection("private")
-                .doc("webhookAuth")
-                .get()
-              const tokenInDbData = tokenInDb.data()
-              if (tokenInDbData) {
-                return hashedToken === tokenInDbData.videoAssemblyWebhookToken
-              }
-              return false
+        if (maybeEventsInDb.docs.length) {
+          // If we have a match look for one that matches a hash of the token
+          // we gave Assembly. There should only be one of these but firestore
+          // gives us an array. If there is more than one member, something is
+          // wrong
+          const authenticatedEventIds = [] as string[]
+          const hashedToken = sha256(String(req.headers["x-maple-webhook"]))
+
+          for (const index in maybeEventsInDb.docs) {
+            const doc = maybeEventsInDb.docs[index]
+
+            const tokenDocInDb = await db
+              .collection("events")
+              .doc(doc.id)
+              .collection("private")
+              .doc("webhookAuth")
+              .get()
+
+            const tokenDataInDb = tokenDocInDb.data()?.videoAssemblyWebhookToken
+
+            if (hashedToken === tokenDataInDb) {
+              authenticatedEventIds.push(doc.id)
             }
-          )
-          if (authenticatedEventsInDb) {
+          }
+
+          // Log edge cases
+          if (maybeEventsInDb.docs.length === 0) {
+            console.log("No matching event in db.")
+          }
+          if (authenticatedEventIds.length === 0) {
+            console.log("No authenticated events in db.")
+          }
+          if (authenticatedEventIds.length > 1) {
+            console.log("More than one matching event in db.")
+          }
+
+          if (authenticatedEventIds.length === 1) {
+            // If there is one authenticated event, pull out the parts we want to
+            // save and try to save them in the db.
+            const { id, text, audio_url, utterances } = transcript
             try {
-              await db
+              const transcriptionInDb = await db
                 .collection("transcriptions")
-                .doc(transcript.id)
-                .set({ _timestamp: new Date(), ...transcript })
+                .doc(id)
 
-              authenticatedEventsInDb.forEach(async d => {
-                await d.ref.update({
-                  ["webhook_auth_header_value"]: null
-                })
+              await transcriptionInDb.set({
+                id,
+                text,
+                createdAt: Timestamp.now(),
+                audio_url
               })
-              console.log("transcript saved in db")
+
+              // Put each `utterance` in a separate doc in an utterances
+              // collection. Previously had done the same for `words` but
+              // got worried about collection size and write times since
+              // `words` can be tens of thousands of members.
+              if (utterances) {
+                const writer = db.bulkWriter()
+                for (let utterance of utterances) {
+                  const { speaker, confidence, start, end, text } = utterance
+
+                  writer.set(
+                    db
+                      .collection("transcriptions")
+                      .doc(`${transcript.id}`)
+                      .collection("utterances")
+                      .doc(),
+                    { speaker, confidence, start, end, text }
+                  )
+                }
+
+                await writer.close()
+              }
+
+              // Delete the hashed webhook auth token from our db now that
+              // we're done.
+              for (const index in authenticatedEventIds) {
+                await db
+                  .collection("events")
+                  .doc(authenticatedEventIds[index])
+                  .collection("private")
+                  .doc("webhookAuth")
+                  .set({
+                    videoAssemblyWebhookToken: null
+                  })
+              }
             } catch (error) {
               console.log(error)
             }