Add embeddings-create and embeddings-load steps to pipeline

ghukill · ghukill · commit 2c2e2a2f899c · 2026-02-02T12:00:01.000-05:00
Why these changes are being introduced: Now that we have AWS Batch pipelines that can generate embeddings, and TIM is prepared to load them into Opensearch, we need the pipeline lambda to prepare commands for both running the AWS Batch job to create embeddings, and then the TIM command to load the embeddings. This dovetails with work in https://mitlibraries.atlassian.net/browse/USE-215 which has proposed updates to the StepFunction. There are two new pipeline lambda invocations in the StepFunction that will utilize the two new allowed 'next-step' values introduced in this commit. How this addresses that need: * Add "embeddings-create" and "embeddings-load" as valid steps in config * Add SKIP_EMBEDDINGS_SOURCES config for sources that don't need embeddings (alma, gisogm) * Add generate_embeddings_create_command() which determines compute env (cpu vs gpu-spot) based on record count threshold * Add generate_embeddings_load_command() for TIM bulk-update-embeddings command * Add handlers for both new steps in format_input.py * Update handle_load() to flow into embeddings-create instead of end * Add run_id and embeddings fields to ResultPayload * Add unit tests for new functionality Side effects of this change: * Pipeline will now continue to embeddings steps after load completes Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-140
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Takes input JSON (usually from EventBridge although it can be passed to a manual
 
 #### Required
 
-- `next-step`: The next step of the pipeline to be performed, must be one of `["extract", "transform", "load"]`. Determines which task run commands will be generated as output from the format lambda.
+- `next-step`: The next step of the pipeline to be performed. Determines which task run commands will be generated as output from the format lambda.
 - `run-date`: Must be in one of the formats ["yyyy-mm-dd", "yyyy-mm-ddThh:mm:ssZ"]. The provided date is used in the input/output file naming scheme for all steps of the pipeline.
 - `run-type`: Must be one of `["full", "daily"]`. The provided run type is used in the input/output file naming scheme for all steps of the pipeline. It also determines logic for both the OAI-PMH harvest and load commands as follows:
   - `full`: Perform a full harvest of all records from the provided `oai-pmh-host`. During load, create a new OpenSearch index, load all records into it, and then promote the new index.
diff --git a/lambdas/commands.py b/lambdas/commands.py
@@ -1,4 +1,5 @@
 import logging
+import uuid
 from typing import TYPE_CHECKING
 
 from lambdas import helpers
@@ -11,6 +12,8 @@
 
 CONFIG = Config()
 
+GPU_RECORD_COUNT_THRESHOLD = 500
+
 
 def generate_extract_command(input_payload: "InputPayload") -> dict:
     step = "extract"
@@ -148,3 +151,45 @@ def generate_load_commands(input_payload: "InputPayload") -> dict:
         }
 
     return {"failure": f"Unexpected run-type: '{input_payload.run_type}'"}
+
+
+def generate_embeddings_create_command(
+    input_payload: "InputPayload",
+    record_count: int,
+) -> dict:
+    """Generate AWS Batch job parameters for creating embeddings.
+
+    Determines compute environment based on record count:
+    - cpu (ECS Fargate) for < 500 records
+    - gpu-spot (EC2 Spot) for >= 500 records
+    """
+    job_compute_env = "gpu-spot" if record_count >= GPU_RECORD_COUNT_THRESHOLD else "cpu"
+
+    return {
+        "create": {
+            "job_name": f"create-embeddings-{job_compute_env}-{uuid.uuid4()}",
+            "job_compute_env": job_compute_env,
+            "command": [
+                "--verbose",
+                "create-embeddings",
+                "--strategy=full_record",
+                f"--dataset-location={CONFIG.s3_timdex_dataset_location}",
+                f"--run-id={input_payload.run_id}",
+            ],
+        }
+    }
+
+
+def generate_embeddings_load_command(input_payload: "InputPayload") -> dict:
+    """Generate TIM command to update documents with embeddings."""
+    return {
+        "load": {
+            "bulk-update-embeddings-command": [
+                "--verbose",
+                "bulk-update-embeddings",
+                f"--source={input_payload.source}",
+                f"--run-id={input_payload.run_id}",
+                CONFIG.s3_timdex_dataset_location,
+            ],
+        }
+    }
diff --git a/lambdas/config.py b/lambdas/config.py
@@ -42,7 +42,8 @@ class Config:
     SOURCE_EXCLUSION_LISTS: ClassVar = {"libguides": "/config/libguides/exclusions.csv"}
     VALID_DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ")
     VALID_RUN_TYPES = ("full", "daily")
-    VALID_STEPS = ("extract", "transform", "load")
+    VALID_STEPS = ("extract", "transform", "load", "embeddings-create", "embeddings-load")
+    SKIP_EMBEDDINGS_SOURCES = ("alma", "gisogm")
 
     def __getattr__(self, name: str) -> Any:  # noqa: ANN401
         """Provide dot notation access to configurations and env vars on this class."""
diff --git a/lambdas/format_input.py b/lambdas/format_input.py
@@ -1,24 +1,37 @@
+# ruff: noqa: S608
+
 import json
 import logging
 import uuid
 from dataclasses import asdict, dataclass
 from datetime import UTC, datetime
 from typing import Literal
 
+from timdex_dataset_api.dataset import TIMDEXDataset  # type: ignore[import-untyped]
+
 from lambdas import alma_prep, commands, errors, helpers
 from lambdas.config import Config, configure_logger
 
 logger = logging.getLogger(__name__)
 
 CONFIG = Config()
 
-type NextStep = Literal["extract", "transform", "load", "exit-ok", "exit-error", "end"]
+type NextStep = Literal[
+    "extract",
+    "transform",
+    "load",
+    "embeddings-create",
+    "embeddings-load",
+    "exit-ok",
+    "exit-error",
+    "end",
+]
 
 
 @dataclass
 class InputPayload:
     run_date: str
-    run_type: str
+    run_type: Literal["daily", "full"]
     source: str
     next_step: NextStep
     run_id: str
@@ -118,12 +131,14 @@ class ResultPayload:
     next_step: NextStep
     run_date: str
     run_type: str
+    run_id: str
     source: str
     verbose: bool = True
     harvester_type: str | None = None
     extract: dict | None = None
     transform: dict | None = None
     load: dict | None = None
+    embeddings: dict | None = None
     message: str | None = None
 
     @classmethod
@@ -132,6 +147,7 @@ def from_input_payload(cls, input_payload: "InputPayload") -> "ResultPayload":
             next_step=input_payload.next_step,
             run_date=input_payload.run_date,
             run_type=input_payload.run_type,
+            run_id=input_payload.run_id,
             source=input_payload.source,
             verbose=input_payload.verbose,
         )
@@ -154,6 +170,10 @@ def lambda_handler(event: dict, _context: dict) -> dict:
         result = handle_transform(input_payload, result)
     elif input_payload.next_step == "load":
         result = handle_load(input_payload, result)
+    elif input_payload.next_step == "embeddings-create":
+        result = handle_embeddings_create(input_payload, result)
+    elif input_payload.next_step == "embeddings-load":
+        result = handle_embeddings_load(input_payload, result)
     else:
         raise ValueError(f"'next-step' not supported: '{input_payload.next_step}'")
 
@@ -213,7 +233,7 @@ def handle_transform(input_payload: InputPayload, result: ResultPayload) -> Resu
 
 
 def handle_load(input_payload: InputPayload, result: ResultPayload) -> ResultPayload:
-    result.next_step = "end"
+    result.next_step = "embeddings-create"
     if not helpers.dataset_records_exist_for_run(input_payload.run_id):
         result.next_step = "exit-ok"
         message = (
@@ -225,3 +245,101 @@ def handle_load(input_payload: InputPayload, result: ResultPayload) -> ResultPay
         return result
     result.load = commands.generate_load_commands(input_payload)
     return result
+
+
+def handle_embeddings_create(
+    input_payload: InputPayload, result: ResultPayload
+) -> ResultPayload:
+    """Analyze ETL run and prepare parameters for AWS Batch job to create embeddings.
+
+    There are currently three compute environments we can create embeddings in:
+        - ECS Fargate - "cpu"
+        - EC2 - "gpu"
+        - EC2 Spot Instances - "gpu-spot"
+
+    This lambda handler is responsible for analyzing the size and shape of the ETL run,
+    and determining which AWS Batch compute environment is most appropriate.
+
+    We do not create embeddings for all sources.  Those we skip are configured in
+    CONFIG.SKIP_EMBEDDINGS_SOURCES.
+
+    Additionally, at this time, we do not have a scenario or code path that would
+    utilize the "gpu" compute environment, only "gpu-spot".  This is mostly because we
+    don't require an immediate turnaround for embeddings creation; when the job size
+    calls for a GPU, we have the luxury of waiting for a spot instance.
+    """
+    result.next_step = "embeddings-load"
+
+    if input_payload.source in CONFIG.SKIP_EMBEDDINGS_SOURCES:
+        result.next_step = "exit-ok"
+        result.message = (
+            f"Not currently creating embeddings for source '{input_payload.source}'"
+        )
+        return result
+
+    # retrieve records count for run
+    td = TIMDEXDataset(location=CONFIG.s3_timdex_dataset_location)
+    record_count = td.metadata.conn.query(f"""
+        select count(*)
+        from metadata.records
+        where run_id = '{input_payload.run_id}'
+        and action in ('index')
+        """).fetchone()[0]
+
+    # exit early if no records to create embeddings for
+    if record_count == 0:
+        result.next_step = "exit-ok"
+        result.message = f"No embeddable records found for run '{input_payload.run_id}'."
+        return result
+
+    job_compute_env = (
+        "gpu-spot" if record_count >= commands.GPU_RECORD_COUNT_THRESHOLD else "cpu"
+    )
+    logger.info(
+        f"ETL run '{input_payload.run_id}' had {record_count} records indexed, "
+        f"recommending '{job_compute_env}' compute env."
+    )
+
+    result.embeddings = commands.generate_embeddings_create_command(
+        input_payload, record_count
+    )
+    return result
+
+
+def handle_embeddings_load(
+    input_payload: InputPayload, result: ResultPayload
+) -> ResultPayload:
+    """Prepare TIM command to update documents in Opensearch with embeddings.
+
+    We do not create embeddings for all sources.  Those we skip are configured in
+    CONFIG.SKIP_EMBEDDINGS_SOURCES.
+    """
+    result.next_step = "end"
+
+    if input_payload.source in CONFIG.SKIP_EMBEDDINGS_SOURCES:
+        result.next_step = "exit-ok"
+        result.message = (
+            f"Not currently indexing embeddings for source '{input_payload.source}'"
+        )
+        return result
+
+    # retrieve embeddings count for run
+    td = TIMDEXDataset(location=CONFIG.s3_timdex_dataset_location)
+    embeddings_count = td.metadata.conn.query(f"""
+        select count(*)
+        from data.current_run_embeddings
+        where run_id = '{input_payload.run_id}'
+    """).fetchone()[0]
+
+    # exit early if no embeddings to load
+    if embeddings_count == 0:
+        result.next_step = "exit-ok"
+        result.message = f"No embeddings found for run '{input_payload.run_id}'."
+        return result
+
+    logger.info(
+        f"Preparing TIM command to update {embeddings_count} documents with embeddings."
+    )
+
+    result.embeddings = commands.generate_embeddings_load_command(input_payload)
+    return result
diff --git a/tests/test_commands.py b/tests/test_commands.py
@@ -338,3 +338,60 @@ def test_generate_load_commands_unhandled_run_type(run_id):
     }
     with pytest.raises(ValueError, match=r"Input 'run-type' value must be one of:"):
         InputPayload.from_event(event)
+
+
+def test_generate_embeddings_create_command_cpu(run_id):
+    """Record count below threshold uses cpu compute env."""
+    event = {
+        "next-step": "embeddings-create",
+        "run-date": "2022-01-02",
+        "run-type": "daily",
+        "source": "testsource",
+        "run-id": run_id,
+    }
+    input_payload = InputPayload.from_event(event)
+    result = commands.generate_embeddings_create_command(input_payload, record_count=100)
+
+    assert result["create"]["job_compute_env"] == "cpu"
+    assert "create-embeddings-cpu-" in result["create"]["job_name"]
+    assert f"--run-id={run_id}" in result["create"]["command"]
+
+
+def test_generate_embeddings_create_command_gpu_spot(run_id):
+    """Record count at/above threshold uses gpu-spot compute env."""
+    event = {
+        "next-step": "embeddings-create",
+        "run-date": "2022-01-02",
+        "run-type": "daily",
+        "source": "testsource",
+        "run-id": run_id,
+    }
+    input_payload = InputPayload.from_event(event)
+    result = commands.generate_embeddings_create_command(input_payload, record_count=500)
+
+    assert result["create"]["job_compute_env"] == "gpu-spot"
+    assert "create-embeddings-gpu-spot-" in result["create"]["job_name"]
+
+
+def test_generate_embeddings_load_command(run_id):
+    event = {
+        "next-step": "embeddings-load",
+        "run-date": "2022-01-02",
+        "run-type": "daily",
+        "source": "testsource",
+        "run-id": run_id,
+    }
+    input_payload = InputPayload.from_event(event)
+    result = commands.generate_embeddings_load_command(input_payload)
+
+    assert result == {
+        "load": {
+            "bulk-update-embeddings-command": [
+                "--verbose",
+                "bulk-update-embeddings",
+                "--source=testsource",
+                f"--run-id={run_id}",
+                "s3://test-timdex-bucket/dataset",
+            ],
+        }
+    }
diff --git a/tests/test_format_input.py b/tests/test_format_input.py