Merge pull request #1 from genematx/setup

genematx · web-flow · commit 2a86215c1489 · 2025-11-21T12:22:51.000-05:00
Validation Workflows
diff --git a/data_validation.py b/data_validation.py
@@ -1,32 +1,232 @@
-import os
-import time as ttime
+import time
 
 from prefect import flow, task, get_run_logger
 from prefect.blocks.system import Secret
+
 from tiled.client import from_profile
+from tiled.client.array import ArrayClient
+from tiled.client.dataframe import DataFrameClient
+from tiled.client.utils import handle_error
+from tiled.mimetypes import DEFAULT_ADAPTERS_BY_MIMETYPE as ADAPTERS_BY_MIMETYPE
+from tiled.utils import safe_json_dump
 
 BEAMLINE_OR_ENDSTATION = "arpes"
 
 
+class ValidationException(Exception):
+
+    def __init__(self, message, uid=None):
+        super().__init__(message)
+        self.uid = uid
+
+class ReadingValidationException(ValidationException):
+    pass
+
+class RunValidationException(ValidationException):
+    pass
+
+class MetadataValidationException(ValidationException):
+    pass
+
+
+def validate(root_client, fix_errors=True, try_reading=True, raise_on_error=False, ignore_errors=[]):
+    """Validate the given BlueskyRun client for completeness and data integrity.
+
+    Parameters
+    ----------
+
+    root_client : tiled.client.run.RunClient
+        The Run client to validate.
+    fix_errors : bool, optional
+        Whether to attempt to fix structural errors found during validation.
+        Default is True.
+    try_reading : bool, optional
+        Whether to attempt reading the data for external data keys.
+        Default is True.
+    raise_on_error : bool, optional
+        Whether to raise an exception on the first validation error encountered.
+        Default is False.
+    ignore_errors : list of str, optional
+        List of error messages to ignore during reading validation.
+        Default is an empty list.
+
+    Returns
+    -------
+    bool
+        True if validation passed without errors, False otherwise.
+    """
+
+    logger = get_run_logger()
+
+    # Check if there's a Stop document in the run
+    if "stop" not in root_client.metadata:
+        logger.error("The Run is not complete: missing the Stop document")
+        if raise_on_error:
+            raise RunValidationException("Missing Stop document in the run")
+
+    # Check all streams and data keys
+    errored_keys, notes = [], []
+    streams_node = root_client['streams'] if 'streams' in root_client.keys() else root_client
+    for sname, stream in streams_node.items():
+        for data_key in stream.base:
+            if data_key == "internal":
+                continue
+
+            data_client = stream[data_key]
+            if data_client.data_sources()[0].management != "external":
+                continue
+
+            # Validate data structure
+            title = f"Validation of data key '{sname}/{data_key}'"
+            try:
+                _notes = validate_structure(data_client, fix_errors=fix_errors)
+                notes.extend([title + ": " + note for note in _notes])
+            except Exception as e:
+                msg = f"{type(e).__name__}: " + str(e).replace("\n", " ").replace("\r", "").strip()
+                msg = title + f" failed with error: {msg}"
+                logger.warning(msg)
+                notes.append(msg)
+
+            # Validate reading of the data
+            if try_reading:
+                try:
+                    validate_reading(data_client, ignore_errors=ignore_errors)
+                except Exception as e:
+                    errored_keys.append((sname, data_key, str(e)))
+                    logger.error(f"Reading validation of '{sname}/{data_key}' failed with error: {e}")
+                    if raise_on_error:
+                        raise e
+
+            time.sleep(0.1)
+
+    if try_reading and (not errored_keys):
+        logger.info(f"Reading validation completed successfully.")
+
+    # Update the root metadata with validation notes
+    if notes:
+        existing_notes = root_client.metadata.get("notes", [])
+        root_client.update_metadata(
+            {"notes": existing_notes + notes},
+            drop_revision=True,
+        )
+
+    return not errored_keys
+
+
+def validate_reading(data_client, ignore_errors=[]):
+    logger = get_run_logger()
+
+    data_key = data_client.item['id']
+    sname = data_client.item['attributes']['ancestors'][-1]  # stream name
+
+    if isinstance(data_client, ArrayClient):
+        try:
+            data_client[*(0,)*len(data_client.shape)]  # try to read the first element
+            data_client[*(-1,)*len(data_client.shape)]  # try to read the last element
+        except Exception as e:
+            if any([msg in e.args[0] for msg in ignore_errors]):
+                logger.info(f"Ignoring array reading error: {sname}/{data_key}: {e.args[0]}")
+            else:
+                raise ReadingValidationException(f"Array reading failed with error: {e.args[0]}")
+
+    elif isinstance(data_client, DataFrameClient):
+        try:
+            data_client.read()  # try to read the entire table
+        except Exception as e:
+            if any([msg in e.args[0] for msg in ignore_errors]):
+                logger.info(f"Ignoring table reading error: {sname}/{data_key}: {e.args[0]}")
+            else:
+                raise ReadingValidationException(f"Table reading failed with error: {e.args[0]}")
+
+    else:
+        logger.warning(f"Validation of '{data_key=}' is not supported with client of type {type(data_client)}.")
+
+
+def validate_structure(data_client, fix_errors=False) -> list[str]:
+    logger = get_run_logger()
+
+    data_source = data_client.data_sources()[0]
+    uris = [asset.data_uri for asset in data_source.assets]
+    structure = data_client.structure()
+    notes = []
+
+    # Initialize adapter from uris and determine the structure as read by the adapter
+    adapter_class = ADAPTERS_BY_MIMETYPE[data_source.mimetype]
+    true_structure = adapter_class.from_uris(*uris, **data_source.parameters).structure()
+    true_data_type = true_structure.data_type
+    true_shape = true_structure.shape
+    true_chunks = true_structure.chunks
+
+    # Validate structure components
+    if structure.shape != true_shape:
+        if not fix_errors:
+            raise ValueError(f"Shape mismatch: {structure.shape} != {true_shape}")
+        else:
+            msg = f"Fixed shape mismatch: {structure.shape} -> {true_shape}"
+            logger.warning(msg)
+            structure.shape = true_shape
+            notes.append(msg)
+
+    if structure.chunks != true_chunks:
+        if not fix_errors:
+            raise ValueError(f"Chunk shape mismatch: {structure.chunks} != {true_chunks}")
+        else:
+            _true_chunk_shape = tuple(c[0] for c in true_chunks)
+            _chunk_shape = tuple(c[0] for c in structure.chunks)
+            msg = f"Fixed chunk shape mismatch: {_chunk_shape} -> {_true_chunk_shape}"
+            logger.warning(msg)
+            structure.chunks = true_chunks
+            notes.append(msg)
+
+    if structure.data_type != true_data_type:
+        if not fix_errors:
+            raise ValueError(f"dtype mismatch: {structure.data_type} != {true_data_type}")
+        else:
+            msg = f"Fixed dtype mismatch: {structure.data_type.to_numpy_dtype()} -> {true_data_type.to_numpy_dtype()}"  # noqa
+            logger.warning(msg)
+            structure.data_type = true_data_type
+            notes.append(msg)
+
+    if structure.dims and (len(structure.dims) != len(true_shape)):
+        if not fix_errors:
+            raise ValueError(f"Number of dimension names mismatch for a {len(true_shape)}-dimensional array: {structure.dims}")  # noqa
+        else:
+            old_dims = structure.dims
+            if len(old_dims) < len(true_shape):
+                structure.dims = ("time",) + old_dims + tuple(f"dim{i}" for i in range(len(old_dims)+1, len(true_shape)))
+            else:
+                structure.dims = old_dims[: len(true_shape)]
+            msg = f"Fixed dimension names: {old_dims} -> {structure.dims}"
+            logger.warning(msg)
+            notes.append(msg)
+
+    # Update the data source structure if any fixes were applied
+    if notes:
+        data_source.structure = structure
+        handle_error(
+            data_client.context.http_client.put(
+                data_client.uri.replace("/api/v1/metadata/", "/api/v1/data_source/", 1),
+                content=safe_json_dump({"data_source": data_source}),
+            )
+        ).json()
+
+    return notes
+
+
 @task(retries=2, retry_delay_seconds=10)
-def read_all_streams(uid, beamline_acronym=BEAMLINE_OR_ENDSTATION):
+def data_validation_task(uid, beamline_acronym=BEAMLINE_OR_ENDSTATION):
     logger = get_run_logger()
+
     api_key = Secret.load(f"tiled-{beamline_acronym}-api-key", _sync=True).get()
     tiled_client = from_profile("nsls2", api_key=api_key)
-    run = tiled_client[beamline_acronym]["migration"][uid]
-    logger.info(f"Validating uid {run.metadata['start']['uid']}")
-    start_time = ttime.monotonic()
-    for stream in run:
-        logger.info(f"{stream}:")
-        stream_start_time = ttime.monotonic()
-        # stream_data = run[stream].read()
-        stream_elapsed_time = ttime.monotonic() - stream_start_time
-        logger.info(f"{stream} elapsed_time = {stream_elapsed_time}")
-        logger.info(f"{stream} nbytes = {stream_data.nbytes:_}")
-    elapsed_time = ttime.monotonic() - start_time
+    run_client = tiled_client[beamline_acronym]["migration"][uid]
+    logger.info(f"Validating uid {uid}")
+    start_time = time.monotonic()
+    validate(run_client, fix_errors=True, try_reading=True, raise_on_error=False)
+    elapsed_time = time.monotonic() - start_time
     logger.info(f"{elapsed_time = }")
 
 
 @flow(log_prints=True)
-def data_validation(uid):
-    read_all_streams(uid)
+def data_validation_flow(uid):
+    data_validation_task(uid)
diff --git a/end_of_run_workflow.py b/end_of_run_workflow.py
@@ -1,5 +1,5 @@
 from prefect import task, flow, get_run_logger
-from data_validation import data_validation
+from data_validation import data_validation_flow
 
 @task
 def log_completion():
@@ -9,5 +9,5 @@ def log_completion():
 @flow(log_prints=True)
 def end_of_run_workflow(stop_doc):
     uid = stop_doc["run_start"]
-    data_validation(uid)
+    data_validation_flow(uid)
     log_completion()
diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -6,6 +6,7 @@ platforms = ["linux-64"]
 [dependencies]
 prefect = "3.*"
 python = "<3.13"
-tiled-client = ">=0.1.6"
+tiled-client = ">=0.2.1"
 prefect-docker = "*"
 databroker = "*"
+bluesky-tiled-plugins = "*"