MetaInformAnt/examples/core/example_workflow.py at main · docxology/MetaInformAnt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/bin/env python3
"""Basic workflow orchestration example.

This example demonstrates METAINFORMANT's workflow utilities for config-based data processing and step-by-step execution.

Usage:
    python examples/core/example_workflow.py

Output:
    output/examples/core/workflow_results.json
"""

from __future__ import annotations

import time
from pathlib import Path
from typing import Any, Dict

from metainformant.core.io import io
from metainformant.core.utils import logging
from metainformant.core.execution import workflow


def main():
    """Demonstrate basic workflow orchestration."""
    # Setup output directory
    output_dir = Path("output/examples/core")
    output_dir.mkdir(parents=True, exist_ok=True)

    print("=== METAINFORMANT Workflow Orchestration Example ===")

    # 1. Create sample configuration
    print("\n1. Creating sample workflow configuration...")

    workflow_config = {
        "workflow": {
            "name": "dna_sequence_analysis_demo",
            "description": "Demonstration of basic workflow orchestration",
            "version": "1.0",
        },
        "steps": {
            "load_data": {
                "description": "Load sequence data from file",
                "input_file": "sample_sequences.fasta",
                "expected_count": 5,
            },
            "validate_data": {
                "description": "Validate sequence integrity",
                "min_length": 10,
                "allowed_bases": ["A", "T", "C", "G"],
            },
            "analyze_sequences": {
                "description": "Calculate sequence statistics",
                "calculate_gc": True,
                "find_motifs": ["ATCG", "GATC"],
            },
            "generate_report": {
                "description": "Create analysis summary report",
                "output_format": "json",
                "include_plots": False,
            },
        },
        "output": {
            "directory": str(output_dir / "workflow_output"),
            "intermediate_files": True,
            "final_report": "analysis_report.json",
        },
    }

    config_file = output_dir / "workflow_config.yaml"
    import yaml

    with open(config_file, "w") as f:
        yaml.dump(workflow_config, f)

    print(f"✓ Created workflow config: {config_file}")

    # 2. Validate configuration file
    print("\n2. Validating configuration file...")

    is_valid, validation_errors = workflow.validate_config_file(config_file)

    if is_valid:
        print("✓ Configuration file is valid")
    else:
        print(f"✗ Configuration validation failed: {validation_errors}")
        return

    # 3. Create sample config programmatically
    print("\n3. Creating sample config programmatically...")

    sample_config_file = output_dir / "sample_workflow_config.yaml"
    workflow.create_sample_config(sample_config_file, sample_type="basic")

    print(f"✓ Created sample config: {sample_config_file}")

    # 4. Simulate workflow execution
    print("\n4. Simulating workflow execution...")

    logger = logging.get_logger("workflow_demo")

    def simulate_workflow_step(step_name: str, step_config: Dict[str, Any]) -> Dict[str, Any]:
        """Simulate execution of a workflow step."""
        logger.info(f"Starting step: {step_name}")
        start_time = time.time()

        # Simulate processing time based on step
        processing_times = {"load_data": 0.2, "validate_data": 0.1, "analyze_sequences": 0.5, "generate_report": 0.3}

        time.sleep(processing_times.get(step_name, 0.1))

        # Generate mock results based on step
        if step_name == "load_data":
            result = {
                "sequences_loaded": step_config.get("expected_count", 5),
                "total_bases": 1250,
                "file_format": "FASTA",
            }
        elif step_name == "validate_data":
            result = {
                "sequences_validated": 5,
                "invalid_sequences": 0,
                "validation_checks": ["length", "bases", "format"],
            }
        elif step_name == "analyze_sequences":
            result = {
                "average_length": 250,
                "gc_content": 0.42,
                "motifs_found": {"ATCG": 3, "GATC": 2},
                "complexity_score": 0.85,
            }
        elif step_name == "generate_report":
            result = {
                "report_sections": ["summary", "statistics", "motifs"],
                "output_format": step_config.get("output_format", "json"),
                "file_size_kb": 15.7,
            }
        else:
            result = {"status": "unknown_step"}

        elapsed = time.time() - start_time
        result["execution_time"] = elapsed
        result["status"] = "success"

        logger.info(f"Step {step_name} completed in {elapsed:.2f} seconds")
        return result

    # Execute workflow steps
    step_results = {}
    total_workflow_time = 0

    for step_name, step_config in workflow_config["steps"].items():
        step_result = simulate_workflow_step(step_name, step_config)
        step_results[step_name] = step_result
        total_workflow_time += step_result["execution_time"]

    logger.info(".2f")

    # 5. Download and process data example
    print("\n5. Demonstrating download and process pattern...")

    def mock_data_processor(data: Any) -> Dict[str, Any]:
        """Mock data processor for demonstration."""
        # Simulate processing downloaded data
        return {
            "processed_items": len(data) if isinstance(data, list) else 1,
            "data_type": type(data).__name__,
            "processing_timestamp": "2024-12-26T10:00:00Z",
        }

    # Example URLs (these won't actually be downloaded in this demo)
    example_urls = ["https://example.com/sample_data.json", "https://example.com/sequences.fasta"]

    download_results = {}
    for url in example_urls:
        # In real usage, this would download actual data
        # Here we simulate with mock data
        mock_data = {"sample": "data", "url": url}
        try:
            # This would normally download and process real data
            result = workflow.download_and_process_data(url, mock_data_processor, output_dir)
            download_results[url] = result
        except Exception as e:
            download_results[url] = {"error": str(e), "status": "simulated"}

    print("✓ Demonstrated download and process pattern")

    # 6. Run config-based workflow
    print("\n6. Running config-based workflow...")

    try:
        # This demonstrates the full workflow execution pattern
        workflow_result = workflow.run_config_based_workflow(config_file, custom_param="demo_value")
        print("✓ Config-based workflow completed")
        config_workflow_result = workflow_result
    except Exception as e:
        print(f"Config-based workflow demo: {e}")
        config_workflow_result = {"error": str(e), "status": "demo_only"}

    # 7. Create comprehensive results summary
    print("\n7. Creating comprehensive results summary...")

    summary = {
        "workflow_orchestration_demo": {
            "timestamp": "2024-12-26T10:00:00Z",
            "configuration": {
                "config_file_created": str(config_file.relative_to(output_dir)),
                "sample_config_created": str(sample_config_file.relative_to(output_dir)),
                "config_validation_passed": is_valid,
            },
            "workflow_execution": {
                "steps_executed": len(step_results),
                "total_execution_time": total_workflow_time,
                "step_results": step_results,
                "workflow_status": "completed",
            },
            "data_processing": {
                "download_operations": len(download_results),
                "download_results": download_results,
                "processing_pattern": "download_and_process_data",
            },
            "config_based_execution": {"attempted": True, "result": config_workflow_result},
            "features_demonstrated": [
                "Configuration file creation and validation",
                "Sample configuration generation",
                "Step-by-step workflow execution",
                "Progress tracking and timing",
                "Data download and processing patterns",
                "Config-based workflow orchestration",
                "Error handling and result aggregation",
            ],
            "workflow_patterns": {
                "step_execution": "Simulated realistic bioinformatics workflow steps",
                "data_flow": "Config -> Validation -> Processing -> Results",
                "error_handling": "Graceful handling of validation and execution errors",
                "result_aggregation": "Comprehensive summary of all workflow outputs",
            },
        }
    }

    results_file = output_dir / "workflow_results.json"
    io.dump_json(summary, results_file, indent=2)

    print(f"✓ Comprehensive results saved to: {results_file}")

    print("\n=== Workflow Orchestration Example Complete ===")
    print("This example demonstrated METAINFORMANT's workflow orchestration capabilities:")
    print("- Configuration management and validation")
    print("- Step-by-step workflow execution")
    print("- Progress tracking and result aggregation")
    print("- Data download and processing patterns")
    print("- Config-based workflow orchestration")

    print(f"\nAll outputs saved to: {output_dir}")


if __name__ == "__main__":
    main()