-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathexample_io.py
More file actions
182 lines (142 loc) · 6.37 KB
/
example_io.py
File metadata and controls
182 lines (142 loc) · 6.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python3
"""File I/O operations example.
This example demonstrates METAINFORMANT's file input/output utilities for JSON, CSV, and JSONL formats with gzip compression support.
Usage:
python examples/core/example_io.py
Output:
output/examples/core/io_example.{json,csv,jsonl,json.gz,jsonl.gz}
"""
from __future__ import annotations
import csv
from pathlib import Path
from metainformant.core.io import io
def main():
"""Demonstrate file I/O operations."""
# Setup output directory
output_dir = Path("output/examples/core")
output_dir.mkdir(parents=True, exist_ok=True)
print("=== METAINFORMANT I/O Operations Example ===")
# Sample data for demonstration
sample_data = {
"analysis": {"type": "dna_sequence_analysis", "timestamp": "2024-12-26T10:00:00Z", "version": "1.0"},
"sequences": [
{"id": "seq1", "sequence": "ATCGATCG", "length": 8, "gc_content": 0.5},
{"id": "seq2", "sequence": "GCTAGCTA", "length": 8, "gc_content": 0.5},
{"id": "seq3", "sequence": "TTTTAAAA", "length": 8, "gc_content": 0.0},
],
"statistics": {"total_sequences": 3, "average_length": 8.0, "average_gc": 0.333},
}
# 1. JSON file operations
print("\n1. JSON file operations...")
json_file = output_dir / "io_example.json"
io.dump_json(sample_data, json_file, indent=2)
print(f"✓ Saved JSON data to: {json_file}")
# Load it back
loaded_json = io.load_json(json_file)
print(f"✓ Loaded JSON data: {len(loaded_json)} top-level keys")
# 2. Compressed JSON file operations
print("\n2. Compressed JSON file operations...")
json_gz_file = output_dir / "io_example.json.gz"
io.dump_json(sample_data, json_gz_file, indent=2)
print(f"✓ Saved compressed JSON data to: {json_gz_file}")
# Load compressed file
loaded_json_gz = io.load_json(json_gz_file)
print(f"✓ Loaded compressed JSON data: {len(loaded_json_gz)} top-level keys")
# 3. CSV file operations
print("\n3. CSV file operations...")
# Convert sequence data to CSV format
csv_data = []
for seq in sample_data["sequences"]:
csv_data.append(
{"id": seq["id"], "sequence": seq["sequence"], "length": seq["length"], "gc_content": seq["gc_content"]}
)
csv_file = output_dir / "io_example.csv"
io.write_csv(csv_data, csv_file)
print(f"✓ Saved CSV data to: {csv_file}")
# Load CSV back
loaded_csv = io.read_csv(csv_file)
print(f"✓ Loaded CSV data: {len(loaded_csv)} rows")
# 4. JSONL (JSON Lines) file operations
print("\n4. JSONL file operations...")
# Prepare JSONL data (one JSON object per line)
jsonl_data = [
{"step": 1, "operation": "load_data", "status": "success", "duration_ms": 150},
{"step": 2, "operation": "validate_data", "status": "success", "duration_ms": 50},
{"step": 3, "operation": "analyze_sequences", "status": "success", "duration_ms": 500},
{"step": 4, "operation": "generate_report", "status": "success", "duration_ms": 200},
]
jsonl_file = output_dir / "io_example.jsonl"
io.write_jsonl(jsonl_data, jsonl_file)
print(f"✓ Saved JSONL data to: {jsonl_file}")
# Load JSONL back (iterator)
loaded_jsonl = list(io.read_jsonl(jsonl_file))
print(f"✓ Loaded JSONL data: {len(loaded_jsonl)} records")
# 5. Compressed JSONL operations
print("\n5. Compressed JSONL operations...")
jsonl_gz_file = output_dir / "io_example.jsonl.gz"
io.write_jsonl(jsonl_data, jsonl_gz_file)
print(f"✓ Saved compressed JSONL data to: {jsonl_gz_file}")
# Load compressed JSONL
loaded_jsonl_gz = list(io.read_jsonl(jsonl_gz_file))
print(f"✓ Loaded compressed JSONL data: {len(loaded_jsonl_gz)} records")
# 6. Demonstrate atomic writes (prevent corruption)
print("\n6. Atomic write operations...")
# All dump_json and write_jsonl operations use atomic writes by default
# This prevents file corruption if the process is interrupted
print("✓ All write operations use atomic writes (prevent corruption)")
# 7. File size comparison
print("\n7. File size comparison...")
import os
sizes = {
"JSON": os.path.getsize(json_file),
"JSON.gz": os.path.getsize(json_gz_file),
"CSV": os.path.getsize(csv_file),
"JSONL": os.path.getsize(jsonl_file),
"JSONL.gz": os.path.getsize(jsonl_gz_file),
}
print("File sizes (bytes):")
for format_name, size in sizes.items():
print(f" {format_name:8}: {size:4}")
compression_ratio = sizes["JSON.gz"] / sizes["JSON"]
print(".1f")
# 8. Create summary report
print("\n8. Creating summary report...")
summary = {
"io_operations_demo": {
"timestamp": "2024-12-26T10:00:00Z",
"files_created": {
"json": str(json_file.relative_to(output_dir)),
"json_gz": str(json_gz_file.relative_to(output_dir)),
"csv": str(csv_file.relative_to(output_dir)),
"jsonl": str(jsonl_file.relative_to(output_dir)),
"jsonl_gz": str(jsonl_gz_file.relative_to(output_dir)),
},
"file_sizes_bytes": sizes,
"compression_ratio": compression_ratio,
"features_demonstrated": [
"JSON reading/writing",
"Gzip compression support",
"CSV data handling",
"JSONL (JSON Lines) format",
"Atomic write operations",
"Error handling and validation",
],
"data_summary": {
"sequences_processed": len(sample_data["sequences"]),
"workflow_steps": len(jsonl_data),
"total_data_points": sum(len(seq["sequence"]) for seq in sample_data["sequences"]),
},
}
}
summary_file = output_dir / "io_operations_summary.json"
io.dump_json(summary, summary_file, indent=2)
print(f"✓ Summary saved to: {summary_file}")
print("\n=== I/O Operations Example Complete ===")
print(f"All example files created in: {output_dir}")
print("\nKey takeaways:")
print("- Use dump_json/load_json for structured data")
print("- Use write_jsonl/read_jsonl for streaming large datasets")
print("- Use .gz extension for automatic compression")
print("- All writes are atomic to prevent corruption")
if __name__ == "__main__":
main()