Skip to content

Commit 570aad1

Browse files
committed
Addressed review comments and added memory profiling module
Signed-off-by: Rishin Raj <[email protected]>
1 parent 0465734 commit 570aad1

File tree

8 files changed

+1283
-54
lines changed

8 files changed

+1283
-54
lines changed

QEfficient/base/modeling_qeff.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ def _export(
209209

210210
finally:
211211
shutil.rmtree(tmp_onnx_dir, ignore_errors=True)
212+
# Clear external data from memory and cache after all transforms and saving
213+
# Make sure model exists before trying to clean it up
214+
if 'model' in locals():
215+
OnnxTransform._cleanup_external_data_and_cache(model)
216+
OnnxTransform._cleanup_memory()
217+
logger.info("Cleanup complete.")
212218

213219
self.onnx_path = onnx_path
214220
return onnx_path

QEfficient/base/onnx_transforms.py

Lines changed: 101 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import numpy as np
1313
from onnx import ModelProto, external_data_helper, numpy_helper
1414

15-
from QEfficient.utils.constants import ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL
15+
from QEfficient.utils.constants import ONNX_TRANSFORM_MEMORY_CLEANUP_INTERVAL
1616

1717
logger = logging.getLogger(__name__)
1818

@@ -22,6 +22,8 @@ class OnnxTransform:
2222
OnnxTransform is the base class for graph modifications on exported onnx.
2323
"""
2424

25+
_external_data_loaded_cache = {} # Dict[int, bool]
26+
2527
def __init__(self):
2628
raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")
2729

@@ -45,12 +47,54 @@ def _check_external_data_loaded(cls, model: ModelProto) -> bool:
4547
:param model: The ONNX model to check
4648
:returns: True if external data is already loaded, False otherwise
4749
"""
50+
# Use object ID as key instead of the object itself
51+
model_id = id(model)
52+
# Return cached result if available
53+
if model_id in cls._external_data_loaded_cache:
54+
return cls._external_data_loaded_cache[model_id]
55+
56+
# Load the model if not already loaded
4857
for tensor in external_data_helper._get_all_tensors(model):
4958
# Check if tensor has external data but no raw data loaded
5059
if len(tensor.external_data) > 0 and not tensor.HasField("raw_data"):
60+
cls._external_data_loaded_cache[model_id] = False
5161
return False
62+
63+
cls._external_data_loaded_cache[model_id] = True
5264
return True
5365

66+
@classmethod
67+
def _load_external_data(cls, model: ModelProto, onnx_base_dir: Optional[str] = None):
68+
"""
69+
Performs a bulk load of external data if it's not already loaded.
70+
Updates the cache upon successful load.
71+
"""
72+
model_id = id(model)
73+
if not cls._check_external_data_loaded(model):
74+
logger.info("External data not loaded. Performing bulk load.")
75+
external_data_helper.load_external_data_for_model(model, onnx_base_dir)
76+
cls._external_data_loaded_cache[model_id] = True
77+
else:
78+
logger.info("External data already loaded (or cached). Skipping bulk load.")
79+
80+
81+
@classmethod
82+
def _cleanup_external_data_and_cache(cls, model: ModelProto):
83+
"""
84+
Combines clearing external data from the model and its cache entry.
85+
"""
86+
# Remove the loaded raw data from tensors
87+
for tensor in external_data_helper._get_all_tensors(model):
88+
if tensor.HasField("raw_data"):
89+
tensor.ClearField("raw_data")
90+
91+
# Clear the cache entry for this model using its ID
92+
model_id = id(model)
93+
if model_id in cls._external_data_loaded_cache:
94+
del cls._external_data_loaded_cache[model_id]
95+
96+
logger.info("External data and cache cleaned up.")
97+
5498
@classmethod
5599
def _cleanup_memory(cls):
56100
"""
@@ -69,36 +113,42 @@ def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwar
69113
"""
70114
:param onnx_base_dir: Base directory to load tensors
71115
"""
72-
finfo = np.finfo(np.float16)
73-
fp16_max = finfo.max
74-
fp16_min = finfo.min
75-
transformed = False
116+
try:
117+
# --- FIX: Ensure external data is loaded efficiently BEFORE processing ---
118+
cls._load_external_data(model, onnx_base_dir)
76119

77-
processed_count = 0
78-
for tensor in external_data_helper._get_all_tensors(model):
79-
nptensor = numpy_helper.to_array(tensor, onnx_base_dir)
80-
if nptensor.dtype == np.float32 and (np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)):
81-
neg_inf_mask = np.isinf(nptensor) & (nptensor < 0)
82-
clipped_tensor = np.clip(nptensor, fp16_min, fp16_max)
120+
finfo = np.finfo(np.float16)
121+
fp16_max = finfo.max
122+
fp16_min = finfo.min
123+
transformed = False
124+
125+
processed_count = 0
126+
for tensor in external_data_helper._get_all_tensors(model):
127+
nptensor = numpy_helper.to_array(tensor) # Removed onnx_base_dir as data is already loaded
128+
if nptensor.dtype == np.float32 and (np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)):
129+
neg_inf_mask = np.isinf(nptensor) & (nptensor < 0)
130+
clipped_tensor = np.clip(nptensor, fp16_min, fp16_max)
83131

84-
# Restore -inf values
85-
if neg_inf_mask.any():
86-
clipped_tensor = np.where(neg_inf_mask, np.float32("-inf"), clipped_tensor)
132+
# Restore -inf values
133+
if neg_inf_mask.any():
134+
clipped_tensor = np.where(neg_inf_mask, np.float32("-inf"), clipped_tensor)
87135

88-
new_tensor = numpy_helper.from_array(clipped_tensor, tensor.name)
89-
tensor.CopyFrom(new_tensor)
90-
transformed = True
136+
new_tensor = numpy_helper.from_array(clipped_tensor, tensor.name)
137+
tensor.CopyFrom(new_tensor)
138+
transformed = True
91139

92-
del neg_inf_mask, clipped_tensor, new_tensor
140+
del neg_inf_mask, clipped_tensor, new_tensor
93141

94-
del nptensor
95-
processed_count += 1
142+
del nptensor
143+
processed_count += 1
96144

97-
if processed_count % ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL == 0:
98-
cls._cleanup_memory()
145+
if processed_count % ONNX_TRANSFORM_MEMORY_CLEANUP_INTERVAL == 0:
146+
cls._cleanup_memory()
99147

100-
cls._cleanup_memory()
101-
return model, transformed
148+
return model, transformed
149+
finally:
150+
# Ensure cleanup happens even if an exception occurs
151+
cls._cleanup_memory()
102152

103153

104154
class SplitTensorsTransform(OnnxTransform):
@@ -123,32 +173,30 @@ def apply(
123173
:param file_chunk_size: Chunk size to split external files into.
124174
:param size_threshold: Only tensors greater than this threshold (in bytes) will be saved externally.
125175
"""
126-
file_num = 0
127-
current_file_size = 0
128-
transformed = False
129-
130-
# Check if external data is already loaded to avoid redundant loading
131-
external_data_already_loaded = cls._check_external_data_loaded(model)
132-
133-
if not external_data_already_loaded:
134-
external_data_helper.load_external_data_for_model(model, onnx_base_dir)
135-
else:
136-
logger.info("External data already loaded, skipping redundant load operation")
137-
138-
processed_count = 0
139-
for tensor in external_data_helper._get_all_tensors(model):
140-
if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold):
141-
transformed = True
142-
current_file_size += tsize
143-
if current_file_size > file_chunk_size:
144-
file_num += 1
145-
current_file_size = tsize
146-
external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data")
147-
148-
processed_count += 1
149-
if processed_count % ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL == 0:
150-
cls._cleanup_memory()
151-
152-
cls._cleanup_memory()
153-
154-
return model, transformed
176+
try:
177+
file_num = 0
178+
current_file_size = 0
179+
transformed = False
180+
181+
# --- Adjustment: The initial check and load will now use the new bulk loader ---
182+
# This will either use the cache (if FP16ClipTransform loaded it) or perform the bulk load itself.
183+
cls._load_external_data(model, onnx_base_dir)
184+
185+
processed_count = 0
186+
for tensor in external_data_helper._get_all_tensors(model):
187+
if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold):
188+
transformed = True
189+
current_file_size += tsize
190+
if current_file_size > file_chunk_size:
191+
file_num += 1
192+
current_file_size = tsize
193+
external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data")
194+
195+
processed_count += 1
196+
if processed_count % ONNX_TRANSFORM_MEMORY_CLEANUP_INTERVAL == 0:
197+
cls._cleanup_memory()
198+
199+
return model, transformed
200+
finally:
201+
# Ensure cleanup happens even if an exception occurs
202+
cls._cleanup_memory()

QEfficient/utils/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def get_models_dir():
7070
ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99
7171
ONNX_EXPORT_OPSET = 13
7272

73-
ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL = 100
73+
ONNX_TRANSFORM_MEMORY_CLEANUP_INTERVAL = 100
7474

7575
COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]
7676

scripts/memory_profiling/README.md

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# QEfficient Memory Profiling
2+
3+
A memory profiling solution for QEfficient workflows with manual operation marking.
4+
5+
6+
7+
## Quick Start
8+
9+
```python
10+
from scripts.memory_profiling import QEffMemoryProfiler
11+
from QEfficient import QEFFAutoModelForCausalLM
12+
13+
# Initialize profiler
14+
profiler = QEffMemoryProfiler(verbose=True)
15+
profiler.start_monitoring()
16+
17+
# Your QEfficient workflow
18+
model = QEFFAutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
19+
model.export()
20+
model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16)
21+
output = model.generate(prompts=["Hello world"])
22+
23+
# Generate report and visualization
24+
profiler.stop_monitoring()
25+
print(profiler.get_memory_report())
26+
profiler.generate_memory_graph("profile.png")
27+
```
28+
29+
## Configuration
30+
31+
### Basic Configuration
32+
33+
```python
34+
profiler = QEffMemoryProfiler(
35+
sampling_interval=0.1, # Sample every 100ms
36+
output_file="my_profile.png", # Custom output file
37+
verbose=True, # Enable detailed logging
38+
enable_cpu_monitoring=True, # Monitor CPU usage
39+
enable_disk_monitoring=True, # Monitor disk I/O
40+
)
41+
```
42+
43+
### Manual Operation Marking
44+
45+
```python
46+
profiler = QEffMemoryProfiler()
47+
profiler.start_monitoring()
48+
49+
# Manual operation marking
50+
profiler.mark_operation("Custom Operation 1")
51+
# ... your code ...
52+
53+
profiler.mark_operation("Custom Operation 2")
54+
# ... more code ...
55+
56+
profiler.stop_monitoring()
57+
```
58+
59+
## API Reference
60+
61+
### QEffMemoryProfiler
62+
63+
#### Constructor Parameters
64+
65+
| Parameter | Type | Default | Description |
66+
|-----------|------|---------|-------------|
67+
| `sampling_interval` | `float` | `0.05` | Time between samples (seconds) |
68+
| `output_file` | `str` | `"qeff_memory_profile.png"` | Output file path |
69+
| `verbose` | `bool` | `False` | Enable verbose logging |
70+
| `enable_cpu_monitoring` | `bool` | `True` | Monitor CPU usage |
71+
| `enable_disk_monitoring` | `bool` | `True` | Monitor disk I/O |
72+
73+
#### Methods
74+
75+
- **`start_monitoring()`**: Start background monitoring
76+
- **`stop_monitoring()`**: Stop monitoring and mark completion
77+
- **`mark_operation(name: str)`**: Manually mark operation start
78+
- **`get_memory_report() -> str`**: Generate comprehensive text report
79+
- **`generate_memory_graph(filename: str)`**: Create visualization
80+
- **`stop_and_save(filename: str) -> str`**: Convenience method to stop and save
81+
82+
#### Properties
83+
84+
- **`peak_rss`**: Peak RSS memory usage (MB)
85+
- **`peak_operation`**: Operation during peak memory
86+
- **`samples`**: List of collected profiling samples
87+
- **`operations`**: List of marked operations with timestamps
88+
89+
## Operation Types
90+
91+
The profiler supports marking these common QEfficient operations:
92+
93+
- **Model Loading**: `from_pretrained`, `AutoModel`, `AutoTokenizer`
94+
- **Export**: `model.export()`, ONNX transforms, PyTorch transforms
95+
- **Compilation**: `model.compile()`, QNN compilation
96+
- **Generation**: `model.generate()`, inference execution
97+
- **Cleanup**: Memory cleanup, garbage collection
98+
99+
## Output
100+
101+
### Console Report
102+
```
103+
QEFFICIENT PERFORMANCE MONITORING REPORT
104+
============================================================
105+
Peak Memory Usage:
106+
• RSS (Physical): 18.7 GB at 14:23:45
107+
• Peak during: Compilation
108+
109+
Memory Statistics:
110+
• Current RSS: 16.2 GB (Delta: +15.8 GB)
111+
• Duration: 185.3 seconds
112+
• Operations: 4
113+
114+
QEfficient Operations Timeline:
115+
1. 0.0s - Model Loading (25.2s) [+8.2 GB]
116+
2. 25.2s - Export (15.4s) [+2.1 GB]
117+
3. 40.6s - Compilation (120.8s) [+6.3 GB] <- Peak
118+
4. 161.4s - Generation (18.7s) [+1.2 GB]
119+
```
120+
121+
### Visualization
122+
123+
The profiler generates a comprehensive 4-panel visualization:
124+
125+
1. **Memory Timeline**: RSS usage with colored operation phases
126+
2. **CPU Usage**: CPU utilization with performance zones
127+
3. **Disk I/O**: Read/write activity per operation phase
128+
4. **Phase Duration**: Timing analysis with duration labels
129+
130+
#### Sample Output
131+
132+
![Sample Memory Profile](memory_profile_llama3.2.png)
133+
134+
*Example memory profiling output showing QEfficient workflow phases including model loading, ONNX transforms, compilation, and generation phases with detailed memory, CPU, and disk I/O metrics.*
135+
136+
## Advanced Usage
137+
138+
139+
### Accessing Raw Data
140+
141+
```python
142+
# Get synchronized data arrays
143+
data = profiler.get_synchronized_data()
144+
timestamps = data['timestamps']
145+
memory_usage = data['rss_memory']
146+
cpu_usage = data['cpu_usage']
147+
148+
# Access individual samples
149+
for sample in profiler.samples:
150+
print(f"Time: {sample.timestamp}, RSS: {sample.rss_mb} MB")
151+
```
152+
153+
## Integration Examples
154+
155+
### With Existing QEfficient Scripts
156+
157+
```python
158+
# Add to existing QEfficient workflow
159+
profiler = QEffMemoryProfiler(output_file="workflow_profile.png")
160+
profiler.start_monitoring()
161+
162+
# Existing QEfficient code unchanged
163+
model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
164+
# ... rest of workflow ...
165+
166+
# Add at end
167+
report = profiler.stop_and_save()
168+
print(report)
169+
```
170+
171+
172+
## Compatibility
173+
174+
- **Python**: 3.7+
175+
- **Dependencies**: `psutil`, `matplotlib`, `numpy`

0 commit comments

Comments
 (0)