@@ -96,7 +96,8 @@ void setup_input_tensors(
96
96
std::vector<at::Tensor> inputs,
97
97
c10::intrusive_ptr<TRTEngine> compiled_engine,
98
98
bool cudagraphs_enabled,
99
- bool need_cudagraphs_record) {
99
+ bool need_cudagraphs_record,
100
+ bool shape_changed) {
100
101
// this is a buffer to store shape tensor input addresses throughout the runtime scope
101
102
std::list<std::vector<int64_t >> inputShapeTensorValues;
102
103
std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
@@ -117,7 +118,7 @@ void setup_input_tensors(
117
118
auto shape = core::util::toVec (dims);
118
119
LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
119
120
120
- if (compiled_engine->cuda_engine -> isShapeInferenceIO ( name. c_str ()) ) {
121
+ if (compiled_engine->isShapeInferenceIO [ name] ) {
121
122
// Shape tensor inputs are casted to int64 explicitly.
122
123
// Refer to
123
124
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
@@ -145,10 +146,10 @@ void setup_input_tensors(
145
146
// Create a new persistent input buffer
146
147
compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
147
148
}
148
-
149
- TORCHTRT_CHECK (
150
- compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
151
-
149
+ if (shape_changed) {
150
+ TORCHTRT_CHECK (
151
+ compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
152
+ }
152
153
if (cudagraphs_enabled) {
153
154
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
154
155
compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
@@ -217,7 +218,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
217
218
compiled_engine->cudagraph .reset ();
218
219
}
219
220
220
- std::vector<at::Tensor> outputs (compiled_engine-> num_io . second ) ;
221
+ std::vector<at::Tensor> outputs;
221
222
222
223
// Intialize inputs and outputs to be available throughout the succeeding scopes
223
224
{ // Input Setup
@@ -226,10 +227,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
226
227
input_profiler_guard =
227
228
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
228
229
}
229
-
230
- setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
230
+ setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, shape_changed);
231
231
// Check if input shapes can be inferred.
232
- int32_t const io_size{compiled_engine->cuda_engine -> getNbIOTensors () };
232
+ int32_t const io_size{compiled_engine->io_size };
233
233
std::vector<char const *> names (io_size);
234
234
int32_t const nbNames = compiled_engine->exec_ctx ->inferShapes (names.size (), names.data ());
235
235
TORCHTRT_CHECK (
@@ -240,6 +240,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240
240
}
241
241
242
242
{ // Output Setup
243
+ bool new_outputs = false ;
243
244
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244
245
if (compiled_engine->profile_execution ) {
245
246
output_profiler_guard =
@@ -248,64 +249,60 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
248
249
if (can_use_pre_allocated_outputs) {
249
250
outputs = compiled_engine->pre_allocated_outputs ;
250
251
} else {
251
- outputs = create_output_tensors (compiled_engine);
252
+ if (compiled_engine->allocated_outputs .size () == 0 or compiled_engine->requires_new_output_tensor or
253
+ shape_changed) {
254
+ compiled_engine->allocated_outputs = create_output_tensors (compiled_engine);
255
+ new_outputs = true ;
256
+ }
257
+ outputs = compiled_engine->allocated_outputs ;
252
258
}
253
259
254
- for (auto output_indices : compiled_engine->out_binding_map ) {
255
- auto pyt_idx = output_indices.second ;
256
- std::string name = compiled_engine->out_binding_names [pyt_idx];
257
- if (need_cudagraphs_record) {
258
- // If we are recording the cuda graph then we need to update the persistent output buffer
259
- compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
260
- }
260
+ if (new_outputs) {
261
+ for (auto output_indices : compiled_engine->out_binding_map ) {
262
+ auto pyt_idx = output_indices.second ;
263
+ std::string name = compiled_engine->out_binding_names [pyt_idx];
264
+ if (need_cudagraphs_record) {
265
+ // If we are recording the cuda graph then we need to update the persistent output buffer
266
+ compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
267
+ }
261
268
262
- if (cudagraphs_enabled) {
263
- TORCHTRT_CHECK (
264
- compiled_engine->exec_ctx ->setTensorAddress (
265
- name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
266
- " Error while setting the output tensor address" );
267
- } else {
268
- TORCHTRT_CHECK (
269
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
270
- " Error while setting the output tensor address" );
269
+ if (cudagraphs_enabled) {
270
+ TORCHTRT_CHECK (
271
+ compiled_engine->exec_ctx ->setTensorAddress (
272
+ name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
273
+ " Error while setting the output tensor address" );
274
+ } else {
275
+ TORCHTRT_CHECK (
276
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
277
+ " Error while setting the output tensor address" );
278
+ }
271
279
}
272
280
}
273
281
}
274
282
275
283
auto current_device_id = -1 ;
276
284
if (inputs.size () > 0 ) {
277
285
current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
278
- } else if (outputs.size () > 0 ) {
279
- current_device_id = outputs[0 ].device ().index (); // Done this way to avoid a call to cudart
280
- }
281
-
282
- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
283
- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
284
- // Create a new stream if the engine stream is the default stream
285
- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
286
+ if (current_device_id != compiled_engine->current_device_id ) {
287
+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
288
+ }
286
289
}
287
290
288
291
{ // Engine Execution (execute on engine stream)
289
- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
290
292
291
293
std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292
294
if (compiled_engine->profile_execution ) {
293
295
enqueue_profiler_guard =
294
296
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
295
297
}
296
298
297
- // Block engine stream until results are available on caller stream
298
- at::cuda::CUDAEvent caller_exec_complete;
299
- caller_exec_complete.record (compiled_engine->caller_stream );
300
- caller_exec_complete.block (compiled_engine->engine_stream );
301
-
302
299
if (!cudagraphs_enabled) {
303
300
// Direct execution uses the caller buffers directly
304
- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
301
+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
305
302
} else {
306
303
if (need_cudagraphs_record) {
307
304
// If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308
- c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream ;
305
+ c10::cuda::CUDAStream recording_stream = compiled_engine->stream ;
309
306
compiled_engine->cudagraph .capture_begin ();
310
307
compiled_engine->exec_ctx ->enqueueV3 (recording_stream);
311
308
compiled_engine->cudagraph .capture_end ();
@@ -325,11 +322,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
325
322
compiled_engine->pre_allocated_outputs = create_output_tensors (compiled_engine);
326
323
}
327
324
328
- // Block caller stream until engine execution is complete
329
- at::cuda::CUDAEvent trt_exec_complete;
330
- trt_exec_complete.record (compiled_engine->engine_stream );
331
- trt_exec_complete.block (compiled_engine->caller_stream );
332
-
333
325
if (cudagraphs_enabled) {
334
326
// If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335
327
for (size_t o = 0 ; o < compiled_engine->output_buffers .size (); o++) {
@@ -354,7 +346,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
354
346
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
355
347
}
356
348
357
- setup_input_tensors (inputs, compiled_engine, false , false );
349
+ setup_input_tensors (inputs, compiled_engine, false , false , true );
358
350
// Check if input shapes can be inferred.
359
351
int32_t const io_size{compiled_engine->cuda_engine ->getNbIOTensors ()};
360
352
std::vector<char const *> names (io_size);
@@ -378,40 +370,24 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378
370
auto current_device_id = -1 ;
379
371
if (inputs.size () > 0 ) {
380
372
current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
381
- } else {
382
- current_device_id = at::cuda::current_device ();
383
- }
384
-
385
- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
386
- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
387
- // Create a new stream if the engine stream is the default stream
388
- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
373
+ if (current_device_id != compiled_engine->current_device_id ) {
374
+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
375
+ }
389
376
}
390
377
391
378
{ // Engine Execution (execute on engine stream)
392
- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
393
379
394
380
std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395
381
if (compiled_engine->profile_execution ) {
396
382
enqueue_profiler_guard =
397
383
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
398
384
}
399
385
400
- // Block engine stream until results are available on caller stream
401
- at::cuda::CUDAEvent caller_exec_complete;
402
- caller_exec_complete.record (compiled_engine->caller_stream );
403
- caller_exec_complete.block (compiled_engine->engine_stream );
404
-
405
386
// Direct execution uses the caller buffers directly
406
- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
387
+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
407
388
408
389
} // End engine exeuction (resets to caller stream)
409
390
410
- // Block caller stream until engine execution is complete
411
- at::cuda::CUDAEvent trt_exec_complete;
412
- trt_exec_complete.record (compiled_engine->engine_stream );
413
- trt_exec_complete.block (compiled_engine->caller_stream );
414
-
415
391
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416
392
if (compiled_engine->profile_execution ) {
417
393
output_profiler_guard =
0 commit comments