@@ -41,11 +41,11 @@ Intel® Extension for PyTorch* shares most of features for CPU and GPU.
41
41
these optimizations will be landed in PyTorch master through PRs that are
42
42
being submitted and reviewed. Auto Mixed Precision (AMP) with both BFloat16
43
43
and Float16 have been enabled for Intel discrete GPUs.
44
- - **Graph Optimization: ** To optimize performance further with torchscript,
45
- Intel® Extension for PyTorch* supports fusion of frequently used operator
46
- patterns, like Conv2D+ReLU, Linear+ReLU, etc. The benefit of the fusions are
47
- delivered to users in a transparent fashion. Detailed fusion patterns
48
- supported can be found `here <https://github.com/intel/intel-extension-for-pytorch >`_.
44
+ - **Graph Optimization: ** To optimize performance further, Intel® Extension for
45
+ PyTorch* supports fusion of frequently used operator patterns, like Conv2D+ReLU,
46
+ Linear+ReLU, etc. The benefit of the fusions are delivered to users in a transparent
47
+ fashion. Detailed fusion patterns supported can be found
48
+ `here <https://github.com/intel/intel-extension-for-pytorch >`_.
49
49
The graph optimization will be up-streamed to PyTorch with the introduction
50
50
of oneDNN Graph API.
51
51
- **Operator Optimization: ** Intel® Extension for PyTorch* also optimizes
@@ -186,8 +186,8 @@ BFloat16
186
186
'optimizer_state_dict': optimizer.state_dict(),
187
187
}, 'checkpoint.pth')
188
188
189
- Inference - Imperative Mode
190
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
189
+ Inference
190
+ ~~~~~~~~~
191
191
192
192
Float32
193
193
^^^^^^^
@@ -234,67 +234,6 @@ BFloat16
234
234
with torch.cpu.amp.autocast():
235
235
model(data)
236
236
237
- Inference - TorchScript Mode
238
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
239
-
240
- TorchScript mode makes graph optimization possible, hence improves
241
- performance for some topologies. Intel® Extension for PyTorch* enables most
242
- commonly used operator pattern fusion, and users can get the performance
243
- benefit without additional code changes.
244
-
245
- Float32
246
- ^^^^^^^
247
-
248
- .. code :: python3
249
-
250
- import torch
251
- import torchvision.models as models
252
-
253
- model = models.resnet50(pretrained=True)
254
- model.eval()
255
- data = torch.rand(1, 3, 224, 224)
256
-
257
- #################### code changes ####################
258
- import intel_extension_for_pytorch as ipex
259
- model = ipex.optimize(model)
260
- ######################################################
261
-
262
- with torch.no_grad():
263
- d = torch.rand(1, 3, 224, 224)
264
- model = torch.jit.trace(model, d)
265
- model = torch.jit.freeze(model)
266
-
267
- model(data)
268
-
269
- BFloat16
270
- ^^^^^^^^
271
-
272
- .. code :: python3
273
-
274
- import torch
275
- from transformers import BertModel
276
-
277
- model = BertModel.from_pretrained(args.model_name)
278
- model.eval()
279
-
280
- vocab_size = model.config.vocab_size
281
- batch_size = 1
282
- seq_length = 512
283
- data = torch.randint(vocab_size, size=[batch_size, seq_length])
284
-
285
- #################### code changes ####################
286
- import intel_extension_for_pytorch as ipex
287
- model = ipex.optimize(model, dtype=torch.bfloat16)
288
- ######################################################
289
-
290
- with torch.no_grad():
291
- with torch.cpu.amp.autocast():
292
- d = torch.randint(vocab_size, size=[batch_size, seq_length])
293
- model = torch.jit.trace(model, (d,), check_trace=False, strict=False)
294
- model = torch.jit.freeze(model)
295
-
296
- model(data)
297
-
298
237
Examples -- GPU
299
238
---------------
300
239
@@ -420,8 +359,8 @@ BFloat16
420
359
'optimizer_state_dict': optimizer.state_dict(),
421
360
}, 'checkpoint.pth')
422
361
423
- Inference - Imperative Mode
424
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
362
+ Inference
363
+ ~~~~~~~~~
425
364
426
365
Float32
427
366
^^^^^^^
@@ -510,121 +449,6 @@ Float16
510
449
################################# code changes ######################################
511
450
model(data)
512
451
513
- Inference - TorchScript Mode
514
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
515
-
516
- TorchScript mode makes graph optimization possible, hence improves
517
- performance for some topologies. Intel® Extension for PyTorch* enables most
518
- commonly used operator pattern fusion, and users can get the performance
519
- benefit without additional code changes.
520
-
521
- Float32
522
- ^^^^^^^
523
-
524
- .. code :: python3
525
-
526
- import torch
527
- from transformers import BertModel
528
- ############# code changes ###############
529
- import intel_extension_for_pytorch as ipex
530
- ############# code changes ###############
531
-
532
- model = BertModel.from_pretrained(args.model_name)
533
- model.eval()
534
-
535
- vocab_size = model.config.vocab_size
536
- batch_size = 1
537
- seq_length = 512
538
- data = torch.randint(vocab_size, size=[batch_size, seq_length])
539
-
540
- #################### code changes ################
541
- model = model.to("xpu")
542
- data = data.to("xpu")
543
- model = ipex.optimize(model, dtype=torch.float32)
544
- #################### code changes ################
545
-
546
- with torch.no_grad():
547
- d = torch.randint(vocab_size, size=[batch_size, seq_length])
548
- ##### code changes #####
549
- d = d.to("xpu")
550
- ##### code changes #####
551
- model = torch.jit.trace(model, (d,), check_trace=False, strict=False)
552
- model = torch.jit.freeze(model)
553
-
554
- model(data)
555
-
556
- BFloat16
557
- ^^^^^^^^
558
-
559
- .. code :: python3
560
-
561
- import torch
562
- from transformers import BertModel
563
- ############# code changes ###############
564
- import intel_extension_for_pytorch as ipex
565
- ############# code changes ###############
566
-
567
- model = BertModel.from_pretrained(args.model_name)
568
- model.eval()
569
-
570
- vocab_size = model.config.vocab_size
571
- batch_size = 1
572
- seq_length = 512
573
- data = torch.randint(vocab_size, size=[batch_size, seq_length])
574
-
575
- #################### code changes #################
576
- model = model.to("xpu")
577
- data = data.to("xpu")
578
- model = ipex.optimize(model, dtype=torch.bfloat16)
579
- #################### code changes #################
580
-
581
- with torch.no_grad():
582
- d = torch.randint(vocab_size, size=[batch_size, seq_length])
583
- ################################# code changes ######################################
584
- d = d.to("xpu")
585
- with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=False):
586
- ################################# code changes ######################################
587
- model = torch.jit.trace(model, (d,), check_trace=False, strict=False)
588
- model = torch.jit.freeze(model)
589
-
590
- model(data)
591
-
592
- Float16
593
- ^^^^^^^
594
-
595
- .. code :: python3
596
-
597
- import torch
598
- from transformers import BertModel
599
- ############# code changes ###############
600
- import intel_extension_for_pytorch as ipex
601
- ############# code changes ###############
602
-
603
- model = BertModel.from_pretrained(args.model_name)
604
- model.eval()
605
-
606
- vocab_size = model.config.vocab_size
607
- batch_size = 1
608
- seq_length = 512
609
- data = torch.randint(vocab_size, size=[batch_size, seq_length])
610
-
611
- #################### code changes ################
612
- model = model.to("xpu")
613
- data = data.to("xpu")
614
- model = ipex.optimize(model, dtype=torch.float16)
615
- #################### code changes ################
616
-
617
- with torch.no_grad():
618
- d = torch.randint(vocab_size, size=[batch_size, seq_length])
619
- ################################# code changes ######################################
620
- d = d.to("xpu")
621
- with torch.xpu.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=False):
622
- ################################# code changes ######################################
623
- model = torch.jit.trace(model, (d,), check_trace=False, strict=False)
624
- model = torch.jit.freeze(model)
625
-
626
- model(data)
627
-
628
452
C++ (CPU only)
629
453
~~~~~~~~~~~~~~
630
454
@@ -657,10 +481,11 @@ once C++ dynamic library of Intel® Extension for PyTorch* is linked.
657
481
}
658
482
std::vector<torch::jit::IValue> inputs;
659
483
// make sure input data are converted to channels last format
660
- inputs.push_back(torch::ones ({1, 3, 224, 224}).to(c10::MemoryFormat::ChannelsLast) );
484
+ inputs.push_back(torch::rand ({1, 3, 224, 224});
661
485
662
486
at::Tensor output = module.forward(inputs).toTensor();
663
-
487
+ std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << std::endl;
488
+ std::cout << "Execution finished" << std::endl;
664
489
return 0;
665
490
}
666
491
@@ -676,7 +501,7 @@ once C++ dynamic library of Intel® Extension for PyTorch* is linked.
676
501
add_executable(example-app example-app.cpp)
677
502
target_link_libraries(example-app "${TORCH_LIBRARIES}")
678
503
679
- set_property(TARGET example-app PROPERTY CXX_STANDARD 14 )
504
+ set_property(TARGET example-app PROPERTY CXX_STANDARD 17 )
680
505
681
506
**Command for compilation **
682
507
@@ -691,31 +516,20 @@ into the binary. This can be verified with the Linux command `ldd`.
691
516
::
692
517
693
518
$ cmake -DCMAKE_PREFIX_PATH=/workspace/libtorch ..
694
- -- The C compiler identification is GNU 9.3.0
695
- -- The CXX compiler identification is GNU 9.3.0
696
- -- Check for working C compiler: /usr/bin/cc
697
- -- Check for working C compiler: /usr/bin/cc -- works
519
+ -- The C compiler identification is GNU XX.X.X
520
+ -- The CXX compiler identification is GNU XX.X.X
698
521
-- Detecting C compiler ABI info
699
522
-- Detecting C compiler ABI info - done
523
+ -- Check for working C compiler: /usr/bin/cc - skipped
700
524
-- Detecting C compile features
701
525
-- Detecting C compile features - done
702
- -- Check for working CXX compiler: /usr/bin/c++
703
- -- Check for working CXX compiler: /usr/bin/c++ -- works
704
526
-- Detecting CXX compiler ABI info
705
527
-- Detecting CXX compiler ABI info - done
528
+ -- Check for working CXX compiler: /usr/bin/c++ - skipped
706
529
-- Detecting CXX compile features
707
530
-- Detecting CXX compile features - done
708
- -- Looking for pthread.h
709
- -- Looking for pthread.h - found
710
- -- Performing Test CMAKE_HAVE_LIBC_PTHREAD
711
- -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
712
- -- Looking for pthread_create in pthreads
713
- -- Looking for pthread_create in pthreads - not found
714
- -- Looking for pthread_create in pthread
715
- -- Looking for pthread_create in pthread - found
716
- -- Found Threads: TRUE
717
531
-- Found Torch: /workspace/libtorch/lib/libtorch.so
718
- -- Found INTEL_EXT_PT_CPU: TRUE
532
+ -- Found IPEX: /workspace/libtorch/lib/libintel-ext-pt-cpu.so
719
533
-- Configuring done
720
534
-- Generating done
721
535
-- Build files have been written to: /workspace/build
@@ -726,18 +540,6 @@ into the binary. This can be verified with the Linux command `ldd`.
726
540
libc10.so => /workspace/libtorch/lib/libc10.so (0x00007f3cf985a000)
727
541
libintel-ext-pt-cpu.so => /workspace/libtorch/lib/libintel-ext-pt-cpu.so (0x00007f3cf70fc000)
728
542
libtorch_cpu.so => /workspace/libtorch/lib/libtorch_cpu.so (0x00007f3ce16ac000)
729
- ...
730
- libdnnl_graph.so.0 => /workspace/libtorch/lib/libdnnl_graph.so.0 (0x00007f3cde954000)
731
- ...
732
-
733
- Model Zoo (CPU only)
734
- --------------------
735
-
736
- Use cases that had already been optimized by Intel engineers are available at
737
- `Model Zoo for Intel® Architecture <https://github.com/IntelAI/models/ >`_ (with
738
- the branch name in format of `pytorch-r<version>-models `). Many PyTorch use
739
- cases for benchmarking are also available on the GitHub page. You can get
740
- performance benefits out-of-the-box by simply running scripts in the Model Zoo.
741
543
742
544
Tutorials
743
545
---------
0 commit comments