diff --git a/CMakeLists.txt b/CMakeLists.txt index 8cee416f8..b20caec8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,15 +31,15 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INSTALL_RPATH $ORIGIN) # Need the torch package -set(Torch_COMP_VERION "${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR}") -find_package(Torch ${Torch_COMP_VERION} REQUIRED) +set(Torch_COMP_VERSION "${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR}") +find_package(Torch ${Torch_COMP_VERSION} REQUIRED) if(NOT EXISTS ${TORCH_INSTALL_PREFIX}) message(FATAL_ERROR "Can NOT find torch install path at ${TORCH_INSTALL_PREFIX}!") endif() -if(NOT ${Torch_COMP_VERION} VERSION_EQUAL "${Torch_VERSION_MAJOR}.${Torch_VERSION_MINOR}") - message(FATAL_ERROR "Not compatible Torch version ${Torch_VERSION} at ${TORCH_INSTALL_PREFIX}!\nTorch ${Torch_COMP_VERION} is needed!") +if(NOT ${Torch_COMP_VERSION} VERSION_EQUAL "${Torch_VERSION_MAJOR}.${Torch_VERSION_MINOR}") + message(FATAL_ERROR "Not compatible Torch version ${Torch_VERSION} at ${TORCH_INSTALL_PREFIX}!\nTorch ${Torch_COMP_VERSION} is needed!") endif() include(${IPEX_ROOT_DIR}/cmake/Options.cmake) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 23418d006..c3374bf53 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -75,7 +75,7 @@ In case you want to reinstall, make sure that you uninstall Intel® Extension fo ### Tips and Debugging -* A prerequisite to installing Intel® Extension for PyTorch\* is CMake. We recommend installing it with [Homebrew](https://brew.sh/) with `brew install cmake` if you are developing on MacOS or Linux system. +* A prerequisite to installing Intel® Extension for PyTorch\* is CMake. We recommend installing it with [Homebrew](https://brew.sh/) with `brew install cmake` if you are developing on macOS or Linux system. * Our `setup.py` requires Python >= 3.6 * If you run into errors when running `python setup.py develop`, here are some debugging steps: 1. Run `printf '#include \nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure your CMake works and can compile this simple Hello World program without errors. diff --git a/cmake/cppsdk/gen_self_extract.sh.in b/cmake/cppsdk/gen_self_extract.sh.in index 1a279087c..ca9a064ea 100755 --- a/cmake/cppsdk/gen_self_extract.sh.in +++ b/cmake/cppsdk/gen_self_extract.sh.in @@ -32,5 +32,5 @@ if [ $? -gt 0 ]; then exit 23 fi -echo "Successfully generate self-extacting package at ${LIBIPEX_INSTALL_SCRIPT}" +echo "Successfully generate self-extracting package at ${LIBIPEX_INSTALL_SCRIPT}" exit diff --git a/cmake/cppsdk/libintel-ext-pt.installer.sh.in b/cmake/cppsdk/libintel-ext-pt.installer.sh.in index 066007f1b..35cc4d1f7 100644 --- a/cmake/cppsdk/libintel-ext-pt.installer.sh.in +++ b/cmake/cppsdk/libintel-ext-pt.installer.sh.in @@ -119,7 +119,7 @@ if [[ ${COMMAND} == "install" ]]; then echo "f|${comp}" >> ${LIBTORCH_PATH}/${LOGFILE} done - echo "Installation successed!" + echo "Installation succeeded!" # LIBIPEX Uninstallation elif [[ ${COMMAND} == "uninstall" ]]; then @@ -144,7 +144,7 @@ elif [[ ${COMMAND} == "uninstall" ]]; then rm -f ${LIBTORCH_PATH}/${LOGFILE} fi - echo "Uninstallation successed!" + echo "Uninstallation succeeded!" fi exit diff --git a/cmake/cpu/IsaCodegen.cmake b/cmake/cpu/IsaCodegen.cmake index 8ab710002..b7c45831e 100644 --- a/cmake/cpu/IsaCodegen.cmake +++ b/cmake/cpu/IsaCodegen.cmake @@ -141,7 +141,7 @@ endif(CXX_AVX2_FOUND) list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES) math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1") -# The sources list might get reordered later based on the capabilites. +# The sources list might get reordered later based on the capabilities. # See NOTE [ Linking AVX and non-AVX files ] foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES}) foreach(IMPL ${cpu_kernel_cpp_in}) diff --git a/csrc/cpu/aten/AveragePool.cpp b/csrc/cpu/aten/AveragePool.cpp index fcb85120b..164b43973 100644 --- a/csrc/cpu/aten/AveragePool.cpp +++ b/csrc/cpu/aten/AveragePool.cpp @@ -308,7 +308,7 @@ at::Tensor avg_pool3d_out_cpu( } else { TORCH_CHECK( false, - "Unsupport memory format. Supports only ChannelsLast3d, Contiguous"); + "Unsupported memory format. Supports only ChannelsLast3d, Contiguous"); } TORCH_CHECK( @@ -459,7 +459,7 @@ at::Tensor avg_pool3d_backward_out_cpu( } else { TORCH_CHECK( false, - "Unsupport memory format. Supports only ChannelsLast3d, Contiguous"); + "Unsupported memory format. Supports only ChannelsLast3d, Contiguous"); } TORCH_CHECK( diff --git a/csrc/cpu/aten/Conv.cpp b/csrc/cpu/aten/Conv.cpp index fcf39d600..16874f670 100644 --- a/csrc/cpu/aten/Conv.cpp +++ b/csrc/cpu/aten/Conv.cpp @@ -104,7 +104,7 @@ at::Tensor convolution_kernel( at::MemoryFormat memory_format) { // Base convolution kernel, this base kernel will not change input's format, // so make sure you has make process the input's format before call this - // function, the output wil has same format with input. + // function, the output will has same format with input. // TODO: the input will be actively converted to channels last format // after the 5-D tensor supports channels last format. TORCH_CHECK( diff --git a/csrc/cpu/aten/ConvTranspose.cpp b/csrc/cpu/aten/ConvTranspose.cpp index 799d04f21..6e1c3db47 100644 --- a/csrc/cpu/aten/ConvTranspose.cpp +++ b/csrc/cpu/aten/ConvTranspose.cpp @@ -36,7 +36,7 @@ std::vector conv_input_size( static inline std::vector padding_r( at::IntArrayRef padding, at::IntArrayRef output_padding) { - // ConvTranpose padding adjustment + // ConvTranspose padding adjustment // // PyTorch uses padding/output_padding: // osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) diff --git a/csrc/cpu/aten/DistributedMergedEmb.cpp b/csrc/cpu/aten/DistributedMergedEmb.cpp index f7f90035f..d432175b2 100644 --- a/csrc/cpu/aten/DistributedMergedEmb.cpp +++ b/csrc/cpu/aten/DistributedMergedEmb.cpp @@ -68,7 +68,7 @@ IPEX_DEFINE_DISPATCH(mergedemb_distribute_backward_merge_adagrad_update_stub); * distributed-merged-embedding-foward-lookup * 1. mergedemb_distribute_backward_local_cpu will finish the backward with * local grad (shape of [local BS * num_table * emb_dim]), the output grad will - * be organzied by 3 TensorList: val Tensors, idx Tensors, ofs Tensors. The + * be organized by 3 TensorList: val Tensors, idx Tensors, ofs Tensors. The * number of the Tensors in 1 TensorList equal to world size. val[i], idx[i], * ofs[i] is the tensors will be transfer to rank i by sparse all to all. It * contains the grads for those indices on rank i. diff --git a/csrc/cpu/aten/EmbeddingBag.cpp b/csrc/cpu/aten/EmbeddingBag.cpp index 1e93545ec..daf2b7caa 100644 --- a/csrc/cpu/aten/EmbeddingBag.cpp +++ b/csrc/cpu/aten/EmbeddingBag.cpp @@ -148,8 +148,8 @@ at::Tensor embedding_bag( } // namespace torch_ipex /* -A namespace wrapper to keep API compatiable to callers. -And also compatiable to new dyndisp. +A namespace wrapper to keep API compatible to callers. +And also compatible to new dyndisp. */ namespace torch_ipex { diff --git a/csrc/cpu/aten/FlashAttention.cpp b/csrc/cpu/aten/FlashAttention.cpp index 14c2c4286..479576e01 100644 --- a/csrc/cpu/aten/FlashAttention.cpp +++ b/csrc/cpu/aten/FlashAttention.cpp @@ -20,7 +20,7 @@ bool use_ipex_flash_attention( } /* - *Caculate the flash attention SDPA with attention mask. + *Calculate the flash attention SDPA with attention mask. */ std::tuple flash_attention_forward_cpu( const at::Tensor& query, diff --git a/csrc/cpu/aten/LayerNorm.cpp b/csrc/cpu/aten/LayerNorm.cpp index 936d10cae..f8a4755c9 100644 --- a/csrc/cpu/aten/LayerNorm.cpp +++ b/csrc/cpu/aten/LayerNorm.cpp @@ -25,7 +25,7 @@ std::tuple layer_norm_impl( double eps) { TORCH_CHECK( gamma.scalar_type() == at::kFloat && beta.scalar_type() == at::kFloat, - "gamma adn beta's data type should be float"); + "gamma and beta's data type should be float"); ideep::tensor x = itensor_view_from_dense(X); const ideep::tensor scale = itensor_view_from_dense(gamma); const ideep::tensor shift = itensor_view_from_dense(beta); @@ -148,7 +148,7 @@ at::Tensor layer_norm_forward( * Now, we only use oneDNN kernel when both weight and bias are provided. * ToDo: more scenarios to use oneDNN or remvoe this pass * when at::layer_norm performance is back compared to w/o - * mergeing https://github.com/pytorch/pytorch/pull/59987 + * merging https://github.com/pytorch/pytorch/pull/59987 * * @param input: the source tensor to layernorm * @param normalized_shape: input shape from an expected input of size diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp index 23de79e57..82d4e11e9 100644 --- a/csrc/cpu/aten/Linear.cpp +++ b/csrc/cpu/aten/Linear.cpp @@ -45,7 +45,7 @@ void linear_kernel_output( dim == 2 ? self_ : self_.reshape({-1, self.size(self.dim() - 1)}); const ideep::tensor mkldnn_input = itensor_view_from_dense(self_reshaped); // output.sizes() will return a reference for output's size which will not - // hold the underlaying storage. It will be released if output are dead + // hold the underlying storage. It will be released if output are dead // (output = output.reshape(output_size_reshaped)) output.sizes().vec() will // trigger a copy and can hold the sizes vector. auto output_size = output.sizes().vec(); diff --git a/csrc/cpu/aten/MaskedMultiHeadAttention.cpp b/csrc/cpu/aten/MaskedMultiHeadAttention.cpp index 1989e3bbe..77f77fec9 100644 --- a/csrc/cpu/aten/MaskedMultiHeadAttention.cpp +++ b/csrc/cpu/aten/MaskedMultiHeadAttention.cpp @@ -10,7 +10,7 @@ IPEX_DEFINE_DISPATCH(deepseekv2_mla_kernel_stub); IPEX_DEFINE_DISPATCH(prepare_4d_causal_attention_mask_kernel_stub); /* - *Caculate the masked multihead attention for decoder layer in decoder only + *Calculate the masked multihead attention for decoder layer in decoder only *model. *@param query *@param key diff --git a/csrc/cpu/aten/MergedEmbeddingBag.h b/csrc/cpu/aten/MergedEmbeddingBag.h index 1687ce21f..4318d87bd 100644 --- a/csrc/cpu/aten/MergedEmbeddingBag.h +++ b/csrc/cpu/aten/MergedEmbeddingBag.h @@ -50,7 +50,7 @@ class EMBROWFixLen { * EmbeddingRowCache with smaller memory usage. * * EmbeddingRowCache contains var length EmbRow hash map and Fixed length EmbRow - * with len=64, 128, 256 And handle different lenght inside EmbeddingRowCache + * with len=64, 128, 256 And handle different length inside EmbeddingRowCache * without expose len info to users. * * The robin_hood::unordered_map _cached_ptr is used because user @@ -61,7 +61,7 @@ class EMBROWFixLen { * We will allocate memory to hold emb row very frequently during Embedding * FW/BW, we wish to allocate the memory on stack by using temporal varalble * instead of allocating them in heap for performance consideration. So we use C - * array to hold fixed length and use std::vector to hold var lenght + * array to hold fixed length and use std::vector to hold var length * (std::vector will use memory on heap). * * How to use: diff --git a/csrc/cpu/aten/PagedAttention.cpp b/csrc/cpu/aten/PagedAttention.cpp index ee19109e6..4395cee81 100644 --- a/csrc/cpu/aten/PagedAttention.cpp +++ b/csrc/cpu/aten/PagedAttention.cpp @@ -11,7 +11,7 @@ IPEX_DEFINE_DISPATCH(reshape_and_cache_kernel_stub); IPEX_DEFINE_DISPATCH(flash_attn_var_len_kernel_stub); /* - *Caculate the masked multihead attention for decoder layer in decoder only + *Calculate the masked multihead attention for decoder layer in decoder only */ at::Tensor single_query_cached_kv_attention_forward_cpu( at::Tensor& out, // [num_seqs, num_heads, head_size] diff --git a/csrc/cpu/aten/Punica.cpp b/csrc/cpu/aten/Punica.cpp index 0966b2145..a7121bacf 100644 --- a/csrc/cpu/aten/Punica.cpp +++ b/csrc/cpu/aten/Punica.cpp @@ -17,9 +17,9 @@ at::Tensor punica_bgmv_shrink_forward_cpu( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, const double scale) { - punica_bgmv_shrink_kernel_stub(kCPU, out, input, weights, indicies, scale); + punica_bgmv_shrink_kernel_stub(kCPU, out, input, weights, indices, scale); return out; } @@ -27,11 +27,11 @@ at::Tensor punica_sgmv_shrink_forward_cpu( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, const double scale) { punica_sgmv_shrink_kernel_stub( - kCPU, out, input, weights, indicies, seq_lens, scale); + kCPU, out, input, weights, indices, seq_lens, scale); return out; } @@ -39,10 +39,10 @@ at::Tensor punica_bgmv_expand_forward_cpu( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, bool add_inputs) { punica_bgmv_expand_kernel_stub( - kCPU, out, input, weights, indicies, add_inputs); + kCPU, out, input, weights, indices, add_inputs); return out; } @@ -50,11 +50,11 @@ at::Tensor punica_sgmv_expand_forward_cpu( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, bool add_inputs) { punica_sgmv_expand_kernel_stub( - kCPU, out, input, weights, indicies, seq_lens, add_inputs); + kCPU, out, input, weights, indices, seq_lens, add_inputs); return out; } @@ -62,7 +62,7 @@ at::Tensor punica_bgmv_expand_slice_forward_cpu( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, int64_t slice_offset, int64_t slice_size, bool add_inputs) { @@ -71,7 +71,7 @@ at::Tensor punica_bgmv_expand_slice_forward_cpu( out, input, weights, - indicies, + indices, slice_offset, slice_size, add_inputs); @@ -82,7 +82,7 @@ at::Tensor punica_sgmv_expand_slice_forward_cpu( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, int64_t slice_offset, int64_t slice_size, @@ -92,7 +92,7 @@ at::Tensor punica_sgmv_expand_slice_forward_cpu( out, input, weights, - indicies, + indices, seq_lens, slice_offset, slice_size, diff --git a/csrc/cpu/aten/Punica.h b/csrc/cpu/aten/Punica.h index 0bdaceabb..1fae98094 100644 --- a/csrc/cpu/aten/Punica.h +++ b/csrc/cpu/aten/Punica.h @@ -12,14 +12,14 @@ void punica_bgmv_shrink( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, const double scale); void punica_sgmv_shrink( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, const double scale); @@ -27,14 +27,14 @@ void punica_bgmv_expand( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, bool add_inputs); void punica_sgmv_expand( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, bool add_inputs); @@ -42,7 +42,7 @@ void punica_bgmv_expand_slice( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, int64_t slice_offset, int64_t slice_size, bool add_inputs); @@ -51,7 +51,7 @@ void punica_sgmv_expand_slice( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, int64_t slice_offset, int64_t slice_size, @@ -62,14 +62,14 @@ using punica_bgmv_shrink_fn = void (*)( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, const double scale); using punica_sgmv_shrink_fn = void (*)( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, const double scale); @@ -77,14 +77,14 @@ using punica_bgmv_expand_fn = void (*)( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, bool add_inputs); using punica_sgmv_expand_fn = void (*)( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, bool add_inputs); @@ -92,7 +92,7 @@ using punica_bgmv_expand_slice_fn = void (*)( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, int64_t slice_offset, int64_t slice_size, bool add_inputs); @@ -101,7 +101,7 @@ using punica_sgmv_expand_slice_fn = void (*)( at::Tensor& out, at::Tensor& input, at::Tensor& weights, - at::Tensor& indicies, + at::Tensor& indices, at::Tensor& seq_lens, int64_t slice_offset, int64_t slice_size, diff --git a/csrc/cpu/aten/RotaryPositionEmbedding.cpp b/csrc/cpu/aten/RotaryPositionEmbedding.cpp index 1046dd668..48f5cd018 100644 --- a/csrc/cpu/aten/RotaryPositionEmbedding.cpp +++ b/csrc/cpu/aten/RotaryPositionEmbedding.cpp @@ -1,5 +1,5 @@ -// The orginal python code can be found in +// The original python code can be found in // https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py // apply_rotary_pos_emb #include "RotaryPositionEmbedding.h" diff --git a/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp b/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp index 20b673c17..06bafe61b 100644 --- a/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp +++ b/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp @@ -26,23 +26,23 @@ inline int64_t _calc_element_offset( inline std::vector _adjust_strides( const at::Tensor& src, - std::vector& infered_size) { + std::vector& inferred_size) { // We does NOT support broadcasting last dim which mean last_dim = 1 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.stride(src.ndimension() - 1) == 1); auto original_shape = src.sizes(); auto original_stride = src.strides(); - auto offset = infered_size.size() - original_shape.size(); + auto offset = inferred_size.size() - original_shape.size(); std::vector adjusted_stride; if (offset > 0) - adjusted_stride.resize(infered_size.size(), 0); + adjusted_stride.resize(inferred_size.size(), 0); else - adjusted_stride.resize(infered_size.size()); + adjusted_stride.resize(inferred_size.size()); for (size_t i = 0; i < original_shape.size(); i++) { // see NOTE: [Computing output strides] - if (original_shape[i] == 1 && infered_size[offset + i] != 1) { + if (original_shape[i] == 1 && inferred_size[offset + i] != 1) { adjusted_stride[offset + i] = 0; } else { adjusted_stride[offset + i] = original_stride[i]; @@ -54,7 +54,7 @@ inline std::vector _adjust_strides( /** * @brief Fuse the div (div scalar or mul 1/scalar) add operator and softmax - * operator. softmax(alpah * a + b) + * operator. softmax(alpha * a + b) * * @attention * There are some assumptions for this operator. @@ -64,7 +64,7 @@ inline std::vector _adjust_strides( * - The input tensors are contiguous * - The number of the input tensor dimension should be >=2 * - Only the second input tensor is brodcastable - * - The datatype for inpusts(a,b) and output are same. + * - The datatype for inputs(a,b) and output are same. * * @param[in] a a contiguous tensor to be added * @param[in] b a tensor to be added while it should be broadcastable @@ -79,30 +79,30 @@ at::Tensor dil_div_add_softmax( scalar_t* b_data_base = b.data_ptr(); // Check if the tensor needs to be broadcasted - auto infered_size = a.sizes().vec(); - auto need_broadcast = (infered_size != b.sizes()); + auto inferred_size = a.sizes().vec(); + auto need_broadcast = (inferred_size != b.sizes()); if (need_broadcast) { - infered_size = at::infer_size(a.sizes(), b.sizes()); + inferred_size = at::infer_size(a.sizes(), b.sizes()); } at::Tensor output = at::empty_like(a); // Create an new tensor to store the output scalar_t* output_data_base = output.data_ptr(); // Calculate the strides for the input tensor - std::vector b_adjusted_strides = _adjust_strides(b, infered_size); + std::vector b_adjusted_strides = _adjust_strides(b, inferred_size); std::vector outer_size_per_dim; - int64_t dim_size = infered_size[infered_size.size() - 1]; + int64_t dim_size = inferred_size[inferred_size.size() - 1]; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim_size != 1); int64_t outer_size = 1; // The last dim is the loop unit. We need to minus 2 to exclude the last dim. - // infered_size.size() - 2 is the -2th dimension. - for (int64_t i = infered_size.size() - 2; i >= 0; i--) { + // inferred_size.size() - 2 is the -2th dimension. + for (int64_t i = inferred_size.size() - 2; i >= 0; i--) { // Record outer dimensions outer_size_per_dim.insert(outer_size_per_dim.begin(), outer_size); // Calculate outer loop number; - outer_size *= infered_size[i]; + outer_size *= inferred_size[i]; } int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size); @@ -138,7 +138,7 @@ at::Tensor dil_div_add_softmax( // val = sum(output_data) _dil_exp_reduce_sum_fusion_kernel( tmp_out_ptr, dim_size, tmp_out_ptr, val); - // Calculat the normalization [e^x / sum(e^x)]: + // Calculate the normalization [e^x / sum(e^x)]: // output_data = output_data / sum(output_data) _dil_normalization_kernel( tmp_out_ptr, val, dim_size, output_data_base + i * dim_size); @@ -170,27 +170,27 @@ at::Tensor& dil_add_softmax_(at::Tensor& a, const at::Tensor& b) { float* b_data_base = b.data_ptr(); // Check if the tensor needs to be broadcasted - auto infered_size = a.sizes().vec(); - auto need_broadcast = (infered_size != b.sizes()); + auto inferred_size = a.sizes().vec(); + auto need_broadcast = (inferred_size != b.sizes()); if (need_broadcast) { - infered_size = at::infer_size(a.sizes(), b.sizes()); + inferred_size = at::infer_size(a.sizes(), b.sizes()); } // Calculate the strides for the input tensor - std::vector b_adjusted_strides = _adjust_strides(b, infered_size); + std::vector b_adjusted_strides = _adjust_strides(b, inferred_size); std::vector outer_size_per_dim; - int64_t dim_size = infered_size[infered_size.size() - 1]; + int64_t dim_size = inferred_size[inferred_size.size() - 1]; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim_size != 1); int64_t outer_size = 1; // The last dim is the loop unit. We need to minus 2 to exclude the last dim. - // infered_size.size() - 2 is the -2th dimension. - for (int64_t i = infered_size.size() - 2; i >= 0; i--) { + // inferred_size.size() - 2 is the -2th dimension. + for (int64_t i = inferred_size.size() - 2; i >= 0; i--) { // Record outer dimensions outer_size_per_dim.insert(outer_size_per_dim.begin(), outer_size); // Calculate outer loop number; - outer_size *= infered_size[i]; + outer_size *= inferred_size[i]; } int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size); @@ -227,7 +227,7 @@ at::Tensor& dil_add_softmax_(at::Tensor& a, const at::Tensor& b) { dim_size, a_data_base + i * dim_size, val); - // Calculat the normalization [e^x / sum(e^x)]: + // Calculate the normalization [e^x / sum(e^x)]: // output_data = output_data / sum(output_data) _dil_normalization_kernel( diff --git a/csrc/cpu/aten/kernels/AddSwishKrnl.cpp b/csrc/cpu/aten/kernels/AddSwishKrnl.cpp index 4e0984793..36832b2c0 100644 --- a/csrc/cpu/aten/kernels/AddSwishKrnl.cpp +++ b/csrc/cpu/aten/kernels/AddSwishKrnl.cpp @@ -14,14 +14,14 @@ at::Tensor dil_add_swish(const at::Tensor& mm_output, const at::Tensor& bias) { scalar_t* mm_output_data_base = mm_output.data_ptr(); scalar_t* bias_data_base = bias.data_ptr(); - auto infered_size = mm_output.sizes().vec(); - int64_t dim_size = infered_size[infered_size.size() - 1]; + auto inferred_size = mm_output.sizes().vec(); + int64_t dim_size = inferred_size[inferred_size.size() - 1]; int64_t outer_size = 1; // The last dim is the loop unit. We need to minus 2 to exclude the last dim. - // infered_size.size() - 2 is the -2th dimension. - for (int64_t i = infered_size.size() - 2; i >= 0; i--) { + // inferred_size.size() - 2 is the -2th dimension. + for (int64_t i = inferred_size.size() - 2; i >= 0; i--) { // Calculate outer loop number; - outer_size *= infered_size[i]; + outer_size *= inferred_size[i]; } int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size); diff --git a/csrc/cpu/aten/kernels/CatKrnl.cpp b/csrc/cpu/aten/kernels/CatKrnl.cpp index 7c3a6f596..4edeccfdd 100644 --- a/csrc/cpu/aten/kernels/CatKrnl.cpp +++ b/csrc/cpu/aten/kernels/CatKrnl.cpp @@ -69,7 +69,7 @@ void cat_contig_firstdim_impl( // short input tensor list: parallel on dim_size (dim_size == ninputs * // input_dim_size). // - // note that prallel on ninputs may not have enough parallelism (e.g. + // note that parallel on ninputs may not have enough parallelism (e.g. // inputs == 2), also parallel on input_dim_size would trigger multiple // omp sessions, which has additional overhead. // @@ -340,14 +340,14 @@ void cpu_cat_contig_dispatch( int64_t dim_size = result.sizes()[dim]; int64_t outer_size = result.numel() / (dim_size * inner_size); - // Note on cat implementation choosen: + // Note on cat implementation chosen: // // In order to minimize overhead of meta info creation, pass down // `all_same_sizes_and_stride` to the kernel. `True` indicates all the input // tensors all have the same shape and stride. // // All kernels have a single omp loop (the non-contiguous path may have - // mutiple omp loops). All kernels trim grain_size in the parallel loop w.r.t. + // multiple omp loops). All kernels trim grain_size in the parallel loop w.r.t. // `at::internal::GRAIN_SIZE`. // // 1. `cat_contig_firstdim_impl`: used when outer_size == 1 (dim is the first diff --git a/csrc/cpu/aten/kernels/DecodeKrnl.cpp b/csrc/cpu/aten/kernels/DecodeKrnl.cpp index b7633174e..5ebf17de2 100644 --- a/csrc/cpu/aten/kernels/DecodeKrnl.cpp +++ b/csrc/cpu/aten/kernels/DecodeKrnl.cpp @@ -82,7 +82,7 @@ inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) { #define CHECK_LAST_DIM_CONTIGUOUS(x) \ TORCH_CHECK( \ x.strides()[x.strides().size() - 1] == 1, \ - #x "must be contiguous at last dimention") + #x "must be contiguous at last dimension") #define CHECK_INPUT(x) \ CHECK_CPU(x); \ @@ -1409,7 +1409,7 @@ void decode_attention_kernel_impl( s_prime += at::vec::reduce_all( [](Vec& x, Vec& y) { return x + y; }, s_delta, n_size); m_prime = m_i; - // caculate V' <- s_delta @ V + V' * m_delta + // calculate V' <- s_delta @ V + V' * m_delta index_gemm_kernel_nn( /* A */ s_delta, /* B */ kv_cache + head_id * stride_kv1, @@ -1560,7 +1560,7 @@ void decode_attention_opt_kernel_impl( s_prime += at::vec::reduce_all( [](Vec& x, Vec& y) { return x + y; }, s_delta, n_size); m_prime = m_i; - // caculate V' <- s_delta @ V + V' * m_delta + // calculate V' <- s_delta @ V + V' * m_delta gemm_kernel_nn( /* A */ s_delta, /* B */ kv_cache + head_id * stride_kv1 + n * stride_kv0, @@ -1736,7 +1736,7 @@ void decode_attention_grouped_kernel_impl( n_size); m_prime[h] = m_i; } - // caculate V' <- s_delta @ V + V' * m_delta + // calculate V' <- s_delta @ V + V' * m_delta index_gemm_kernel_nn( /* A */ s_delta, /* B */ kv_cache + head_kv_id * stride_kv1, @@ -1914,7 +1914,7 @@ void decode_attention_grouped_opt_kernel_impl( n_size); m_prime[h] = m_i; } - // caculate V' <- s_delta @ V + V' * m_delta + // calculate V' <- s_delta @ V + V' * m_delta gemm_kernel_nn( /* A */ s_delta, /* B */ kv_cache + head_kv_id * stride_kv2 + n * stride_kv0 + diff --git a/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp b/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp index 6cbd2066c..4eb73af2c 100644 --- a/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp +++ b/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp @@ -10,7 +10,7 @@ namespace { #if defined(CPU_CAPABILITY_AVX512) using namespace torch_ipex::cpu::kernel; /** - * @brief This function is caculating the loop unit offset for current loop idx + * @brief This function is calculating the loop unit offset for current loop idx * element, and the loop is for reading a tensor with its last dim as the loop * unit * @param[in] outer_loop_idx the loop idx @@ -37,7 +37,7 @@ inline int64_t _calc_element_offset( /** * @brief This function is adjusting the strides of src tensor based on the - * target infered_size, and make sure the strides can be used for element + * target inferred_size, and make sure the strides can be used for element * offset calculation of broadcastable reading. * For example: * [56, 1, 128 ,128] broadcasting reading for [56, 12, 128, 128] @@ -46,33 +46,33 @@ inline int64_t _calc_element_offset( * input: * src tensor size [56, 1, 128 ,128] * src tensor strides [16384, 16384, 128 ,1] - * infered_size (target tensor size) [56, 12, 128, 128] + * inferred_size (target tensor size) [56, 12, 128, 128] * output: * [16384, 0, 128, 1] * @param[in] src original tensor that needs to be adjusted - * @param[in] infered_size the target size to be broadcasted + * @param[in] inferred_size the target size to be broadcasted * @return adjusted strides * @endcode */ inline std::vector _adjust_strides( const at::Tensor& src, - std::vector& infered_size) { + std::vector& inferred_size) { // We does NOT support broadcasting last dim which mean last_dim = 1 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.stride(src.ndimension() - 1) == 1); auto original_shape = src.sizes(); auto original_stride = src.strides(); - auto offset = infered_size.size() - original_shape.size(); + auto offset = inferred_size.size() - original_shape.size(); std::vector adjusted_stride; if (offset > 0) - adjusted_stride.resize(infered_size.size(), 0); + adjusted_stride.resize(inferred_size.size(), 0); else - adjusted_stride.resize(infered_size.size()); + adjusted_stride.resize(inferred_size.size()); for (size_t i = 0; i < original_shape.size(); i++) { // see NOTE: [Computing output strides] - if (original_shape[i] == 1 && infered_size[offset + i] != 1) { + if (original_shape[i] == 1 && inferred_size[offset + i] != 1) { adjusted_stride[offset + i] = 0; } else { adjusted_stride[offset + i] = original_stride[i]; @@ -93,7 +93,7 @@ inline std::vector _adjust_strides( * - The number of the input tensor dimension should be >=2 * - The mask b has the same dimension as a, or it can be expand_as a with (bs * :: seq_length), i.e., 2D tensor expands from mid dims - * - The datatype for inpust a and output are same. + * - The datatype for input a and output are same. * * @param[in] a a contiguous tensor to do div and softmax * @param[in] b a mask tensor to be masked_fill into tensor a after div and @@ -110,36 +110,36 @@ at::Tensor dil_div_maskfill_softmax( scalar_t* a_data_base = a.data_ptr(); float* b_data_base = b.data_ptr(); - auto infered_size = a.sizes().vec(); + auto inferred_size = a.sizes().vec(); - auto need_broadcast = a.dim() == b.dim() && (infered_size != b.sizes()); + auto need_broadcast = a.dim() == b.dim() && (inferred_size != b.sizes()); auto need_expand_from_2d = b.dim() == 2; if (need_broadcast) { - infered_size = at::infer_size(a.sizes(), b.sizes()); + inferred_size = at::infer_size(a.sizes(), b.sizes()); } // Calculate the strides for the input tensor - std::vector b_adjusted_strides = _adjust_strides(b, infered_size); + std::vector b_adjusted_strides = _adjust_strides(b, inferred_size); // Create an new tensor to store the output at::Tensor output = at::empty_like(a); scalar_t* output_data_base = output.data_ptr(); std::vector outer_size_per_dim; - int64_t dim_size = infered_size[infered_size.size() - 1]; + int64_t dim_size = inferred_size[inferred_size.size() - 1]; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim_size != 1); int64_t outer_size = 1; // The last dim is the loop unit. We need to minus 2 to exclude the last dim. - // infered_size.size() - 2 is the -2th dimension. - for (int64_t i = infered_size.size() - 2; i >= 0; i--) { + // inferred_size.size() - 2 is the -2th dimension. + for (int64_t i = inferred_size.size() - 2; i >= 0; i--) { // Record outer dimensions outer_size_per_dim.insert(outer_size_per_dim.begin(), outer_size); // Calculate outer loop number; - outer_size *= infered_size[i]; + outer_size *= inferred_size[i]; } - auto mask_offset = outer_size / infered_size[0]; + auto mask_offset = outer_size / inferred_size[0]; int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size); if (grain_size < 1) @@ -181,7 +181,7 @@ at::Tensor dil_div_maskfill_softmax( _dil_exp_reduce_sum_fusion_kernel( tmp_out_ptr, dim_size, tmp_out_ptr, val); - // Calculat the normalization [e^x / sum(e^x)]: + // Calculate the normalization [e^x / sum(e^x)]: // output_data = output_data / sum(output_data) _dil_normalization_kernel( diff --git a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp index 7cc0c4ec2..309ead6e6 100644 --- a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp @@ -390,7 +390,7 @@ inline void reshape_attn_mask_to_4d( } /* - *Caculate the flash attention SDPA. + *Calculate the flash attention SDPA. *@template scalar_t: q/k/v data type *@template q_split_size: q block size *@template kv_split_size: kv block size @@ -1411,7 +1411,7 @@ inline bool use_vnni( int64_t thresh_size = (dtype == at::kBFloat16) ? 64 : 16; bool need_pack = kvSize >= thresh_size && qSize >= thresh_size; // When the number of gemm is greater than the number of pack, - // the pack overhead can be overlaped. + // the pack overhead can be overlapped. if (need_pack) { double pack_size = batchSize * num_head * kvSize * headSize; double qs_per_thread = diff --git a/csrc/cpu/aten/kernels/GroupNormKrnl.cpp b/csrc/cpu/aten/kernels/GroupNormKrnl.cpp index a3b84e9be..e34856593 100644 --- a/csrc/cpu/aten/kernels/GroupNormKrnl.cpp +++ b/csrc/cpu/aten/kernels/GroupNormKrnl.cpp @@ -325,7 +325,7 @@ void GroupNormKernelImplChannelsLastInternal( const bool gamma_null = (gamma_data == nullptr); const bool beta_null = beta_data == nullptr; - // NB: About algorithm choosen: + // NB: About algorithm chosen: // // On channels last, GroupNorm has a input shape of {N, H, W, GD}, // Mean and rstd are collected per each n and g, which involves reduction @@ -462,7 +462,7 @@ void GroupNormKernelImplChannelsLastInternal( // // We could fuse step 3 and 4 into a single session but this way is better: // a. D might be too small for vectorization; - // b. Avoid duplicate caculation of scale/bias, each HxW plain share the + // b. Avoid duplicate calculation of scale/bias, each HxW plain share the // same scale/bias // for (const auto n : c10::irange(N)) { diff --git a/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp b/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp index 130b6b6e2..5a3df1947 100644 --- a/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp +++ b/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp @@ -156,7 +156,7 @@ static void index_select_gather_impl( outer_size, grain_size / (index_size * inner_size), [&](int64_t begin, int64_t end) { - // create the offset stencil for each row in outer dimenson, + // create the offset stencil for each row in outer dimension, // shared across {outer_size} std::unique_ptr index_buffer( new integer_t[index_size * inner_size]); @@ -213,7 +213,7 @@ void cpu_index_select_dispatch( check_indexarray_range(index_data, index_size, dim_size); - // Note on index_select implementation choosen: + // Note on index_select implementation chosen: // // 1. `index_select_gather_impl`: used when inner_size == 1 or 2. // inner_size == 1 indicates a gather across {index_size}, here diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index 4197e18e3..31da471e0 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -1118,7 +1118,7 @@ inline void copy_key_value( } /* - *The scale-dot product for indirect access kv chache and fuse + *The scale-dot product for indirect access kv cache and fuse *matmul+div+add+softmax to improve data reuse *@param query Query embeeding with the of [beam_size*batch, cur_len, head_num, *head_size] @@ -1128,7 +1128,7 @@ inline void copy_key_value( *head_size] *@param key_cache Cache past key embeeding with the of [max_len, *beam_size*batch, head_num, head_size] - *@param value_chache Cache past value embeeding with the of [max_len, + *@param value_cache Cache past value embeeding with the of [max_len, *beam_size*batch, head_num, head_size] *@param beam_idx Beam info for every token [max_len, beam_size*batch] *@param offset The length of decoded(past) token. @@ -1273,13 +1273,13 @@ scale_dot_product_for_indirect_access_kv_cache( auto k_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - k_start); auto query_ti = 0; - // maping the query head to key/value head to support MGA/MQA + // mapping the query head to key/value head to support MGA/MQA auto kv_hi = head_group_start / group_size; if (chg_attn_w_layout) { auto attn_w_stride = (bsi * head_num + head_group_start) * attn_w_strideH; for (auto ti = k_start; ti < k_start + block_size; ti++) { - // caculate the innerproduct for the current token and store the + // calculate the innerproduct for the current token and store the // key if (offset > 0 && ti == query_ti + offset) { for (auto bbi = 0; bbi < beam_size; bbi++) { @@ -1301,7 +1301,7 @@ scale_dot_product_for_indirect_access_kv_cache( true, kc_head_start); } - } else { // caculate the innerproduct for the past token + } else { // calculate the innerproduct for the past token auto bi = bsi * beam_size; auto q_ptr_start = q_ptr + bi * qStrideB + head_group_start * qStrideH; @@ -1351,7 +1351,7 @@ scale_dot_product_for_indirect_access_kv_cache( auto beam = need_update_beam_idx && ti >= prompt_len ? new_beam_idx[bi][ti] : bsi * beam_size; - // caculate the innerproduct for the current token and store the + // calculate the innerproduct for the current token and store the // key if (offset > 0 && ti == query_ti + offset) { auto kc_head_start = k_cache_ptr + ti * kcStrideS + @@ -1366,7 +1366,7 @@ scale_dot_product_for_indirect_access_kv_cache( head_size, true, kc_head_start); - } else { // caculate the innerproduct for the past token + } else { // calculate the innerproduct for the past token auto kc_head_start = k_cache_ptr + ti * kcStrideS + beam * kcStrideB + kv_hi * kcStrideH; reduce_head( @@ -1555,7 +1555,7 @@ scale_dot_product_for_indirect_access_kv_cache( auto v_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - v_start); auto query_ti = 0; - // maping the query head to key/value head to support MGA/MQA + // mapping the query head to key/value head to support MGA/MQA auto kv_hi = hi / group_size; if (chg_attn_w_layout) { auto attn_w_stride = (bsi * head_num + hi) * attn_w_strideH; @@ -1588,7 +1588,7 @@ scale_dot_product_for_indirect_access_kv_cache( flag_access_start); } } else { - // caculate the innerproduct for the past token + // calculate the innerproduct for the past token if (need_update_beam_idx && vi >= prompt_len) { for (auto bbi = 0; bbi < beam_size; bbi++) { auto bi = bsi * beam_size + bbi; @@ -1666,7 +1666,7 @@ scale_dot_product_for_indirect_access_kv_cache( auto beam = need_update_beam_idx && vi >= prompt_len ? new_beam_idx[bi][vi] : bsi * beam_size; - // caculate the innerproduct for the current token and store the + // calculate the innerproduct for the current token and store the // key if (offset > 0 && vi == offset) { auto v_cache_head_start = v_cache_ptr + vi * vcStrideS + @@ -1684,7 +1684,7 @@ scale_dot_product_for_indirect_access_kv_cache( v_cache_head_start, flag_access_start); } else { - // caculate the innerproduct for the past token + // calculate the innerproduct for the past token auto v_cache_head_start = v_cache_ptr + vi * vcStrideS + beam * vcStrideB + kv_hi * vcStrideH; mul_attenion_weights_and_value_of_head( @@ -1880,13 +1880,13 @@ scale_dot_product_for_indirect_access_kv_cache_half( auto k_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - k_start); auto query_ti = 0; - // maping the query head to key/value head to support MGA/MQA + // mapping the query head to key/value head to support MGA/MQA auto kv_hi = head_group_start / group_size; if (chg_attn_w_layout) { auto attn_w_stride = (bsi * head_num + head_group_start) * attn_w_strideH; for (auto ti = k_start; ti < k_start + block_size; ti++) { - // caculate the innerproduct for the current token and store the + // calculate the innerproduct for the current token and store the // key if (offset > 0 && ti == query_ti + offset) { for (auto bbi = 0; bbi < beam_size; bbi++) { @@ -1908,7 +1908,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( true, kc_head_start); } - } else { // caculate the innerproduct for the past token + } else { // calculate the innerproduct for the past token auto bi = bsi * beam_size; auto q_ptr_start = q_ptr + bi * qStrideB + head_group_start * qStrideH; @@ -1958,7 +1958,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( auto beam = need_update_beam_idx && ti >= prompt_len ? new_beam_idx[bi][ti] : bsi * beam_size; - // caculate the innerproduct for the current token and store the + // calculate the innerproduct for the current token and store the // key if (offset > 0 && ti == query_ti + offset) { auto kc_head_start = k_cache_ptr + ti * kcStrideS + @@ -1973,7 +1973,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( head_size, true, kc_head_start); - } else { // caculate the innerproduct for the past token + } else { // calculate the innerproduct for the past token auto kc_head_start = k_cache_ptr + ti * kcStrideS + beam * kcStrideB + kv_hi * kcStrideH; reduce_head_half( @@ -2081,12 +2081,12 @@ scale_dot_product_for_indirect_access_kv_cache_half( auto v_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - v_start); auto query_ti = 0; - // maping the query head to key/value head to support MGA/MQA + // mapping the query head to key/value head to support MGA/MQA auto kv_hi = hi / group_size; if (chg_attn_w_layout) { auto attn_w_stride = (bsi * head_num + hi) * attn_w_strideH; for (auto vi = v_start; vi < v_start + block_size; vi++) { - // caculate the attention values for the current token + // calculate the attention values for the current token if (offset > 0 && vi == offset) { for (auto bbi = 0; bbi < beam_size; bbi++) { auto bi = bsi * beam_size + bbi; @@ -2115,7 +2115,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( flag_access_start); } } else { - // caculate the innerproduct for the past token + // calculate the innerproduct for the past token if (need_update_beam_idx && vi >= prompt_len) { for (auto bbi = 0; bbi < beam_size; bbi++) { auto bi = bsi * beam_size + bbi; @@ -2193,7 +2193,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( auto beam = need_update_beam_idx && vi >= prompt_len ? new_beam_idx[bi][vi] : bsi * beam_size; - // caculate the attention values for the current token + // calculate the attention values for the current token if (offset > 0 && vi == offset) { auto v_cache_head_start = v_cache_ptr + vi * vcStrideS + bi * vcStrideB + kv_hi * vcStrideH; @@ -2210,7 +2210,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( v_cache_head_start, flag_access_start); } else { - // caculate the innerproduct for the past token + // calculate the innerproduct for the past token auto v_cache_head_start = v_cache_ptr + vi * vcStrideS + beam * vcStrideB + kv_hi * vcStrideH; mul_attenion_weights_and_value_of_head_half( @@ -2684,7 +2684,7 @@ masked_multihead_self_attention_kernel_impl( } beam_idx_access[max_positions][0] = cur_len; // record the prompt token len beam_idx_access[max_positions + 1][0] = - query.size(0); // record the promt bs info + query.size(0); // record the prompt bs info } else if (offset > 0 && offset + cur_len > cache_size) { auto new_cache_size = cache_size * 2; @@ -2726,7 +2726,7 @@ masked_multihead_self_attention_kernel_impl( offset, scale_attn, attention_mask_v); - // just a funcationality path,need to optimize + // just a functionality path, need to optimize auto tokens_outs = std::vector(cur_len); for (auto i = 0; i < cur_len; i++) { auto query_i = query.select(1, i).unsqueeze(1); @@ -2934,7 +2934,7 @@ deepseekv2_mla_kernel_impl( } beam_idx_access[max_positions][0] = cur_len; // record the prompt token len beam_idx_access[max_positions + 1][0] = - query.size(0); // record the promt bs info + query.size(0); // record the prompt bs info } else if (offset > 0 && offset + cur_len > cache_size) { auto new_cache_size = cache_size * 2; diff --git a/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp b/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp index 1ee50613f..37ac26406 100644 --- a/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp +++ b/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp @@ -603,10 +603,10 @@ std::vector merged_embeddingbag_forward_cpu_kernel_impl( /** * Read from embedding table, and write to world_size * num_chk * num_emb's *EmbeddingRowCache world_size dimension decide which ranks should this - *particial look up result sent to num_emb dimmension devide which emb table + *particial look up result sent to num_emb dimension divide which emb table *should this particial look up result belong to num_chk dimension is hard code *to 16 here for better parallel scope, list 3 parallel choices: - *(1) Only parallel on num_emb, this limite the thread nums == num_emb + *(1) Only parallel on num_emb, this limit the thread nums == num_emb *(2) Parallel on num_emb and gbatch. Total tasks = num_emb * gbatch *(3) Parallel on num_emb and num_chk. Total tasks = num_emb * num_chk * @@ -782,7 +782,7 @@ mergedemb_distribute_forward_local_kernel_impl( indices_ptr[i] = indices[i].data_ptr(); offsets_ptr[i] = offsets[i].data_ptr(); } - // read from weight and accumuate in emb cache + // read from weight and accumulate in emb cache int64_t num_chk = 16; std::vector> cache_with_num_chk( world_size * num_chk * num_emb); @@ -807,7 +807,7 @@ mergedemb_distribute_forward_local_kernel_impl( emb_dim, world_size); // read from emb cache and write to the buffer while will be - // comunicated with other ranks + // communicated with other ranks prepare_ccl_buffer( idx, val, @@ -884,7 +884,7 @@ void mergedemb_distribute_forward_merge_kernel_impl( val_ptr[i] = val[i].data_ptr(); ofs_ptr[i] = ofs[i].data_ptr(); } - // read from weight and accumuate in emb cache + // read from weight and accumulate in emb cache mergedemb_distribute_forward_merge( world_size, num_emb, diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp index a1be9ea21..c758a975b 100644 --- a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp @@ -1532,7 +1532,7 @@ void reshape_and_cache_kernel( * seqlen_q = 5 and seqlen_k = 2, the causal mask is: 0 0 0 0 0 0 1 0 1 1 If the * row of the mask is all zero, the output will be zero. * - * For the chuned prefill case, the data layout is as follow: + * For the chunked prefill case, the data layout is as follow: * * Definition of context_len, query_len, and seq_len. * |---------- N-1 iteration --------| @@ -1556,9 +1556,9 @@ void flash_attn_varlen_kernel( at::Tensor& query, // [num_seqs, num_heads, head_size] at::Tensor& key_cache, // [num_blocks, num_heads, block_size, head_size] at::Tensor& value_cache, //[num_blocks, num_heads, block_size, head_size] - at::Tensor& cu_seqlens_q, // [batch_size+1] // the accumulted sequence + at::Tensor& cu_seqlens_q, // [batch_size+1] // the accumulated sequence // length of query - at::Tensor& cu_seqlens_k, // [batch_size+1] // the accumulted sequence + at::Tensor& cu_seqlens_k, // [batch_size+1] // the accumulated sequence // length of key(cached) int64_t max_seqlen_q, // max sequence length of query int64_t max_seqlens_k, // max sequence length of key and value(cached, @@ -2037,7 +2037,7 @@ void reshape_and_cache_cpu_kernel_impl( TORCH_CHECK( kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e5m2" || kv_cache_dtype == "auto", - "not supported kv_cahce_dtype"); + "not supported kv_cache_dtype"); RECORD_FUNCTION( "ipex::reshape_and_cache_cpu_kernel_impl", c10::ArrayRef({})); @@ -2095,7 +2095,7 @@ void flash_attn_varlen_cpu_kernel_impl( TORCH_CHECK( kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e5m2" || kv_cache_dtype == "auto", - "not supported kv_cahce_dtype"); + "not supported kv_cache_dtype"); RECORD_FUNCTION( "ipex::flash_attn_varlen_cpu_kernel_impl", c10::ArrayRef({})); diff --git a/csrc/cpu/aten/kernels/PunicaKrnl.cpp b/csrc/cpu/aten/kernels/PunicaKrnl.cpp index 93ce07c92..20471243a 100644 --- a/csrc/cpu/aten/kernels/PunicaKrnl.cpp +++ b/csrc/cpu/aten/kernels/PunicaKrnl.cpp @@ -32,7 +32,7 @@ namespace { template void _dot( - const T1* intput, + const T1* input, const T2* weight, T1* out, int64_t len, @@ -45,7 +45,7 @@ void _dot( int64_t vec_size = 16; // 512/32 auto qk_sum_vec = _mm512_setzero_ps(); for (hsi = 0; hsi <= len - vec_size; hsi += vec_size) { - auto q_vec = _loadu(intput + hsi); + auto q_vec = _loadu(input + hsi); auto k_vec = _loadu(weight + hsi); qk_sum_vec = _mm512_fmadd_ps(q_vec, k_vec, qk_sum_vec); } @@ -68,7 +68,7 @@ void punica_bgmv_expand_slice_kernel( out, // [bs, output_size1] output_size1 >= slice_offset + slice_size at::Tensor& input, // [bs, max_rank] at::Tensor& weights, // [num_lora, hidden_size, max_rank] - at::Tensor& indicies, // [bs] + at::Tensor& indices, // [bs] int64_t slice_offset, int64_t slice_size, bool add_inputs) { @@ -82,14 +82,14 @@ void punica_bgmv_expand_slice_kernel( TORCH_CHECK(slice_offset >= 0) TORCH_CHECK(slice_size == hidden_size) TORCH_CHECK(output_size1 >= slice_offset + slice_size); - TORCH_CHECK(batch_size == indicies.size(0)); + TORCH_CHECK(batch_size == indices.size(0)); TORCH_CHECK(batch_size == input.size(0)); TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(weights.is_contiguous()); - TORCH_CHECK(indicies.is_contiguous()); + TORCH_CHECK(indices.is_contiguous()); TORCH_CHECK(out.is_contiguous()); - int64_t* indicies_ptr = indicies.data_ptr(); + int64_t* indices_ptr = indices.data_ptr(); T* out_ptr = out.data_ptr(); T* input_ptr = input.data_ptr(); T* weights_ptr = weights.data_ptr(); @@ -100,7 +100,7 @@ void punica_bgmv_expand_slice_kernel( for (int64_t h = 0; h < hidden_size; h++) { int64_t input_bs = limit ? 0 : bs; int64_t weights_offset = - indicies_ptr[bs] * max_rank * hidden_size + h * max_rank; + indices_ptr[bs] * max_rank * hidden_size + h * max_rank; T* weight_start = weights_ptr + weights_offset; T* input_start = input_ptr + input_bs * input_size1; T* out_start = out_ptr + bs * output_size1 + h + slice_offset; @@ -115,7 +115,7 @@ void punica_sgmv_expand_slice_kernel( out, // [bs, output_size1] output_size1 >= slice_offset + slice_size at::Tensor& input, // [bs, max_rank] at::Tensor& weights, // [num_lora, hidden_size, max_rank] - at::Tensor& indicies, // [num_seq] + at::Tensor& indices, // [num_seq] at::Tensor& seq_lens, // [num_seq] int64_t slice_offset, int64_t slice_size, @@ -130,14 +130,14 @@ void punica_sgmv_expand_slice_kernel( TORCH_CHECK(slice_offset >= 0) TORCH_CHECK(slice_size == hidden_size) TORCH_CHECK(output_size1 >= slice_offset + slice_size); - TORCH_CHECK(seq_lens.size(0) == indicies.size(0)); + TORCH_CHECK(seq_lens.size(0) == indices.size(0)); TORCH_CHECK(batch_size == input.size(0)); TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(weights.is_contiguous()); - TORCH_CHECK(indicies.is_contiguous()); + TORCH_CHECK(indices.is_contiguous()); TORCH_CHECK(out.is_contiguous()); - int64_t* indicies_ptr = indicies.data_ptr(); + int64_t* indices_ptr = indices.data_ptr(); T* out_ptr = out.data_ptr(); T* input_ptr = input.data_ptr(); T* weights_ptr = weights.data_ptr(); @@ -159,7 +159,7 @@ void punica_sgmv_expand_slice_kernel( int64_t bs = offset + s; int64_t input_bs = limit ? 0 : bs; int64_t weights_offset = - indicies_ptr[seq_id] * max_rank * hidden_size + h * max_rank; + indices_ptr[seq_id] * max_rank * hidden_size + h * max_rank; T* weight_start = weights_ptr + weights_offset; T* input_start = input_ptr + input_bs * input_size1; T* out_start = out_ptr + bs * output_size1 + h + slice_offset; @@ -175,7 +175,7 @@ void punica_bgmv_shrink_kernel( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [bs] + at::Tensor& indices, // [bs] const double scale) { int64_t num_lora = weights.size(0); int64_t max_rank = weights.size(1); @@ -186,12 +186,12 @@ void punica_bgmv_shrink_kernel( TORCH_CHECK(input_size1 >= hidden_size); TORCH_CHECK(output_size1 >= max_rank); TORCH_CHECK(batch_size == input.size(0)); - TORCH_CHECK(batch_size == indicies.size(0)); + TORCH_CHECK(batch_size == indices.size(0)); TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(weights.is_contiguous()); - TORCH_CHECK(indicies.is_contiguous()); + TORCH_CHECK(indices.is_contiguous()); TORCH_CHECK(out.is_contiguous()); - int64_t* indicies_ptr = indicies.data_ptr(); + int64_t* indices_ptr = indices.data_ptr(); T* out_ptr = out.data_ptr(); T* input_ptr = input.data_ptr(); T* weights_ptr = weights.data_ptr(); @@ -200,7 +200,7 @@ void punica_bgmv_shrink_kernel( for (int64_t bs = 0; bs < batch_size; bs++) { for (int64_t r = 0; r < max_rank; r++) { int64_t weights_offset = - indicies_ptr[bs] * max_rank * hidden_size + r * hidden_size; + indices_ptr[bs] * max_rank * hidden_size + r * hidden_size; T* weight_start = weights_ptr + weights_offset; T* input_start = input_ptr + bs * input_size1; T* out_start = out_ptr + bs * output_size1 + r; @@ -215,7 +215,7 @@ void punica_sgmv_shrink_kernel( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [num_seq] + at::Tensor& indices, // [num_seq] at::Tensor& seq_lens, // [num_seq] const double scale) { int64_t num_lora = weights.size(0); @@ -227,12 +227,12 @@ void punica_sgmv_shrink_kernel( TORCH_CHECK(input_size1 >= hidden_size); TORCH_CHECK(output_size1 >= max_rank); TORCH_CHECK(batch_size == input.size(0)); - TORCH_CHECK(seq_lens.size(0) == indicies.size(0)); + TORCH_CHECK(seq_lens.size(0) == indices.size(0)); TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(weights.is_contiguous()); - TORCH_CHECK(indicies.is_contiguous()); + TORCH_CHECK(indices.is_contiguous()); TORCH_CHECK(out.is_contiguous()); - int64_t* indicies_ptr = indicies.data_ptr(); + int64_t* indices_ptr = indices.data_ptr(); T* out_ptr = out.data_ptr(); T* input_ptr = input.data_ptr(); T* weights_ptr = weights.data_ptr(); @@ -252,7 +252,7 @@ void punica_sgmv_shrink_kernel( int64_t offset = seq_id == 0 ? 0 : offsets_ptr[seq_id - 1]; int64_t bs = offset + s; int64_t weights_offset = - indicies_ptr[seq_id] * max_rank * hidden_size + r * hidden_size; + indices_ptr[seq_id] * max_rank * hidden_size + r * hidden_size; T* weight_start = weights_ptr + weights_offset; T* input_start = input_ptr + bs * input_size1; T* out_start = out_ptr + bs * output_size1 + r; @@ -268,7 +268,7 @@ void punica_bgmv_expand_kernel( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [bs] + at::Tensor& indices, // [bs] bool add_inputs) { int64_t num_lora = weights.size(0); int64_t max_rank = weights.size(1); @@ -278,13 +278,13 @@ void punica_bgmv_expand_kernel( int64_t input_size1 = input.size(1); TORCH_CHECK(input_size1 >= hidden_size); TORCH_CHECK(output_size1 >= max_rank); - TORCH_CHECK(batch_size == indicies.size(0)); + TORCH_CHECK(batch_size == indices.size(0)); TORCH_CHECK(batch_size == input.size(0) || input.size(0) == 1); TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(weights.is_contiguous()); - TORCH_CHECK(indicies.is_contiguous()); + TORCH_CHECK(indices.is_contiguous()); TORCH_CHECK(out.is_contiguous()); - int64_t* indicies_ptr = indicies.data_ptr(); + int64_t* indices_ptr = indices.data_ptr(); T* out_ptr = out.data_ptr(); T* input_ptr = input.data_ptr(); T* weights_ptr = weights.data_ptr(); @@ -295,7 +295,7 @@ void punica_bgmv_expand_kernel( for (int64_t r = 0; r < max_rank; r++) { int64_t input_bs = limit ? 0 : bs; int64_t weights_offset = - indicies_ptr[bs] * max_rank * hidden_size + r * hidden_size; + indices_ptr[bs] * max_rank * hidden_size + r * hidden_size; T* weight_start = weights_ptr + weights_offset; T* input_start = input_ptr + input_bs * input_size1; T* out_start = out_ptr + bs * output_size1 + r; @@ -310,7 +310,7 @@ void punica_sgmv_expand_kernel( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [num_seq] + at::Tensor& indices, // [num_seq] at::Tensor& seq_lens, // [num_seq] bool add_inputs) { int64_t num_lora = weights.size(0); @@ -321,13 +321,13 @@ void punica_sgmv_expand_kernel( int64_t input_size1 = input.size(1); TORCH_CHECK(input_size1 >= hidden_size); TORCH_CHECK(output_size1 >= max_rank); - TORCH_CHECK(seq_lens.size(0) == indicies.size(0)); + TORCH_CHECK(seq_lens.size(0) == indices.size(0)); TORCH_CHECK(batch_size == input.size(0) || input.size(0) == 1); TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(weights.is_contiguous()); - TORCH_CHECK(indicies.is_contiguous()); + TORCH_CHECK(indices.is_contiguous()); TORCH_CHECK(out.is_contiguous()); - int64_t* indicies_ptr = indicies.data_ptr(); + int64_t* indices_ptr = indices.data_ptr(); T* out_ptr = out.data_ptr(); T* input_ptr = input.data_ptr(); T* weights_ptr = weights.data_ptr(); @@ -349,7 +349,7 @@ void punica_sgmv_expand_kernel( int64_t bs = offset + s; int64_t input_bs = limit ? 0 : bs; int64_t weights_offset = - indicies_ptr[seq_id] * max_rank * hidden_size + r * hidden_size; + indices_ptr[seq_id] * max_rank * hidden_size + r * hidden_size; T* weight_start = weights_ptr + weights_offset; T* input_start = input_ptr + input_bs * input_size1; T* out_start = out_ptr + bs * output_size1 + r; @@ -364,7 +364,7 @@ void punica_bgmv_shrink_kernel_impl( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [bs] + at::Tensor& indices, // [bs] const double scale) { RECORD_FUNCTION( "ipex::punica_bgmv_shrink_kernel_impl", c10::ArrayRef({})); @@ -377,12 +377,12 @@ void punica_bgmv_shrink_kernel_impl( TORCH_CHECK(out.dim() == 2, "out must be 2D"); TORCH_CHECK(input.dim() == 2, "input must be 2D"); TORCH_CHECK(weights.dim() == 3, "weights must be 3D"); - TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D"); + TORCH_CHECK(indices.dim() == 1, "indices must be 1D"); if (out.scalar_type() == at::kBFloat16) { punica_bgmv_shrink_kernel( - out, input, weights, indicies, scale); + out, input, weights, indices, scale); } else if (out.scalar_type() == at::kHalf) { - punica_bgmv_shrink_kernel(out, input, weights, indicies, scale); + punica_bgmv_shrink_kernel(out, input, weights, indices, scale); } } @@ -390,7 +390,7 @@ void punica_sgmv_shrink_kernel_impl( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [num_seq] + at::Tensor& indices, // [num_seq] at::Tensor& seq_lens, // [num_seq] const double scale) { RECORD_FUNCTION( @@ -404,14 +404,14 @@ void punica_sgmv_shrink_kernel_impl( TORCH_CHECK(out.dim() == 2, "out must be 2D"); TORCH_CHECK(input.dim() == 2, "input must be 2D"); TORCH_CHECK(weights.dim() == 3, "weights must be 3D"); - TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D"); - TORCH_CHECK(seq_lens.dim() == 1, "indicies must be 1D"); + TORCH_CHECK(indices.dim() == 1, "indices must be 1D"); + TORCH_CHECK(seq_lens.dim() == 1, "indices must be 1D"); if (out.scalar_type() == at::kBFloat16) { punica_sgmv_shrink_kernel( - out, input, weights, indicies, seq_lens, scale); + out, input, weights, indices, seq_lens, scale); } else if (out.scalar_type() == at::kHalf) { punica_sgmv_shrink_kernel( - out, input, weights, indicies, seq_lens, scale); + out, input, weights, indices, seq_lens, scale); } } @@ -420,7 +420,7 @@ void punica_bgmv_expand_kernel_impl( at::Tensor& input, // [bs, input_size1] or [1, input_size1] input_size1 >= // hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [bs] + at::Tensor& indices, // [bs] bool add_inputs) { RECORD_FUNCTION( "ipex::punica_bgmv_expand_kernel_impl", c10::ArrayRef({})); @@ -433,13 +433,13 @@ void punica_bgmv_expand_kernel_impl( TORCH_CHECK(out.dim() == 2, "out must be 2D"); TORCH_CHECK(input.dim() == 2, "input must be 2D"); TORCH_CHECK(weights.dim() == 3, "weights must be 3D"); - TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D"); + TORCH_CHECK(indices.dim() == 1, "indices must be 1D"); if (out.scalar_type() == at::kBFloat16) { punica_bgmv_expand_kernel( - out, input, weights, indicies, add_inputs); + out, input, weights, indices, add_inputs); } else if (out.scalar_type() == at::kHalf) { punica_bgmv_expand_kernel( - out, input, weights, indicies, add_inputs); + out, input, weights, indices, add_inputs); } } @@ -448,7 +448,7 @@ void punica_sgmv_expand_kernel_impl( at::Tensor& input, // [bs, input_size1] or [1, input_size1] input_size1 >= // hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [bs] + at::Tensor& indices, // [bs] at::Tensor& seq_lens, // [bs] bool add_inputs) { RECORD_FUNCTION( @@ -462,14 +462,14 @@ void punica_sgmv_expand_kernel_impl( TORCH_CHECK(out.dim() == 2, "out must be 2D"); TORCH_CHECK(input.dim() == 2, "input must be 2D"); TORCH_CHECK(weights.dim() == 3, "weights must be 3D"); - TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D"); - TORCH_CHECK(seq_lens.dim() == 1, "indicies must be 1D"); + TORCH_CHECK(indices.dim() == 1, "indices must be 1D"); + TORCH_CHECK(seq_lens.dim() == 1, "indices must be 1D"); if (out.scalar_type() == at::kBFloat16) { punica_sgmv_expand_kernel( - out, input, weights, indicies, seq_lens, add_inputs); + out, input, weights, indices, seq_lens, add_inputs); } else if (out.scalar_type() == at::kHalf) { punica_sgmv_expand_kernel( - out, input, weights, indicies, seq_lens, add_inputs); + out, input, weights, indices, seq_lens, add_inputs); } } @@ -477,7 +477,7 @@ void punica_bgmv_expand_slice_kernel_impl( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [bs] + at::Tensor& indices, // [bs] int64_t slice_offset, int64_t slice_size, bool add_inputs) { @@ -493,13 +493,13 @@ void punica_bgmv_expand_slice_kernel_impl( TORCH_CHECK(out.dim() == 2, "out must be 2D"); TORCH_CHECK(input.dim() == 2, "input must be 2D"); TORCH_CHECK(weights.dim() == 3, "weights must be 3D"); - TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D"); + TORCH_CHECK(indices.dim() == 1, "indices must be 1D"); if (out.scalar_type() == at::kBFloat16) { punica_bgmv_expand_slice_kernel( - out, input, weights, indicies, slice_offset, slice_size, add_inputs); + out, input, weights, indices, slice_offset, slice_size, add_inputs); } else if (out.scalar_type() == at::kHalf) { punica_bgmv_expand_slice_kernel( - out, input, weights, indicies, slice_offset, slice_size, add_inputs); + out, input, weights, indices, slice_offset, slice_size, add_inputs); } } @@ -507,7 +507,7 @@ void punica_sgmv_expand_slice_kernel_impl( at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank at::Tensor& input, // [bs, input_size1] input_size1 >= hidden_size at::Tensor& weights, // [num_lora, max_rank, hidden_size] - at::Tensor& indicies, // [num_seq] + at::Tensor& indices, // [num_seq] at::Tensor& seq_lens, // [num_seq] int64_t slice_offset, int64_t slice_size, @@ -524,14 +524,14 @@ void punica_sgmv_expand_slice_kernel_impl( TORCH_CHECK(out.dim() == 2, "out must be 2D"); TORCH_CHECK(input.dim() == 2, "input must be 2D"); TORCH_CHECK(weights.dim() == 3, "weights must be 3D"); - TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D"); - TORCH_CHECK(seq_lens.dim() == 1, "indicies must be 1D"); + TORCH_CHECK(indices.dim() == 1, "indices must be 1D"); + TORCH_CHECK(seq_lens.dim() == 1, "indices must be 1D"); if (out.scalar_type() == at::kBFloat16) { punica_sgmv_expand_slice_kernel( out, input, weights, - indicies, + indices, seq_lens, slice_offset, slice_size, @@ -541,7 +541,7 @@ void punica_sgmv_expand_slice_kernel_impl( out, input, weights, - indicies, + indices, seq_lens, slice_offset, slice_size, diff --git a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp index ad3a19686..15c961d44 100644 --- a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp +++ b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp @@ -34,7 +34,7 @@ bool is_fused_qkv(at::Tensor& t_in, int64_t hidden_size) { * @param t_pos The tensor containing the positions. t_pos should be [B][S] * where B is the batch size and S is the sequence length. In some cases, there * is only one element which the past_kv_length.In this case, position id can - * construced by past_kv_length + current_position + * constructed by past_kv_length + current_position * @param N The number of heads. * @param H The head size. * @param offset The offset value. For GPT-J 6B/ChatGLM, cos/sin is applied to diff --git a/csrc/cpu/isa/cpu_feature.cpp b/csrc/cpu/isa/cpu_feature.cpp index 2d0ebdb9a..e6bed25f5 100644 --- a/csrc/cpu/isa/cpu_feature.cpp +++ b/csrc/cpu/isa/cpu_feature.cpp @@ -373,7 +373,7 @@ bool CPUFeature::_do_check_and_init_amx() { } bool CPUFeature::isa_level_amx() { - // check and init in a funtion, avoid to double init. + // check and init in a function, avoid to double init. static bool b_is_support = _do_check_and_init_amx(); return b_is_support; diff --git a/csrc/cpu/isa/cpu_feature.hpp b/csrc/cpu/isa/cpu_feature.hpp index 598ad9cdf..cb414eb29 100644 --- a/csrc/cpu/isa/cpu_feature.hpp +++ b/csrc/cpu/isa/cpu_feature.hpp @@ -140,7 +140,7 @@ class CPUFeature { public: /* - isa level referance to oneDNN. + isa level reference to oneDNN. ------------------------------------------------------------------------------------ The ISAs are partially ordered: SSE41 < AVX < AVX2, diff --git a/csrc/cpu/jit/README.md b/csrc/cpu/jit/README.md index 707049b21..0242d8a33 100644 --- a/csrc/cpu/jit/README.md +++ b/csrc/cpu/jit/README.md @@ -1,3 +1,3 @@ # PyTorch JIT pass for DNNL -This folder contains experimental passes that optimize PyTorch Graph to utilize full power of DNNL (or other optimizations). It chose to use PyTorch namespace for eazy migration into main repo in the future. Abstract graph manipulation part of JIT should completely independent of other modules in extension, which means no reference to any symbols in other files of the project. +This folder contains experimental passes that optimize PyTorch Graph to utilize full power of DNNL (or other optimizations). It chose to use PyTorch namespace for easy migration into main repo in the future. Abstract graph manipulation part of JIT should completely independent of other modules in extension, which means no reference to any symbols in other files of the project. diff --git a/csrc/cpu/jit/codegen/onednn/kernel.cpp b/csrc/cpu/jit/codegen/onednn/kernel.cpp index 8709d87b8..d5a74c88c 100644 --- a/csrc/cpu/jit/codegen/onednn/kernel.cpp +++ b/csrc/cpu/jit/codegen/onednn/kernel.cpp @@ -75,18 +75,18 @@ ArgSpec LlgaKernel::getQuantizedSpec(ArgSpec spec, size_t offset) const { return spec; } -std::map LlgaKernel::initializeTensorIdToOccurence() const { - std::map tensorIdToOccurence; +std::map LlgaKernel::initializeTensorIdToOccurrence() const { + std::map tensorIdToOccurrence; for (auto& lt : partition_.get_input_ports()) { auto inputId = lt.get_id(); - std::map::iterator it(tensorIdToOccurence.find(inputId)); - if (it != tensorIdToOccurence.end()) { + std::map::iterator it(tensorIdToOccurrence.find(inputId)); + if (it != tensorIdToOccurrence.end()) { it->second++; } else { - tensorIdToOccurence[inputId] = 1; + tensorIdToOccurrence[inputId] = 1; } } - return tensorIdToOccurence; + return tensorIdToOccurrence; } ArgSpecs LlgaKernel::initializeInputSpecs(const TensorArgs& inputs) { @@ -100,22 +100,22 @@ ArgSpecs LlgaKernel::initializeInputSpecs(const TensorArgs& inputs) { } }); GRAPH_DEBUG("Initializing graph input logical tensors"); - // initializeTensorIdToOccurence can also be called just once for the first + // initializeTensorIdToOccurrence can also be called just once for the first // input shape - std::map tensorIdToOccurence = - initializeTensorIdToOccurence(); + std::map tensorIdToOccurrence = + initializeTensorIdToOccurrence(); for (size_t i = 0; i < nGraphInputs_; i++) { auto spec = ArgSpec(graph_->inputs()[i]).supplementTensorInfo(inputs[i]); - int64_t occurence = tensorIdToOccurence[spec.tid()]; - inputSpecs.insert(inputSpecs.end(), occurence, spec); + int64_t occurrence = tensorIdToOccurrence[spec.tid()]; + inputSpecs.insert(inputSpecs.end(), occurrence, spec); } std::call_once(constantSpecInitializedFlag_, [&]() { for (size_t i = 0; i < nGraphInputs_; i++) { auto spec = ArgSpec(graph_->inputs()[i]).supplementTensorInfo(inputs[i]); - int64_t occurence = tensorIdToOccurence[spec.tid()]; + int64_t occurrence = tensorIdToOccurrence[spec.tid()]; initializedInputIds_.insert(spec.tid()); - runArgsIdx_.insert(runArgsIdx_.end(), occurence, i); + runArgsIdx_.insert(runArgsIdx_.end(), occurrence, i); } for (auto& lt : partition_.get_input_ports()) { auto inputId = lt.get_id(); @@ -291,7 +291,7 @@ void LlgaKernel::prepareAndCacheRunArgs( // Currently, only weight will use quantize_per_channel, data will // always use quantize_per_tensor. We will only allocate buffer for data // (output of a LlgaPartition). If in the future, we need allocate - // buffer for qensor that is quantized per channel, need implemeted + // buffer for qensor that is quantized per channel, need implemented // as_strided_qtensorimpl for PER_CHANNEL QScheme. qtensor.as_strided_(spec.sizes(), spec.strides()); outputs.push_back(qtensor); diff --git a/csrc/cpu/jit/codegen/onednn/kernel.h b/csrc/cpu/jit/codegen/onednn/kernel.h index d11496a55..56e918888 100644 --- a/csrc/cpu/jit/codegen/onednn/kernel.h +++ b/csrc/cpu/jit/codegen/onednn/kernel.h @@ -91,7 +91,7 @@ class LlgaKernel { // create qtensor for output of public format ArgSpec getQuantizedSpec(ArgSpec spec, size_t offset) const; - std::map initializeTensorIdToOccurence() const; + std::map initializeTensorIdToOccurrence() const; // PyTorch copy constants inside the subgraph instead of referencing them. // Constants inputs to the partition are no longer in the graph->inputs(). diff --git a/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp b/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp index 00a0311a7..b49e13b02 100644 --- a/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp +++ b/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp @@ -18,7 +18,7 @@ class OpSplitter { bool analyzeNode(Node* node) { // If node->kind() matches the NodeKind, the node will be a candidate to be - // splitted. If the input to the current node matches with InputKind, will + // split. If the input to the current node matches with InputKind, will // split the node static std::unordered_map> NodeKindToInputKind{ {aten::to, {Symbol::aten("dequantize")}}, diff --git a/csrc/cpu/jit/codegen/onednn/prepare_dequant.h b/csrc/cpu/jit/codegen/onednn/prepare_dequant.h index 490181d09..2595bb8ff 100644 --- a/csrc/cpu/jit/codegen/onednn/prepare_dequant.h +++ b/csrc/cpu/jit/codegen/onednn/prepare_dequant.h @@ -38,7 +38,7 @@ void PrepareDequantForLLGA(std::shared_ptr& graph); // PyTorch dequant node receives qtensor as input, thus no quantization-related // info (scales, zp, etc.) on the IR, while LLGA needs those info on the -// dequantize node. We add a pass to retreive the quantization info from the +// dequantize node. We add a pass to retrieve the quantization info from the // quantize node just before the dequantize node and save them on the dequantize // node. void SaveDequantInformation(std::shared_ptr& graph); diff --git a/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h b/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h index 2d2887cf6..cdef1d621 100644 --- a/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h +++ b/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h @@ -25,7 +25,7 @@ struct ContextConvTranspose final { std::vector dilation_; std::vector input_size_; int64_t groups_; - // The originin weight != weight_packed_.get_dims() since there is a tranpose + // The originin weight != weight_packed_.get_dims() since there is a transpose // for weight, We directly store origin_weight_dims_ here to avoid compute it. std::vector origin_weight_dims_; bool weight_is_channels_last_; diff --git a/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h b/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h index 752c6d144..253c18550 100644 --- a/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h +++ b/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h @@ -116,7 +116,7 @@ at::Tensor& run( at::Tensor& accumu, const ideep::attr_t& attr); -// Runing backward for ConvTranspose by given grad_output, input and grad_masks. +// Running backward for ConvTranspose by given grad_output, input and grad_masks. // Will using the mkldnn_weight stored in the context std::tuple run_backward( ContextConvTranspose& context, diff --git a/csrc/cpu/jit/cpu/kernels/Einsum.cpp b/csrc/cpu/jit/cpu/kernels/Einsum.cpp index 7dc06547c..685d459b1 100644 --- a/csrc/cpu/jit/cpu/kernels/Einsum.cpp +++ b/csrc/cpu/jit/cpu/kernels/Einsum.cpp @@ -178,7 +178,7 @@ static Tensor sumproduct_pair( // we now work with the following permutations / shapes. // the pipeline is permute inputs -> reshape inputs -> batch matrix mul -> // reshape(view) output -> permute output output: "lro, lo, 1-for-summed-dims, - // ro" with orgiginal shape dimensions left: "lro, lo, summed" permuted with + // ro" with original shape dimensions left: "lro, lo, summed" permuted with // lpermutation and the three flattened right: "lro, summed, ro" permuted // with rpermutation and the three flattened then the permuted output is a // view of bmm(left, right) finally, opermutation reverts the permutation to @@ -192,7 +192,7 @@ static Tensor sumproduct_pair( for (auto& d : sum_dims_) { out_size.push_back(1); (void)(d); - }; // avoid warining about not using d + }; // avoid warning about not using d for (auto& d : ro) out_size.push_back(right.size(d)); std::vector lpermutation(lro); @@ -395,7 +395,7 @@ unsigned char einsum_index_to_label(uint8_t index) { *2) unsqueeze and permute the inputs/output to have same dims. The dim order * of all inputs and output is same. *\param equation: The subscripts for the Einstein summation. - *more detials about equation can found: + *more details about equation can found: *https://pytorch.org/docs/stable/generated/torch.einsum.html *\param operands: The tensors to compute the Einstein summation of. *\return tuple& graph) { // TODO: Record original aten nodes, while convert aten linear-> ipex linear, // will ignore these aten linear (if they are fp32 dtype). For BF16 dtype, - // always use ipex linear. This is a temporay solution, for next PR to clean + // always use ipex linear. This is a temporary solution, for next PR to clean // up fusion pass, will further abstract this as a class method. auto aten_linear_recorder = ATenLinearRecorder(graph); // linear folding @@ -244,7 +244,7 @@ FusionBehavior getCurrentBehavior(size_t remaining_depth) { } } // should never get here - TORCH_WARN("Stratgy changed mid-invocation, NYI"); + TORCH_WARN("Strategy changed mid-invocation, NYI"); return FusionBehavior::STATIC; } diff --git a/csrc/cpu/jit/passes/concat_linear.cpp b/csrc/cpu/jit/passes/concat_linear.cpp index e5eee46cb..9cb9cc64f 100644 --- a/csrc/cpu/jit/passes/concat_linear.cpp +++ b/csrc/cpu/jit/passes/concat_linear.cpp @@ -113,7 +113,7 @@ class ConcatLinearLayers { TORCH_CHECK( (aten_linear.find(base_node) != aten_linear.end()) == (aten_linear.find(compatible_layers[i]) != aten_linear.end()), - "one of the layer is replaced by ipex linear while one of the other layer is original aten linear, it is ambiguity to know whether we shoudl create ipex linear or aten linear for concated linear") + "one of the layer is replaced by ipex linear while one of the other layer is original aten linear, it is ambiguity to know whether we should create ipex linear or aten linear for concated linear") } // Create concated aten linear if (aten_linear.find(base_node) != aten_linear.end()) { diff --git a/csrc/cpu/jit/passes/frozen_conv_folding.cpp b/csrc/cpu/jit/passes/frozen_conv_folding.cpp index 8f004e87e..076b70c92 100644 --- a/csrc/cpu/jit/passes/frozen_conv_folding.cpp +++ b/csrc/cpu/jit/passes/frozen_conv_folding.cpp @@ -55,7 +55,7 @@ bool FoldFrozenConvBatchnorm(Block* b) { auto bn_rm_ivalue = bn->namedInput("running_mean"); auto bn_rv_ivalue = bn->namedInput("running_var"); // check running_mean and running_var has value, if they are - // None(track_running_stats=False), skiping the folding path. + // None(track_running_stats=False), skipping the folding path. if (bn_rm_ivalue->type() == NoneType::get() && bn_rv_ivalue->type() == NoneType::get()) { continue; diff --git a/csrc/cpu/jit/passes/frozen_linear_folding.cpp b/csrc/cpu/jit/passes/frozen_linear_folding.cpp index 69a47c7d5..0dcaae27c 100644 --- a/csrc/cpu/jit/passes/frozen_linear_folding.cpp +++ b/csrc/cpu/jit/passes/frozen_linear_folding.cpp @@ -105,7 +105,7 @@ bool FoldFrozenLinearBatchnorm(Block* b) { auto bn_rv_ivalue = bn->namedInput("running_var"); // check running_mean and running_var has value, if they are - // None(track_running_stats=False), skiping the folding path. + // None(track_running_stats=False), skipping the folding path. if (bn_rm_ivalue->type() == NoneType::get() && bn_rv_ivalue->type() == NoneType::get()) { continue; diff --git a/csrc/cpu/jit/passes/graph_rewrite.cpp b/csrc/cpu/jit/passes/graph_rewrite.cpp index c279daf6e..83e1e064c 100644 --- a/csrc/cpu/jit/passes/graph_rewrite.cpp +++ b/csrc/cpu/jit/passes/graph_rewrite.cpp @@ -14,7 +14,7 @@ using namespace torch::jit; // FuseShuffle is matching the channelshuffle pattern, where: // (1) the first view is [n, c, h, w] => [n, groups, c // groups, h, w] -// (2) the tranpose is for groups => [n, c // groups, grpups, h, w] +// (2) the transpose is for groups => [n, c // groups, grpups, h, w] // (3) the output view shape should be the same as the input tensor shape void FuseShuffle(std::shared_ptr& graph) { // below is channelshuffle for staic view shape pattern @@ -98,7 +98,7 @@ void FuseShuffle(std::shared_ptr& graph) { trans_dim0_val < trans_dim1_val ? trans_dim0_val : trans_dim1_val; auto dim1_val = trans_dim0_val > trans_dim1_val ? trans_dim0_val : trans_dim1_val; - // If the tranpose if not for groups. ex. [n, c1, c2, h, w] => [n, c2, + // If the transpose if not for groups. ex. [n, c1, c2, h, w] => [n, c2, // c1, h, w] if ((dim1_val - dim0_val) != 1) { return false; @@ -140,7 +140,7 @@ void FuseShuffle(std::shared_ptr& graph) { for (int i = 0; i < flattern_shape_list.size(); i++) { if (flattern_shape_list[i] != inputTensor.sizes()[i].value()) { - // [n, c, h, w] => view [n, groups, c // groups, h, w] => tranpose + // [n, c, h, w] => view [n, groups, c // groups, h, w] => transpose // [n, c // groups, groups, h, w] // => view [n, -1, h, w] // or @@ -991,7 +991,7 @@ void replaceAddWithQAdd(std::shared_ptr& graph) { %qout = aten::quantize_per_tensor(%r, %o_scale, %o_zp, %o_dtype) return (%qout) )"; - // fliter the unsupported case + // filter the unsupported case auto fusion_filter = [](const Match& match, const std::unordered_map& vmap) { auto alpha = match.values_map.at(vmap.at("alpha")); @@ -1023,7 +1023,7 @@ void fuseBmmAdd(std::shared_ptr& graph) { graph(%input, %batch1, %batch2, %alpha): %res = ipex::bmm_add(%input, %batch1, %batch2, %alpha) return (%res))"; - // fliter the unsupported case + // filter the unsupported case auto fusion_filter = [](const Match& match, const std::unordered_map& vmap) { const auto& match_vmap = match.values_map; @@ -1108,7 +1108,7 @@ void FuseConcatBnRelu(std::shared_ptr& graph) { }; // Check if the dimension of the first tensor is either 4 or 5. // Check if the data type, the size of Channels, and the memory format are - // float, mutiples of 16, and ChannelsLast(3d), respectively. + // float, multiples of 16, and ChannelsLast(3d), respectively. if (!(tensor1->dim().value() == 4 || tensor1->dim().value() == 5) || !check_type_channelsize(*tensor1)) { return false; @@ -1298,7 +1298,7 @@ void FusePythonGELUWithAten(std::shared_ptr& graph) { SingleGeluTanh_v2.runOnGraph(graph, filter_v2); } -// This path will be removed after pytorch offical path is optimized well. +// This path will be removed after pytorch official path is optimized well. void replaceAtenMaxPool2dWithIpexMaxPool2d(std::shared_ptr& graph) { std::string max_pool2d = R"( graph(%a, %kernel_size:int[], %stride:int[], %padding:int[], %dilation:int[], %ceil_mode:bool): diff --git a/csrc/cpu/jit/passes/graph_rewrite.h b/csrc/cpu/jit/passes/graph_rewrite.h index 6ee12fd10..c352f9b32 100644 --- a/csrc/cpu/jit/passes/graph_rewrite.h +++ b/csrc/cpu/jit/passes/graph_rewrite.h @@ -15,7 +15,7 @@ void FuseShuffle(std::shared_ptr& graph); void PostScalarDivOrMul(std::shared_ptr& graph); void FuseMHAScoreCalc(std::shared_ptr& graph); void FuseLinearSwishCustomized(std::shared_ptr& graph); -// This path will be removed after pytorch offical path is optimized well. +// This path will be removed after pytorch official path is optimized well. void replaceAtenMaxPool2dWithIpexMaxPool2d( std::shared_ptr& graph); void fuseBmmAdd(std::shared_ptr& graph); diff --git a/csrc/cpu/jit/passes/graph_rewrite_helper.cpp b/csrc/cpu/jit/passes/graph_rewrite_helper.cpp index 8b7e34819..2ec8af99e 100644 --- a/csrc/cpu/jit/passes/graph_rewrite_helper.cpp +++ b/csrc/cpu/jit/passes/graph_rewrite_helper.cpp @@ -10,7 +10,7 @@ namespace graph_rewrite_helper { using namespace torch::jit; -// those code just copy from PyTorch offical and extend +// those code just copy from PyTorch official and extend // replaceConvolutionWithAtenConv to handle conv_transpose3d. Value* getValue( diff --git a/csrc/cpu/jit/passes/graph_rewrite_helper.h b/csrc/cpu/jit/passes/graph_rewrite_helper.h index 96763393d..ee0acdda5 100644 --- a/csrc/cpu/jit/passes/graph_rewrite_helper.h +++ b/csrc/cpu/jit/passes/graph_rewrite_helper.h @@ -10,7 +10,7 @@ namespace torch_ipex { namespace jit { namespace graph_rewrite_helper { -// those code just copy from PyTorch offical and extend +// those code just copy from PyTorch official and extend // replaceConvolutionWithAtenConv to handle conv_transpose3d. torch::jit::Value* getValue( diff --git a/csrc/cpu/runtime/CPUPool.cpp b/csrc/cpu/runtime/CPUPool.cpp index b9b3fc6b7..a72738035 100644 --- a/csrc/cpu/runtime/CPUPool.cpp +++ b/csrc/cpu/runtime/CPUPool.cpp @@ -100,7 +100,7 @@ std::vector init_process_available_cores() { if (is_runtime_ext_enabled()) { // When IOMP preloaded. // Step1: Get the main thread affinity information: - // 2 knowning external command may change it during process starts up: + // 2 knowing external command may change it during process starts up: // * External Numactl. // * Preload IOMP with KMP_AFFINITY settings. // We need to save this information firstly and restore it later. @@ -299,7 +299,7 @@ void set_mask_affinity_from_cpu_pool(const CPUPool& cpu_pool) { omp_set_num_threads(threads_mask.size()); #pragma omp parallel num_threads(threads_mask.size()) { - // we will destory the mask inside the CPUPool deconstructor + // we will destroy the mask inside the CPUPool deconstructor int thread_id = omp_get_thread_num(); kmp_affinity_mask_t mask = threads_mask[thread_id]; kmp_set_affinity_ext(&mask); diff --git a/csrc/cpu/runtime/CPUPool.h b/csrc/cpu/runtime/CPUPool.h index ad501a27e..9c14d3ba4 100644 --- a/csrc/cpu/runtime/CPUPool.h +++ b/csrc/cpu/runtime/CPUPool.h @@ -43,9 +43,9 @@ class IPEX_API CPUPool { // Put deleted function into private. CPUPool() = delete; CPUPool(const CPUPool& source_cpu_pool) = - delete; // avoid potential risk of double destory masks. + delete; // avoid potential risk of double destroy masks. CPUPool& operator=(const CPUPool& source_cpu_pool) = - delete; // avoid potential risk of double destory masks. + delete; // avoid potential risk of double destroy masks. CPUPool& operator=(CPUPool&& source_cpu_pool) = delete; }; diff --git a/csrc/cpu/runtime/Task.h b/csrc/cpu/runtime/Task.h index 10af352e5..230290384 100644 --- a/csrc/cpu/runtime/Task.h +++ b/csrc/cpu/runtime/Task.h @@ -63,7 +63,7 @@ auto Task::operator()(Args&&... args) // set the thread local status, such as the grad mode before execuating // the status at::GradMode::set_enabled(grad_mode); - // execuate the task + // execute the task (*task)(); }); } diff --git a/csrc/cpu/runtime/TaskExecutor.h b/csrc/cpu/runtime/TaskExecutor.h index 3ec0078b7..468177846 100644 --- a/csrc/cpu/runtime/TaskExecutor.h +++ b/csrc/cpu/runtime/TaskExecutor.h @@ -43,13 +43,13 @@ class IPEX_API TaskExecutor { // Put the deleted function in the private. TaskExecutor(const TaskExecutor& task_executor) = - delete; // Not support copy or move construtor. + delete; // Not support copy or move constructor. TaskExecutor(TaskExecutor&& task_executor) = - delete; // Not support copy or move construtor. + delete; // Not support copy or move constructor. TaskExecutor& operator=(const TaskExecutor& task_executor) = - delete; // Not support copy or move construtor. + delete; // Not support copy or move constructor. TaskExecutor& operator=(TaskExecutor&& task_executor) = - delete; // Not support copy or move construtor. + delete; // Not support copy or move constructor. }; } // namespace runtime diff --git a/csrc/cpu/tpp/ext_tpp.h b/csrc/cpu/tpp/ext_tpp.h index 958584363..1172bea29 100644 --- a/csrc/cpu/tpp/ext_tpp.h +++ b/csrc/cpu/tpp/ext_tpp.h @@ -33,7 +33,7 @@ class BrgemmExtTPP { auto dt_out = XsmmDtype(); if (dt_out == LIBXSMM_DATATYPE_F32 && c_trans == XformTPP::XFORM_N2V_TPP) { printf( - "Warning: reseting c_trans flag from N2V to None for FP32 output\n"); + "Warning: resetting c_trans flag from N2V to None for FP32 output\n"); c_trans = XformTPP::XFORM_NONE_TPP; } auto beta_ = beta; diff --git a/csrc/cpu/tpp/init.cpp b/csrc/cpu/tpp/init.cpp index d33b2df31..85d1ca7e0 100644 --- a/csrc/cpu/tpp/init.cpp +++ b/csrc/cpu/tpp/init.cpp @@ -72,7 +72,7 @@ void init_libxsmm() { auto max_threads = omp_get_max_threads(); PCL_ASSERT( max_threads <= MAX_THREADS, - "Maximun %d threads supported, %d threads being used, please compile with increased MAX_THREADS value\n", + "Maximum %d threads supported, %d threads being used, please compile with increased MAX_THREADS value\n", MAX_THREADS, max_threads); libxsmm_init(); diff --git a/csrc/cpu/tpp/par_loop_generator.cpp b/csrc/cpu/tpp/par_loop_generator.cpp index 52e4f0fcf..1a44f1733 100644 --- a/csrc/cpu/tpp/par_loop_generator.cpp +++ b/csrc/cpu/tpp/par_loop_generator.cpp @@ -52,7 +52,7 @@ typedef struct { int n_loops; loop_param_t* loop_params; int n_logical_loops; - char occurence_map[256]; + char occurrence_map[256]; int jit_loop_spec; int use_2d_par; int n_row_teams; @@ -350,7 +350,7 @@ void emit_loop_body(loop_code* i_code, char* body_func_name) { sizeof(tmp_buf), "%c%d", 'a' + i, - i_code->occurence_map['a' + i] - 1); + i_code->occurrence_map['a' + i] - 1); align_line(i_code); snprintf(tmp_buf, sizeof(tmp_buf), "idx[%d] = %s;\n", i, str_idx); add_buf_to_code(i_code, tmp_buf); @@ -607,7 +607,7 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { char term_func_name[64] = "term_func"; char spec_func_name[64] = "loop_rt_spec"; char loop_map[256]; - char occurence_map[256]; + char occurrence_map[256]; loop_code l_code; char* result_code; loop_param_t loop_params[256], cur_loop, loop_params_map[256]; @@ -699,12 +699,12 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { } /* Set up loop properties */ - std::fill_n(occurence_map, 256, 0); + std::fill_n(occurrence_map, 256, 0); for (i = 0; i < n_loops; i++) { int is_blocked = (loop_map[tolower(loop_nest_desc[i])] > 1) ? 1 : 0; int is_parallelizable = (tolower(loop_nest_desc[i]) != loop_nest_desc[i]) ? 1 : 0; - int occurence_id, is_blocked_outer; + int occurrence_id, is_blocked_outer; char idx_name[16]; char spec_array_name[512]; char start_var_name[512]; @@ -712,9 +712,9 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { char step_var_name[512]; int loop_abs_index = tolower(loop_nest_desc[i]) - 'a'; - occurence_id = occurence_map[tolower(loop_nest_desc[i])]; - is_blocked_outer = (occurence_id == 0) ? 1 : 0; - occurence_map[tolower(loop_nest_desc[i])]++; + occurrence_id = occurrence_map[tolower(loop_nest_desc[i])]; + is_blocked_outer = (occurrence_id == 0) ? 1 : 0; + occurrence_map[tolower(loop_nest_desc[i])]++; snprintf(spec_array_name, sizeof(spec_array_name), "%s", spec_func_name); @@ -723,9 +723,9 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { sizeof(idx_name), "%c%d", tolower(loop_nest_desc[i]), - occurence_id); + occurrence_id); - if (occurence_id == 0) { + if (occurrence_id == 0) { if (loop_params_map[loop_abs_index].jit_start > 0) { snprintf( start_var_name, @@ -746,10 +746,10 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { sizeof(start_var_name), "%c%d", tolower(loop_nest_desc[i]), - occurence_id - 1); + occurrence_id - 1); } - if (occurence_id == 0) { + if (occurrence_id == 0) { if (loop_params_map[loop_abs_index].jit_end > 0) { snprintf( end_var_name, @@ -771,23 +771,23 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { sizeof(end_var_name), "%c%d + %d", tolower(loop_nest_desc[i]), - occurence_id - 1, - loop_params_map[loop_abs_index].block_size[occurence_id - 1]); + occurrence_id - 1, + loop_params_map[loop_abs_index].block_size[occurrence_id - 1]); } else { snprintf( end_var_name, sizeof(end_var_name), "%c%d + %s[%d].block_size[%d]", tolower(loop_nest_desc[i]), - occurence_id - 1, + occurrence_id - 1, spec_array_name, loop_abs_index, - occurence_id - 1); + occurrence_id - 1); } } if (is_blocked) { - if (occurence_id == loop_map[tolower(loop_nest_desc[i])] - 1) { + if (occurrence_id == loop_map[tolower(loop_nest_desc[i])] - 1) { if (loop_params_map[loop_abs_index].jit_step > 0) { snprintf( step_var_name, @@ -808,7 +808,7 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { step_var_name, sizeof(step_var_name), "%d", - loop_params_map[loop_abs_index].block_size[occurence_id]); + loop_params_map[loop_abs_index].block_size[occurrence_id]); } else { snprintf( step_var_name, @@ -816,7 +816,7 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { "%s[%d].block_size[%d]", spec_array_name, loop_abs_index, - occurence_id); + occurrence_id); } } } else { @@ -851,13 +851,13 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { /* Setup number of logical loops and the ocurence map */ n_logical_loops = 0; for (i = 0; i < 256; i++) { - if (occurence_map[i] > 0) { + if (occurrence_map[i] > 0) { n_logical_loops++; } } l_code.n_logical_loops = n_logical_loops; - memcpy(&l_code.occurence_map[0], occurence_map, 256); + memcpy(&l_code.occurrence_map[0], occurrence_map, 256); /* Emit function signature */ emit_func_signature( diff --git a/csrc/cpu/tpp/threaded_loops.h b/csrc/cpu/tpp/threaded_loops.h index 1c8b6830e..b8c7708e9 100644 --- a/csrc/cpu/tpp/threaded_loops.h +++ b/csrc/cpu/tpp/threaded_loops.h @@ -260,7 +260,7 @@ class ThreadedLoop { error: array initializer must be an initializer list So, now this->bounds is initialized by copy elements one by one This change leads to another problem: bounds is an array of LoopSpecs, - but LoopSpecs does not have a default consturctor. So, we added a + but LoopSpecs does not have a default constructor. So, we added a default constructor for LoopSpecs. */ ThreadedLoop(const LoopSpecs (&bounds)[N], std::string scheme = "") diff --git a/csrc/cpu/tpp/woq/dispatcher.h b/csrc/cpu/tpp/woq/dispatcher.h index 9cbdcf492..1cb22d23b 100644 --- a/csrc/cpu/tpp/woq/dispatcher.h +++ b/csrc/cpu/tpp/woq/dispatcher.h @@ -205,17 +205,17 @@ struct product_dispatcher_helper< template < typename... IntegralTypeProcessed, - typename... IntegeralTypeToProcess, + typename... IntegralTypeToProcess, typename... Dispatcher> struct product_dispatcher_helper< std::tuple, - std::tuple, + std::tuple, std::tuple> { template inline static void call( std::tuple dispatchers, std::tuple constants, - std::tuple integrals, + std::tuple integrals, const Lambda1& function, const Lambda2& fallback, Args... args) { @@ -252,20 +252,20 @@ template struct product_dispatcher; // dispatch to a carsian product of a list of integers to a lambda function -template +template struct product_dispatcher< - std::tuple, + std::tuple, std::tuple> { template inline static void call( - std::tuple integrals, + std::tuple integrals, const Lambda1& function, const Lambda2& fallback, Args... args) { static auto dispatchers = std::tuple{}; product_dispatcher_helper< std::tuple<>, - std::tuple, + std::tuple, std::tuple>:: call( dispatchers, diff --git a/csrc/cpu/tpp/xsmm_functors.h b/csrc/cpu/tpp/xsmm_functors.h index db68ed023..f85b8e741 100644 --- a/csrc/cpu/tpp/xsmm_functors.h +++ b/csrc/cpu/tpp/xsmm_functors.h @@ -1583,7 +1583,7 @@ class XformExtTPP { if (ignore_vnni_for_fp32 == false) { PCL_ASSERT( (xtype == XformTPP::XFORM_XPOSE_TPP || dtype != LIBXSMM_DATATYPE_F32), - "Only Transpose Xofrm supportd for FP32 datatype, specified %d\n", + "Only Transpose Xofrm supported for FP32 datatype, specified %d\n", (int)xtype); } const int BS = xsmm_get_vnni_block_size(dtype); diff --git a/csrc/cpu/utils/module_version.cpp b/csrc/cpu/utils/module_version.cpp index 7485aeee5..dcaa1c27a 100644 --- a/csrc/cpu/utils/module_version.cpp +++ b/csrc/cpu/utils/module_version.cpp @@ -21,7 +21,7 @@ void get_mkl_version() { void get_libxsmm_version() { #if 1 printf( - "Not avaliable yet, due to libxsmm CMake build not generate version info.\n"); + "Not available yet, due to libxsmm CMake build not generate version info.\n"); #else printf("libxsmm config version: %s\n", LIBXSMM_CONFIG_VERSION); printf("Config branch: %s\n", LIBXSMM_CONFIG_BRANCH); diff --git a/csrc/cpu/utils/robin_hood.h b/csrc/cpu/utils/robin_hood.h index 08fc09521..dfe0541da 100644 --- a/csrc/cpu/utils/robin_hood.h +++ b/csrc/cpu/utils/robin_hood.h @@ -112,7 +112,7 @@ static Counts& counts() { #error Unsupported bitness #endif -// endianess +// endianness #ifdef _MSC_VER #define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1 #define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0 @@ -189,7 +189,7 @@ static Counts& counts() { #define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0) #endif -// detect if native wchar_t type is availiable in MSVC +// detect if native wchar_t type is available in MSVC #ifdef _MSC_VER #ifdef _NATIVE_WCHAR_T_DEFINED #define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 @@ -200,7 +200,7 @@ static Counts& counts() { #define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 #endif -// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor +// detect if MSVC supports the pair(std::piecewise_construct_t,...) constructor // being constexpr #ifdef _MSC_VER #if _MSC_VER <= 1900 @@ -931,7 +931,7 @@ struct WrapKeyEqual : public T { // member, // or a DataNode with a pointer to std::pair. Which DataNode // representation to use depends on how fast the swap() operation is. -// Heuristically, this is automatically choosen based on sizeof(). there are +// Heuristically, this is automatically chosen based on sizeof(). there are // always 2^n Nodes. // // * info: Each Node in the map has a corresponding info byte, so there are 2^n @@ -1603,7 +1603,7 @@ class Table // Creates an empty hash map. Nothing is allocated yet, this happens at the // first insert. This tremendously speeds up ctor & dtor of a map that never - // receives an element. The penalty is payed at the first insert, and not + // receives an element. The penalty is paid at the first insert, and not // before. Lookup of this empty map works because everybody points to // DummyInfoByte::b. parameter bucket_count is dictated by the standard, but // we can ignore it. diff --git a/csrc/cpu/vec/unroll_helper.hpp b/csrc/cpu/vec/unroll_helper.hpp index fd09e9465..357045c46 100644 --- a/csrc/cpu/vec/unroll_helper.hpp +++ b/csrc/cpu/vec/unroll_helper.hpp @@ -4,8 +4,8 @@ #include #include "aten/utils/utils.h" -// This helper aims to provide a set of lambda function to manully unroll -// vectorized intrisics with compile_time_for +// This helper aims to provide a set of lambda function to manually unroll +// vectorized intrinsics with compile_time_for // https://github.com/intel/intel-extension-for-pytorch/blob/05aeaf4b675f15c68fcde5b575b4fd5151971129/csrc/cpu/aten/utils/utils.h#L68 // For example, // auto load_fp32 = [](auto i, __m512* in_vset, auto* basic_ptr) { diff --git a/csrc/cpu/vec/vec512/perf_kernel/add_swish.h b/csrc/cpu/vec/vec512/perf_kernel/add_swish.h index 848adbd80..b42b7e09b 100644 --- a/csrc/cpu/vec/vec512/perf_kernel/add_swish.h +++ b/csrc/cpu/vec/vec512/perf_kernel/add_swish.h @@ -27,7 +27,7 @@ inline void _dil_add_swish_fusion_kernel( int i = 0; // load tensor a & b - // assum the same size , no need to broadcast + // assume the same size, no need to broadcast for (; i <= size - 16; i += 16) { // a is first operand of add, b is bias vec_a = _loadu(a + i); @@ -38,7 +38,7 @@ inline void _dil_add_swish_fusion_kernel( vec_add_tmp = vec_a; // keep the intermediate result for later use in the mul - // caculate sigmoid e^x / (1 + e^x) + // calculate sigmoid e^x / (1 + e^x) vec_a = _dil_exp_kernel(vec_a); vec_addone_tmp = _mm512_add_ps(vec_a, vec_ps_1); vec_a = _mm512_div_ps(vec_a, vec_addone_tmp); @@ -59,7 +59,7 @@ inline void _dil_add_swish_fusion_kernel( vec_add_tmp = vec_a; // keep the intermediate result for later use in the second mul - // caculate sigmoid e^x / (1 + e^x) + // calculate sigmoid e^x / (1 + e^x) vec_a = _dil_exp_kernel(vec_a); vec_addone_tmp = _mm512_add_ps(vec_a, vec_ps_1); vec_a = _mm512_div_ps(vec_a, vec_addone_tmp); diff --git a/csrc/utils/CustomOperatorRegistration.h b/csrc/utils/CustomOperatorRegistration.h index bfc726c2a..04a2d5580 100644 --- a/csrc/utils/CustomOperatorRegistration.h +++ b/csrc/utils/CustomOperatorRegistration.h @@ -25,7 +25,7 @@ struct TypeSelector { extract_type(args...); } - at::ArrayRef retrive_types() { + at::ArrayRef retrieve_types() { return at::ArrayRef(container_.begin(), container_.end()); } @@ -101,7 +101,7 @@ The macro should be written in below way to register these two different *************************************************************************** IPEX_OP_REGISTER | IPEX_OP_REGISTER_TO_PLAIN This macro is used to register ops into torch_ipex library. Through this macro, -function schema and signature will automatically be infered from function +function schema and signature will automatically be inferred from function prototype. However, it is worth to note that this macro will not works on overload functions(see IPEX_OP_REGISTER_OVERLOAD). Here is some examples for register ipex operators: @@ -117,7 +117,7 @@ IPEX_LIBRARY_FRAGMENT() { IPEX_OP_REGISTER("mul_add", mul_add); } And if this op does not support oneDNN's block format memory layout for tensor. -It would be necessary for developer to register it specificly by adopting the +It would be necessary for developer to register it specifically by adopting the macro IPEX_OP_REGISTER_NEED_PLAIN. In this way, all the tensor passed to this operator will automatically convert to normal tensor layout when execution. diff --git a/docs/tutorials/api_doc.rst b/docs/tutorials/api_doc.rst index db0b664bb..140d00223 100644 --- a/docs/tutorials/api_doc.rst +++ b/docs/tutorials/api_doc.rst @@ -103,7 +103,7 @@ Quantization .. autofunction:: prepare .. autofunction:: convert -Prototype API, introduction is avaiable at `feature page <./features/int8_recipe_tuning_api.md>`_. +Prototype API, introduction is available at `feature page <./features/int8_recipe_tuning_api.md>`_. .. autofunction:: autotune diff --git a/docs/tutorials/contribution.md b/docs/tutorials/contribution.md index 94c7bed35..cd4f64294 100644 --- a/docs/tutorials/contribution.md +++ b/docs/tutorials/contribution.md @@ -68,7 +68,7 @@ If you want to reinstall, make sure that you uninstall Intel® Extension for PyT ### Tips and Debugging -* Cmake must be installed before installing Intel® Extension for PyTorch\*. If youre developing on MacOS or Linux, We recommend installing Cmake with [Homebrew](https://brew.sh/) with `brew install cmake`. +* Cmake must be installed before installing Intel® Extension for PyTorch\*. If you're developing on macOS or Linux, we recommend installing Cmake with [Homebrew](https://brew.sh/) with `brew install cmake`. * Our `setup.py` requires Python >= 3.6 * If you run into errors when running `python setup.py develop`, here are some debugging steps: 1. Run `printf '#include \nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure your CMake works and can compile this simple Hello World program without errors. diff --git a/docs/tutorials/features/graph_optimization.md b/docs/tutorials/features/graph_optimization.md index 680aead3f..fbe0faeb9 100644 --- a/docs/tutorials/features/graph_optimization.md +++ b/docs/tutorials/features/graph_optimization.md @@ -127,9 +127,9 @@ Here listed all the currently supported int8 patterns in Intel® Extension for P ### Folding -Stock PyTorch provids constant propagation and BatchNormalization folding. These optimizations are automatically applied to the jit model by invoking `torch.jit.freeze`. Take the Resnet50 as an example: +Stock PyTorch provides constant propagation and BatchNormalization folding. These optimizations are automatically applied to the jit model by invoking `torch.jit.freeze`. Take the Resnet50 as an example: [//]: # (marker_feature_graph_optimization_folding) [//]: # (marker_feature_graph_optimization_folding) -If the model owner does not invoke the `torch.jit.freeze`, the `BatchNormalization` still exists on the graph. Otheriwse, the `BatchNormalization` will be folded on the graph to save the compuation and then improve the performance. Refer to the [Constant Folding Wikipedia page](https://en.wikipedia.org/wiki/Constant_folding) for more details. +If the model owner does not invoke the `torch.jit.freeze`, the `BatchNormalization` still exists on the graph. Otherwise, the `BatchNormalization` will be folded on the graph to save the computation and then improve the performance. Refer to the [Constant Folding Wikipedia page](https://en.wikipedia.org/wiki/Constant_folding) for more details. diff --git a/docs/tutorials/features/hypertune.md b/docs/tutorials/features/hypertune.md index e8bc828b8..cf9338757 100644 --- a/docs/tutorials/features/hypertune.md +++ b/docs/tutorials/features/hypertune.md @@ -71,7 +71,7 @@ hyperparams: launcher: hp: ['malloc'] ``` -`malloc` will be tuned using its default search space, `['tc', 'je', 'pt']`. All other launcher hyperparamters (`ncores_per_instance`, `ninstances`, `use_all_nodes`, `use_logical_cores`, `disable_numactl`, `disable_iomp`) will not be tuned and instead will use their default values. +`malloc` will be tuned using its default search space, `['tc', 'je', 'pt']`. All other launcher hyperparameters (`ncores_per_instance`, `ninstances`, `use_all_nodes`, `use_logical_cores`, `disable_numactl`, `disable_iomp`) will not be tuned and instead will use their default values. #### User defined search space diff --git a/docs/tutorials/features/int8_overview.md b/docs/tutorials/features/int8_overview.md index b0d279650..350a5bd88 100644 --- a/docs/tutorials/features/int8_overview.md +++ b/docs/tutorials/features/int8_overview.md @@ -31,12 +31,12 @@ qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_a weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric)) ``` -Note: we fully use PyTorch [observer methonds](https://pytorch.org/docs/stable/quantization-support.html#torch-quantization-observer), so you can use a different PyTorch obsever methond to define the [QConfig](https://pytorch.org/docs/1.11/generated/torch.quantization.qconfig.QConfig.html). For weight observer, we only support **torch.qint8** dtype now. +Note: we fully use PyTorch [observer methonds](https://pytorch.org/docs/stable/quantization-support.html#torch-quantization-observer), so you can use a different PyTorch observer method to define the [QConfig](https://pytorch.org/docs/1.11/generated/torch.quantization.qconfig.QConfig.html). For weight observer, we only support **torch.qint8** dtype now. **Suggestion**: 1. For activation observer, if using **qscheme** as **torch.per_tensor_affine**, **torch.quint8** is preferred. If using **qscheme** as **torch.per_tensor_symmetric**, **torch.qint8** is preferred. For weight observer, setting **qscheme** to **torch.per_channel_symmetric** can get a better accuracy. -2. If your CPU device doesn't support VNNI, seting the observer's **reduce_range** to **True** can get a better accuracy, such as skylake. +2. If your CPU device doesn't support VNNI, setting the observer's **reduce_range** to **True** can get a better accuracy, such as skylake. ### Prepare Model and Do Calibration diff --git a/docs/tutorials/features/int8_recipe_tuning_api.md b/docs/tutorials/features/int8_recipe_tuning_api.md index 250237bbd..33b53cc51 100644 --- a/docs/tutorials/features/int8_recipe_tuning_api.md +++ b/docs/tutorials/features/int8_recipe_tuning_api.md @@ -1,7 +1,7 @@ INT8 Recipe Tuning API (Prototype) ===================================== -This [new API](../api_doc.html#ipex.quantization.autotune) `ipex.quantization.autotune` supports INT8 recipe tuning by using Intel® Neural Compressor as the backend in Intel® Extension for PyTorch\*. In general, we provid default recipe in Intel® Extension for PyTorch\*, and we still recommend users to try out the default recipe first without bothering tuning. If the default recipe doesn't bring about desired accuracy, users can use this API to tune for a more advanced receipe. +This [new API](../api_doc.html#ipex.quantization.autotune) `ipex.quantization.autotune` supports INT8 recipe tuning by using Intel® Neural Compressor as the backend in Intel® Extension for PyTorch\*. In general, we provid default recipe in Intel® Extension for PyTorch\*, and we still recommend users to try out the default recipe first without bothering tuning. If the default recipe doesn't bring about desired accuracy, users can use this API to tune for a more advanced recipe. Users need to provide a fp32 model and some parameters required for tuning. The API will return a prepared model with tuned qconfig loaded. diff --git a/docs/tutorials/features/isa_dynamic_dispatch.md b/docs/tutorials/features/isa_dynamic_dispatch.md index ac79974fa..ce5855894 100644 --- a/docs/tutorials/features/isa_dynamic_dispatch.md +++ b/docs/tutorials/features/isa_dynamic_dispatch.md @@ -50,7 +50,7 @@ At the runtime, **Dispatch Stub implementation** will check CPUIDs and OS status >#### **Dispatch Stub implementation:** `csrc/cpu/dyndisp/DispatchStub.cpp` and `csrc/cpu/dyndisp/DispatchStub.h` ### CodeGen Process -IPEX build system will generate code for each ISA level with specifiy complier parameters. The CodeGen script is located at `cmake/cpu/IsaCodegen.cmake`. +IPEX build system will generate code for each ISA level with specifiy compiler parameters. The CodeGen script is located at `cmake/cpu/IsaCodegen.cmake`. The CodeGen will copy each cpp files from **Kernel implementation**, and then add ISA level as new file suffix. @@ -376,7 +376,7 @@ Here are three ISA-related private APIs that can help debugging:: >**Note:** > >1. Max CPU supported ISA level only depends on CPU features. ->2. Max binary supported ISA level only depends on built complier version. +>2. Max binary supported ISA level only depends on built compiler version. >3. Current ISA level, it is the smaller of `max CPU ISA level` and `max binary ISA level`. ### Example: diff --git a/docs/tutorials/features/runtime_extension.md b/docs/tutorials/features/runtime_extension.md index 03f0e9f56..de451fbe4 100644 --- a/docs/tutorials/features/runtime_extension.md +++ b/docs/tutorials/features/runtime_extension.md @@ -120,7 +120,7 @@ Thus, `MultiStreamModule` may benefit performance for inference in throughput mo 2. The overhead of inputs' auto split and outputs' auto concat for each stream. 3. The overhead of pthread (stream async execution) wakes up and threads' synchronization after stream execution. -Here are some performance receipes that we recommend for better multi-stream performance. +Here are some performance recipes that we recommend for better multi-stream performance. * When creating `MultiStreamModule` with `torch.nn.Module` as imperative path module, each stream inside `MultiStreamModule` suffers the GIL issue when doing inference together. This hurts end-to-end performance. We recommend creating `MultiStreamModule` with the `torch.jit.ScriptModule`. diff --git a/docs/tutorials/getting_started.md b/docs/tutorials/getting_started.md index 1bd750d52..8dbfb7cab 100644 --- a/docs/tutorials/getting_started.md +++ b/docs/tutorials/getting_started.md @@ -12,7 +12,7 @@ To start using the Intel® Extension for PyTorch\* in your code, you need to mak **Important:** It is highly recommended to `import intel_extension_for_pytorch` right after `import torch`, prior to importing other packages. -The example below demostrates how to use the Intel® Extension for PyTorch\* with TorchScript: +The example below demonstrates how to use the Intel® Extension for PyTorch\* with TorchScript: ```python import torch @@ -34,7 +34,7 @@ with torch.no_grad(), torch.cpu.amp.autocast(): ########################################## ``` -The example below demostrates how to use the Intel® Extension for PyTorch\* with TorchDynamo: +The example below demonstrates how to use the Intel® Extension for PyTorch\* with TorchDynamo: ```python import torch diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst index 7e6f0b51a..1ec365d62 100644 --- a/docs/tutorials/llm.rst +++ b/docs/tutorials/llm.rst @@ -120,7 +120,7 @@ While Generative AI (GenAI) workloads and models are getting more and more popul Quantization with shorter data types benefits from its nature to improve memory IO throughputs and amount of computations on CPU. Moreover, shorter data types make it possible to keep more data in CPU cache, thus reducing memory access occurrences. Comparing to cache access, memory access is much more time costing. Specifically from computation perspective, AVX-512 Vector Neural Network Instructions (VNNI) instruction set shipped with the 2nd Generation Intel® Xeon® Scalable Processors and newer, as well as Intel® Advanced Matrix Extensions (Intel® AMX) instruction set shipped with the 4th Generation Intel® Xeon® Scalable Processors, provide instruction level accelerations to INT8 computations. -Except for the mixed-precision and INT8 native quantization solution, e.g., post-training static quantization and dynamic quantization in Pytorch, `SmoothQuant `_ and weight only quantization (both INT8 weight and INT4 weight are supported) are also enabled in Intel® Extension for PyTorch* to get beeter accuracy and performance compared with native solution. +Except for the mixed-precision and INT8 native quantization solution, e.g., post-training static quantization and dynamic quantization in Pytorch, `SmoothQuant `_ and weight only quantization (both INT8 weight and INT4 weight are supported) are also enabled in Intel® Extension for PyTorch* to get better accuracy and performance compared with native solution. Intel® Extension for PyTorch* speeds up INT8 computations by leveraging oneDNN and oneDNN graph as the backend. Intel® Extension for PyTorch* static quantization provides a default recipe to automatically decide which operators to quantize. Its backend oneDNN graph brings matrix-multiplication-based fusions for common seen operator patterns and other common fusions like quantization + data type casting. These fusions help achieve best computation cache locality and efficiency, and thus reduce INT8 quantization overhead significantly. diff --git a/docs/tutorials/performance_tuning/launch_script.md b/docs/tutorials/performance_tuning/launch_script.md index 61c5826f3..4c867941c 100644 --- a/docs/tutorials/performance_tuning/launch_script.md +++ b/docs/tutorials/performance_tuning/launch_script.md @@ -258,7 +258,7 @@ You can also specify the cores to be utilized using `--cores-list` argument. For ipexrun --ncores-per-instance 10 --cores-list "11-20" --log-dir ./logs resnet50.py ``` -Please notice that when specifying `--cores-list`, a correspondant `--ncores-per-instance` argument is required for instance number deduction. +Please notice that when specifying `--cores-list`, a correspondent `--ncores-per-instance` argument is required for instance number deduction. In this case the log directory should be like ``` diff --git a/docs/tutorials/performance_tuning/torchserve.md b/docs/tutorials/performance_tuning/torchserve.md index a5d8d694d..e8bd7aeb8 100644 --- a/docs/tutorials/performance_tuning/torchserve.md +++ b/docs/tutorials/performance_tuning/torchserve.md @@ -67,9 +67,9 @@ Below are some useful `cpu_launcher_args` to note. Italic values are default if Refer to [Launch Script Usage Guide](./launch_script.md) for a full list of tunable configuration of launcher. And refer to [Performance Tuning Guide](./tuning_guide.md) for more details. ### Launcher Core Pinning to Boost Performance of TorchServe Multi Worker Inference -When running [multi-worker inference](https://pytorch.org/serve/management_api.html#scale-workers) with Torchserve (Required torchserve>=0.6.1), launcher pin cores to workers to boost performance. Internally, launcher equally divides the number of cores by the number of workers such that each worker is pinned to assigned cores. Doing so avoids core overlap among workers which can signficantly boost performance for TorchServe multi-worker inference. For example, assume running 4 workers on a machine with Intel(R) Xeon(R) Platinum 8180 CPU, 2 sockets, 28 cores per socket, 2 threads per core. Launcher will bind worker 0 to cores 0-13, worker 1 to cores 14-27, worker 2 to cores 28-41, and worker 3 to cores 42-55. +When running [multi-worker inference](https://pytorch.org/serve/management_api.html#scale-workers) with Torchserve (Required torchserve>=0.6.1), launcher pin cores to workers to boost performance. Internally, launcher equally divides the number of cores by the number of workers such that each worker is pinned to assigned cores. Doing so avoids core overlap among workers which can significantly boost performance for TorchServe multi-worker inference. For example, assume running 4 workers on a machine with Intel(R) Xeon(R) Platinum 8180 CPU, 2 sockets, 28 cores per socket, 2 threads per core. Launcher will bind worker 0 to cores 0-13, worker 1 to cores 14-27, worker 2 to cores 28-41, and worker 3 to cores 42-55. -CPU usage is shown below. 4 main worker threads were launched, each launching 14 threads affinitized to the assigned physical cores. +CPU usage is shown below. 4 main worker threads were launched, each launching 14 threads affinities to the assigned physical cores. ![26](https://user-images.githubusercontent.com/93151422/170373651-fd8a0363-febf-4528-bbae-e1ddef119358.gif) @@ -78,7 +78,7 @@ Additionally when dynamically [scaling the number of workers](https://pytorch.or Continuing with the above example with 4 workers, assume killing workers 2 and 3. If cores were not re-distributed after the scale down, cores 28-55 would be left unutilized. Instead, launcher re-distributes cores 28-55 to workers 0 and 1 such that now worker 0 binds to cores 0-27 and worker 1 binds to cores 28-55.2 -CPU usage is shown below. 4 main worker threads were initially launched. Then after scaling down the number of workers from 4 to 2, 2 main worker threads were launched, each launching 28 threads affinitized to the assigned physical cores. +CPU usage is shown below. 4 main worker threads were initially launched. Then after scaling down the number of workers from 4 to 2, 2 main worker threads were launched, each launching 28 threads affinities to the assigned physical cores. ![worker_scaling](https://user-images.githubusercontent.com/93151422/170374697-7497c2d5-4c17-421b-9993-1434d1f722f6.gif) 2. Serving is interrupted for few seconds while re-distributing cores to scaled workers. @@ -171,7 +171,7 @@ torch.jit.save(model, 'rn50_int8_jit.pt') ``` ### 2. Creating a Model Archive -Once the serialized file ( `.pt`) is created, it can be used with `torch-model-archiver` as ususal. +Once the serialized file ( `.pt`) is created, it can be used with `torch-model-archiver` as usual. Use the following command to package `rn50_int8_jit.pt` into `rn50_ipex_int8.mar`. ``` @@ -255,7 +255,7 @@ cpu_launcher_enable=true CPU usage is shown as below: ![launcher_core_pinning](https://user-images.githubusercontent.com/93151422/159063975-e7e8d4b0-e083-4733-bdb6-4d92bdc10556.gif) -4 main worker threads were launched, then each launched a num_physical_cores/num_workers number (14) of threads affinitized to the assigned physical cores. +4 main worker threads were launched, then each launched a num_physical_cores/num_workers number (14) of threads affinities to the assigned physical cores.

 $ cat logs/model_log.log
diff --git a/docs/tutorials/performance_tuning/tuning_guide.md b/docs/tutorials/performance_tuning/tuning_guide.md
index 78d122f36..d45c25bea 100644
--- a/docs/tutorials/performance_tuning/tuning_guide.md
+++ b/docs/tutorials/performance_tuning/tuning_guide.md
@@ -253,7 +253,7 @@ Intel® Extension for PyTorch\* is using OneDNN backend for those most computing
 
 To achieve better performance, OneDNN backend is using its [primitive cache](https://oneapi-src.github.io/oneDNN/dev_guide_primitive_cache.html) to store those created primitives for different input shapes during warm-up stage (default primitive cache size is 1024, i.e., 1024 cached primitives). Therefore, when the total size of the primitives created by all the input shapes is within the default threshold, Intel® Extension for PyTorch\* could get fully computation performance from OneDNN kernels.
 
-Different input shapes usualy come from dynamic shapes of datasets. Dynamic shapes commonly exist in [MaskRCNN model](https://github.com/matterport/Mask_RCNN) (object detection), [Transformers](https://github.com/huggingface/transformers/) Wav2vec2 model (speech-recognition) and other speech/text-generation related Transformers models.
+Different input shapes usually come from dynamic shapes of datasets. Dynamic shapes commonly exist in [MaskRCNN model](https://github.com/matterport/Mask_RCNN) (object detection), [Transformers](https://github.com/huggingface/transformers/) Wav2vec2 model (speech-recognition) and other speech/text-generation related Transformers models.
 
 However, we might meet the fact that model would need to cache a large amount of various input shapes, which would even exceed the default primitive cache size. In such case, we recommend tuning the OneDNN primitive cache by setting `ONEDNN_PRIMITIVE_CACHE_CAPACITY` environment variable to get better performance (Note that it is at the cost of increased memory usage):
 
diff --git a/docs/tutorials/releases.md b/docs/tutorials/releases.md
index e6d2da69e..f4b2e6f6c 100644
--- a/docs/tutorials/releases.md
+++ b/docs/tutorials/releases.md
@@ -655,7 +655,7 @@ Highlights include:
   The support for dynamic shapes in Intel® Extension for PyTorch\* INT8 integration is still work in progress. When the input shapes are dynamic, for example inputs of variable image sizes in an object detection task or of variable sequence lengths in NLP tasks, the Intel® Extension for PyTorch\* INT8 path may slow down the model inference. In this case, use stock PyTorch INT8 functionality.
   **Note**: Using Runtime Extension feature if batch size cannot be divided by number of streams, because mini batch size on each stream are not equivalent, scripts run into this issues.
 - BF16 AMP(auto-mixed-precision) runs abnormally with the extension on the AVX2-only machine if the topology contains `Conv`, `Matmul`, `Linear`, and `BatchNormalization`
-- Runtime extension of MultiStreamModule doesn't support DLRM inference, since the input of DLRM (EmbeddingBag specifically) can't be simplely batch split.
+- Runtime extension of MultiStreamModule doesn't support DLRM inference, since the input of DLRM (EmbeddingBag specifically) can't be simply batch split.
 - Runtime extension of MultiStreamModule has poor performance of RNNT Inference comparing with native throughput mode. Only part of the RNNT models (joint_net specifically) can be jit traced into graph. However, in one batch inference, `joint_net` is invoked multi times. It increases the overhead of MultiStreamModule as input batch split, thread synchronization and output concat.
 - Incorrect Conv and Linear result if the number of OMP threads is changed at runtime
   The oneDNN memory layout depends on the number of OMP threads, which requires the caller to detect the changes for the # of OMP threads while this release has not implemented it yet.
@@ -788,7 +788,7 @@ libintel-ext-pt-cxx11-abi-1.11.0+cpu.run (13.5M)
 
 This release is meant to fix the following issues:
 - Resolve the issue that the PyTorch Tensor Expression(TE) did not work after importing the extension.
-- Wraps the BactchNorm(BN) as another operator to break the TE's BN-related fusions. Because the BatchNorm performance of PyTorch Tensor Expression can not achieve the same performance as PyTorch ATen BN.
+- Wraps the BatchNorm(BN) as another operator to break the TE's BN-related fusions. Because the BatchNorm performance of PyTorch Tensor Expression can not achieve the same performance as PyTorch ATen BN.
 - Update the [documentation](https://intel.github.io/intel-extension-for-pytorch/)
     - Fix the INT8 quantization example issue #205
     - Polish the installation guide
diff --git a/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py b/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py
index 7d741e495..c35ffbc40 100644
--- a/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py
+++ b/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py
@@ -107,13 +107,13 @@ def eval_func(model):
 
         return top1.avg.item()
 
-    print(".........runing autotuning step.........")
+    print(".........running autotuning step.........")
     tuned_model = ipex.quantization.autotune(
         model, val_loader, eval_func=eval_func, sampling_sizes=[300]
     )
     print(".........autotuning step done.........")
 
-    print(".........runing int8 inference.........")
+    print(".........running int8 inference.........")
     converted_model = ipex.quantization.convert(tuned_model)
     with torch.no_grad():
         for i, (images, target) in enumerate(val_loader):
diff --git a/examples/cpu/features/int8_recipe_tuning/int8_autotune.py b/examples/cpu/features/int8_recipe_tuning/int8_autotune.py
index b70155ca4..05b0a3d10 100644
--- a/examples/cpu/features/int8_recipe_tuning/int8_autotune.py
+++ b/examples/cpu/features/int8_recipe_tuning/int8_autotune.py
@@ -84,7 +84,7 @@ def train(dataloader, model, loss_fn, optimizer):
 
 epochs = 5
 for t in range(epochs):
-    print(f"Epoch {t+1}\n-------------------------------")
+    print(f"Epoch {t + 1}\n-------------------------------")
     train(train_dataloader, model, loss_fn, optimizer)
 print("Done!")
 
diff --git a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb
index 8c6c00860..72e04ecbd 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb
@@ -243,7 +243,7 @@
     "\n",
     "|exec type | Description |  \n",
     "|:-----|:----|  \n",
-    "|exec | Time for primitives exection. Better to spend most of time on primitives execution. |  \n",
+    "|exec | Time for primitives execution. Better to spend most of time on primitives execution. |  \n",
     "|create| Time for primitives creation. Primitives creation happens once. Better to spend less time on primitive creation. |  "
    ]
   },
@@ -274,7 +274,7 @@
     "### Step 6: Time breakdown for primitives type\n",
     "The primitives type includes convolution, reorder, sum, etc.  \n",
     "For this simple convolution net example, convolution and inner product primitives are expected to spend most of time.  \n",
-    "However, the exact time percentage of different primitivies may vary among different architectures.    \n",
+    "However, the exact time percentage of different primitives may vary among different architectures.    \n",
     "Users can easily identify top hotpots of primitives executions with this time breakdown.  "
    ]
   },
diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb
index c4bca3199..05efa7abd 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb
@@ -53,7 +53,7 @@
    "source": [
     "## Installation of required packages\n",
     "\n",
-    "Ensure the kernel is set to Pytorch-CPU before running the follwing code."
+    "Ensure the kernel is set to Pytorch-CPU before running the following code."
    ]
   },
   {
diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb
index 03020685e..747f4e5cc 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb
@@ -288,7 +288,7 @@
     "\n",
     "# Calculate speedup when using quantization\n",
     "speedup_from_fp32_static = fp32_inference_time / int8_inference_time_static\n",
-    "print(\"Staic INT8 %.2fX faster than FP32\" %speedup_from_fp32_static)\n",
+    "print(\"Static INT8 %.2fX faster than FP32\" %speedup_from_fp32_static)\n",
     "speedup_from_fp32_dynamic = fp32_inference_time / int8_inference_time_dynamic\n",
     "print(\"Dynamic INT8 %.2fX faster than FP32\" %speedup_from_fp32_dynamic)\n",
     "\n",
diff --git a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb
index b0cec76a8..8a07c7a88 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb
@@ -229,7 +229,7 @@
     "            width = 0.4)\n",
     "\n",
     "    plt.ylabel(\"Runtime (ms)\")\n",
-    "    plt.title(f\"Speedup acheived - {inference_time_stock/inference_time_optimized:.2f}x\")\n",
+    "    plt.title(f\"Speedup achieved - {inference_time_stock/inference_time_optimized:.2f}x\")\n",
     "    plt.show()\n",
     "    \n",
     "\n"
diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py
index d8f73ee60..ba3d1ee43 100644
--- a/examples/cpu/inference/python/llm-modeling/run.py
+++ b/examples/cpu/inference/python/llm-modeling/run.py
@@ -94,7 +94,7 @@ def get_dummy_input(_model, return_dict=False):
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6B",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--dtype",
@@ -230,7 +230,7 @@ def trace_handler(prof):
 elif args.input_tokens in prompt_pool[model_type]:
     prompt = prompt_pool[model_type][args.input_tokens]
 else:
-    raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.")
+    raise SystemExit("[ERROR] Please use --prompt if want to use custom input.")
 
 input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
 print("---- Prompt size:", input_size)
diff --git a/examples/cpu/inference/python/models/LCM/README.md b/examples/cpu/inference/python/models/LCM/README.md
index 32bbcdffc..8e656adf6 100644
--- a/examples/cpu/inference/python/models/LCM/README.md
+++ b/examples/cpu/inference/python/models/LCM/README.md
@@ -50,7 +50,7 @@ bash download_dataset.sh
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md b/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md
index 31224047d..e107a80e6 100644
--- a/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md
+++ b/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md
@@ -74,7 +74,7 @@ export FINETUNED_MODEL=$(pwd)/bert_squad_model
     ```
     ./setup.sh
     ```
-4. Setup required environment paramaters
+4. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/README.md b/examples/cpu/inference/python/models/bert_large/training/cpu/README.md
index 2a757827e..96c20775d 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/README.md
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/README.md
@@ -145,7 +145,7 @@ you can use "SHARD_NUM" to control the shard files number. the default "SHARD_NU
   ```
   ./setup.sh
   ```
-4. Setup required environment paramaters
+4. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py b/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py
index 322e49b40..189a5310c 100755
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py
@@ -185,7 +185,7 @@ def whitespace_tokenize(text):
 
 
 class FullTokenizer(object):
-    """Runs end-to-end tokenziation."""
+    """Runs end-to-end tokenization."""
 
     def __init__(self, vocab_file, do_lower_case=True):
         self.vocab = load_vocab(vocab_file)
@@ -336,7 +336,7 @@ def _clean_text(self, text):
 
 
 class WordpieceTokenizer(object):
-    """Runs WordPiece tokenziation."""
+    """Runs WordPiece tokenization."""
 
     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
         self.vocab = vocab
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py b/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py
index 7f9f05ac6..ede009df7 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py
@@ -85,7 +85,7 @@ def step(self, closure=None):
                 data = p.data
                 if grad.is_sparse:
                     raise RuntimeError(
-                        "Lamb does not support sparse gradients, consider SparseAdam instad."
+                        "Lamb does not support sparse gradients, consider SparseAdam instead."
                     )
 
                 state = self.state[p]
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py b/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py
index a45397f82..26df9d98b 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py
@@ -1158,8 +1158,8 @@ def main():
                 print(
                     f"Step {training_steps:5d}: loss: {gloss:6.3f} lm_acc: {lm_acc:.3f} \
                     seq_acc: {seq_acc:.3f} lbs: {args.train_batch_size} gbs: {total_batch_size} \
-                    DT: {(t1-t0)*1000.0:.1f} XT: {(t2-t1)*1000.0:.1f} FT: {(t3-t2)*1000.0:.1f} \
-                    BT: {(t4-t3)*1000.0:.1f} OT: {(t5-t4)*1000.0:.1f} TT: {(t5-t0)*1000.0:.1f}"
+                    DT: {(t1 - t0) * 1000.0:.1f} XT: {(t2 - t1) * 1000.0:.1f} FT: {(t3 - t2) * 1000.0:.1f} \
+                    BT: {(t4 - t3) * 1000.0:.1f} OT: {(t5 - t4) * 1000.0:.1f} TT: {(t5 - t0) * 1000.0:.1f}"
                 )
 
                 update_step = training_steps % args.gradient_accumulation_steps == 0
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py b/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py
index 6c6e9a628..e3058dae9 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py
@@ -71,7 +71,7 @@ def setup_seeds(master_seed, epochs, device):
     Generates seeds from one master_seed.
     Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
     used to initialize per-worker random number generators (mostly for
-    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dropouts), shuffling_seeds are for RNGs responsible for reshuffling the
     dataset before each epoch.
     Seeds are generated on worker with rank 0 and broadcasted to all other
     workers.
diff --git a/examples/cpu/inference/python/models/deepseek/README.md b/examples/cpu/inference/python/models/deepseek/README.md
index 6ed236bd8..31a9adb45 100644
--- a/examples/cpu/inference/python/models/deepseek/README.md
+++ b/examples/cpu/inference/python/models/deepseek/README.md
@@ -59,7 +59,7 @@ wget -O prompt.json https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.c
 ```
 
 ### Performance
-#### 1. Setup required environment paramaters
+#### 1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
@@ -74,7 +74,7 @@ wget -O prompt.json https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.c
 
 **Please avoid cross NUMA node memory access when setting SGLANG_CPU_OMP_THREADS_BIND.**
 
-`SGLANG_CPU_OMP_THREADS_BIND` specifies the CPU cores dedicated to the OpenMP threads. `--tp` sets the TP size. Below are the example of running without TP and with TP = 6. By changing `--tp` and `SGLANG_CPU_OMP_THREADS_BIND` accordingly, you could set TP size to other values and specifiy the core binding for each rank.
+`SGLANG_CPU_OMP_THREADS_BIND` specifies the CPU cores dedicated to the OpenMP threads. `--tp` sets the TP size. Below are the example of running without TP and with TP = 6. By changing `--tp` and `SGLANG_CPU_OMP_THREADS_BIND` accordingly, you could set TP size to other values and specify the core binding for each rank.
 
 
 ##### 2.1 Bench one batch
diff --git a/examples/cpu/inference/python/models/distilbert/README.md b/examples/cpu/inference/python/models/distilbert/README.md
index e09ac0a3f..ac8ac0147 100644
--- a/examples/cpu/inference/python/models/distilbert/README.md
+++ b/examples/cpu/inference/python/models/distilbert/README.md
@@ -48,13 +48,13 @@
   #by default they are downloaded in current path
   #note that you should do this after you prepared model (transformers repo)
 
-  (2) make following changes in the scirpts to run:
+  (2) make following changes in the scripts to run:
   delete: --task_name sst2  ==>  add: --train_file {path/to/data_file}/SST-2/train.csv --validation_file {path/to/data_file}/SST-2/dev.csv
 
   (3) export model path
   export FINETUNED_MODEL={path/to/model_file}/distilbert-base-uncased-finetuned-sst-2-english
 
-  (4) run scirpt with HF_DATASETS_OFFLINE=1 flag, like:
+  (4) run script with HF_DATASETS_OFFLINE=1 flag, like:
   HF_DATASETS_OFFLINE=1 bash run_multi_instance_throughput.sh fp32
 
   ```
@@ -90,7 +90,7 @@ export FINETUNED_MODEL=$(pwd)/distilbert-base-uncased-finetuned-sst-2-english
   ./setup.sh
   ```
 
-5. Setup required environment paramaters
+5. Setup required environment parameters
 
 # Custom mode
  Run in custom mode by export TEST_MODE="" and export BATCH_SIZE to set the batch_size, export CORES_PER_INSTANCE to set the number of cores per instance and export INSTANCES to set the number of instances.
diff --git a/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py b/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py
index 2c0284f4b..517702fd6 100755
--- a/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py
+++ b/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py
@@ -680,7 +680,7 @@ def compute_metrics(p: EvalPrediction):
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        # Loop to handle MNLI double evaluation (matched, miss-matched)
         tasks = [data_args.task_name]
         eval_datasets = [eval_dataset]
         if data_args.task_name == "mnli":
@@ -718,7 +718,7 @@ def compute_metrics(p: EvalPrediction):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        # Loop to handle MNLI double evaluation (matched, miss-matched)
         tasks = [data_args.task_name]
         predict_datasets = [predict_dataset]
         if data_args.task_name == "mnli":
diff --git a/examples/cpu/inference/python/models/distilbert/scripts/trainer.py b/examples/cpu/inference/python/models/distilbert/scripts/trainer.py
index 9b5b64979..8cb609d6a 100755
--- a/examples/cpu/inference/python/models/distilbert/scripts/trainer.py
+++ b/examples/cpu/inference/python/models/distilbert/scripts/trainer.py
@@ -2063,7 +2063,7 @@ def _inner_training_loop(
                 (self.model_wrapped,) = release_memory(self.model_wrapped)
                 self.model_wrapped = self.model
 
-                # Check for DeepSpeed *after* the intial pass and modify the config
+                # Check for DeepSpeed *after* the initial pass and modify the config
                 if self.is_deepspeed_enabled:
                     # Temporarily unset `self.args.train_batch_size`
                     original_bs = self.args.per_device_train_batch_size
@@ -2748,7 +2748,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     # Checkpoint must have been saved with the old smp api.
                     if hasattr(self.args, "fp16") and self.args.fp16 is True:
                         logger.warning(
-                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
+                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported."
                         )
                     state_dict = torch.load(
                         weights_file,
@@ -3923,7 +3923,7 @@ def evaluate(
             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
             dictionary also contains the epoch number which comes from the training state.
         """
-        # handle multipe eval datasets
+        # handle multiple eval datasets
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
         if isinstance(eval_dataset, dict):
             metrics = {}
@@ -4067,7 +4067,7 @@ def predict(
     def benchmark_evaluate(self, model, dataloader):
         steps_per_epoch = len(dataloader)
         total_steps = self.args.perf_run_iters + self.args.perf_begin_iter
-        test_epoches = int(total_steps / steps_per_epoch)
+        test_epochs = int(total_steps / steps_per_epoch)
         print(
             "Evaluating: Steps per Epoch {} total Steps {}".format(
                 steps_per_epoch, total_steps
@@ -4105,7 +4105,7 @@ def benchmark_evaluate(self, model, dataloader):
                     prof.step()
             prof.__exit__(None, None, None)
         with tqdm(total=total_steps, desc="Evaluating") as pbar:
-            for epoch in range(test_epoches + 1):
+            for epoch in range(test_epochs + 1):
                 for it, batch in enumerate(dataloader):
                     if "pixel_values" in batch:
                         if self.args.fp16_cpu:
diff --git a/examples/cpu/inference/python/models/distilbert/scripts/training_args.py b/examples/cpu/inference/python/models/distilbert/scripts/training_args.py
index 66e526096..45b6d4084 100644
--- a/examples/cpu/inference/python/models/distilbert/scripts/training_args.py
+++ b/examples/cpu/inference/python/models/distilbert/scripts/training_args.py
@@ -475,7 +475,7 @@ class TrainingArguments:
                      all-gathers.
                 - use_orig_params (`bool`, *optional*, defaults to `True`)
                     If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
-                    frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
+                    frozen and trainable parameters. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
                 - sync_module_states (`bool`, *optional*, defaults to `True`)
@@ -524,8 +524,8 @@ class TrainingArguments:
                     all workers.
                 - use_seedable_sampler (`bool`, *optional*, defaults to `True`):
                     Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
-                    training results are fully reproducable using a different sampling technique. While seed-to-seed results
-                    may differ, on average the differences are neglible when using multiple different seeds to compare. Should
+                    training results are fully reproducible using a different sampling technique. While seed-to-seed results
+                    may differ, on average the differences are negligible when using multiple different seeds to compare. Should
                     also be ran with [`~utils.set_seed`] for the best results.
 
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
@@ -1281,7 +1281,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Config to be used with the internal Accelerator object initializtion. The value is either a "
+                "Config to be used with the internal Accelerator object initialization. The value is either a "
                 "accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
             )
         },
@@ -1570,7 +1570,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": "Activates neftune noise embeddings into the model. NEFTune"
-            " has been proven to drastically improve model performances for instrcution fine-tuning."
+            " has been proven to drastically improve model performances for instruction fine-tuning."
             " Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original "
             "code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
         },
@@ -1861,7 +1861,7 @@ def __post_init__(self):
                     torch.backends.cudnn.allow_tf32 = True
             else:
                 logger.warning(
-                    "The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
+                    "The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here."
                 )
         if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
             if self.tf32:
@@ -2305,7 +2305,7 @@ def _setup_devices(self) -> "torch.device":
                 )
                 if device.type != "mps":
                     raise ValueError(
-                        "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ "
+                        "Either you do not have an MPS-enabled device on this machine or macOS version is not 12.3+ "
                         "or current PyTorch install was not built with MPS enabled."
                     )
             if device.type == "mps":
diff --git a/examples/cpu/inference/python/models/dlrm/README.md b/examples/cpu/inference/python/models/dlrm/README.md
index 174fa55d6..83b487864 100644
--- a/examples/cpu/inference/python/models/dlrm/README.md
+++ b/examples/cpu/inference/python/models/dlrm/README.md
@@ -57,7 +57,7 @@ After you loading the raw dataset `day_*.gz` and unzip them to RAW_DIR.
 ```bash
 cd intel-extension-for-pytorch/examples/cpu/inference/python/models/dlrm/
 export MODEL_DIR=$(pwd)
-export RAW_DIR=
+export RAW_DIR=
 export TEMP_DIR=
 export PREPROCESSED_DIR=
 export MULTI_HOT_DIR=
@@ -81,14 +81,14 @@ https://github.com/mlcommons/inference/tree/master/recommendation/dlrm_v2/pytorc
     ./setup.sh
     ```
 
-5. Setup required environment paramaters
+5. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
 | **TEST_MODE** (THROUGHPUT, ACCURACY)              | `export TEST_MODE=THROUGHPUT`                  |
 | **DATASET_DIR**             |                               `export DATASET_DIR=`                                  |
 | **EVAL_BATCH**             |                               `export EVAL_BATCH=20000`                                  |
-| **WEIGHT_DIR** (ONLY FOR ACCURACY)     |                 `export WEIGHT_DIR=`        |
+| **WEIGHT_DIR** (ONLY FOR ACCURACY)     |                 `export WEIGHT_DIR=`        |
 | **PRECISION**    |                               `export PRECISION=int8 `                             |
 | **OUTPUT_DIR**    |                               `export OUTPUT_DIR=$PWD`                               |
 | **BATCH_SIZE** (optional) |                               `export BATCH_SIZE=`                                |
diff --git a/examples/cpu/inference/python/models/dlrm/run_model.sh b/examples/cpu/inference/python/models/dlrm/run_model.sh
index be2baa4aa..f75a94053 100644
--- a/examples/cpu/inference/python/models/dlrm/run_model.sh
+++ b/examples/cpu/inference/python/models/dlrm/run_model.sh
@@ -33,7 +33,7 @@ if [[ "${TEST_MODE}" == "THROUGHPUT" ]]; then
 elif [[ "${TEST_MODE}" == "ACCURACY" ]]; then
     echo "TEST_MODE set to ACCURACY"
     BATCH_SIZE=${BATCH_SIZE:-65536}
-    LOG_PREFIX=dlrm_inference_accuarcy_log
+    LOG_PREFIX=dlrm_inference_accuracy_log
     if [ -z "${DATASET_DIR}" ]; then
         echo "The required environment variable DATASET_DIR has not been set"
         exit 1
@@ -71,7 +71,7 @@ mkdir -p ${OUTPUT_DIR}
 TORCH_INDUCTOR=${TORCH_INDUCTOR:-"0"}
 AOT_INDUCTOR=${AOT_INDUCTOR:-"0"}
 # if the number of cores are not equal on different numa node
-# or for TORCHINDUCTOR=1 we will lanuch 2 process per numa
+# or for TORCHINDUCTOR=1 we will launch 2 process per numa
 ENABLE_2ND_PROCESS=${ENABLE_2ND_PROCESS:-"0"}
 MANUALLY_LAUNCH=${MANUALLY_LAUNCH:-"0"}
 if [[ "1" == ${TORCH_INDUCTOR} ]];then
diff --git a/examples/cpu/inference/python/models/gptj/README.md b/examples/cpu/inference/python/models/gptj/README.md
index 998b3b698..605918dd2 100644
--- a/examples/cpu/inference/python/models/gptj/README.md
+++ b/examples/cpu/inference/python/models/gptj/README.md
@@ -27,7 +27,7 @@ export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libtcmall
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py b/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py
index e85ad91fe..08df9ba81 100644
--- a/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py
+++ b/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py
@@ -393,7 +393,7 @@ def run_accuracy_lmeval(model, dataset):
         prompt = prompt_pool[model_type][args.input_tokens]
     else:
         raise SystemExit(
-            "[ERROR] No such input_tokens prompt in prompt.json, Plese use --prompt if want to use custom input."
+            "[ERROR] No such input_tokens prompt in prompt.json, Please use --prompt if want to use custom input."
         )
 
 input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
diff --git a/examples/cpu/inference/python/models/gptj/setup.sh b/examples/cpu/inference/python/models/gptj/setup.sh
index 1e94a6015..547298eee 100755
--- a/examples/cpu/inference/python/models/gptj/setup.sh
+++ b/examples/cpu/inference/python/models/gptj/setup.sh
@@ -24,7 +24,7 @@ cd transformers
 pip install -e ./
 cd ..
 
-# Get prompt.json for gneration inference
+# Get prompt.json for generation inference
 wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
 
 export EVAL_SCRIPT="run_llm_inductor_greedy.py"
diff --git a/examples/cpu/inference/python/models/llama/README.md b/examples/cpu/inference/python/models/llama/README.md
index bc4e19966..ec2068148 100644
--- a/examples/cpu/inference/python/models/llama/README.md
+++ b/examples/cpu/inference/python/models/llama/README.md
@@ -59,7 +59,7 @@ wget -O prompt.json https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.c
 ```
 
 ### Performance
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
@@ -127,7 +127,7 @@ export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libtcmall
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py b/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py
index e85ad91fe..08df9ba81 100644
--- a/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py
+++ b/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py
@@ -393,7 +393,7 @@ def run_accuracy_lmeval(model, dataset):
         prompt = prompt_pool[model_type][args.input_tokens]
     else:
         raise SystemExit(
-            "[ERROR] No such input_tokens prompt in prompt.json, Plese use --prompt if want to use custom input."
+            "[ERROR] No such input_tokens prompt in prompt.json, Please use --prompt if want to use custom input."
         )
 
 input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
diff --git a/examples/cpu/inference/python/models/llama/setup.sh b/examples/cpu/inference/python/models/llama/setup.sh
index f6ddf9c3a..a0d80afb3 100755
--- a/examples/cpu/inference/python/models/llama/setup.sh
+++ b/examples/cpu/inference/python/models/llama/setup.sh
@@ -24,7 +24,7 @@ cd transformers
 pip install -e ./
 cd ..
 
-# Get prompt.json for gneration inference
+# Get prompt.json for generation inference
 wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json
 
 export EVAL_SCRIPT="run_llm_inductor_greedy.py"
diff --git a/examples/cpu/inference/python/models/resnet50/README.md b/examples/cpu/inference/python/models/resnet50/README.md
index cfa789f9d..e79fef997 100644
--- a/examples/cpu/inference/python/models/resnet50/README.md
+++ b/examples/cpu/inference/python/models/resnet50/README.md
@@ -45,7 +45,7 @@ imagenet
 The folder that contains the `val` directory should be set as the `DATASET_DIR` (for example: `export DATASET_DIR=/home//imagenet`).
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                                    |                **export command**                                 |
 |:------------------------------------------------:|:-----------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/resnet50/common/main.py b/examples/cpu/inference/python/models/resnet50/common/main.py
index cd86cf507..8ecabc618 100755
--- a/examples/cpu/inference/python/models/resnet50/common/main.py
+++ b/examples/cpu/inference/python/models/resnet50/common/main.py
@@ -512,10 +512,10 @@ def main_worker(gpu, ngpus_per_node, args):
     if args.dummy:
         assert args.evaluate, "please using real dataset if you want run training path"
     if not args.ipex and not args.inductor:
-        # for offical pytorch, int8 and jit path is not enabled.
+        # for official pytorch, int8 and jit path is not enabled.
         # for torch.compile(backend=inductor) INT8 quantization is been supported.
-        assert not args.int8, "int8 path is not enabled for offical pytorch"
-        assert not args.jit, "jit path is not enabled for offical pytorch"
+        assert not args.int8, "int8 path is not enabled for official pytorch"
+        assert not args.jit, "jit path is not enabled for official pytorch"
 
     if not args.dummy:
         # Data loading code
@@ -582,7 +582,7 @@ def main_worker(gpu, ngpus_per_node, args):
         if args.ipex:
             print("using ipex model to do inference\n")
         else:
-            print("using offical pytorch model to do inference\n")
+            print("using official pytorch model to do inference\n")
 
         # IPEX Path
         if args.ipex:
@@ -615,17 +615,17 @@ def main_worker(gpu, ngpus_per_node, args):
                     model = torch.jit.freeze(model.eval())
                     y = model(x)
                     y = model(x)
-                    print("running int8 evalation step\n")
+                    print("running int8 evaluation step\n")
             else:
                 if args.bf16:
                     model = ipex.optimize(model, dtype=torch.bfloat16, inplace=True)
-                    print("running bfloat16 evalation step\n")
+                    print("running bfloat16 evaluation step\n")
                 elif args.fp16:
                     model = ipex.optimize(model, dtype=torch.half, inplace=True)
-                    print("running float16 evalation step\n")
+                    print("running float16 evaluation step\n")
                 else:
                     model = ipex.optimize(model, dtype=torch.float32, inplace=True)
-                    print("running fp32 evalation step\n")
+                    print("running fp32 evaluation step\n")
                 if args.jit:
                     x = torch.randn(args.batch_size, 3, 224, 224).contiguous(
                         memory_format=torch.channels_last
@@ -1019,7 +1019,7 @@ def validate(val_loader, model, criterion, args):
         model.eval()
 
     if args.ipex and args.int8 and args.calibration:
-        print("runing int8 calibration step\n")
+        print("running int8 calibration step\n")
         import intel_extension_for_pytorch as ipex
         from torch.ao.quantization import (
             MinMaxObserver,
@@ -1048,7 +1048,7 @@ def validate(val_loader, model, criterion, args):
             print(".........calibration step done..........")
     else:
         if args.dummy:
-            # always running channle last for fp32, bf16, int8
+            # always running channel last for fp32, bf16, int8
             with torch.no_grad():
                 if args.weight_sharing:
                     threads = []
diff --git a/examples/cpu/inference/python/models/stable_diffusion/README.md b/examples/cpu/inference/python/models/stable_diffusion/README.md
index 70791bb5a..3c5352dd3 100644
--- a/examples/cpu/inference/python/models/stable_diffusion/README.md
+++ b/examples/cpu/inference/python/models/stable_diffusion/README.md
@@ -56,7 +56,7 @@ bash download_dataset.sh
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/vit/README.md b/examples/cpu/inference/python/models/vit/README.md
index f4af3e01d..e4cac8795 100644
--- a/examples/cpu/inference/python/models/vit/README.md
+++ b/examples/cpu/inference/python/models/vit/README.md
@@ -70,7 +70,7 @@ Vision Transformer inference best known configurations with PyTorch.
     ./setup.sh
     ```
 5. Prepare for downloading access
-    On https://huggingface.co/datasets/ILSVRC/imagenet-1k, login your account, and click the aggreement and then generating {your huggingface token}
+    On https://huggingface.co/datasets/ILSVRC/imagenet-1k, login your account, and click the agreement and then generating {your huggingface token}
 
     huggingface-cli login
     {your huggingface token}
@@ -80,7 +80,7 @@ Vision Transformer inference best known configurations with PyTorch.
    #Run "download_data.sh"
    ./download_data.sh
   ```
-7. Setup required environment paramaters
+7. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/vit/scripts/trainer.py b/examples/cpu/inference/python/models/vit/scripts/trainer.py
index c6d5442a9..e9855b7e1 100755
--- a/examples/cpu/inference/python/models/vit/scripts/trainer.py
+++ b/examples/cpu/inference/python/models/vit/scripts/trainer.py
@@ -2049,7 +2049,7 @@ def _inner_training_loop(
                 (self.model_wrapped,) = release_memory(self.model_wrapped)
                 self.model_wrapped = self.model
 
-                # Check for DeepSpeed *after* the intial pass and modify the config
+                # Check for DeepSpeed *after* the initial pass and modify the config
                 if self.is_deepspeed_enabled:
                     # Temporarily unset `self.args.train_batch_size`
                     original_bs = self.args.per_device_train_batch_size
@@ -2734,7 +2734,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     # Checkpoint must have been saved with the old smp api.
                     if hasattr(self.args, "fp16") and self.args.fp16 is True:
                         logger.warning(
-                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
+                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported."
                         )
                     state_dict = torch.load(
                         weights_file,
@@ -3909,7 +3909,7 @@ def evaluate(
             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
             dictionary also contains the epoch number which comes from the training state.
         """
-        # handle multipe eval datasets
+        # handle multiple eval datasets
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
         if isinstance(eval_dataset, dict):
             metrics = {}
@@ -4053,7 +4053,7 @@ def predict(
     def benchmark_evaluate(self, model, dataloader):
         steps_per_epoch = len(dataloader)
         total_steps = self.args.perf_run_iters + self.args.perf_begin_iter
-        test_epoches = int(total_steps / steps_per_epoch)
+        test_epochs = int(total_steps / steps_per_epoch)
         print(
             "Evaluating: Steps per Epoch {} total Steps {}".format(
                 steps_per_epoch, total_steps
@@ -4064,7 +4064,7 @@ def benchmark_evaluate(self, model, dataloader):
         import time
 
         with tqdm(total=total_steps, desc="Evaluating") as pbar:
-            for epoch in range(test_epoches + 1):
+            for epoch in range(test_epochs + 1):
                 for it, batch in enumerate(dataloader):
                     if "pixel_values" in batch and self.args.benchmark:
                         if self.args.fp16_cpu:
diff --git a/examples/cpu/inference/python/models/vit/scripts/training_args.py b/examples/cpu/inference/python/models/vit/scripts/training_args.py
index 384bcf913..51eb22ff2 100644
--- a/examples/cpu/inference/python/models/vit/scripts/training_args.py
+++ b/examples/cpu/inference/python/models/vit/scripts/training_args.py
@@ -472,7 +472,7 @@ class TrainingArguments:
                      all-gathers.
                 - use_orig_params (`bool`, *optional*, defaults to `True`)
                     If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
-                    frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
+                    frozen and trainable parameters. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
                 - sync_module_states (`bool`, *optional*, defaults to `True`)
@@ -521,8 +521,8 @@ class TrainingArguments:
                     all workers.
                 - use_seedable_sampler (`bool`, *optional*, defaults to `True`):
                     Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
-                    training results are fully reproducable using a different sampling technique. While seed-to-seed results
-                    may differ, on average the differences are neglible when using multiple different seeds to compare. Should
+                    training results are fully reproducible using a different sampling technique. While seed-to-seed results
+                    may differ, on average the differences are negligible when using multiple different seeds to compare. Should
                     also be ran with [`~utils.set_seed`] for the best results.
 
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
@@ -1297,7 +1297,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Config to be used with the internal Accelerator object initializtion. The value is either a "
+                "Config to be used with the internal Accelerator object initialization. The value is either a "
                 "accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
             )
         },
@@ -1582,7 +1582,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": "Activates neftune noise embeddings into the model. NEFTune has been proven "
-            "to drastically improve model performances for instrcution fine-tuning. Check out the "
+            "to drastically improve model performances for instruction fine-tuning. Check out the "
             "original paper here: https://arxiv.org/abs/2310.05914 and the original code "
             "here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
         },
@@ -2289,7 +2289,7 @@ def _setup_devices(self) -> "torch.device":
                 )
                 if device.type != "mps":
                     raise ValueError(
-                        "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ "
+                        "Either you do not have an MPS-enabled device on this machine or macOS version is not 12.3+ "
                         "or current PyTorch install was not built with MPS enabled."
                     )
             if device.type == "mps":
diff --git a/examples/cpu/inference/python/models/yolov7/README.md b/examples/cpu/inference/python/models/yolov7/README.md
index 17c30cb6d..7a31038f6 100644
--- a/examples/cpu/inference/python/models/yolov7/README.md
+++ b/examples/cpu/inference/python/models/yolov7/README.md
@@ -57,7 +57,7 @@
     ./setup.sh
     ```
 
-3. Setup required environment paramaters
+3. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/yolov7/inference.py b/examples/cpu/inference/python/models/yolov7/inference.py
index 03a1d2eb6..68819dc2a 100644
--- a/examples/cpu/inference/python/models/yolov7/inference.py
+++ b/examples/cpu/inference/python/models/yolov7/inference.py
@@ -281,7 +281,7 @@ def test(
     model = model.to(memory_format=torch.channels_last)
 
     if evaluate:
-        print("using offical pytorch model to do inference\n")
+        print("using official pytorch model to do inference\n")
         x = torch.rand(batch_size, 3, imgsz, imgsz).contiguous(
             memory_format=torch.channels_last
         )
diff --git a/examples/cpu/llm/fine-tuning/finetune.py b/examples/cpu/llm/fine-tuning/finetune.py
index 6d806ef74..84468507d 100644
--- a/examples/cpu/llm/fine-tuning/finetune.py
+++ b/examples/cpu/llm/fine-tuning/finetune.py
@@ -1,5 +1,5 @@
 """
-This script is adapted from the following official alpaca-loca fine-tuning code with minimal code changes:
+This script is adapted from the following official alpaca-local fine-tuning code with minimal code changes:
 https://github.com/tloen/alpaca-lora/blob/main/finetune.py
 """
 
diff --git a/examples/cpu/llm/inference/README.md b/examples/cpu/llm/inference/README.md
index 0eecbb7d2..6444195e3 100644
--- a/examples/cpu/llm/inference/README.md
+++ b/examples/cpu/llm/inference/README.md
@@ -493,7 +493,7 @@ deepspeed --bind_cores_to_rank run.py -m  --benchm
 - Notes
 
 (1) Since the hugeness of the model size as well as the cache based optimizations, it is recommended to use a server with 1.5TB
-or larger memory amount. The memory comsumption optimizations are in progress.
+or larger memory amount. The memory consumption optimizations are in progress.
 
 (2) Please add `--num_accelerators` and `--bind_core_list` arguments for `deepspeed` command based on your SNC configurations.
 For example, for a server having 2 sockets, 128 physical cores per socket with a total number of 6 sub-numa clusters,
diff --git a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
index 0c1aec6c4..88761fd1f 100644
--- a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
+++ b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
@@ -247,7 +247,7 @@ def get_low_precision_checkpoint(args, model_config):
 
 def maybe_set_tp_grain_size(quant_config, ds_init_inf_kwargs):
     tp_grain_size = 64
-    # Need to check if this attr is available. Old DeepSpeep does not have it.
+    # Need to check if this attr is available. Old DeepSpeed does not have it.
     assert "tp_grain_size" in dir(
         deepspeed.inference.config.DeepSpeedTPConfig()
     ), "Old DeepSpeed version detected. Please update to the recommended version."
@@ -1714,7 +1714,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
                 and DEFAULT_IMAGE_TOKEN not in prompts_input
             ):
                 """
-                Three senarios:
+                Three scenarios:
                 1. No image, and there for, no image token should be added.
                 2. image token is already specified in the context, so we don't need to add it.
                 3. image token is not specified in the context and there is image inputs, so we need to add it.
@@ -1857,7 +1857,7 @@ def _collate(x):
                         and DEFAULT_IMAGE_TOKEN not in context
                     ):
                         """
-                        Three senarios:
+                        Three scenarios:
                         1. No image, and there for, no image token should be added.
                         2. image token is already specified in the context, so we don't need to add it.
                         3. image token is not specified in the context and there is image inputs,
diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
index 30e04ff84..563b8f070 100644
--- a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
+++ b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -70,7 +70,7 @@ def str_to_kwargs(s):
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6b",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--vision-text-model",
@@ -619,7 +619,7 @@ def write_checkpoints_json():
     )
 
 tp_grain_size = 64
-# Need to check if this attr is available. Old DeepSpeep does not have it.
+# Need to check if this attr is available. Old DeepSpeed does not have it.
 assert "tp_grain_size" in dir(
     deepspeed.inference.config.DeepSpeedTPConfig()
 ), "Old DeepSpeed version detected. Please update to the recommended version."
@@ -871,7 +871,7 @@ def load_image(image_file):
                 ]
             prompt = current_prompt
         else:
-            raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.")
+            raise SystemExit("[ERROR] Please use --prompt if want to use custom input.")
 
         raw_image = load_image(args.image_url)
         raw_image = [raw_image] * test_bs
@@ -948,7 +948,7 @@ def download_and_open(url: str) -> Image.Image:
             else:
                 input_sentences.append(prompt_pool[model_type][args.input_tokens])
         else:
-            raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.")
+            raise SystemExit("[ERROR] Please use --prompt if want to use custom input.")
         if test_bs > len(input_sentences):
             # dynamically extend to support larger bs by repetition
             input_sentences *= math.ceil(test_bs / len(input_sentences))
@@ -1064,7 +1064,7 @@ def trace_handler(prof):
     generated, _ = generate()
     t_generate_span = time.time() - t_generate_start
     for i, o, _ in generated:
-        print_rank0(f"{'-'*60}\nin={i}\nout={o}\n")
+        print_rank0(f"{'-' * 60}\nin={i}\nout={o}\n")
 
 # benchmark it!
 else:
diff --git a/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb b/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb
index 1aef9adb7..44eab465a 100644
--- a/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb
+++ b/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb
@@ -126,7 +126,7 @@
     "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n",
     "model = model.eval()\n",
     "\n",
-    "# Customizeable hyperparamters\n",
+    "# Customizable hyperparameters\n",
     "batch_size = 1\n",
     "num_beams = 1\n",
     "generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=num_beams)"
@@ -342,7 +342,7 @@
    "metadata": {},
    "source": [
     "### Running ipex.llm in a Distributed Manner\n",
-    "Running ipex.llm in a distributed manner allows you to utlize all available cores more effectively. This is done using DeepSpeed. It is recommended to shard the model weight sizes for better memory usage when running with DeepSpeed. Sharding only needs to be done once. On subsequent runs, remove \"--shard-model\" and replace \"-m \\\" with \"-m \\\"."
+    "Running ipex.llm in a distributed manner allows you to utilize all available cores more effectively. This is done using DeepSpeed. It is recommended to shard the model weight sizes for better memory usage when running with DeepSpeed. Sharding only needs to be done once. On subsequent runs, remove \"--shard-model\" and replace \"-m \\\" with \"-m \\\"."
    ]
   },
   {
diff --git a/examples/cpu/llm/inference/single_instance/run_accuracy.py b/examples/cpu/llm/inference/single_instance/run_accuracy.py
index 9593268d1..698f91a89 100644
--- a/examples/cpu/llm/inference/single_instance/run_accuracy.py
+++ b/examples/cpu/llm/inference/single_instance/run_accuracy.py
@@ -1217,7 +1217,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
                 and DEFAULT_IMAGE_TOKEN not in prompts_input
             ):
                 """
-                Three senarios:
+                Three scenarios:
                 1. No image, and there for, no image token should be added.
                 2. image token is already specified in the context, so we don't need to add it.
                 3. image token is not specified in the context and there is image inputs, so we need to add it.
@@ -1361,7 +1361,7 @@ def _collate(x):
                         and DEFAULT_IMAGE_TOKEN not in context
                     ):
                         """
-                        Three senarios:
+                        Three scenarios:
                         1. No image, and there for, no image token should be added.
                         2. image token is already specified in the context, so we don't need to add it.
                         3. image token is not specified in the context and there is image inputs, so we need to add it.
diff --git a/examples/cpu/llm/inference/single_instance/run_generation.py b/examples/cpu/llm/inference/single_instance/run_generation.py
index d6c01a881..ab33bf5ff 100644
--- a/examples/cpu/llm/inference/single_instance/run_generation.py
+++ b/examples/cpu/llm/inference/single_instance/run_generation.py
@@ -45,7 +45,7 @@ def str_to_kwargs(s):
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6B",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--dtype",
@@ -474,7 +474,7 @@ def trace_handler(prof):
                     prompt = prompt_pool[model_type][args.input_tokens]
             else:
                 raise SystemExit(
-                    "[ERROR] Plese use --prompt if want to use custom input."
+                    "[ERROR] Please use --prompt if want to use custom input."
                 )
             if model_type == "mllama":
                 raw_image = load_image(args.image_url)
diff --git a/examples/cpu/llm/inference/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py
index 52183bdee..c069c33af 100644
--- a/examples/cpu/llm/inference/single_instance/run_quantization.py
+++ b/examples/cpu/llm/inference/single_instance/run_quantization.py
@@ -1619,7 +1619,7 @@ def calib_func(prepared_model):
                     prompt = prompt_pool[model.name][args.input_tokens]
             else:
                 raise SystemExit(
-                    "[ERROR] Plese use --prompt if want to use custom input."
+                    "[ERROR] Please use --prompt if want to use custom input."
                 )
 
             if model.name == "mllama":
diff --git a/examples/cpu/llm/inference/utils/create_shard_model.py b/examples/cpu/llm/inference/utils/create_shard_model.py
index cef4f1e3d..338bd7c3e 100644
--- a/examples/cpu/llm/inference/utils/create_shard_model.py
+++ b/examples/cpu/llm/inference/utils/create_shard_model.py
@@ -17,7 +17,7 @@
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6B",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--save-path",
diff --git a/examples/cpu/llm/tools/env_setup.sh b/examples/cpu/llm/tools/env_setup.sh
index 743d66cec..b8dbef0b5 100644
--- a/examples/cpu/llm/tools/env_setup.sh
+++ b/examples/cpu/llm/tools/env_setup.sh
@@ -29,7 +29,7 @@ if [ ! -f ${WHEELFOLDER}/lm_eval*.whl ]; then
     (( MODE |= 0x02 ))
 fi
 
-# Check existance of required Linux commands
+# Check existence of required Linux commands
 for CMD in gcc g++; do
     command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" is required."; exit 1;)
 done
@@ -57,7 +57,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
     # Enter IPEX parent dir
     cd ..
 
-    # Check existance of required Linux commands
+    # Check existence of required Linux commands
     for CMD in make git; do
         command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" is required."; exit 3;)
     done
diff --git a/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html b/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html
index 2dd4747bb..df0408eb3 100644
--- a/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html
+++ b/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html
@@ -14455,7 +14455,7 @@