diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8cee416f8..b20caec8e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,15 +31,15 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_INSTALL_RPATH $ORIGIN)
 
 # Need the torch package
-set(Torch_COMP_VERION "${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR}")
-find_package(Torch ${Torch_COMP_VERION} REQUIRED)
+set(Torch_COMP_VERSION "${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR}")
+find_package(Torch ${Torch_COMP_VERSION} REQUIRED)
 
 if(NOT EXISTS ${TORCH_INSTALL_PREFIX})
   message(FATAL_ERROR "Can NOT find torch install path at ${TORCH_INSTALL_PREFIX}!")
 endif()
 
-if(NOT ${Torch_COMP_VERION} VERSION_EQUAL "${Torch_VERSION_MAJOR}.${Torch_VERSION_MINOR}")
-  message(FATAL_ERROR "Not compatible Torch version ${Torch_VERSION} at ${TORCH_INSTALL_PREFIX}!\nTorch ${Torch_COMP_VERION} is needed!")
+if(NOT ${Torch_COMP_VERSION} VERSION_EQUAL "${Torch_VERSION_MAJOR}.${Torch_VERSION_MINOR}")
+  message(FATAL_ERROR "Not compatible Torch version ${Torch_VERSION} at ${TORCH_INSTALL_PREFIX}!\nTorch ${Torch_COMP_VERSION} is needed!")
 endif()
 
 include(${IPEX_ROOT_DIR}/cmake/Options.cmake)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 23418d006..c3374bf53 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -75,7 +75,7 @@ In case you want to reinstall, make sure that you uninstall Intel® Extension fo
 
 ### Tips and Debugging
 
-* A prerequisite to installing Intel® Extension for PyTorch\* is CMake. We recommend installing it with [Homebrew](https://brew.sh/) with `brew install cmake` if you are developing on MacOS or Linux system.
+* A prerequisite to installing Intel® Extension for PyTorch\* is CMake. We recommend installing it with [Homebrew](https://brew.sh/) with `brew install cmake` if you are developing on macOS or Linux system.
 * Our `setup.py` requires Python >= 3.6
 * If you run into errors when running `python setup.py develop`, here are some debugging steps:
   1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure your CMake works and can compile this simple Hello World program without errors.
diff --git a/cmake/cppsdk/gen_self_extract.sh.in b/cmake/cppsdk/gen_self_extract.sh.in
index 1a279087c..ca9a064ea 100755
--- a/cmake/cppsdk/gen_self_extract.sh.in
+++ b/cmake/cppsdk/gen_self_extract.sh.in
@@ -32,5 +32,5 @@ if [ $? -gt 0 ]; then
     exit 23
 fi
 
-echo "Successfully generate self-extacting package at ${LIBIPEX_INSTALL_SCRIPT}"
+echo "Successfully generate self-extracting package at ${LIBIPEX_INSTALL_SCRIPT}"
 exit
diff --git a/cmake/cppsdk/libintel-ext-pt.installer.sh.in b/cmake/cppsdk/libintel-ext-pt.installer.sh.in
index 066007f1b..35cc4d1f7 100644
--- a/cmake/cppsdk/libintel-ext-pt.installer.sh.in
+++ b/cmake/cppsdk/libintel-ext-pt.installer.sh.in
@@ -119,7 +119,7 @@ if [[ ${COMMAND} == "install" ]]; then
         echo "f|${comp}" >> ${LIBTORCH_PATH}/${LOGFILE}
     done
 
-    echo "Installation successed!"
+    echo "Installation succeeded!"
 
 # LIBIPEX Uninstallation
 elif [[ ${COMMAND} == "uninstall" ]]; then
@@ -144,7 +144,7 @@ elif [[ ${COMMAND} == "uninstall" ]]; then
         rm -f ${LIBTORCH_PATH}/${LOGFILE}
     fi
 
-    echo "Uninstallation successed!"
+    echo "Uninstallation succeeded!"
 fi
 
 exit
diff --git a/cmake/cpu/IsaCodegen.cmake b/cmake/cpu/IsaCodegen.cmake
index 8ab710002..b7c45831e 100644
--- a/cmake/cpu/IsaCodegen.cmake
+++ b/cmake/cpu/IsaCodegen.cmake
@@ -141,7 +141,7 @@ endif(CXX_AVX2_FOUND)
 list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
 math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
 
-# The sources list might get reordered later based on the capabilites.
+# The sources list might get reordered later based on the capabilities.
 # See NOTE [ Linking AVX and non-AVX files ]
 foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
   foreach(IMPL ${cpu_kernel_cpp_in})
diff --git a/csrc/cpu/aten/AveragePool.cpp b/csrc/cpu/aten/AveragePool.cpp
index fcb85120b..164b43973 100644
--- a/csrc/cpu/aten/AveragePool.cpp
+++ b/csrc/cpu/aten/AveragePool.cpp
@@ -308,7 +308,7 @@ at::Tensor avg_pool3d_out_cpu(
   } else {
     TORCH_CHECK(
         false,
-        "Unsupport memory format. Supports only ChannelsLast3d, Contiguous");
+        "Unsupported memory format. Supports only ChannelsLast3d, Contiguous");
   }
 
   TORCH_CHECK(
@@ -459,7 +459,7 @@ at::Tensor avg_pool3d_backward_out_cpu(
   } else {
     TORCH_CHECK(
         false,
-        "Unsupport memory format. Supports only ChannelsLast3d, Contiguous");
+        "Unsupported memory format. Supports only ChannelsLast3d, Contiguous");
   }
 
   TORCH_CHECK(
diff --git a/csrc/cpu/aten/Conv.cpp b/csrc/cpu/aten/Conv.cpp
index fcf39d600..16874f670 100644
--- a/csrc/cpu/aten/Conv.cpp
+++ b/csrc/cpu/aten/Conv.cpp
@@ -104,7 +104,7 @@ at::Tensor convolution_kernel(
     at::MemoryFormat memory_format) {
   // Base convolution kernel, this base kernel will not change input's format,
   // so make sure you has make process the input's format before call this
-  // function, the output wil has same format with input.
+  // function, the output will has same format with input.
   // TODO: the input will be actively converted to channels last format
   // after the 5-D tensor supports channels last format.
   TORCH_CHECK(
diff --git a/csrc/cpu/aten/ConvTranspose.cpp b/csrc/cpu/aten/ConvTranspose.cpp
index 799d04f21..6e1c3db47 100644
--- a/csrc/cpu/aten/ConvTranspose.cpp
+++ b/csrc/cpu/aten/ConvTranspose.cpp
@@ -36,7 +36,7 @@ std::vector<int64_t> conv_input_size(
 static inline std::vector<int64_t> padding_r(
     at::IntArrayRef padding,
     at::IntArrayRef output_padding) {
-  // ConvTranpose padding adjustment
+  // ConvTranspose padding adjustment
   //
   // PyTorch uses padding/output_padding:
   //   osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1)
diff --git a/csrc/cpu/aten/DistributedMergedEmb.cpp b/csrc/cpu/aten/DistributedMergedEmb.cpp
index f7f90035f..d432175b2 100644
--- a/csrc/cpu/aten/DistributedMergedEmb.cpp
+++ b/csrc/cpu/aten/DistributedMergedEmb.cpp
@@ -68,7 +68,7 @@ IPEX_DEFINE_DISPATCH(mergedemb_distribute_backward_merge_adagrad_update_stub);
  * distributed-merged-embedding-foward-lookup
  * 1. mergedemb_distribute_backward_local_cpu will finish the backward with
  * local grad (shape of [local BS * num_table * emb_dim]), the output grad will
- * be organzied by 3 TensorList: val Tensors, idx Tensors, ofs Tensors. The
+ * be organized by 3 TensorList: val Tensors, idx Tensors, ofs Tensors. The
  * number of the Tensors in 1 TensorList equal to world size. val[i], idx[i],
  * ofs[i] is the tensors will be transfer to rank i by sparse all to all. It
  * contains the grads for those indices on rank i.
diff --git a/csrc/cpu/aten/EmbeddingBag.cpp b/csrc/cpu/aten/EmbeddingBag.cpp
index 1e93545ec..daf2b7caa 100644
--- a/csrc/cpu/aten/EmbeddingBag.cpp
+++ b/csrc/cpu/aten/EmbeddingBag.cpp
@@ -148,8 +148,8 @@ at::Tensor embedding_bag(
 } // namespace torch_ipex
 
 /*
-A namespace wrapper to keep API compatiable to callers.
-And also compatiable to new dyndisp.
+A namespace wrapper to keep API compatible to callers.
+And also compatible to new dyndisp.
 */
 namespace torch_ipex {
 
diff --git a/csrc/cpu/aten/FlashAttention.cpp b/csrc/cpu/aten/FlashAttention.cpp
index 14c2c4286..479576e01 100644
--- a/csrc/cpu/aten/FlashAttention.cpp
+++ b/csrc/cpu/aten/FlashAttention.cpp
@@ -20,7 +20,7 @@ bool use_ipex_flash_attention(
 }
 
 /*
- *Caculate the flash attention SDPA with attention mask.
+ *Calculate the flash attention SDPA with attention mask.
  */
 std::tuple<at::Tensor, at::Tensor> flash_attention_forward_cpu(
     const at::Tensor& query,
diff --git a/csrc/cpu/aten/LayerNorm.cpp b/csrc/cpu/aten/LayerNorm.cpp
index 936d10cae..f8a4755c9 100644
--- a/csrc/cpu/aten/LayerNorm.cpp
+++ b/csrc/cpu/aten/LayerNorm.cpp
@@ -25,7 +25,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> layer_norm_impl(
     double eps) {
   TORCH_CHECK(
       gamma.scalar_type() == at::kFloat && beta.scalar_type() == at::kFloat,
-      "gamma adn beta's data type should be float");
+      "gamma and beta's data type should be float");
   ideep::tensor x = itensor_view_from_dense(X);
   const ideep::tensor scale = itensor_view_from_dense(gamma);
   const ideep::tensor shift = itensor_view_from_dense(beta);
@@ -148,7 +148,7 @@ at::Tensor layer_norm_forward(
  * Now, we only use oneDNN kernel when both weight and bias are provided.
  * ToDo: more scenarios to use oneDNN or remvoe this pass
  * when at::layer_norm performance is back compared to w/o
- * mergeing https://github.com/pytorch/pytorch/pull/59987
+ * merging https://github.com/pytorch/pytorch/pull/59987
  *
  * @param input: the source tensor to layernorm
  * @param normalized_shape: input shape from an expected input of size
diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp
index 23de79e57..82d4e11e9 100644
--- a/csrc/cpu/aten/Linear.cpp
+++ b/csrc/cpu/aten/Linear.cpp
@@ -45,7 +45,7 @@ void linear_kernel_output(
       dim == 2 ? self_ : self_.reshape({-1, self.size(self.dim() - 1)});
   const ideep::tensor mkldnn_input = itensor_view_from_dense(self_reshaped);
   // output.sizes() will return a reference for output's size which will not
-  // hold the underlaying storage. It will be released if output are dead
+  // hold the underlying storage. It will be released if output are dead
   // (output = output.reshape(output_size_reshaped)) output.sizes().vec() will
   // trigger a copy and can hold the sizes vector.
   auto output_size = output.sizes().vec();
diff --git a/csrc/cpu/aten/MaskedMultiHeadAttention.cpp b/csrc/cpu/aten/MaskedMultiHeadAttention.cpp
index 1989e3bbe..77f77fec9 100644
--- a/csrc/cpu/aten/MaskedMultiHeadAttention.cpp
+++ b/csrc/cpu/aten/MaskedMultiHeadAttention.cpp
@@ -10,7 +10,7 @@ IPEX_DEFINE_DISPATCH(deepseekv2_mla_kernel_stub);
 IPEX_DEFINE_DISPATCH(prepare_4d_causal_attention_mask_kernel_stub);
 
 /*
- *Caculate the masked multihead attention for decoder layer in decoder only
+ *Calculate the masked multihead attention for decoder layer in decoder only
  *model.
  *@param query
  *@param key
diff --git a/csrc/cpu/aten/MergedEmbeddingBag.h b/csrc/cpu/aten/MergedEmbeddingBag.h
index 1687ce21f..4318d87bd 100644
--- a/csrc/cpu/aten/MergedEmbeddingBag.h
+++ b/csrc/cpu/aten/MergedEmbeddingBag.h
@@ -50,7 +50,7 @@ class EMBROWFixLen {
  * EmbeddingRowCache with smaller memory usage.
  *
  * EmbeddingRowCache contains var length EmbRow hash map and Fixed length EmbRow
- * with len=64, 128, 256 And handle different lenght inside EmbeddingRowCache
+ * with len=64, 128, 256 And handle different length inside EmbeddingRowCache
  * without expose len info to users.
  *
  * The robin_hood::unordered_map<int64_t, T*> _cached_ptr is used because user
@@ -61,7 +61,7 @@ class EMBROWFixLen {
  *     We will allocate memory to hold emb row very frequently during Embedding
  * FW/BW, we wish to allocate the memory on stack by using temporal varalble
  * instead of allocating them in heap for performance consideration. So we use C
- * array to hold fixed length and use std::vector to hold var lenght
+ * array to hold fixed length and use std::vector to hold var length
  * (std::vector will use memory on heap).
  *
  * How to use:
diff --git a/csrc/cpu/aten/PagedAttention.cpp b/csrc/cpu/aten/PagedAttention.cpp
index ee19109e6..4395cee81 100644
--- a/csrc/cpu/aten/PagedAttention.cpp
+++ b/csrc/cpu/aten/PagedAttention.cpp
@@ -11,7 +11,7 @@ IPEX_DEFINE_DISPATCH(reshape_and_cache_kernel_stub);
 IPEX_DEFINE_DISPATCH(flash_attn_var_len_kernel_stub);
 
 /*
- *Caculate the masked multihead attention for decoder layer in decoder only
+ *Calculate the masked multihead attention for decoder layer in decoder only
  */
 at::Tensor single_query_cached_kv_attention_forward_cpu(
     at::Tensor& out, // [num_seqs, num_heads, head_size]
diff --git a/csrc/cpu/aten/Punica.cpp b/csrc/cpu/aten/Punica.cpp
index 0966b2145..a7121bacf 100644
--- a/csrc/cpu/aten/Punica.cpp
+++ b/csrc/cpu/aten/Punica.cpp
@@ -17,9 +17,9 @@ at::Tensor punica_bgmv_shrink_forward_cpu(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     const double scale) {
-  punica_bgmv_shrink_kernel_stub(kCPU, out, input, weights, indicies, scale);
+  punica_bgmv_shrink_kernel_stub(kCPU, out, input, weights, indices, scale);
   return out;
 }
 
@@ -27,11 +27,11 @@ at::Tensor punica_sgmv_shrink_forward_cpu(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     const double scale) {
   punica_sgmv_shrink_kernel_stub(
-      kCPU, out, input, weights, indicies, seq_lens, scale);
+      kCPU, out, input, weights, indices, seq_lens, scale);
   return out;
 }
 
@@ -39,10 +39,10 @@ at::Tensor punica_bgmv_expand_forward_cpu(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     bool add_inputs) {
   punica_bgmv_expand_kernel_stub(
-      kCPU, out, input, weights, indicies, add_inputs);
+      kCPU, out, input, weights, indices, add_inputs);
   return out;
 }
 
@@ -50,11 +50,11 @@ at::Tensor punica_sgmv_expand_forward_cpu(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     bool add_inputs) {
   punica_sgmv_expand_kernel_stub(
-      kCPU, out, input, weights, indicies, seq_lens, add_inputs);
+      kCPU, out, input, weights, indices, seq_lens, add_inputs);
   return out;
 }
 
@@ -62,7 +62,7 @@ at::Tensor punica_bgmv_expand_slice_forward_cpu(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     int64_t slice_offset,
     int64_t slice_size,
     bool add_inputs) {
@@ -71,7 +71,7 @@ at::Tensor punica_bgmv_expand_slice_forward_cpu(
       out,
       input,
       weights,
-      indicies,
+      indices,
       slice_offset,
       slice_size,
       add_inputs);
@@ -82,7 +82,7 @@ at::Tensor punica_sgmv_expand_slice_forward_cpu(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     int64_t slice_offset,
     int64_t slice_size,
@@ -92,7 +92,7 @@ at::Tensor punica_sgmv_expand_slice_forward_cpu(
       out,
       input,
       weights,
-      indicies,
+      indices,
       seq_lens,
       slice_offset,
       slice_size,
diff --git a/csrc/cpu/aten/Punica.h b/csrc/cpu/aten/Punica.h
index 0bdaceabb..1fae98094 100644
--- a/csrc/cpu/aten/Punica.h
+++ b/csrc/cpu/aten/Punica.h
@@ -12,14 +12,14 @@ void punica_bgmv_shrink(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     const double scale);
 
 void punica_sgmv_shrink(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     const double scale);
 
@@ -27,14 +27,14 @@ void punica_bgmv_expand(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     bool add_inputs);
 
 void punica_sgmv_expand(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     bool add_inputs);
 
@@ -42,7 +42,7 @@ void punica_bgmv_expand_slice(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     int64_t slice_offset,
     int64_t slice_size,
     bool add_inputs);
@@ -51,7 +51,7 @@ void punica_sgmv_expand_slice(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     int64_t slice_offset,
     int64_t slice_size,
@@ -62,14 +62,14 @@ using punica_bgmv_shrink_fn = void (*)(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     const double scale);
 
 using punica_sgmv_shrink_fn = void (*)(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     const double scale);
 
@@ -77,14 +77,14 @@ using punica_bgmv_expand_fn = void (*)(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     bool add_inputs);
 
 using punica_sgmv_expand_fn = void (*)(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     bool add_inputs);
 
@@ -92,7 +92,7 @@ using punica_bgmv_expand_slice_fn = void (*)(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     int64_t slice_offset,
     int64_t slice_size,
     bool add_inputs);
@@ -101,7 +101,7 @@ using punica_sgmv_expand_slice_fn = void (*)(
     at::Tensor& out,
     at::Tensor& input,
     at::Tensor& weights,
-    at::Tensor& indicies,
+    at::Tensor& indices,
     at::Tensor& seq_lens,
     int64_t slice_offset,
     int64_t slice_size,
diff --git a/csrc/cpu/aten/RotaryPositionEmbedding.cpp b/csrc/cpu/aten/RotaryPositionEmbedding.cpp
index 1046dd668..48f5cd018 100644
--- a/csrc/cpu/aten/RotaryPositionEmbedding.cpp
+++ b/csrc/cpu/aten/RotaryPositionEmbedding.cpp
@@ -1,5 +1,5 @@
 
-// The orginal python code can be found in
+// The original python code can be found in
 // https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
 // apply_rotary_pos_emb
 #include "RotaryPositionEmbedding.h"
diff --git a/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp b/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp
index 20b673c17..06bafe61b 100644
--- a/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp
+++ b/csrc/cpu/aten/kernels/AddSoftmaxKrnl.cpp
@@ -26,23 +26,23 @@ inline int64_t _calc_element_offset(
 
 inline std::vector<int64_t> _adjust_strides(
     const at::Tensor& src,
-    std::vector<int64_t>& infered_size) {
+    std::vector<int64_t>& inferred_size) {
   // We does NOT support broadcasting last dim which mean last_dim = 1
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.stride(src.ndimension() - 1) == 1);
 
   auto original_shape = src.sizes();
   auto original_stride = src.strides();
-  auto offset = infered_size.size() - original_shape.size();
+  auto offset = inferred_size.size() - original_shape.size();
 
   std::vector<int64_t> adjusted_stride;
   if (offset > 0)
-    adjusted_stride.resize(infered_size.size(), 0);
+    adjusted_stride.resize(inferred_size.size(), 0);
   else
-    adjusted_stride.resize(infered_size.size());
+    adjusted_stride.resize(inferred_size.size());
 
   for (size_t i = 0; i < original_shape.size(); i++) {
     // see NOTE: [Computing output strides]
-    if (original_shape[i] == 1 && infered_size[offset + i] != 1) {
+    if (original_shape[i] == 1 && inferred_size[offset + i] != 1) {
       adjusted_stride[offset + i] = 0;
     } else {
       adjusted_stride[offset + i] = original_stride[i];
@@ -54,7 +54,7 @@ inline std::vector<int64_t> _adjust_strides(
 
 /**
  * @brief Fuse the div (div scalar or mul 1/scalar) add operator and softmax
- * operator. softmax(alpah * a + b)
+ * operator. softmax(alpha * a + b)
  *
  * @attention
  * There are some assumptions for this operator.
@@ -64,7 +64,7 @@ inline std::vector<int64_t> _adjust_strides(
  * - The input tensors are contiguous
  * - The number of the input tensor dimension should be >=2
  * - Only the second input tensor is brodcastable
- * - The datatype for inpusts(a,b) and output are same.
+ * - The datatype for inputs(a,b) and output are same.
  *
  * @param[in] a a contiguous tensor to be added
  * @param[in] b a tensor to be added while it should be broadcastable
@@ -79,30 +79,30 @@ at::Tensor dil_div_add_softmax(
   scalar_t* b_data_base = b.data_ptr<scalar_t>();
 
   // Check if the tensor needs to be broadcasted
-  auto infered_size = a.sizes().vec();
-  auto need_broadcast = (infered_size != b.sizes());
+  auto inferred_size = a.sizes().vec();
+  auto need_broadcast = (inferred_size != b.sizes());
   if (need_broadcast) {
-    infered_size = at::infer_size(a.sizes(), b.sizes());
+    inferred_size = at::infer_size(a.sizes(), b.sizes());
   }
   at::Tensor output = at::empty_like(a);
   // Create an new tensor to store the output
   scalar_t* output_data_base = output.data_ptr<scalar_t>();
 
   // Calculate the strides for the input tensor
-  std::vector<int64_t> b_adjusted_strides = _adjust_strides(b, infered_size);
+  std::vector<int64_t> b_adjusted_strides = _adjust_strides(b, inferred_size);
 
   std::vector<int64_t> outer_size_per_dim;
-  int64_t dim_size = infered_size[infered_size.size() - 1];
+  int64_t dim_size = inferred_size[inferred_size.size() - 1];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim_size != 1);
 
   int64_t outer_size = 1;
   // The last dim is the loop unit. We need to minus 2 to exclude the last dim.
-  // infered_size.size() - 2 is the -2th dimension.
-  for (int64_t i = infered_size.size() - 2; i >= 0; i--) {
+  // inferred_size.size() - 2 is the -2th dimension.
+  for (int64_t i = inferred_size.size() - 2; i >= 0; i--) {
     // Record outer dimensions
     outer_size_per_dim.insert(outer_size_per_dim.begin(), outer_size);
     // Calculate outer loop number;
-    outer_size *= infered_size[i];
+    outer_size *= inferred_size[i];
   }
 
   int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size);
@@ -138,7 +138,7 @@ at::Tensor dil_div_add_softmax(
       //    val = sum(output_data)
       _dil_exp_reduce_sum_fusion_kernel(
           tmp_out_ptr, dim_size, tmp_out_ptr, val);
-      // Calculat the normalization [e^x / sum(e^x)]:
+      // Calculate the normalization [e^x / sum(e^x)]:
       //    output_data = output_data / sum(output_data)
       _dil_normalization_kernel<scalar_t>(
           tmp_out_ptr, val, dim_size, output_data_base + i * dim_size);
@@ -170,27 +170,27 @@ at::Tensor& dil_add_softmax_(at::Tensor& a, const at::Tensor& b) {
   float* b_data_base = b.data_ptr<float>();
 
   // Check if the tensor needs to be broadcasted
-  auto infered_size = a.sizes().vec();
-  auto need_broadcast = (infered_size != b.sizes());
+  auto inferred_size = a.sizes().vec();
+  auto need_broadcast = (inferred_size != b.sizes());
   if (need_broadcast) {
-    infered_size = at::infer_size(a.sizes(), b.sizes());
+    inferred_size = at::infer_size(a.sizes(), b.sizes());
   }
 
   // Calculate the strides for the input tensor
-  std::vector<int64_t> b_adjusted_strides = _adjust_strides(b, infered_size);
+  std::vector<int64_t> b_adjusted_strides = _adjust_strides(b, inferred_size);
 
   std::vector<int64_t> outer_size_per_dim;
-  int64_t dim_size = infered_size[infered_size.size() - 1];
+  int64_t dim_size = inferred_size[inferred_size.size() - 1];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim_size != 1);
 
   int64_t outer_size = 1;
   // The last dim is the loop unit. We need to minus 2 to exclude the last dim.
-  // infered_size.size() - 2 is the -2th dimension.
-  for (int64_t i = infered_size.size() - 2; i >= 0; i--) {
+  // inferred_size.size() - 2 is the -2th dimension.
+  for (int64_t i = inferred_size.size() - 2; i >= 0; i--) {
     // Record outer dimensions
     outer_size_per_dim.insert(outer_size_per_dim.begin(), outer_size);
     // Calculate outer loop number;
-    outer_size *= infered_size[i];
+    outer_size *= inferred_size[i];
   }
 
   int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size);
@@ -227,7 +227,7 @@ at::Tensor& dil_add_softmax_(at::Tensor& a, const at::Tensor& b) {
           dim_size,
           a_data_base + i * dim_size,
           val);
-      // Calculat the normalization [e^x / sum(e^x)]:
+      // Calculate the normalization [e^x / sum(e^x)]:
       //  output_data = output_data / sum(output_data)
 
       _dil_normalization_kernel<float>(
diff --git a/csrc/cpu/aten/kernels/AddSwishKrnl.cpp b/csrc/cpu/aten/kernels/AddSwishKrnl.cpp
index 4e0984793..36832b2c0 100644
--- a/csrc/cpu/aten/kernels/AddSwishKrnl.cpp
+++ b/csrc/cpu/aten/kernels/AddSwishKrnl.cpp
@@ -14,14 +14,14 @@ at::Tensor dil_add_swish(const at::Tensor& mm_output, const at::Tensor& bias) {
   scalar_t* mm_output_data_base = mm_output.data_ptr<scalar_t>();
   scalar_t* bias_data_base = bias.data_ptr<scalar_t>();
 
-  auto infered_size = mm_output.sizes().vec();
-  int64_t dim_size = infered_size[infered_size.size() - 1];
+  auto inferred_size = mm_output.sizes().vec();
+  int64_t dim_size = inferred_size[inferred_size.size() - 1];
   int64_t outer_size = 1;
   // The last dim is the loop unit. We need to minus 2 to exclude the last dim.
-  // infered_size.size() - 2 is the -2th dimension.
-  for (int64_t i = infered_size.size() - 2; i >= 0; i--) {
+  // inferred_size.size() - 2 is the -2th dimension.
+  for (int64_t i = inferred_size.size() - 2; i >= 0; i--) {
     // Calculate outer loop number;
-    outer_size *= infered_size[i];
+    outer_size *= inferred_size[i];
   }
 
   int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size);
diff --git a/csrc/cpu/aten/kernels/CatKrnl.cpp b/csrc/cpu/aten/kernels/CatKrnl.cpp
index 7c3a6f596..4edeccfdd 100644
--- a/csrc/cpu/aten/kernels/CatKrnl.cpp
+++ b/csrc/cpu/aten/kernels/CatKrnl.cpp
@@ -69,7 +69,7 @@ void cat_contig_firstdim_impl(
       // short input tensor list: parallel on dim_size (dim_size == ninputs *
       // input_dim_size).
       //
-      // note that prallel on ninputs may not have enough parallelism (e.g.
+      // note that parallel on ninputs may not have enough parallelism (e.g.
       // inputs == 2), also parallel on input_dim_size would trigger multiple
       // omp sessions, which has additional overhead.
       //
@@ -340,14 +340,14 @@ void cpu_cat_contig_dispatch(
   int64_t dim_size = result.sizes()[dim];
   int64_t outer_size = result.numel() / (dim_size * inner_size);
 
-  // Note on cat implementation choosen:
+  // Note on cat implementation chosen:
   //
   // In order to minimize overhead of meta info creation, pass down
   // `all_same_sizes_and_stride` to the kernel. `True` indicates all the input
   // tensors all have the same shape and stride.
   //
   // All kernels have a single omp loop (the non-contiguous path may have
-  // mutiple omp loops). All kernels trim grain_size in the parallel loop w.r.t.
+  // multiple omp loops). All kernels trim grain_size in the parallel loop w.r.t.
   // `at::internal::GRAIN_SIZE`.
   //
   // 1. `cat_contig_firstdim_impl`: used when outer_size == 1 (dim is the first
diff --git a/csrc/cpu/aten/kernels/DecodeKrnl.cpp b/csrc/cpu/aten/kernels/DecodeKrnl.cpp
index b7633174e..5ebf17de2 100644
--- a/csrc/cpu/aten/kernels/DecodeKrnl.cpp
+++ b/csrc/cpu/aten/kernels/DecodeKrnl.cpp
@@ -82,7 +82,7 @@ inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) {
 #define CHECK_LAST_DIM_CONTIGUOUS(x)            \
   TORCH_CHECK(                                  \
       x.strides()[x.strides().size() - 1] == 1, \
-      #x "must be contiguous at last dimention")
+      #x "must be contiguous at last dimension")
 
 #define CHECK_INPUT(x) \
   CHECK_CPU(x);        \
@@ -1409,7 +1409,7 @@ void decode_attention_kernel_impl(
         s_prime += at::vec::reduce_all<float>(
             [](Vec& x, Vec& y) { return x + y; }, s_delta, n_size);
         m_prime = m_i;
-        // caculate V' <- s_delta @ V + V' * m_delta
+        // calculate V' <- s_delta @ V + V' * m_delta
         index_gemm_kernel_nn(
             /* A   */ s_delta,
             /* B   */ kv_cache + head_id * stride_kv1,
@@ -1560,7 +1560,7 @@ void decode_attention_opt_kernel_impl(
         s_prime += at::vec::reduce_all<float>(
             [](Vec& x, Vec& y) { return x + y; }, s_delta, n_size);
         m_prime = m_i;
-        // caculate V' <- s_delta @ V + V' * m_delta
+        // calculate V' <- s_delta @ V + V' * m_delta
         gemm_kernel_nn(
             /* A   */ s_delta,
             /* B   */ kv_cache + head_id * stride_kv1 + n * stride_kv0,
@@ -1736,7 +1736,7 @@ void decode_attention_grouped_kernel_impl(
               n_size);
           m_prime[h] = m_i;
         }
-        // caculate V' <- s_delta @ V + V' * m_delta
+        // calculate V' <- s_delta @ V + V' * m_delta
         index_gemm_kernel_nn(
             /* A   */ s_delta,
             /* B   */ kv_cache + head_kv_id * stride_kv1,
@@ -1914,7 +1914,7 @@ void decode_attention_grouped_opt_kernel_impl(
               n_size);
           m_prime[h] = m_i;
         }
-        // caculate V' <- s_delta @ V + V' * m_delta
+        // calculate V' <- s_delta @ V + V' * m_delta
         gemm_kernel_nn(
             /* A   */ s_delta,
             /* B   */ kv_cache + head_kv_id * stride_kv2 + n * stride_kv0 +
diff --git a/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp b/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp
index 6cbd2066c..4eb73af2c 100644
--- a/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp
+++ b/csrc/cpu/aten/kernels/DivSoftmaxKrnl.cpp
@@ -10,7 +10,7 @@ namespace {
 #if defined(CPU_CAPABILITY_AVX512)
 using namespace torch_ipex::cpu::kernel;
 /**
- * @brief This function is caculating the loop unit offset for current loop idx
+ * @brief This function is calculating the loop unit offset for current loop idx
  * element, and the loop is for reading a tensor with its last dim as the loop
  * unit
  * @param[in] outer_loop_idx the loop idx
@@ -37,7 +37,7 @@ inline int64_t _calc_element_offset(
 
 /**
  * @brief This function is adjusting the strides of src tensor based on the
- * target infered_size, and make sure the strides can be used for element
+ * target inferred_size, and make sure the strides can be used for element
  * offset calculation of broadcastable reading.
  * For example:
  * [56, 1, 128 ,128] broadcasting reading for [56, 12, 128, 128]
@@ -46,33 +46,33 @@ inline int64_t _calc_element_offset(
  * input:
  * src tensor size [56, 1, 128 ,128]
  * src tensor strides [16384, 16384, 128 ,1]
- * infered_size (target tensor size) [56, 12, 128, 128]
+ * inferred_size (target tensor size) [56, 12, 128, 128]
  * output:
  * [16384, 0, 128, 1]
  * @param[in] src original tensor that needs to be adjusted
- * @param[in] infered_size the target size to be broadcasted
+ * @param[in] inferred_size the target size to be broadcasted
  * @return adjusted strides
  * @endcode
  */
 inline std::vector<int64_t> _adjust_strides(
     const at::Tensor& src,
-    std::vector<int64_t>& infered_size) {
+    std::vector<int64_t>& inferred_size) {
   // We does NOT support broadcasting last dim which mean last_dim = 1
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.stride(src.ndimension() - 1) == 1);
 
   auto original_shape = src.sizes();
   auto original_stride = src.strides();
-  auto offset = infered_size.size() - original_shape.size();
+  auto offset = inferred_size.size() - original_shape.size();
 
   std::vector<int64_t> adjusted_stride;
   if (offset > 0)
-    adjusted_stride.resize(infered_size.size(), 0);
+    adjusted_stride.resize(inferred_size.size(), 0);
   else
-    adjusted_stride.resize(infered_size.size());
+    adjusted_stride.resize(inferred_size.size());
 
   for (size_t i = 0; i < original_shape.size(); i++) {
     // see NOTE: [Computing output strides]
-    if (original_shape[i] == 1 && infered_size[offset + i] != 1) {
+    if (original_shape[i] == 1 && inferred_size[offset + i] != 1) {
       adjusted_stride[offset + i] = 0;
     } else {
       adjusted_stride[offset + i] = original_stride[i];
@@ -93,7 +93,7 @@ inline std::vector<int64_t> _adjust_strides(
  * - The number of the input tensor dimension should be >=2
  * - The mask b has the same dimension as a, or it can be expand_as a with (bs
  * :: seq_length), i.e., 2D tensor expands from mid dims
- * - The datatype for inpust a and output are same.
+ * - The datatype for input a and output are same.
  *
  * @param[in] a a contiguous tensor to do div and softmax
  * @param[in] b a mask tensor to be masked_fill into tensor a after div and
@@ -110,36 +110,36 @@ at::Tensor dil_div_maskfill_softmax(
   scalar_t* a_data_base = a.data_ptr<scalar_t>();
   float* b_data_base = b.data_ptr<float>();
 
-  auto infered_size = a.sizes().vec();
+  auto inferred_size = a.sizes().vec();
 
-  auto need_broadcast = a.dim() == b.dim() && (infered_size != b.sizes());
+  auto need_broadcast = a.dim() == b.dim() && (inferred_size != b.sizes());
   auto need_expand_from_2d = b.dim() == 2;
   if (need_broadcast) {
-    infered_size = at::infer_size(a.sizes(), b.sizes());
+    inferred_size = at::infer_size(a.sizes(), b.sizes());
   }
 
   // Calculate the strides for the input tensor
-  std::vector<int64_t> b_adjusted_strides = _adjust_strides(b, infered_size);
+  std::vector<int64_t> b_adjusted_strides = _adjust_strides(b, inferred_size);
 
   // Create an new tensor to store the output
   at::Tensor output = at::empty_like(a);
   scalar_t* output_data_base = output.data_ptr<scalar_t>();
 
   std::vector<int64_t> outer_size_per_dim;
-  int64_t dim_size = infered_size[infered_size.size() - 1];
+  int64_t dim_size = inferred_size[inferred_size.size() - 1];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim_size != 1);
 
   int64_t outer_size = 1;
   // The last dim is the loop unit. We need to minus 2 to exclude the last dim.
-  // infered_size.size() - 2 is the -2th dimension.
-  for (int64_t i = infered_size.size() - 2; i >= 0; i--) {
+  // inferred_size.size() - 2 is the -2th dimension.
+  for (int64_t i = inferred_size.size() - 2; i >= 0; i--) {
     // Record outer dimensions
     outer_size_per_dim.insert(outer_size_per_dim.begin(), outer_size);
     // Calculate outer loop number;
-    outer_size *= infered_size[i];
+    outer_size *= inferred_size[i];
   }
 
-  auto mask_offset = outer_size / infered_size[0];
+  auto mask_offset = outer_size / inferred_size[0];
 
   int64_t grain_size = at::internal::GRAIN_SIZE / (16 * dim_size);
   if (grain_size < 1)
@@ -181,7 +181,7 @@ at::Tensor dil_div_maskfill_softmax(
 
       _dil_exp_reduce_sum_fusion_kernel(
           tmp_out_ptr, dim_size, tmp_out_ptr, val);
-      // Calculat the normalization [e^x / sum(e^x)]:
+      // Calculate the normalization [e^x / sum(e^x)]:
       //    output_data = output_data / sum(output_data)
 
       _dil_normalization_kernel<scalar_t>(
diff --git a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp
index 7cc0c4ec2..309ead6e6 100644
--- a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp
+++ b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp
@@ -390,7 +390,7 @@ inline void reshape_attn_mask_to_4d(
 }
 
 /*
- *Caculate the flash attention SDPA.
+ *Calculate the flash attention SDPA.
  *@template scalar_t: q/k/v data type
  *@template q_split_size: q block size
  *@template kv_split_size: kv block size
@@ -1411,7 +1411,7 @@ inline bool use_vnni(
   int64_t thresh_size = (dtype == at::kBFloat16) ? 64 : 16;
   bool need_pack = kvSize >= thresh_size && qSize >= thresh_size;
   // When the number of gemm is greater than the number of pack,
-  // the pack overhead can be overlaped.
+  // the pack overhead can be overlapped.
   if (need_pack) {
     double pack_size = batchSize * num_head * kvSize * headSize;
     double qs_per_thread =
diff --git a/csrc/cpu/aten/kernels/GroupNormKrnl.cpp b/csrc/cpu/aten/kernels/GroupNormKrnl.cpp
index a3b84e9be..e34856593 100644
--- a/csrc/cpu/aten/kernels/GroupNormKrnl.cpp
+++ b/csrc/cpu/aten/kernels/GroupNormKrnl.cpp
@@ -325,7 +325,7 @@ void GroupNormKernelImplChannelsLastInternal(
   const bool gamma_null = (gamma_data == nullptr);
   const bool beta_null = beta_data == nullptr;
 
-  // NB: About algorithm choosen:
+  // NB: About algorithm chosen:
   //
   // On channels last, GroupNorm has a input shape of {N, H, W, GD},
   // Mean and rstd are collected per each n and g, which involves reduction
@@ -462,7 +462,7 @@ void GroupNormKernelImplChannelsLastInternal(
     //
     // We could fuse step 3 and 4 into a single session but this way is better:
     //   a. D might be too small for vectorization;
-    //   b. Avoid duplicate caculation of scale/bias, each HxW plain share the
+    //   b. Avoid duplicate calculation of scale/bias, each HxW plain share the
     //   same scale/bias
     //
     for (const auto n : c10::irange(N)) {
diff --git a/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp b/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp
index 130b6b6e2..5a3df1947 100644
--- a/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp
+++ b/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp
@@ -156,7 +156,7 @@ static void index_select_gather_impl(
       outer_size,
       grain_size / (index_size * inner_size),
       [&](int64_t begin, int64_t end) {
-        // create the offset stencil for each row in outer dimenson,
+        // create the offset stencil for each row in outer dimension,
         // shared across {outer_size}
         std::unique_ptr<integer_t[]> index_buffer(
             new integer_t[index_size * inner_size]);
@@ -213,7 +213,7 @@ void cpu_index_select_dispatch(
 
   check_indexarray_range<index_t>(index_data, index_size, dim_size);
 
-  // Note on index_select implementation choosen:
+  // Note on index_select implementation chosen:
   //
   // 1. `index_select_gather_impl`: used when inner_size == 1 or 2.
   //   inner_size == 1 indicates a gather across {index_size}, here
diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp
index 4197e18e3..31da471e0 100644
--- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp
+++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp
@@ -1118,7 +1118,7 @@ inline void copy_key_value(
 }
 
 /*
- *The scale-dot product for indirect access kv chache and fuse
+ *The scale-dot product for indirect access kv cache and fuse
  *matmul+div+add+softmax to improve data reuse
  *@param  query Query embeeding with the of [beam_size*batch, cur_len, head_num,
  *head_size]
@@ -1128,7 +1128,7 @@ inline void copy_key_value(
  *head_size]
  *@param  key_cache Cache past key embeeding with the of [max_len,
  *beam_size*batch, head_num, head_size]
- *@param  value_chache Cache past value embeeding with the of [max_len,
+ *@param  value_cache Cache past value embeeding with the of [max_len,
  *beam_size*batch, head_num, head_size]
  *@param  beam_idx Beam info for every token [max_len, beam_size*batch]
  *@param  offset  The length of decoded(past) token.
@@ -1273,13 +1273,13 @@ scale_dot_product_for_indirect_access_kv_cache(
           auto k_start = block_id * kv_block_size;
           auto block_size = std::min(kv_block_size, seq_len - k_start);
           auto query_ti = 0;
-          // maping the query head to key/value head to support MGA/MQA
+          // mapping the query head to key/value head to support MGA/MQA
           auto kv_hi = head_group_start / group_size;
           if (chg_attn_w_layout) {
             auto attn_w_stride =
                 (bsi * head_num + head_group_start) * attn_w_strideH;
             for (auto ti = k_start; ti < k_start + block_size; ti++) {
-              // caculate the innerproduct for the current token and store the
+              // calculate the innerproduct for the current token and store the
               // key
               if (offset > 0 && ti == query_ti + offset) {
                 for (auto bbi = 0; bbi < beam_size; bbi++) {
@@ -1301,7 +1301,7 @@ scale_dot_product_for_indirect_access_kv_cache(
                       true,
                       kc_head_start);
                 }
-              } else { // caculate the innerproduct for the past token
+              } else { // calculate the innerproduct for the past token
                 auto bi = bsi * beam_size;
                 auto q_ptr_start =
                     q_ptr + bi * qStrideB + head_group_start * qStrideH;
@@ -1351,7 +1351,7 @@ scale_dot_product_for_indirect_access_kv_cache(
                 auto beam = need_update_beam_idx && ti >= prompt_len
                     ? new_beam_idx[bi][ti]
                     : bsi * beam_size;
-                // caculate the innerproduct for the current token and store the
+                // calculate the innerproduct for the current token and store the
                 // key
                 if (offset > 0 && ti == query_ti + offset) {
                   auto kc_head_start = k_cache_ptr + ti * kcStrideS +
@@ -1366,7 +1366,7 @@ scale_dot_product_for_indirect_access_kv_cache(
                       head_size,
                       true,
                       kc_head_start);
-                } else { // caculate the innerproduct for the past token
+                } else { // calculate the innerproduct for the past token
                   auto kc_head_start = k_cache_ptr + ti * kcStrideS +
                       beam * kcStrideB + kv_hi * kcStrideH;
                   reduce_head<QT, KCT, KCT>(
@@ -1555,7 +1555,7 @@ scale_dot_product_for_indirect_access_kv_cache(
           auto v_start = block_id * kv_block_size;
           auto block_size = std::min(kv_block_size, seq_len - v_start);
           auto query_ti = 0;
-          // maping the query head to key/value head to support MGA/MQA
+          // mapping the query head to key/value head to support MGA/MQA
           auto kv_hi = hi / group_size;
           if (chg_attn_w_layout) {
             auto attn_w_stride = (bsi * head_num + hi) * attn_w_strideH;
@@ -1588,7 +1588,7 @@ scale_dot_product_for_indirect_access_kv_cache(
                       flag_access_start);
                 }
               } else {
-                // caculate the innerproduct for the past token
+                // calculate the innerproduct for the past token
                 if (need_update_beam_idx && vi >= prompt_len) {
                   for (auto bbi = 0; bbi < beam_size; bbi++) {
                     auto bi = bsi * beam_size + bbi;
@@ -1666,7 +1666,7 @@ scale_dot_product_for_indirect_access_kv_cache(
                 auto beam = need_update_beam_idx && vi >= prompt_len
                     ? new_beam_idx[bi][vi]
                     : bsi * beam_size;
-                // caculate the innerproduct for the current token and store the
+                // calculate the innerproduct for the current token and store the
                 // key
                 if (offset > 0 && vi == offset) {
                   auto v_cache_head_start = v_cache_ptr + vi * vcStrideS +
@@ -1684,7 +1684,7 @@ scale_dot_product_for_indirect_access_kv_cache(
                       v_cache_head_start,
                       flag_access_start);
                 } else {
-                  // caculate the innerproduct for the past token
+                  // calculate the innerproduct for the past token
                   auto v_cache_head_start = v_cache_ptr + vi * vcStrideS +
                       beam * vcStrideB + kv_hi * vcStrideH;
                   mul_attenion_weights_and_value_of_head<VCT, float, VCT>(
@@ -1880,13 +1880,13 @@ scale_dot_product_for_indirect_access_kv_cache_half(
           auto k_start = block_id * kv_block_size;
           auto block_size = std::min(kv_block_size, seq_len - k_start);
           auto query_ti = 0;
-          // maping the query head to key/value head to support MGA/MQA
+          // mapping the query head to key/value head to support MGA/MQA
           auto kv_hi = head_group_start / group_size;
           if (chg_attn_w_layout) {
             auto attn_w_stride =
                 (bsi * head_num + head_group_start) * attn_w_strideH;
             for (auto ti = k_start; ti < k_start + block_size; ti++) {
-              // caculate the innerproduct for the current token and store the
+              // calculate the innerproduct for the current token and store the
               // key
               if (offset > 0 && ti == query_ti + offset) {
                 for (auto bbi = 0; bbi < beam_size; bbi++) {
@@ -1908,7 +1908,7 @@ scale_dot_product_for_indirect_access_kv_cache_half(
                       true,
                       kc_head_start);
                 }
-              } else { // caculate the innerproduct for the past token
+              } else { // calculate the innerproduct for the past token
                 auto bi = bsi * beam_size;
                 auto q_ptr_start =
                     q_ptr + bi * qStrideB + head_group_start * qStrideH;
@@ -1958,7 +1958,7 @@ scale_dot_product_for_indirect_access_kv_cache_half(
                 auto beam = need_update_beam_idx && ti >= prompt_len
                     ? new_beam_idx[bi][ti]
                     : bsi * beam_size;
-                // caculate the innerproduct for the current token and store the
+                // calculate the innerproduct for the current token and store the
                 // key
                 if (offset > 0 && ti == query_ti + offset) {
                   auto kc_head_start = k_cache_ptr + ti * kcStrideS +
@@ -1973,7 +1973,7 @@ scale_dot_product_for_indirect_access_kv_cache_half(
                       head_size,
                       true,
                       kc_head_start);
-                } else { // caculate the innerproduct for the past token
+                } else { // calculate the innerproduct for the past token
                   auto kc_head_start = k_cache_ptr + ti * kcStrideS +
                       beam * kcStrideB + kv_hi * kcStrideH;
                   reduce_head_half(
@@ -2081,12 +2081,12 @@ scale_dot_product_for_indirect_access_kv_cache_half(
           auto v_start = block_id * kv_block_size;
           auto block_size = std::min(kv_block_size, seq_len - v_start);
           auto query_ti = 0;
-          // maping the query head to key/value head to support MGA/MQA
+          // mapping the query head to key/value head to support MGA/MQA
           auto kv_hi = hi / group_size;
           if (chg_attn_w_layout) {
             auto attn_w_stride = (bsi * head_num + hi) * attn_w_strideH;
             for (auto vi = v_start; vi < v_start + block_size; vi++) {
-              // caculate the attention values for the current token
+              // calculate the attention values for the current token
               if (offset > 0 && vi == offset) {
                 for (auto bbi = 0; bbi < beam_size; bbi++) {
                   auto bi = bsi * beam_size + bbi;
@@ -2115,7 +2115,7 @@ scale_dot_product_for_indirect_access_kv_cache_half(
                       flag_access_start);
                 }
               } else {
-                // caculate the innerproduct for the past token
+                // calculate the innerproduct for the past token
                 if (need_update_beam_idx && vi >= prompt_len) {
                   for (auto bbi = 0; bbi < beam_size; bbi++) {
                     auto bi = bsi * beam_size + bbi;
@@ -2193,7 +2193,7 @@ scale_dot_product_for_indirect_access_kv_cache_half(
                 auto beam = need_update_beam_idx && vi >= prompt_len
                     ? new_beam_idx[bi][vi]
                     : bsi * beam_size;
-                // caculate the attention values for the current token
+                // calculate the attention values for the current token
                 if (offset > 0 && vi == offset) {
                   auto v_cache_head_start = v_cache_ptr + vi * vcStrideS +
                       bi * vcStrideB + kv_hi * vcStrideH;
@@ -2210,7 +2210,7 @@ scale_dot_product_for_indirect_access_kv_cache_half(
                       v_cache_head_start,
                       flag_access_start);
                 } else {
-                  // caculate the innerproduct for the past token
+                  // calculate the innerproduct for the past token
                   auto v_cache_head_start = v_cache_ptr + vi * vcStrideS +
                       beam * vcStrideB + kv_hi * vcStrideH;
                   mul_attenion_weights_and_value_of_head_half(
@@ -2684,7 +2684,7 @@ masked_multihead_self_attention_kernel_impl(
     }
     beam_idx_access[max_positions][0] = cur_len; // record the prompt token len
     beam_idx_access[max_positions + 1][0] =
-        query.size(0); // record the promt bs info
+        query.size(0); // record the prompt bs info
 
   } else if (offset > 0 && offset + cur_len > cache_size) {
     auto new_cache_size = cache_size * 2;
@@ -2726,7 +2726,7 @@ masked_multihead_self_attention_kernel_impl(
           offset,
           scale_attn,
           attention_mask_v);
-    // just a  funcationality path,need to optimize
+    // just a functionality path, need to optimize
     auto tokens_outs = std::vector<at::Tensor>(cur_len);
     for (auto i = 0; i < cur_len; i++) {
       auto query_i = query.select(1, i).unsqueeze(1);
@@ -2934,7 +2934,7 @@ deepseekv2_mla_kernel_impl(
     }
     beam_idx_access[max_positions][0] = cur_len; // record the prompt token len
     beam_idx_access[max_positions + 1][0] =
-        query.size(0); // record the promt bs info
+        query.size(0); // record the prompt bs info
 
   } else if (offset > 0 && offset + cur_len > cache_size) {
     auto new_cache_size = cache_size * 2;
diff --git a/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp b/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp
index 1ee50613f..37ac26406 100644
--- a/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp
+++ b/csrc/cpu/aten/kernels/MergedEmbeddingBagKrnl.cpp
@@ -603,10 +603,10 @@ std::vector<Tensor> merged_embeddingbag_forward_cpu_kernel_impl(
 /**
  * Read from embedding table, and write to world_size * num_chk * num_emb's
  *EmbeddingRowCache world_size dimension decide which ranks should this
- *particial look up result sent to num_emb dimmension devide which emb table
+ *particial look up result sent to num_emb dimension divide which emb table
  *should this particial look up result belong to num_chk dimension is hard code
  *to 16 here for better parallel scope, list 3 parallel choices:
- *(1) Only parallel on num_emb, this limite the thread nums == num_emb
+ *(1) Only parallel on num_emb, this limit the thread nums == num_emb
  *(2) Parallel on num_emb and gbatch. Total tasks = num_emb * gbatch
  *(3) Parallel on num_emb and num_chk. Total tasks = num_emb * num_chk
  *
@@ -782,7 +782,7 @@ mergedemb_distribute_forward_local_kernel_impl(
                 indices_ptr[i] = indices[i].data_ptr<index_t>();
                 offsets_ptr[i] = offsets[i].data_ptr<index_t>();
               }
-              // read from weight and accumuate in emb cache
+              // read from weight and accumulate in emb cache
               int64_t num_chk = 16;
               std::vector<EmbeddingRowCache<acc_t>> cache_with_num_chk(
                   world_size * num_chk * num_emb);
@@ -807,7 +807,7 @@ mergedemb_distribute_forward_local_kernel_impl(
                   emb_dim,
                   world_size);
               // read from emb cache and write to the buffer while will be
-              // comunicated with other ranks
+              // communicated with other ranks
               prepare_ccl_buffer<acc_t, scalar_t, index_t>(
                   idx,
                   val,
@@ -884,7 +884,7 @@ void mergedemb_distribute_forward_merge_kernel_impl(
                 val_ptr[i] = val[i].data_ptr<scalar_t>();
                 ofs_ptr[i] = ofs[i].data_ptr<int64_t>();
               }
-              // read from weight and accumuate in emb cache
+              // read from weight and accumulate in emb cache
               mergedemb_distribute_forward_merge<acc_t, scalar_t, index_t>(
                   world_size,
                   num_emb,
diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp
index a1be9ea21..c758a975b 100644
--- a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp
+++ b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp
@@ -1532,7 +1532,7 @@ void reshape_and_cache_kernel(
  * seqlen_q = 5 and seqlen_k = 2, the causal mask is: 0 0 0 0 0 0 1 0 1 1 If the
  * row of the mask is all zero, the output will be zero.
  *
- * For the chuned prefill case, the data layout is as follow:
+ * For the chunked prefill case, the data layout is as follow:
  *
  * Definition of context_len, query_len, and seq_len.
  *   |---------- N-1 iteration --------|
@@ -1556,9 +1556,9 @@ void flash_attn_varlen_kernel(
     at::Tensor& query, // [num_seqs, num_heads, head_size]
     at::Tensor& key_cache, // [num_blocks, num_heads, block_size,  head_size]
     at::Tensor& value_cache, //[num_blocks, num_heads, block_size, head_size]
-    at::Tensor& cu_seqlens_q, // [batch_size+1] // the accumulted sequence
+    at::Tensor& cu_seqlens_q, // [batch_size+1] // the accumulated sequence
                               // length of query
-    at::Tensor& cu_seqlens_k, // [batch_size+1] // the accumulted sequence
+    at::Tensor& cu_seqlens_k, // [batch_size+1] // the accumulated sequence
                               // length of key(cached)
     int64_t max_seqlen_q, // max sequence length of query
     int64_t max_seqlens_k, // max sequence length of key and value(cached,
@@ -2037,7 +2037,7 @@ void reshape_and_cache_cpu_kernel_impl(
   TORCH_CHECK(
       kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e5m2" ||
           kv_cache_dtype == "auto",
-      "not supported kv_cahce_dtype");
+      "not supported kv_cache_dtype");
   RECORD_FUNCTION(
       "ipex::reshape_and_cache_cpu_kernel_impl",
       c10::ArrayRef<c10::IValue>({}));
@@ -2095,7 +2095,7 @@ void flash_attn_varlen_cpu_kernel_impl(
   TORCH_CHECK(
       kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e5m2" ||
           kv_cache_dtype == "auto",
-      "not supported kv_cahce_dtype");
+      "not supported kv_cache_dtype");
   RECORD_FUNCTION(
       "ipex::flash_attn_varlen_cpu_kernel_impl",
       c10::ArrayRef<c10::IValue>({}));
diff --git a/csrc/cpu/aten/kernels/PunicaKrnl.cpp b/csrc/cpu/aten/kernels/PunicaKrnl.cpp
index 93ce07c92..20471243a 100644
--- a/csrc/cpu/aten/kernels/PunicaKrnl.cpp
+++ b/csrc/cpu/aten/kernels/PunicaKrnl.cpp
@@ -32,7 +32,7 @@ namespace {
 
 template <typename T1, typename T2>
 void _dot(
-    const T1* intput,
+    const T1* input,
     const T2* weight,
     T1* out,
     int64_t len,
@@ -45,7 +45,7 @@ void _dot(
   int64_t vec_size = 16; // 512/32
   auto qk_sum_vec = _mm512_setzero_ps();
   for (hsi = 0; hsi <= len - vec_size; hsi += vec_size) {
-    auto q_vec = _loadu(intput + hsi);
+    auto q_vec = _loadu(input + hsi);
     auto k_vec = _loadu(weight + hsi);
     qk_sum_vec = _mm512_fmadd_ps(q_vec, k_vec, qk_sum_vec);
   }
@@ -68,7 +68,7 @@ void punica_bgmv_expand_slice_kernel(
         out, // [bs, output_size1] output_size1 >= slice_offset + slice_size
     at::Tensor& input, // [bs, max_rank]
     at::Tensor& weights, // [num_lora, hidden_size, max_rank]
-    at::Tensor& indicies, // [bs]
+    at::Tensor& indices, // [bs]
     int64_t slice_offset,
     int64_t slice_size,
     bool add_inputs) {
@@ -82,14 +82,14 @@ void punica_bgmv_expand_slice_kernel(
   TORCH_CHECK(slice_offset >= 0)
   TORCH_CHECK(slice_size == hidden_size)
   TORCH_CHECK(output_size1 >= slice_offset + slice_size);
-  TORCH_CHECK(batch_size == indicies.size(0));
+  TORCH_CHECK(batch_size == indices.size(0));
   TORCH_CHECK(batch_size == input.size(0));
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(weights.is_contiguous());
-  TORCH_CHECK(indicies.is_contiguous());
+  TORCH_CHECK(indices.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
 
-  int64_t* indicies_ptr = indicies.data_ptr<int64_t>();
+  int64_t* indices_ptr = indices.data_ptr<int64_t>();
   T* out_ptr = out.data_ptr<T>();
   T* input_ptr = input.data_ptr<T>();
   T* weights_ptr = weights.data_ptr<T>();
@@ -100,7 +100,7 @@ void punica_bgmv_expand_slice_kernel(
     for (int64_t h = 0; h < hidden_size; h++) {
       int64_t input_bs = limit ? 0 : bs;
       int64_t weights_offset =
-          indicies_ptr[bs] * max_rank * hidden_size + h * max_rank;
+          indices_ptr[bs] * max_rank * hidden_size + h * max_rank;
       T* weight_start = weights_ptr + weights_offset;
       T* input_start = input_ptr + input_bs * input_size1;
       T* out_start = out_ptr + bs * output_size1 + h + slice_offset;
@@ -115,7 +115,7 @@ void punica_sgmv_expand_slice_kernel(
         out, // [bs, output_size1] output_size1 >= slice_offset + slice_size
     at::Tensor& input, // [bs, max_rank]
     at::Tensor& weights, // [num_lora, hidden_size, max_rank]
-    at::Tensor& indicies, // [num_seq]
+    at::Tensor& indices, // [num_seq]
     at::Tensor& seq_lens, // [num_seq]
     int64_t slice_offset,
     int64_t slice_size,
@@ -130,14 +130,14 @@ void punica_sgmv_expand_slice_kernel(
   TORCH_CHECK(slice_offset >= 0)
   TORCH_CHECK(slice_size == hidden_size)
   TORCH_CHECK(output_size1 >= slice_offset + slice_size);
-  TORCH_CHECK(seq_lens.size(0) == indicies.size(0));
+  TORCH_CHECK(seq_lens.size(0) == indices.size(0));
   TORCH_CHECK(batch_size == input.size(0));
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(weights.is_contiguous());
-  TORCH_CHECK(indicies.is_contiguous());
+  TORCH_CHECK(indices.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
 
-  int64_t* indicies_ptr = indicies.data_ptr<int64_t>();
+  int64_t* indices_ptr = indices.data_ptr<int64_t>();
   T* out_ptr = out.data_ptr<T>();
   T* input_ptr = input.data_ptr<T>();
   T* weights_ptr = weights.data_ptr<T>();
@@ -159,7 +159,7 @@ void punica_sgmv_expand_slice_kernel(
         int64_t bs = offset + s;
         int64_t input_bs = limit ? 0 : bs;
         int64_t weights_offset =
-            indicies_ptr[seq_id] * max_rank * hidden_size + h * max_rank;
+            indices_ptr[seq_id] * max_rank * hidden_size + h * max_rank;
         T* weight_start = weights_ptr + weights_offset;
         T* input_start = input_ptr + input_bs * input_size1;
         T* out_start = out_ptr + bs * output_size1 + h + slice_offset;
@@ -175,7 +175,7 @@ void punica_bgmv_shrink_kernel(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [bs]
+    at::Tensor& indices, // [bs]
     const double scale) {
   int64_t num_lora = weights.size(0);
   int64_t max_rank = weights.size(1);
@@ -186,12 +186,12 @@ void punica_bgmv_shrink_kernel(
   TORCH_CHECK(input_size1 >= hidden_size);
   TORCH_CHECK(output_size1 >= max_rank);
   TORCH_CHECK(batch_size == input.size(0));
-  TORCH_CHECK(batch_size == indicies.size(0));
+  TORCH_CHECK(batch_size == indices.size(0));
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(weights.is_contiguous());
-  TORCH_CHECK(indicies.is_contiguous());
+  TORCH_CHECK(indices.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
-  int64_t* indicies_ptr = indicies.data_ptr<int64_t>();
+  int64_t* indices_ptr = indices.data_ptr<int64_t>();
   T* out_ptr = out.data_ptr<T>();
   T* input_ptr = input.data_ptr<T>();
   T* weights_ptr = weights.data_ptr<T>();
@@ -200,7 +200,7 @@ void punica_bgmv_shrink_kernel(
   for (int64_t bs = 0; bs < batch_size; bs++) {
     for (int64_t r = 0; r < max_rank; r++) {
       int64_t weights_offset =
-          indicies_ptr[bs] * max_rank * hidden_size + r * hidden_size;
+          indices_ptr[bs] * max_rank * hidden_size + r * hidden_size;
       T* weight_start = weights_ptr + weights_offset;
       T* input_start = input_ptr + bs * input_size1;
       T* out_start = out_ptr + bs * output_size1 + r;
@@ -215,7 +215,7 @@ void punica_sgmv_shrink_kernel(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [num_seq]
+    at::Tensor& indices, // [num_seq]
     at::Tensor& seq_lens, // [num_seq]
     const double scale) {
   int64_t num_lora = weights.size(0);
@@ -227,12 +227,12 @@ void punica_sgmv_shrink_kernel(
   TORCH_CHECK(input_size1 >= hidden_size);
   TORCH_CHECK(output_size1 >= max_rank);
   TORCH_CHECK(batch_size == input.size(0));
-  TORCH_CHECK(seq_lens.size(0) == indicies.size(0));
+  TORCH_CHECK(seq_lens.size(0) == indices.size(0));
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(weights.is_contiguous());
-  TORCH_CHECK(indicies.is_contiguous());
+  TORCH_CHECK(indices.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
-  int64_t* indicies_ptr = indicies.data_ptr<int64_t>();
+  int64_t* indices_ptr = indices.data_ptr<int64_t>();
   T* out_ptr = out.data_ptr<T>();
   T* input_ptr = input.data_ptr<T>();
   T* weights_ptr = weights.data_ptr<T>();
@@ -252,7 +252,7 @@ void punica_sgmv_shrink_kernel(
         int64_t offset = seq_id == 0 ? 0 : offsets_ptr[seq_id - 1];
         int64_t bs = offset + s;
         int64_t weights_offset =
-            indicies_ptr[seq_id] * max_rank * hidden_size + r * hidden_size;
+            indices_ptr[seq_id] * max_rank * hidden_size + r * hidden_size;
         T* weight_start = weights_ptr + weights_offset;
         T* input_start = input_ptr + bs * input_size1;
         T* out_start = out_ptr + bs * output_size1 + r;
@@ -268,7 +268,7 @@ void punica_bgmv_expand_kernel(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [bs]
+    at::Tensor& indices, // [bs]
     bool add_inputs) {
   int64_t num_lora = weights.size(0);
   int64_t max_rank = weights.size(1);
@@ -278,13 +278,13 @@ void punica_bgmv_expand_kernel(
   int64_t input_size1 = input.size(1);
   TORCH_CHECK(input_size1 >= hidden_size);
   TORCH_CHECK(output_size1 >= max_rank);
-  TORCH_CHECK(batch_size == indicies.size(0));
+  TORCH_CHECK(batch_size == indices.size(0));
   TORCH_CHECK(batch_size == input.size(0) || input.size(0) == 1);
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(weights.is_contiguous());
-  TORCH_CHECK(indicies.is_contiguous());
+  TORCH_CHECK(indices.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
-  int64_t* indicies_ptr = indicies.data_ptr<int64_t>();
+  int64_t* indices_ptr = indices.data_ptr<int64_t>();
   T* out_ptr = out.data_ptr<T>();
   T* input_ptr = input.data_ptr<T>();
   T* weights_ptr = weights.data_ptr<T>();
@@ -295,7 +295,7 @@ void punica_bgmv_expand_kernel(
     for (int64_t r = 0; r < max_rank; r++) {
       int64_t input_bs = limit ? 0 : bs;
       int64_t weights_offset =
-          indicies_ptr[bs] * max_rank * hidden_size + r * hidden_size;
+          indices_ptr[bs] * max_rank * hidden_size + r * hidden_size;
       T* weight_start = weights_ptr + weights_offset;
       T* input_start = input_ptr + input_bs * input_size1;
       T* out_start = out_ptr + bs * output_size1 + r;
@@ -310,7 +310,7 @@ void punica_sgmv_expand_kernel(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [num_seq]
+    at::Tensor& indices, // [num_seq]
     at::Tensor& seq_lens, // [num_seq]
     bool add_inputs) {
   int64_t num_lora = weights.size(0);
@@ -321,13 +321,13 @@ void punica_sgmv_expand_kernel(
   int64_t input_size1 = input.size(1);
   TORCH_CHECK(input_size1 >= hidden_size);
   TORCH_CHECK(output_size1 >= max_rank);
-  TORCH_CHECK(seq_lens.size(0) == indicies.size(0));
+  TORCH_CHECK(seq_lens.size(0) == indices.size(0));
   TORCH_CHECK(batch_size == input.size(0) || input.size(0) == 1);
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(weights.is_contiguous());
-  TORCH_CHECK(indicies.is_contiguous());
+  TORCH_CHECK(indices.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
-  int64_t* indicies_ptr = indicies.data_ptr<int64_t>();
+  int64_t* indices_ptr = indices.data_ptr<int64_t>();
   T* out_ptr = out.data_ptr<T>();
   T* input_ptr = input.data_ptr<T>();
   T* weights_ptr = weights.data_ptr<T>();
@@ -349,7 +349,7 @@ void punica_sgmv_expand_kernel(
         int64_t bs = offset + s;
         int64_t input_bs = limit ? 0 : bs;
         int64_t weights_offset =
-            indicies_ptr[seq_id] * max_rank * hidden_size + r * hidden_size;
+            indices_ptr[seq_id] * max_rank * hidden_size + r * hidden_size;
         T* weight_start = weights_ptr + weights_offset;
         T* input_start = input_ptr + input_bs * input_size1;
         T* out_start = out_ptr + bs * output_size1 + r;
@@ -364,7 +364,7 @@ void punica_bgmv_shrink_kernel_impl(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [bs]
+    at::Tensor& indices, // [bs]
     const double scale) {
   RECORD_FUNCTION(
       "ipex::punica_bgmv_shrink_kernel_impl", c10::ArrayRef<c10::IValue>({}));
@@ -377,12 +377,12 @@ void punica_bgmv_shrink_kernel_impl(
   TORCH_CHECK(out.dim() == 2, "out must be 2D");
   TORCH_CHECK(input.dim() == 2, "input must be 2D");
   TORCH_CHECK(weights.dim() == 3, "weights must be 3D");
-  TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D");
+  TORCH_CHECK(indices.dim() == 1, "indices must be 1D");
   if (out.scalar_type() == at::kBFloat16) {
     punica_bgmv_shrink_kernel<at::BFloat16>(
-        out, input, weights, indicies, scale);
+        out, input, weights, indices, scale);
   } else if (out.scalar_type() == at::kHalf) {
-    punica_bgmv_shrink_kernel<at::Half>(out, input, weights, indicies, scale);
+    punica_bgmv_shrink_kernel<at::Half>(out, input, weights, indices, scale);
   }
 }
 
@@ -390,7 +390,7 @@ void punica_sgmv_shrink_kernel_impl(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [num_seq]
+    at::Tensor& indices, // [num_seq]
     at::Tensor& seq_lens, // [num_seq]
     const double scale) {
   RECORD_FUNCTION(
@@ -404,14 +404,14 @@ void punica_sgmv_shrink_kernel_impl(
   TORCH_CHECK(out.dim() == 2, "out must be 2D");
   TORCH_CHECK(input.dim() == 2, "input must be 2D");
   TORCH_CHECK(weights.dim() == 3, "weights must be 3D");
-  TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D");
-  TORCH_CHECK(seq_lens.dim() == 1, "indicies must be 1D");
+  TORCH_CHECK(indices.dim() == 1, "indices must be 1D");
+  TORCH_CHECK(seq_lens.dim() == 1, "indices must be 1D");
   if (out.scalar_type() == at::kBFloat16) {
     punica_sgmv_shrink_kernel<at::BFloat16>(
-        out, input, weights, indicies, seq_lens, scale);
+        out, input, weights, indices, seq_lens, scale);
   } else if (out.scalar_type() == at::kHalf) {
     punica_sgmv_shrink_kernel<at::Half>(
-        out, input, weights, indicies, seq_lens, scale);
+        out, input, weights, indices, seq_lens, scale);
   }
 }
 
@@ -420,7 +420,7 @@ void punica_bgmv_expand_kernel_impl(
     at::Tensor& input, // [bs, input_size1] or [1, input_size1] input_size1  >=
                        // hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [bs]
+    at::Tensor& indices, // [bs]
     bool add_inputs) {
   RECORD_FUNCTION(
       "ipex::punica_bgmv_expand_kernel_impl", c10::ArrayRef<c10::IValue>({}));
@@ -433,13 +433,13 @@ void punica_bgmv_expand_kernel_impl(
   TORCH_CHECK(out.dim() == 2, "out must be 2D");
   TORCH_CHECK(input.dim() == 2, "input must be 2D");
   TORCH_CHECK(weights.dim() == 3, "weights must be 3D");
-  TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D");
+  TORCH_CHECK(indices.dim() == 1, "indices must be 1D");
   if (out.scalar_type() == at::kBFloat16) {
     punica_bgmv_expand_kernel<at::BFloat16>(
-        out, input, weights, indicies, add_inputs);
+        out, input, weights, indices, add_inputs);
   } else if (out.scalar_type() == at::kHalf) {
     punica_bgmv_expand_kernel<at::Half>(
-        out, input, weights, indicies, add_inputs);
+        out, input, weights, indices, add_inputs);
   }
 }
 
@@ -448,7 +448,7 @@ void punica_sgmv_expand_kernel_impl(
     at::Tensor& input, // [bs, input_size1] or [1, input_size1] input_size1  >=
                        // hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [bs]
+    at::Tensor& indices, // [bs]
     at::Tensor& seq_lens, // [bs]
     bool add_inputs) {
   RECORD_FUNCTION(
@@ -462,14 +462,14 @@ void punica_sgmv_expand_kernel_impl(
   TORCH_CHECK(out.dim() == 2, "out must be 2D");
   TORCH_CHECK(input.dim() == 2, "input must be 2D");
   TORCH_CHECK(weights.dim() == 3, "weights must be 3D");
-  TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D");
-  TORCH_CHECK(seq_lens.dim() == 1, "indicies must be 1D");
+  TORCH_CHECK(indices.dim() == 1, "indices must be 1D");
+  TORCH_CHECK(seq_lens.dim() == 1, "indices must be 1D");
   if (out.scalar_type() == at::kBFloat16) {
     punica_sgmv_expand_kernel<at::BFloat16>(
-        out, input, weights, indicies, seq_lens, add_inputs);
+        out, input, weights, indices, seq_lens, add_inputs);
   } else if (out.scalar_type() == at::kHalf) {
     punica_sgmv_expand_kernel<at::Half>(
-        out, input, weights, indicies, seq_lens, add_inputs);
+        out, input, weights, indices, seq_lens, add_inputs);
   }
 }
 
@@ -477,7 +477,7 @@ void punica_bgmv_expand_slice_kernel_impl(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [bs]
+    at::Tensor& indices, // [bs]
     int64_t slice_offset,
     int64_t slice_size,
     bool add_inputs) {
@@ -493,13 +493,13 @@ void punica_bgmv_expand_slice_kernel_impl(
   TORCH_CHECK(out.dim() == 2, "out must be 2D");
   TORCH_CHECK(input.dim() == 2, "input must be 2D");
   TORCH_CHECK(weights.dim() == 3, "weights must be 3D");
-  TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D");
+  TORCH_CHECK(indices.dim() == 1, "indices must be 1D");
   if (out.scalar_type() == at::kBFloat16) {
     punica_bgmv_expand_slice_kernel<at::BFloat16>(
-        out, input, weights, indicies, slice_offset, slice_size, add_inputs);
+        out, input, weights, indices, slice_offset, slice_size, add_inputs);
   } else if (out.scalar_type() == at::kHalf) {
     punica_bgmv_expand_slice_kernel<at::Half>(
-        out, input, weights, indicies, slice_offset, slice_size, add_inputs);
+        out, input, weights, indices, slice_offset, slice_size, add_inputs);
   }
 }
 
@@ -507,7 +507,7 @@ void punica_sgmv_expand_slice_kernel_impl(
     at::Tensor& out, // [bs, output_size1] output_size1 >= max_rank
     at::Tensor& input, // [bs, input_size1]  input_size1  >= hidden_size
     at::Tensor& weights, // [num_lora, max_rank, hidden_size]
-    at::Tensor& indicies, // [num_seq]
+    at::Tensor& indices, // [num_seq]
     at::Tensor& seq_lens, // [num_seq]
     int64_t slice_offset,
     int64_t slice_size,
@@ -524,14 +524,14 @@ void punica_sgmv_expand_slice_kernel_impl(
   TORCH_CHECK(out.dim() == 2, "out must be 2D");
   TORCH_CHECK(input.dim() == 2, "input must be 2D");
   TORCH_CHECK(weights.dim() == 3, "weights must be 3D");
-  TORCH_CHECK(indicies.dim() == 1, "indicies must be 1D");
-  TORCH_CHECK(seq_lens.dim() == 1, "indicies must be 1D");
+  TORCH_CHECK(indices.dim() == 1, "indices must be 1D");
+  TORCH_CHECK(seq_lens.dim() == 1, "indices must be 1D");
   if (out.scalar_type() == at::kBFloat16) {
     punica_sgmv_expand_slice_kernel<at::BFloat16>(
         out,
         input,
         weights,
-        indicies,
+        indices,
         seq_lens,
         slice_offset,
         slice_size,
@@ -541,7 +541,7 @@ void punica_sgmv_expand_slice_kernel_impl(
         out,
         input,
         weights,
-        indicies,
+        indices,
         seq_lens,
         slice_offset,
         slice_size,
diff --git a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp
index ad3a19686..15c961d44 100644
--- a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp
+++ b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp
@@ -34,7 +34,7 @@ bool is_fused_qkv(at::Tensor& t_in, int64_t hidden_size) {
  * @param t_pos The tensor containing the positions. t_pos should be [B][S]
  * where B is the batch size and S is the sequence length. In some cases, there
  * is only one element which the past_kv_length.In this case, position id can
- * construced by past_kv_length + current_position
+ * constructed by past_kv_length + current_position
  * @param N The number of heads.
  * @param H The head size.
  * @param offset The offset value. For GPT-J 6B/ChatGLM, cos/sin is applied to
diff --git a/csrc/cpu/isa/cpu_feature.cpp b/csrc/cpu/isa/cpu_feature.cpp
index 2d0ebdb9a..e6bed25f5 100644
--- a/csrc/cpu/isa/cpu_feature.cpp
+++ b/csrc/cpu/isa/cpu_feature.cpp
@@ -373,7 +373,7 @@ bool CPUFeature::_do_check_and_init_amx() {
 }
 
 bool CPUFeature::isa_level_amx() {
-  // check and init in a funtion, avoid to double init.
+  // check and init in a function, avoid to double init.
   static bool b_is_support = _do_check_and_init_amx();
 
   return b_is_support;
diff --git a/csrc/cpu/isa/cpu_feature.hpp b/csrc/cpu/isa/cpu_feature.hpp
index 598ad9cdf..cb414eb29 100644
--- a/csrc/cpu/isa/cpu_feature.hpp
+++ b/csrc/cpu/isa/cpu_feature.hpp
@@ -140,7 +140,7 @@ class CPUFeature {
 
  public:
   /*
-  isa level referance to oneDNN.
+  isa level reference to oneDNN.
   ------------------------------------------------------------------------------------
   The ISAs are partially ordered:
   SSE41 < AVX < AVX2,
diff --git a/csrc/cpu/jit/README.md b/csrc/cpu/jit/README.md
index 707049b21..0242d8a33 100644
--- a/csrc/cpu/jit/README.md
+++ b/csrc/cpu/jit/README.md
@@ -1,3 +1,3 @@
 # PyTorch JIT pass for DNNL
 
-This folder contains experimental passes that optimize PyTorch Graph to utilize full power of DNNL (or other optimizations). It chose to use PyTorch namespace for eazy migration into main repo in the future. Abstract graph manipulation part of JIT should completely independent of other modules in extension, which means no reference to any symbols in other files of the project.
+This folder contains experimental passes that optimize PyTorch Graph to utilize full power of DNNL (or other optimizations). It chose to use PyTorch namespace for easy migration into main repo in the future. Abstract graph manipulation part of JIT should completely independent of other modules in extension, which means no reference to any symbols in other files of the project.
diff --git a/csrc/cpu/jit/codegen/onednn/kernel.cpp b/csrc/cpu/jit/codegen/onednn/kernel.cpp
index 8709d87b8..d5a74c88c 100644
--- a/csrc/cpu/jit/codegen/onednn/kernel.cpp
+++ b/csrc/cpu/jit/codegen/onednn/kernel.cpp
@@ -75,18 +75,18 @@ ArgSpec LlgaKernel::getQuantizedSpec(ArgSpec spec, size_t offset) const {
   return spec;
 }
 
-std::map<size_t, int64_t> LlgaKernel::initializeTensorIdToOccurence() const {
-  std::map<size_t, int64_t> tensorIdToOccurence;
+std::map<size_t, int64_t> LlgaKernel::initializeTensorIdToOccurrence() const {
+  std::map<size_t, int64_t> tensorIdToOccurrence;
   for (auto& lt : partition_.get_input_ports()) {
     auto inputId = lt.get_id();
-    std::map<size_t, int64_t>::iterator it(tensorIdToOccurence.find(inputId));
-    if (it != tensorIdToOccurence.end()) {
+    std::map<size_t, int64_t>::iterator it(tensorIdToOccurrence.find(inputId));
+    if (it != tensorIdToOccurrence.end()) {
       it->second++;
     } else {
-      tensorIdToOccurence[inputId] = 1;
+      tensorIdToOccurrence[inputId] = 1;
     }
   }
-  return tensorIdToOccurence;
+  return tensorIdToOccurrence;
 }
 
 ArgSpecs LlgaKernel::initializeInputSpecs(const TensorArgs& inputs) {
@@ -100,22 +100,22 @@ ArgSpecs LlgaKernel::initializeInputSpecs(const TensorArgs& inputs) {
     }
   });
   GRAPH_DEBUG("Initializing graph input logical tensors");
-  // initializeTensorIdToOccurence can also be called just once for the first
+  // initializeTensorIdToOccurrence can also be called just once for the first
   // input shape
-  std::map<size_t, int64_t> tensorIdToOccurence =
-      initializeTensorIdToOccurence();
+  std::map<size_t, int64_t> tensorIdToOccurrence =
+      initializeTensorIdToOccurrence();
   for (size_t i = 0; i < nGraphInputs_; i++) {
     auto spec = ArgSpec(graph_->inputs()[i]).supplementTensorInfo(inputs[i]);
-    int64_t occurence = tensorIdToOccurence[spec.tid()];
-    inputSpecs.insert(inputSpecs.end(), occurence, spec);
+    int64_t occurrence = tensorIdToOccurrence[spec.tid()];
+    inputSpecs.insert(inputSpecs.end(), occurrence, spec);
   }
 
   std::call_once(constantSpecInitializedFlag_, [&]() {
     for (size_t i = 0; i < nGraphInputs_; i++) {
       auto spec = ArgSpec(graph_->inputs()[i]).supplementTensorInfo(inputs[i]);
-      int64_t occurence = tensorIdToOccurence[spec.tid()];
+      int64_t occurrence = tensorIdToOccurrence[spec.tid()];
       initializedInputIds_.insert(spec.tid());
-      runArgsIdx_.insert(runArgsIdx_.end(), occurence, i);
+      runArgsIdx_.insert(runArgsIdx_.end(), occurrence, i);
     }
     for (auto& lt : partition_.get_input_ports()) {
       auto inputId = lt.get_id();
@@ -291,7 +291,7 @@ void LlgaKernel::prepareAndCacheRunArgs(
         // Currently, only weight will use quantize_per_channel, data will
         // always use quantize_per_tensor. We will only allocate buffer for data
         // (output of a LlgaPartition). If in the future, we need allocate
-        // buffer for qensor that is quantized per channel, need implemeted
+        // buffer for qensor that is quantized per channel, need implemented
         // as_strided_qtensorimpl for PER_CHANNEL QScheme.
         qtensor.as_strided_(spec.sizes(), spec.strides());
         outputs.push_back(qtensor);
diff --git a/csrc/cpu/jit/codegen/onednn/kernel.h b/csrc/cpu/jit/codegen/onednn/kernel.h
index d11496a55..56e918888 100644
--- a/csrc/cpu/jit/codegen/onednn/kernel.h
+++ b/csrc/cpu/jit/codegen/onednn/kernel.h
@@ -91,7 +91,7 @@ class LlgaKernel {
   // create qtensor for output of public format
   ArgSpec getQuantizedSpec(ArgSpec spec, size_t offset) const;
 
-  std::map<size_t, int64_t> initializeTensorIdToOccurence() const;
+  std::map<size_t, int64_t> initializeTensorIdToOccurrence() const;
 
   // PyTorch copy constants inside the subgraph instead of referencing them.
   // Constants inputs to the partition are no longer in the graph->inputs().
diff --git a/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp b/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp
index 00a0311a7..b49e13b02 100644
--- a/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp
+++ b/csrc/cpu/jit/codegen/onednn/prepare_dequant.cpp
@@ -18,7 +18,7 @@ class OpSplitter {
 
   bool analyzeNode(Node* node) {
     // If node->kind() matches the NodeKind, the node will be a candidate to be
-    // splitted. If the input to the current node matches with InputKind, will
+    // split. If the input to the current node matches with InputKind, will
     // split the node
     static std::unordered_map<Symbol, std::set<Symbol>> NodeKindToInputKind{
         {aten::to, {Symbol::aten("dequantize")}},
diff --git a/csrc/cpu/jit/codegen/onednn/prepare_dequant.h b/csrc/cpu/jit/codegen/onednn/prepare_dequant.h
index 490181d09..2595bb8ff 100644
--- a/csrc/cpu/jit/codegen/onednn/prepare_dequant.h
+++ b/csrc/cpu/jit/codegen/onednn/prepare_dequant.h
@@ -38,7 +38,7 @@ void PrepareDequantForLLGA(std::shared_ptr<torch::jit::Graph>& graph);
 
 // PyTorch dequant node receives qtensor as input, thus no quantization-related
 // info (scales, zp, etc.) on the IR, while LLGA needs those info on the
-// dequantize node. We add a pass to retreive the quantization info from the
+// dequantize node. We add a pass to retrieve the quantization info from the
 // quantize node just before the dequantize node and save them on the dequantize
 // node.
 void SaveDequantInformation(std::shared_ptr<torch::jit::Graph>& graph);
diff --git a/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h b/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h
index 2d2887cf6..cdef1d621 100644
--- a/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h
+++ b/csrc/cpu/jit/cpu/kernels/ContextConvTranspose.h
@@ -25,7 +25,7 @@ struct ContextConvTranspose final {
   std::vector<int64_t> dilation_;
   std::vector<int64_t> input_size_;
   int64_t groups_;
-  // The originin weight != weight_packed_.get_dims() since there is a tranpose
+  // The originin weight != weight_packed_.get_dims() since there is a transpose
   // for weight, We directly store origin_weight_dims_ here to avoid compute it.
   std::vector<int64_t> origin_weight_dims_;
   bool weight_is_channels_last_;
diff --git a/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h b/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h
index 752c6d144..253c18550 100644
--- a/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h
+++ b/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.h
@@ -116,7 +116,7 @@ at::Tensor& run(
     at::Tensor& accumu,
     const ideep::attr_t& attr);
 
-// Runing backward for ConvTranspose by given grad_output, input and grad_masks.
+// Running backward for ConvTranspose by given grad_output, input and grad_masks.
 // Will using the mkldnn_weight stored in the context
 std::tuple<at::Tensor, at::Tensor, at::Tensor> run_backward(
     ContextConvTranspose& context,
diff --git a/csrc/cpu/jit/cpu/kernels/Einsum.cpp b/csrc/cpu/jit/cpu/kernels/Einsum.cpp
index 7dc06547c..685d459b1 100644
--- a/csrc/cpu/jit/cpu/kernels/Einsum.cpp
+++ b/csrc/cpu/jit/cpu/kernels/Einsum.cpp
@@ -178,7 +178,7 @@ static Tensor sumproduct_pair(
   // we now work with the following permutations / shapes.
   // the pipeline is permute inputs -> reshape inputs -> batch matrix mul ->
   // reshape(view) output -> permute output output: "lro, lo, 1-for-summed-dims,
-  // ro" with orgiginal shape dimensions left:   "lro, lo, summed" permuted with
+  // ro" with original shape dimensions left:   "lro, lo, summed" permuted with
   // lpermutation and the three flattened right:  "lro, summed, ro" permuted
   // with rpermutation and the three flattened then the permuted output is a
   // view of bmm(left, right) finally, opermutation reverts the permutation to
@@ -192,7 +192,7 @@ static Tensor sumproduct_pair(
   for (auto& d : sum_dims_) {
     out_size.push_back(1);
     (void)(d);
-  }; // avoid warining about not using d
+  }; // avoid warning about not using d
   for (auto& d : ro)
     out_size.push_back(right.size(d));
   std::vector<int64_t> lpermutation(lro);
@@ -395,7 +395,7 @@ unsigned char einsum_index_to_label(uint8_t index) {
  *2) unsqueeze and permute the inputs/output to have same dims. The dim order
  * of all inputs and output is same.
  *\param equation:  The subscripts for the Einstein summation.
- *more detials about equation can found:
+ *more details about equation can found:
  *https://pytorch.org/docs/stable/generated/torch.einsum.html
  *\param operands: The tensors to compute the Einstein summation of.
  *\return tuple<has_zero_size_dim, out_size, dim_last_op, sum_dims,
@@ -731,7 +731,7 @@ einsum_prepare(
 /*!
  * This function use oneDNN binary post-ops to do the einsum+binary fusion.
  *\param equation:  The subscripts for the Einstein summation.
- *more detials about equation can found:
+ *more details about equation can found:
  *https://pytorch.org/docs/stable/generated/torch.einsum.html
  *\param operands: The tensors to compute the Einstein summation of.
  *\param add_arg: the other input of binary ops.
diff --git a/csrc/cpu/jit/cpu/kernels/Matmul.cpp b/csrc/cpu/jit/cpu/kernels/Matmul.cpp
index 4a0adb523..2a6cbc20d 100644
--- a/csrc/cpu/jit/cpu/kernels/Matmul.cpp
+++ b/csrc/cpu/jit/cpu/kernels/Matmul.cpp
@@ -104,7 +104,7 @@ void mkl_fp32_bmm_impl(
  *
  * @param tensor1
  * @param tensor2
- * @param out Optinal output provided by user for matmul
+ * @param out Optional output provided by user for matmul
  * @attr Attribute for matmul oneDNN primitive
  * @return output Tensor.
  * Since oneDNN 2.6.0, AMX and AVX512 brgemm are enabled for the DNNL MATMUL
@@ -233,7 +233,7 @@ at::Tensor dil_matmul(const at::Tensor& tensor1, const at::Tensor& tensor2) {
  *
  * @param tensor1
  * @param tensor2
- * @param out Optinal output provided by user for matmul
+ * @param out Optional output provided by user for matmul
  * @param div_input Input Tensor for div
  * @return Value for the fusion pattern output.
  */
@@ -262,7 +262,7 @@ at::Tensor dil_matmul_div(
  *
  *@param tensor1
  *@param tensor2
- *@param out Optinal output provided by user for matmul
+ *@param out Optional output provided by user for matmul
  *@param div_input Input scalar for div
  *@return Value for the fusion pattern output.
  */
diff --git a/csrc/cpu/jit/cpu/kernels/Mha.cpp b/csrc/cpu/jit/cpu/kernels/Mha.cpp
index bddfbfc88..a65981195 100644
--- a/csrc/cpu/jit/cpu/kernels/Mha.cpp
+++ b/csrc/cpu/jit/cpu/kernels/Mha.cpp
@@ -19,7 +19,7 @@ namespace cpu {
 
 /**
  * We tried to fuse Div+Matmul+Add+Softmax as a signel operator. But
- * the oneDNN matmul performance with binary postop is poor, then we splited
+ * the oneDNN matmul performance with binary postop is poor, then we split
  * the fusion into two parts - Div+Matmul and Add+Softmax. When the oneDNN
  * fixes the performance issue, we can directly leverage oneDNN's
  * implementation.
@@ -247,7 +247,7 @@ at::Tensor dil_sd_flash_mha(
 
 /**
  *  This kernel implements Flast attention on stable-diffusion models (from
- * Diffusers 0.12.1 and 0.13) for BF16 dtype, where qkv is splited; Note that
+ * Diffusers 0.12.1 and 0.13) for BF16 dtype, where qkv is split; Note that
  * in 0.13, aten::scaled_dot_product_attention uses the scale of sqrt(headSize)
  * if no scale is provided for query, where we are following
  */
diff --git a/csrc/cpu/jit/fusion_pass.cpp b/csrc/cpu/jit/fusion_pass.cpp
index d4a37e500..401337057 100644
--- a/csrc/cpu/jit/fusion_pass.cpp
+++ b/csrc/cpu/jit/fusion_pass.cpp
@@ -129,7 +129,7 @@ void IPEXFusionPass(std::shared_ptr<Graph>& graph) {
 
   // TODO: Record original aten nodes, while convert aten linear-> ipex linear,
   // will ignore these aten linear (if they are fp32 dtype). For BF16 dtype,
-  // always use ipex linear. This is a temporay solution, for next PR to clean
+  // always use ipex linear. This is a temporary solution, for next PR to clean
   // up fusion pass, will further abstract this as a class method.
   auto aten_linear_recorder = ATenLinearRecorder(graph);
   // linear folding
@@ -244,7 +244,7 @@ FusionBehavior getCurrentBehavior(size_t remaining_depth) {
     }
   }
   // should never get here
-  TORCH_WARN("Stratgy changed mid-invocation, NYI");
+  TORCH_WARN("Strategy changed mid-invocation, NYI");
   return FusionBehavior::STATIC;
 }
 
diff --git a/csrc/cpu/jit/passes/concat_linear.cpp b/csrc/cpu/jit/passes/concat_linear.cpp
index e5eee46cb..9cb9cc64f 100644
--- a/csrc/cpu/jit/passes/concat_linear.cpp
+++ b/csrc/cpu/jit/passes/concat_linear.cpp
@@ -113,7 +113,7 @@ class ConcatLinearLayers {
         TORCH_CHECK(
             (aten_linear.find(base_node) != aten_linear.end()) ==
                 (aten_linear.find(compatible_layers[i]) != aten_linear.end()),
-            "one of the layer is replaced by ipex linear while one of the other layer is original aten linear, it is ambiguity to know whether we shoudl create ipex linear or aten linear for concated linear")
+            "one of the layer is replaced by ipex linear while one of the other layer is original aten linear, it is ambiguity to know whether we should create ipex linear or aten linear for concated linear")
       }
       // Create concated aten linear
       if (aten_linear.find(base_node) != aten_linear.end()) {
diff --git a/csrc/cpu/jit/passes/frozen_conv_folding.cpp b/csrc/cpu/jit/passes/frozen_conv_folding.cpp
index 8f004e87e..076b70c92 100644
--- a/csrc/cpu/jit/passes/frozen_conv_folding.cpp
+++ b/csrc/cpu/jit/passes/frozen_conv_folding.cpp
@@ -55,7 +55,7 @@ bool FoldFrozenConvBatchnorm(Block* b) {
       auto bn_rm_ivalue = bn->namedInput("running_mean");
       auto bn_rv_ivalue = bn->namedInput("running_var");
       // check running_mean and running_var has value, if they are
-      // None(track_running_stats=False), skiping the folding path.
+      // None(track_running_stats=False), skipping the folding path.
       if (bn_rm_ivalue->type() == NoneType::get() &&
           bn_rv_ivalue->type() == NoneType::get()) {
         continue;
diff --git a/csrc/cpu/jit/passes/frozen_linear_folding.cpp b/csrc/cpu/jit/passes/frozen_linear_folding.cpp
index 69a47c7d5..0dcaae27c 100644
--- a/csrc/cpu/jit/passes/frozen_linear_folding.cpp
+++ b/csrc/cpu/jit/passes/frozen_linear_folding.cpp
@@ -105,7 +105,7 @@ bool FoldFrozenLinearBatchnorm(Block* b) {
       auto bn_rv_ivalue = bn->namedInput("running_var");
 
       // check running_mean and running_var has value, if they are
-      // None(track_running_stats=False), skiping the folding path.
+      // None(track_running_stats=False), skipping the folding path.
       if (bn_rm_ivalue->type() == NoneType::get() &&
           bn_rv_ivalue->type() == NoneType::get()) {
         continue;
diff --git a/csrc/cpu/jit/passes/graph_rewrite.cpp b/csrc/cpu/jit/passes/graph_rewrite.cpp
index c279daf6e..83e1e064c 100644
--- a/csrc/cpu/jit/passes/graph_rewrite.cpp
+++ b/csrc/cpu/jit/passes/graph_rewrite.cpp
@@ -14,7 +14,7 @@ using namespace torch::jit;
 
 // FuseShuffle is matching the channelshuffle pattern, where:
 // (1) the first view is [n, c, h, w] => [n, groups, c // groups, h, w]
-// (2) the tranpose is for groups => [n, c // groups, grpups, h, w]
+// (2) the transpose is for groups => [n, c // groups, grpups, h, w]
 // (3) the output view shape should be the same as the input tensor shape
 void FuseShuffle(std::shared_ptr<Graph>& graph) {
   // below is channelshuffle for staic view shape pattern
@@ -98,7 +98,7 @@ void FuseShuffle(std::shared_ptr<Graph>& graph) {
             trans_dim0_val < trans_dim1_val ? trans_dim0_val : trans_dim1_val;
         auto dim1_val =
             trans_dim0_val > trans_dim1_val ? trans_dim0_val : trans_dim1_val;
-        // If the tranpose if not for groups. ex. [n, c1, c2, h, w] => [n, c2,
+        // If the transpose if not for groups. ex. [n, c1, c2, h, w] => [n, c2,
         // c1, h, w]
         if ((dim1_val - dim0_val) != 1) {
           return false;
@@ -140,7 +140,7 @@ void FuseShuffle(std::shared_ptr<Graph>& graph) {
 
         for (int i = 0; i < flattern_shape_list.size(); i++) {
           if (flattern_shape_list[i] != inputTensor.sizes()[i].value()) {
-            // [n, c, h, w] => view [n, groups, c // groups, h, w] => tranpose
+            // [n, c, h, w] => view [n, groups, c // groups, h, w] => transpose
             // [n, c // groups, groups, h, w]
             // => view [n, -1, h, w]
             //    or
@@ -991,7 +991,7 @@ void replaceAddWithQAdd(std::shared_ptr<Graph>& graph) {
         %qout = aten::quantize_per_tensor(%r, %o_scale, %o_zp, %o_dtype)
         return (%qout) )";
 
-  // fliter the unsupported case
+  // filter the unsupported case
   auto fusion_filter = [](const Match& match,
                           const std::unordered_map<std::string, Value*>& vmap) {
     auto alpha = match.values_map.at(vmap.at("alpha"));
@@ -1023,7 +1023,7 @@ void fuseBmmAdd(std::shared_ptr<Graph>& graph) {
     graph(%input, %batch1, %batch2, %alpha):
         %res = ipex::bmm_add(%input, %batch1, %batch2, %alpha)
         return (%res))";
-  // fliter the unsupported case
+  // filter the unsupported case
   auto fusion_filter = [](const Match& match,
                           const std::unordered_map<std::string, Value*>& vmap) {
     const auto& match_vmap = match.values_map;
@@ -1108,7 +1108,7 @@ void FuseConcatBnRelu(std::shared_ptr<Graph>& graph) {
     };
     // Check if the dimension of the first tensor is either 4 or 5.
     // Check if the data type, the size of Channels, and the memory format are
-    // float, mutiples of 16, and ChannelsLast(3d), respectively.
+    // float, multiples of 16, and ChannelsLast(3d), respectively.
     if (!(tensor1->dim().value() == 4 || tensor1->dim().value() == 5) ||
         !check_type_channelsize(*tensor1)) {
       return false;
@@ -1298,7 +1298,7 @@ void FusePythonGELUWithAten(std::shared_ptr<Graph>& graph) {
   SingleGeluTanh_v2.runOnGraph(graph, filter_v2);
 }
 
-// This path will be removed after pytorch offical path is optimized well.
+// This path will be removed after pytorch official path is optimized well.
 void replaceAtenMaxPool2dWithIpexMaxPool2d(std::shared_ptr<Graph>& graph) {
   std::string max_pool2d = R"(
       graph(%a, %kernel_size:int[], %stride:int[], %padding:int[], %dilation:int[], %ceil_mode:bool):
diff --git a/csrc/cpu/jit/passes/graph_rewrite.h b/csrc/cpu/jit/passes/graph_rewrite.h
index 6ee12fd10..c352f9b32 100644
--- a/csrc/cpu/jit/passes/graph_rewrite.h
+++ b/csrc/cpu/jit/passes/graph_rewrite.h
@@ -15,7 +15,7 @@ void FuseShuffle(std::shared_ptr<torch::jit::Graph>& graph);
 void PostScalarDivOrMul(std::shared_ptr<torch::jit::Graph>& graph);
 void FuseMHAScoreCalc(std::shared_ptr<torch::jit::Graph>& graph);
 void FuseLinearSwishCustomized(std::shared_ptr<torch::jit::Graph>& graph);
-// This path will be removed after pytorch offical path is optimized well.
+// This path will be removed after pytorch official path is optimized well.
 void replaceAtenMaxPool2dWithIpexMaxPool2d(
     std::shared_ptr<torch::jit::Graph>& graph);
 void fuseBmmAdd(std::shared_ptr<torch::jit::Graph>& graph);
diff --git a/csrc/cpu/jit/passes/graph_rewrite_helper.cpp b/csrc/cpu/jit/passes/graph_rewrite_helper.cpp
index 8b7e34819..2ec8af99e 100644
--- a/csrc/cpu/jit/passes/graph_rewrite_helper.cpp
+++ b/csrc/cpu/jit/passes/graph_rewrite_helper.cpp
@@ -10,7 +10,7 @@ namespace graph_rewrite_helper {
 
 using namespace torch::jit;
 
-// those code just copy from PyTorch offical and extend
+// those code just copy from PyTorch official and extend
 // replaceConvolutionWithAtenConv to handle conv_transpose3d.
 
 Value* getValue(
diff --git a/csrc/cpu/jit/passes/graph_rewrite_helper.h b/csrc/cpu/jit/passes/graph_rewrite_helper.h
index 96763393d..ee0acdda5 100644
--- a/csrc/cpu/jit/passes/graph_rewrite_helper.h
+++ b/csrc/cpu/jit/passes/graph_rewrite_helper.h
@@ -10,7 +10,7 @@ namespace torch_ipex {
 namespace jit {
 namespace graph_rewrite_helper {
 
-// those code just copy from PyTorch offical and extend
+// those code just copy from PyTorch official and extend
 // replaceConvolutionWithAtenConv to handle conv_transpose3d.
 
 torch::jit::Value* getValue(
diff --git a/csrc/cpu/runtime/CPUPool.cpp b/csrc/cpu/runtime/CPUPool.cpp
index b9b3fc6b7..a72738035 100644
--- a/csrc/cpu/runtime/CPUPool.cpp
+++ b/csrc/cpu/runtime/CPUPool.cpp
@@ -100,7 +100,7 @@ std::vector<int32_t> init_process_available_cores() {
   if (is_runtime_ext_enabled()) {
     // When IOMP preloaded.
     // Step1: Get the main thread affinity information:
-    // 2 knowning external command may change it during process starts up:
+    // 2 knowing external command may change it during process starts up:
     //   * External Numactl.
     //   * Preload IOMP with KMP_AFFINITY settings.
     // We need to save this information firstly and restore it later.
@@ -299,7 +299,7 @@ void set_mask_affinity_from_cpu_pool(const CPUPool& cpu_pool) {
   omp_set_num_threads(threads_mask.size());
 #pragma omp parallel num_threads(threads_mask.size())
   {
-    // we will destory the mask inside the CPUPool deconstructor
+    // we will destroy the mask inside the CPUPool deconstructor
     int thread_id = omp_get_thread_num();
     kmp_affinity_mask_t mask = threads_mask[thread_id];
     kmp_set_affinity_ext(&mask);
diff --git a/csrc/cpu/runtime/CPUPool.h b/csrc/cpu/runtime/CPUPool.h
index ad501a27e..9c14d3ba4 100644
--- a/csrc/cpu/runtime/CPUPool.h
+++ b/csrc/cpu/runtime/CPUPool.h
@@ -43,9 +43,9 @@ class IPEX_API CPUPool {
   // Put deleted function into private.
   CPUPool() = delete;
   CPUPool(const CPUPool& source_cpu_pool) =
-      delete; // avoid potential risk of double destory masks.
+      delete; // avoid potential risk of double destroy masks.
   CPUPool& operator=(const CPUPool& source_cpu_pool) =
-      delete; // avoid potential risk of double destory masks.
+      delete; // avoid potential risk of double destroy masks.
   CPUPool& operator=(CPUPool&& source_cpu_pool) = delete;
 };
 
diff --git a/csrc/cpu/runtime/Task.h b/csrc/cpu/runtime/Task.h
index 10af352e5..230290384 100644
--- a/csrc/cpu/runtime/Task.h
+++ b/csrc/cpu/runtime/Task.h
@@ -63,7 +63,7 @@ auto Task<F, Args...>::operator()(Args&&... args)
       // set the thread local status, such as the grad mode before execuating
       // the status
       at::GradMode::set_enabled(grad_mode);
-      // execuate the task
+      // execute the task
       (*task)();
     });
   }
diff --git a/csrc/cpu/runtime/TaskExecutor.h b/csrc/cpu/runtime/TaskExecutor.h
index 3ec0078b7..468177846 100644
--- a/csrc/cpu/runtime/TaskExecutor.h
+++ b/csrc/cpu/runtime/TaskExecutor.h
@@ -43,13 +43,13 @@ class IPEX_API TaskExecutor {
 
   // Put the deleted function in the private.
   TaskExecutor(const TaskExecutor& task_executor) =
-      delete; // Not support copy or move construtor.
+      delete; // Not support copy or move constructor.
   TaskExecutor(TaskExecutor&& task_executor) =
-      delete; // Not support copy or move construtor.
+      delete; // Not support copy or move constructor.
   TaskExecutor& operator=(const TaskExecutor& task_executor) =
-      delete; // Not support copy or move construtor.
+      delete; // Not support copy or move constructor.
   TaskExecutor& operator=(TaskExecutor&& task_executor) =
-      delete; // Not support copy or move construtor.
+      delete; // Not support copy or move constructor.
 };
 
 } // namespace runtime
diff --git a/csrc/cpu/tpp/ext_tpp.h b/csrc/cpu/tpp/ext_tpp.h
index 958584363..1172bea29 100644
--- a/csrc/cpu/tpp/ext_tpp.h
+++ b/csrc/cpu/tpp/ext_tpp.h
@@ -33,7 +33,7 @@ class BrgemmExtTPP {
     auto dt_out = XsmmDtype<Tout>();
     if (dt_out == LIBXSMM_DATATYPE_F32 && c_trans == XformTPP::XFORM_N2V_TPP) {
       printf(
-          "Warning: reseting c_trans flag from N2V to None for FP32 output\n");
+          "Warning: resetting c_trans flag from N2V to None for FP32 output\n");
       c_trans = XformTPP::XFORM_NONE_TPP;
     }
     auto beta_ = beta;
diff --git a/csrc/cpu/tpp/init.cpp b/csrc/cpu/tpp/init.cpp
index d33b2df31..85d1ca7e0 100644
--- a/csrc/cpu/tpp/init.cpp
+++ b/csrc/cpu/tpp/init.cpp
@@ -72,7 +72,7 @@ void init_libxsmm() {
   auto max_threads = omp_get_max_threads();
   PCL_ASSERT(
       max_threads <= MAX_THREADS,
-      "Maximun %d threads supported, %d threads being used, please compile with increased  MAX_THREADS value\n",
+      "Maximum %d threads supported, %d threads being used, please compile with increased  MAX_THREADS value\n",
       MAX_THREADS,
       max_threads);
   libxsmm_init();
diff --git a/csrc/cpu/tpp/par_loop_generator.cpp b/csrc/cpu/tpp/par_loop_generator.cpp
index 52e4f0fcf..1a44f1733 100644
--- a/csrc/cpu/tpp/par_loop_generator.cpp
+++ b/csrc/cpu/tpp/par_loop_generator.cpp
@@ -52,7 +52,7 @@ typedef struct {
   int n_loops;
   loop_param_t* loop_params;
   int n_logical_loops;
-  char occurence_map[256];
+  char occurrence_map[256];
   int jit_loop_spec;
   int use_2d_par;
   int n_row_teams;
@@ -350,7 +350,7 @@ void emit_loop_body(loop_code* i_code, char* body_func_name) {
         sizeof(tmp_buf),
         "%c%d",
         'a' + i,
-        i_code->occurence_map['a' + i] - 1);
+        i_code->occurrence_map['a' + i] - 1);
     align_line(i_code);
     snprintf(tmp_buf, sizeof(tmp_buf), "idx[%d] = %s;\n", i, str_idx);
     add_buf_to_code(i_code, tmp_buf);
@@ -607,7 +607,7 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
   char term_func_name[64] = "term_func";
   char spec_func_name[64] = "loop_rt_spec";
   char loop_map[256];
-  char occurence_map[256];
+  char occurrence_map[256];
   loop_code l_code;
   char* result_code;
   loop_param_t loop_params[256], cur_loop, loop_params_map[256];
@@ -699,12 +699,12 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
   }
 
   /* Set up loop properties */
-  std::fill_n(occurence_map, 256, 0);
+  std::fill_n(occurrence_map, 256, 0);
   for (i = 0; i < n_loops; i++) {
     int is_blocked = (loop_map[tolower(loop_nest_desc[i])] > 1) ? 1 : 0;
     int is_parallelizable =
         (tolower(loop_nest_desc[i]) != loop_nest_desc[i]) ? 1 : 0;
-    int occurence_id, is_blocked_outer;
+    int occurrence_id, is_blocked_outer;
     char idx_name[16];
     char spec_array_name[512];
     char start_var_name[512];
@@ -712,9 +712,9 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
     char step_var_name[512];
     int loop_abs_index = tolower(loop_nest_desc[i]) - 'a';
 
-    occurence_id = occurence_map[tolower(loop_nest_desc[i])];
-    is_blocked_outer = (occurence_id == 0) ? 1 : 0;
-    occurence_map[tolower(loop_nest_desc[i])]++;
+    occurrence_id = occurrence_map[tolower(loop_nest_desc[i])];
+    is_blocked_outer = (occurrence_id == 0) ? 1 : 0;
+    occurrence_map[tolower(loop_nest_desc[i])]++;
 
     snprintf(spec_array_name, sizeof(spec_array_name), "%s", spec_func_name);
 
@@ -723,9 +723,9 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
         sizeof(idx_name),
         "%c%d",
         tolower(loop_nest_desc[i]),
-        occurence_id);
+        occurrence_id);
 
-    if (occurence_id == 0) {
+    if (occurrence_id == 0) {
       if (loop_params_map[loop_abs_index].jit_start > 0) {
         snprintf(
             start_var_name,
@@ -746,10 +746,10 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
           sizeof(start_var_name),
           "%c%d",
           tolower(loop_nest_desc[i]),
-          occurence_id - 1);
+          occurrence_id - 1);
     }
 
-    if (occurence_id == 0) {
+    if (occurrence_id == 0) {
       if (loop_params_map[loop_abs_index].jit_end > 0) {
         snprintf(
             end_var_name,
@@ -771,23 +771,23 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
             sizeof(end_var_name),
             "%c%d + %d",
             tolower(loop_nest_desc[i]),
-            occurence_id - 1,
-            loop_params_map[loop_abs_index].block_size[occurence_id - 1]);
+            occurrence_id - 1,
+            loop_params_map[loop_abs_index].block_size[occurrence_id - 1]);
       } else {
         snprintf(
             end_var_name,
             sizeof(end_var_name),
             "%c%d + %s[%d].block_size[%d]",
             tolower(loop_nest_desc[i]),
-            occurence_id - 1,
+            occurrence_id - 1,
             spec_array_name,
             loop_abs_index,
-            occurence_id - 1);
+            occurrence_id - 1);
       }
     }
 
     if (is_blocked) {
-      if (occurence_id == loop_map[tolower(loop_nest_desc[i])] - 1) {
+      if (occurrence_id == loop_map[tolower(loop_nest_desc[i])] - 1) {
         if (loop_params_map[loop_abs_index].jit_step > 0) {
           snprintf(
               step_var_name,
@@ -808,7 +808,7 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
               step_var_name,
               sizeof(step_var_name),
               "%d",
-              loop_params_map[loop_abs_index].block_size[occurence_id]);
+              loop_params_map[loop_abs_index].block_size[occurrence_id]);
         } else {
           snprintf(
               step_var_name,
@@ -816,7 +816,7 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
               "%s[%d].block_size[%d]",
               spec_array_name,
               loop_abs_index,
-              occurence_id);
+              occurrence_id);
         }
       }
     } else {
@@ -851,13 +851,13 @@ std::string loop_generator(const char* __loop_nest_desc_extended) {
   /* Setup number of logical loops and the ocurence map */
   n_logical_loops = 0;
   for (i = 0; i < 256; i++) {
-    if (occurence_map[i] > 0) {
+    if (occurrence_map[i] > 0) {
       n_logical_loops++;
     }
   }
   l_code.n_logical_loops = n_logical_loops;
 
-  memcpy(&l_code.occurence_map[0], occurence_map, 256);
+  memcpy(&l_code.occurrence_map[0], occurrence_map, 256);
 
   /* Emit function signature  */
   emit_func_signature(
diff --git a/csrc/cpu/tpp/threaded_loops.h b/csrc/cpu/tpp/threaded_loops.h
index 1c8b6830e..b8c7708e9 100644
--- a/csrc/cpu/tpp/threaded_loops.h
+++ b/csrc/cpu/tpp/threaded_loops.h
@@ -260,7 +260,7 @@ class ThreadedLoop {
   error: array initializer must be an initializer list
   So, now this->bounds is initialized by copy elements one by one
   This change leads to another problem: bounds is an array of LoopSpecs,
-  but LoopSpecs does not have a default consturctor. So, we added a
+  but LoopSpecs does not have a default constructor. So, we added a
   default constructor for LoopSpecs.
   */
   ThreadedLoop(const LoopSpecs (&bounds)[N], std::string scheme = "")
diff --git a/csrc/cpu/tpp/woq/dispatcher.h b/csrc/cpu/tpp/woq/dispatcher.h
index 9cbdcf492..1cb22d23b 100644
--- a/csrc/cpu/tpp/woq/dispatcher.h
+++ b/csrc/cpu/tpp/woq/dispatcher.h
@@ -205,17 +205,17 @@ struct product_dispatcher_helper<
 
 template <
     typename... IntegralTypeProcessed,
-    typename... IntegeralTypeToProcess,
+    typename... IntegralTypeToProcess,
     typename... Dispatcher>
 struct product_dispatcher_helper<
     std::tuple<IntegralTypeProcessed...>,
-    std::tuple<IntegeralTypeToProcess...>,
+    std::tuple<IntegralTypeToProcess...>,
     std::tuple<Dispatcher...>> {
   template <typename Lambda1, typename Lambda2, typename... Args>
   inline static void call(
       std::tuple<Dispatcher...> dispatchers,
       std::tuple<IntegralTypeProcessed...> constants,
-      std::tuple<IntegeralTypeToProcess...> integrals,
+      std::tuple<IntegralTypeToProcess...> integrals,
       const Lambda1& function,
       const Lambda2& fallback,
       Args... args) {
@@ -252,20 +252,20 @@ template <typename IntegralTypes, typename Dispatchers>
 struct product_dispatcher;
 
 // dispatch to a carsian product of a list of integers to a lambda function
-template <typename... IntegeralType, typename... Dispatcher>
+template <typename... IntegralType, typename... Dispatcher>
 struct product_dispatcher<
-    std::tuple<IntegeralType...>,
+    std::tuple<IntegralType...>,
     std::tuple<Dispatcher...>> {
   template <typename Lambda1, typename Lambda2, typename... Args>
   inline static void call(
-      std::tuple<IntegeralType...> integrals,
+      std::tuple<IntegralType...> integrals,
       const Lambda1& function,
       const Lambda2& fallback,
       Args... args) {
     static auto dispatchers = std::tuple<Dispatcher...>{};
     product_dispatcher_helper<
         std::tuple<>,
-        std::tuple<IntegeralType...>,
+        std::tuple<IntegralType...>,
         std::tuple<Dispatcher...>>::
         call(
             dispatchers,
diff --git a/csrc/cpu/tpp/xsmm_functors.h b/csrc/cpu/tpp/xsmm_functors.h
index db68ed023..f85b8e741 100644
--- a/csrc/cpu/tpp/xsmm_functors.h
+++ b/csrc/cpu/tpp/xsmm_functors.h
@@ -1583,7 +1583,7 @@ class XformExtTPP {
     if (ignore_vnni_for_fp32 == false) {
       PCL_ASSERT(
           (xtype == XformTPP::XFORM_XPOSE_TPP || dtype != LIBXSMM_DATATYPE_F32),
-          "Only Transpose Xofrm supportd for FP32 datatype, specified %d\n",
+          "Only Transpose Xofrm supported for FP32 datatype, specified %d\n",
           (int)xtype);
     }
     const int BS = xsmm_get_vnni_block_size(dtype);
diff --git a/csrc/cpu/utils/module_version.cpp b/csrc/cpu/utils/module_version.cpp
index 7485aeee5..dcaa1c27a 100644
--- a/csrc/cpu/utils/module_version.cpp
+++ b/csrc/cpu/utils/module_version.cpp
@@ -21,7 +21,7 @@ void get_mkl_version() {
 void get_libxsmm_version() {
 #if 1
   printf(
-      "Not avaliable yet, due to libxsmm CMake build not generate version info.\n");
+      "Not available yet, due to libxsmm CMake build not generate version info.\n");
 #else
   printf("libxsmm config version:  %s\n", LIBXSMM_CONFIG_VERSION);
   printf("Config branch:           %s\n", LIBXSMM_CONFIG_BRANCH);
diff --git a/csrc/cpu/utils/robin_hood.h b/csrc/cpu/utils/robin_hood.h
index 08fc09521..dfe0541da 100644
--- a/csrc/cpu/utils/robin_hood.h
+++ b/csrc/cpu/utils/robin_hood.h
@@ -112,7 +112,7 @@ static Counts& counts() {
 #error Unsupported bitness
 #endif
 
-// endianess
+// endianness
 #ifdef _MSC_VER
 #define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1
 #define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0
@@ -189,7 +189,7 @@ static Counts& counts() {
 #define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
 #endif
 
-// detect if native wchar_t type is availiable in MSVC
+// detect if native wchar_t type is available in MSVC
 #ifdef _MSC_VER
 #ifdef _NATIVE_WCHAR_T_DEFINED
 #define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
@@ -200,7 +200,7 @@ static Counts& counts() {
 #define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
 #endif
 
-// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor
+// detect if MSVC supports the pair(std::piecewise_construct_t,...) constructor
 // being constexpr
 #ifdef _MSC_VER
 #if _MSC_VER <= 1900
@@ -931,7 +931,7 @@ struct WrapKeyEqual : public T {
 // member,
 //   or a DataNode with a pointer to std::pair<key,val>. Which DataNode
 //   representation to use depends on how fast the swap() operation is.
-//   Heuristically, this is automatically choosen based on sizeof(). there are
+//   Heuristically, this is automatically chosen based on sizeof(). there are
 //   always 2^n Nodes.
 //
 // * info: Each Node in the map has a corresponding info byte, so there are 2^n
@@ -1603,7 +1603,7 @@ class Table
 
   // Creates an empty hash map. Nothing is allocated yet, this happens at the
   // first insert. This tremendously speeds up ctor & dtor of a map that never
-  // receives an element. The penalty is payed at the first insert, and not
+  // receives an element. The penalty is paid at the first insert, and not
   // before. Lookup of this empty map works because everybody points to
   // DummyInfoByte::b. parameter bucket_count is dictated by the standard, but
   // we can ignore it.
diff --git a/csrc/cpu/vec/unroll_helper.hpp b/csrc/cpu/vec/unroll_helper.hpp
index fd09e9465..357045c46 100644
--- a/csrc/cpu/vec/unroll_helper.hpp
+++ b/csrc/cpu/vec/unroll_helper.hpp
@@ -4,8 +4,8 @@
 #include <immintrin.h>
 #include "aten/utils/utils.h"
 
-// This helper aims to provide a set of lambda function to manully unroll
-// vectorized intrisics with compile_time_for
+// This helper aims to provide a set of lambda function to manually unroll
+// vectorized intrinsics with compile_time_for
 // https://github.com/intel/intel-extension-for-pytorch/blob/05aeaf4b675f15c68fcde5b575b4fd5151971129/csrc/cpu/aten/utils/utils.h#L68
 // For example,
 // auto load_fp32 = [](auto i, __m512* in_vset, auto* basic_ptr) {
diff --git a/csrc/cpu/vec/vec512/perf_kernel/add_swish.h b/csrc/cpu/vec/vec512/perf_kernel/add_swish.h
index 848adbd80..b42b7e09b 100644
--- a/csrc/cpu/vec/vec512/perf_kernel/add_swish.h
+++ b/csrc/cpu/vec/vec512/perf_kernel/add_swish.h
@@ -27,7 +27,7 @@ inline void _dil_add_swish_fusion_kernel(
   int i = 0;
 
   // load tensor<float> a & b
-  // assum the same size , no need to broadcast
+  // assume the same size, no need to broadcast
   for (; i <= size - 16; i += 16) {
     // a is first operand of add, b is bias
     vec_a = _loadu(a + i);
@@ -38,7 +38,7 @@ inline void _dil_add_swish_fusion_kernel(
     vec_add_tmp =
         vec_a; // keep the intermediate result for later use in the mul
 
-    // caculate sigmoid e^x / (1 + e^x)
+    // calculate sigmoid e^x / (1 + e^x)
     vec_a = _dil_exp_kernel(vec_a);
     vec_addone_tmp = _mm512_add_ps(vec_a, vec_ps_1);
     vec_a = _mm512_div_ps(vec_a, vec_addone_tmp);
@@ -59,7 +59,7 @@ inline void _dil_add_swish_fusion_kernel(
     vec_add_tmp =
         vec_a; // keep the intermediate result for later use in the second mul
 
-    // caculate sigmoid e^x / (1 + e^x)
+    // calculate sigmoid e^x / (1 + e^x)
     vec_a = _dil_exp_kernel(vec_a);
     vec_addone_tmp = _mm512_add_ps(vec_a, vec_ps_1);
     vec_a = _mm512_div_ps(vec_a, vec_addone_tmp);
diff --git a/csrc/utils/CustomOperatorRegistration.h b/csrc/utils/CustomOperatorRegistration.h
index bfc726c2a..04a2d5580 100644
--- a/csrc/utils/CustomOperatorRegistration.h
+++ b/csrc/utils/CustomOperatorRegistration.h
@@ -25,7 +25,7 @@ struct TypeSelector {
     extract_type(args...);
   }
 
-  at::ArrayRef<T> retrive_types() {
+  at::ArrayRef<T> retrieve_types() {
     return at::ArrayRef<T>(container_.begin(), container_.end());
   }
 
@@ -101,7 +101,7 @@ The macro should be written in below way to register these two different
 ***************************************************************************
 IPEX_OP_REGISTER | IPEX_OP_REGISTER_TO_PLAIN
 This macro is used to register ops into torch_ipex library. Through this macro,
-function schema and signature will automatically be infered from function
+function schema and signature will automatically be inferred from function
 prototype. However, it is worth to note that this macro will not works on
 overload functions(see IPEX_OP_REGISTER_OVERLOAD). Here is some examples for
 register ipex operators:
@@ -117,7 +117,7 @@ IPEX_LIBRARY_FRAGMENT() {
   IPEX_OP_REGISTER("mul_add", mul_add);
 }
 And if this op does not support oneDNN's block format memory layout for tensor.
-It would be necessary for developer to register it specificly by adopting the
+It would be necessary for developer to register it specifically by adopting the
 macro IPEX_OP_REGISTER_NEED_PLAIN. In this way, all the tensor passed to this
 operator will automatically convert to normal tensor layout when execution.
 
diff --git a/docs/tutorials/api_doc.rst b/docs/tutorials/api_doc.rst
index db0b664bb..140d00223 100644
--- a/docs/tutorials/api_doc.rst
+++ b/docs/tutorials/api_doc.rst
@@ -103,7 +103,7 @@ Quantization
 .. autofunction:: prepare
 .. autofunction:: convert
 
-Prototype API, introduction is avaiable at `feature page <./features/int8_recipe_tuning_api.md>`_.
+Prototype API, introduction is available at `feature page <./features/int8_recipe_tuning_api.md>`_.
 
 .. autofunction:: autotune
 
diff --git a/docs/tutorials/contribution.md b/docs/tutorials/contribution.md
index 94c7bed35..cd4f64294 100644
--- a/docs/tutorials/contribution.md
+++ b/docs/tutorials/contribution.md
@@ -68,7 +68,7 @@ If you want to reinstall, make sure that you uninstall Intel® Extension for PyT
 
 ### Tips and Debugging
 
-* Cmake must be installed before installing Intel® Extension for PyTorch\*. If youre developing on MacOS or Linux, We recommend installing Cmake with [Homebrew](https://brew.sh/) with `brew install cmake`.
+* Cmake must be installed before installing Intel® Extension for PyTorch\*. If you're developing on macOS or Linux, we recommend installing Cmake with [Homebrew](https://brew.sh/) with `brew install cmake`.
 * Our `setup.py` requires Python >= 3.6
 * If you run into errors when running `python setup.py develop`, here are some debugging steps:
   1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure your CMake works and can compile this simple Hello World program without errors.
diff --git a/docs/tutorials/features/graph_optimization.md b/docs/tutorials/features/graph_optimization.md
index 680aead3f..fbe0faeb9 100644
--- a/docs/tutorials/features/graph_optimization.md
+++ b/docs/tutorials/features/graph_optimization.md
@@ -127,9 +127,9 @@ Here listed all the currently supported int8 patterns in Intel® Extension for P
 
 
 ### Folding
-Stock PyTorch provids constant propagation and BatchNormalization folding. These optimizations are automatically applied to the jit model by invoking `torch.jit.freeze`. Take the Resnet50 as an example:
+Stock PyTorch provides constant propagation and BatchNormalization folding. These optimizations are automatically applied to the jit model by invoking `torch.jit.freeze`. Take the Resnet50 as an example:
 
 [//]: # (marker_feature_graph_optimization_folding)
 [//]: # (marker_feature_graph_optimization_folding)
 
-If the model owner does not invoke the `torch.jit.freeze`, the `BatchNormalization` still exists on the graph. Otheriwse, the `BatchNormalization` will be folded on the graph to save the compuation and then improve the performance. Refer to the [Constant Folding Wikipedia page](https://en.wikipedia.org/wiki/Constant_folding) for more details.
+If the model owner does not invoke the `torch.jit.freeze`, the `BatchNormalization` still exists on the graph. Otherwise, the `BatchNormalization` will be folded on the graph to save the computation and then improve the performance. Refer to the [Constant Folding Wikipedia page](https://en.wikipedia.org/wiki/Constant_folding) for more details.
diff --git a/docs/tutorials/features/hypertune.md b/docs/tutorials/features/hypertune.md
index e8bc828b8..cf9338757 100644
--- a/docs/tutorials/features/hypertune.md
+++ b/docs/tutorials/features/hypertune.md
@@ -71,7 +71,7 @@ hyperparams:
   launcher:
     hp: ['malloc']
 ```
-`malloc` will be tuned using its default search space, `['tc', 'je', 'pt']`. All other launcher hyperparamters (`ncores_per_instance`, `ninstances`, `use_all_nodes`, `use_logical_cores`, `disable_numactl`, `disable_iomp`) will not be tuned and instead will use their default values.
+`malloc` will be tuned using its default search space, `['tc', 'je', 'pt']`. All other launcher hyperparameters (`ncores_per_instance`, `ninstances`, `use_all_nodes`, `use_logical_cores`, `disable_numactl`, `disable_iomp`) will not be tuned and instead will use their default values.
 
 #### User defined search space
 
diff --git a/docs/tutorials/features/int8_overview.md b/docs/tutorials/features/int8_overview.md
index b0d279650..350a5bd88 100644
--- a/docs/tutorials/features/int8_overview.md
+++ b/docs/tutorials/features/int8_overview.md
@@ -31,12 +31,12 @@ qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_a
                   weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric))
 ```
 
-Note: we fully use PyTorch [observer methonds](https://pytorch.org/docs/stable/quantization-support.html#torch-quantization-observer), so you can use a different PyTorch obsever methond to define the [QConfig](https://pytorch.org/docs/1.11/generated/torch.quantization.qconfig.QConfig.html). For weight observer, we only support **torch.qint8** dtype now.
+Note: we fully use PyTorch [observer methonds](https://pytorch.org/docs/stable/quantization-support.html#torch-quantization-observer), so you can use a different PyTorch observer method to define the [QConfig](https://pytorch.org/docs/1.11/generated/torch.quantization.qconfig.QConfig.html). For weight observer, we only support **torch.qint8** dtype now.
 
 **Suggestion**:
 
 1. For activation observer, if using **qscheme** as **torch.per_tensor_affine**, **torch.quint8** is preferred. If using **qscheme** as **torch.per_tensor_symmetric**, **torch.qint8** is preferred. For weight observer, setting **qscheme** to **torch.per_channel_symmetric** can get a better accuracy.
-2. If your CPU device doesn't support VNNI, seting the observer's **reduce_range** to **True** can get a better accuracy, such as skylake.
+2. If your CPU device doesn't support VNNI, setting the observer's **reduce_range** to **True** can get a better accuracy, such as skylake.
 
 ### Prepare Model and Do Calibration
 
diff --git a/docs/tutorials/features/int8_recipe_tuning_api.md b/docs/tutorials/features/int8_recipe_tuning_api.md
index 250237bbd..33b53cc51 100644
--- a/docs/tutorials/features/int8_recipe_tuning_api.md
+++ b/docs/tutorials/features/int8_recipe_tuning_api.md
@@ -1,7 +1,7 @@
 INT8 Recipe Tuning API (Prototype)
 =====================================
 
-This [new API](../api_doc.html#ipex.quantization.autotune) `ipex.quantization.autotune` supports INT8 recipe tuning by using Intel® Neural Compressor as the backend in Intel® Extension for PyTorch\*. In general, we provid default recipe in Intel® Extension for PyTorch\*, and we still recommend users to try out the default recipe first without bothering tuning. If the default recipe doesn't bring about desired accuracy, users can use this API to tune for a more advanced receipe.
+This [new API](../api_doc.html#ipex.quantization.autotune) `ipex.quantization.autotune` supports INT8 recipe tuning by using Intel® Neural Compressor as the backend in Intel® Extension for PyTorch\*. In general, we provid default recipe in Intel® Extension for PyTorch\*, and we still recommend users to try out the default recipe first without bothering tuning. If the default recipe doesn't bring about desired accuracy, users can use this API to tune for a more advanced recipe.
 
 Users need to provide a fp32 model and some parameters required for tuning. The API will return a prepared model with tuned qconfig loaded.
 
diff --git a/docs/tutorials/features/isa_dynamic_dispatch.md b/docs/tutorials/features/isa_dynamic_dispatch.md
index ac79974fa..ce5855894 100644
--- a/docs/tutorials/features/isa_dynamic_dispatch.md
+++ b/docs/tutorials/features/isa_dynamic_dispatch.md
@@ -50,7 +50,7 @@ At the runtime, **Dispatch Stub implementation** will check CPUIDs and OS status
 >#### **Dispatch Stub implementation:** `csrc/cpu/dyndisp/DispatchStub.cpp` and `csrc/cpu/dyndisp/DispatchStub.h`
 
 ### CodeGen Process
-IPEX build system will generate code for each ISA level with specifiy complier parameters. The CodeGen script is located at `cmake/cpu/IsaCodegen.cmake`.
+IPEX build system will generate code for each ISA level with specifiy compiler parameters. The CodeGen script is located at `cmake/cpu/IsaCodegen.cmake`.
 
 The CodeGen will copy each cpp files from **Kernel implementation**, and then add ISA level as new file suffix.
 
@@ -376,7 +376,7 @@ Here are three ISA-related private APIs that can help debugging::
 >**Note:**
 >
 >1. Max CPU supported ISA level only depends on CPU features.
->2. Max binary supported ISA level only depends on built complier version.
+>2. Max binary supported ISA level only depends on built compiler version.
 >3. Current ISA level, it is the smaller of `max CPU ISA level` and `max binary ISA level`.
 
 ### Example:
diff --git a/docs/tutorials/features/runtime_extension.md b/docs/tutorials/features/runtime_extension.md
index 03f0e9f56..de451fbe4 100644
--- a/docs/tutorials/features/runtime_extension.md
+++ b/docs/tutorials/features/runtime_extension.md
@@ -120,7 +120,7 @@ Thus, `MultiStreamModule` may benefit performance for inference in throughput mo
 2. The overhead of inputs' auto split and outputs' auto concat for each stream.
 3. The overhead of pthread (stream async execution) wakes up and threads' synchronization after stream execution.
 
-Here are some performance receipes that we recommend for better multi-stream performance.
+Here are some performance recipes that we recommend for better multi-stream performance.
 
 * When creating `MultiStreamModule` with `torch.nn.Module` as imperative path module, each stream inside `MultiStreamModule` suffers the GIL issue when doing inference together. This hurts end-to-end performance. We recommend creating `MultiStreamModule` with the `torch.jit.ScriptModule`.
 
diff --git a/docs/tutorials/getting_started.md b/docs/tutorials/getting_started.md
index 1bd750d52..8dbfb7cab 100644
--- a/docs/tutorials/getting_started.md
+++ b/docs/tutorials/getting_started.md
@@ -12,7 +12,7 @@ To start using the Intel® Extension for PyTorch\* in your code, you need to mak
 
 **Important:** It is highly recommended to `import intel_extension_for_pytorch` right after `import torch`, prior to importing other packages.
 
-The example below demostrates how to use the Intel® Extension for PyTorch\* with TorchScript:
+The example below demonstrates how to use the Intel® Extension for PyTorch\* with TorchScript:
 
 ```python
 import torch
@@ -34,7 +34,7 @@ with torch.no_grad(), torch.cpu.amp.autocast():
 ##########################################
 ```
 
-The example below demostrates how to use the Intel® Extension for PyTorch\* with TorchDynamo:
+The example below demonstrates how to use the Intel® Extension for PyTorch\* with TorchDynamo:
 
 ```python
 import torch
diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst
index 7e6f0b51a..1ec365d62 100644
--- a/docs/tutorials/llm.rst
+++ b/docs/tutorials/llm.rst
@@ -120,7 +120,7 @@ While Generative AI (GenAI) workloads and models are getting more and more popul
 
 Quantization with shorter data types benefits from its nature to improve memory IO throughputs and amount of computations on CPU. Moreover, shorter data types make it possible to keep more data in CPU cache, thus reducing memory access occurrences. Comparing to cache access, memory access is much more time costing. Specifically from computation perspective, AVX-512 Vector Neural Network Instructions (VNNI) instruction set shipped with the 2nd Generation Intel® Xeon® Scalable Processors and newer, as well as Intel® Advanced Matrix Extensions (Intel® AMX) instruction set shipped with the 4th Generation Intel® Xeon® Scalable Processors, provide instruction level accelerations to INT8 computations.
 
-Except for the mixed-precision and INT8 native quantization solution, e.g., post-training static quantization and dynamic quantization in Pytorch, `SmoothQuant <https://arxiv.org/abs/2211.10438>`_ and weight only quantization (both INT8 weight and INT4 weight are supported) are also enabled in Intel® Extension for PyTorch* to get beeter accuracy and performance compared with native solution.
+Except for the mixed-precision and INT8 native quantization solution, e.g., post-training static quantization and dynamic quantization in Pytorch, `SmoothQuant <https://arxiv.org/abs/2211.10438>`_ and weight only quantization (both INT8 weight and INT4 weight are supported) are also enabled in Intel® Extension for PyTorch* to get better accuracy and performance compared with native solution.
 
 Intel® Extension for PyTorch* speeds up INT8 computations by leveraging oneDNN and oneDNN graph as the backend. Intel® Extension for PyTorch* static quantization provides a default recipe to automatically decide which operators to quantize. Its backend oneDNN graph brings matrix-multiplication-based fusions for common seen operator patterns and other common fusions like quantization + data type casting. These fusions help achieve best computation cache locality and efficiency, and thus reduce INT8 quantization overhead significantly.       
 
diff --git a/docs/tutorials/performance_tuning/launch_script.md b/docs/tutorials/performance_tuning/launch_script.md
index 61c5826f3..4c867941c 100644
--- a/docs/tutorials/performance_tuning/launch_script.md
+++ b/docs/tutorials/performance_tuning/launch_script.md
@@ -258,7 +258,7 @@ You can also specify the cores to be utilized using `--cores-list` argument. For
 ipexrun --ncores-per-instance 10 --cores-list "11-20" --log-dir ./logs resnet50.py
 ```
 
-Please notice that when specifying `--cores-list`, a correspondant `--ncores-per-instance` argument is required for instance number deduction.
+Please notice that when specifying `--cores-list`, a correspondent `--ncores-per-instance` argument is required for instance number deduction.
 
 In this case the log directory should be like
 ```
diff --git a/docs/tutorials/performance_tuning/torchserve.md b/docs/tutorials/performance_tuning/torchserve.md
index a5d8d694d..e8bd7aeb8 100644
--- a/docs/tutorials/performance_tuning/torchserve.md
+++ b/docs/tutorials/performance_tuning/torchserve.md
@@ -67,9 +67,9 @@ Below are some useful `cpu_launcher_args` to note. Italic values are default if
 Refer to [Launch Script Usage Guide](./launch_script.md) for a full list of tunable configuration of launcher. And refer to [Performance Tuning Guide](./tuning_guide.md) for more details.
 
 ### Launcher Core Pinning to Boost Performance of TorchServe Multi Worker Inference
-When running [multi-worker inference](https://pytorch.org/serve/management_api.html#scale-workers) with Torchserve (Required torchserve>=0.6.1), launcher pin cores to workers to boost performance. Internally, launcher equally divides the number of cores by the number of workers such that each worker is pinned to assigned cores. Doing so avoids core overlap among workers which can signficantly boost performance for TorchServe multi-worker inference. For example, assume running 4 workers on a machine with Intel(R) Xeon(R) Platinum 8180 CPU, 2 sockets, 28 cores per socket, 2 threads per core. Launcher will bind worker 0 to cores 0-13, worker 1 to cores 14-27, worker 2 to cores 28-41, and worker 3 to cores 42-55.
+When running [multi-worker inference](https://pytorch.org/serve/management_api.html#scale-workers) with Torchserve (Required torchserve>=0.6.1), launcher pin cores to workers to boost performance. Internally, launcher equally divides the number of cores by the number of workers such that each worker is pinned to assigned cores. Doing so avoids core overlap among workers which can significantly boost performance for TorchServe multi-worker inference. For example, assume running 4 workers on a machine with Intel(R) Xeon(R) Platinum 8180 CPU, 2 sockets, 28 cores per socket, 2 threads per core. Launcher will bind worker 0 to cores 0-13, worker 1 to cores 14-27, worker 2 to cores 28-41, and worker 3 to cores 42-55.
 
-CPU usage is shown below. 4 main worker threads were launched, each launching 14 threads affinitized to the assigned physical cores.
+CPU usage is shown below. 4 main worker threads were launched, each launching 14 threads affinities to the assigned physical cores.
 ![26](https://user-images.githubusercontent.com/93151422/170373651-fd8a0363-febf-4528-bbae-e1ddef119358.gif)
 
 
@@ -78,7 +78,7 @@ Additionally when dynamically [scaling the number of workers](https://pytorch.or
 
 Continuing with the above example with 4 workers, assume killing workers 2 and 3. If cores were not re-distributed after the scale down, cores 28-55 would be left unutilized. Instead, launcher re-distributes cores 28-55 to workers 0 and 1 such that now worker 0 binds to cores 0-27 and worker 1 binds to cores 28-55.<sup>2</sup>
 
-CPU usage is shown below. 4 main worker threads were initially launched. Then after scaling down the number of workers from 4 to 2, 2 main worker threads were launched, each launching 28 threads affinitized to the assigned physical cores.
+CPU usage is shown below. 4 main worker threads were initially launched. Then after scaling down the number of workers from 4 to 2, 2 main worker threads were launched, each launching 28 threads affinities to the assigned physical cores.
 ![worker_scaling](https://user-images.githubusercontent.com/93151422/170374697-7497c2d5-4c17-421b-9993-1434d1f722f6.gif)
 
 <sup>2. Serving is interrupted for few seconds while re-distributing cores to scaled workers.</sup>
@@ -171,7 +171,7 @@ torch.jit.save(model, 'rn50_int8_jit.pt')
 ```
 
 ### 2. Creating a Model Archive
-Once the serialized file ( `.pt`) is created, it can be used with `torch-model-archiver` as ususal.
+Once the serialized file ( `.pt`) is created, it can be used with `torch-model-archiver` as usual.
 
 Use the following command to package `rn50_int8_jit.pt` into `rn50_ipex_int8.mar`.
 ```
@@ -255,7 +255,7 @@ cpu_launcher_enable=true
 CPU usage is shown as below:
 ![launcher_core_pinning](https://user-images.githubusercontent.com/93151422/159063975-e7e8d4b0-e083-4733-bdb6-4d92bdc10556.gif)
 
-4 main worker threads were launched, then each launched a num_physical_cores/num_workers number (14) of threads affinitized to the assigned physical cores.
+4 main worker threads were launched, then each launched a num_physical_cores/num_workers number (14) of threads affinities to the assigned physical cores.
 
 <pre><code>
 $ cat logs/model_log.log
diff --git a/docs/tutorials/performance_tuning/tuning_guide.md b/docs/tutorials/performance_tuning/tuning_guide.md
index 78d122f36..d45c25bea 100644
--- a/docs/tutorials/performance_tuning/tuning_guide.md
+++ b/docs/tutorials/performance_tuning/tuning_guide.md
@@ -253,7 +253,7 @@ Intel® Extension for PyTorch\* is using OneDNN backend for those most computing
 
 To achieve better performance, OneDNN backend is using its [primitive cache](https://oneapi-src.github.io/oneDNN/dev_guide_primitive_cache.html) to store those created primitives for different input shapes during warm-up stage (default primitive cache size is 1024, i.e., 1024 cached primitives). Therefore, when the total size of the primitives created by all the input shapes is within the default threshold, Intel® Extension for PyTorch\* could get fully computation performance from OneDNN kernels.
 
-Different input shapes usualy come from dynamic shapes of datasets. Dynamic shapes commonly exist in [MaskRCNN model](https://github.com/matterport/Mask_RCNN) (object detection), [Transformers](https://github.com/huggingface/transformers/) Wav2vec2 model (speech-recognition) and other speech/text-generation related Transformers models.
+Different input shapes usually come from dynamic shapes of datasets. Dynamic shapes commonly exist in [MaskRCNN model](https://github.com/matterport/Mask_RCNN) (object detection), [Transformers](https://github.com/huggingface/transformers/) Wav2vec2 model (speech-recognition) and other speech/text-generation related Transformers models.
 
 However, we might meet the fact that model would need to cache a large amount of various input shapes, which would even exceed the default primitive cache size. In such case, we recommend tuning the OneDNN primitive cache by setting `ONEDNN_PRIMITIVE_CACHE_CAPACITY` environment variable to get better performance (Note that it is at the cost of increased memory usage):
 
diff --git a/docs/tutorials/releases.md b/docs/tutorials/releases.md
index e6d2da69e..f4b2e6f6c 100644
--- a/docs/tutorials/releases.md
+++ b/docs/tutorials/releases.md
@@ -655,7 +655,7 @@ Highlights include:
   The support for dynamic shapes in Intel® Extension for PyTorch\* INT8 integration is still work in progress. When the input shapes are dynamic, for example inputs of variable image sizes in an object detection task or of variable sequence lengths in NLP tasks, the Intel® Extension for PyTorch\* INT8 path may slow down the model inference. In this case, use stock PyTorch INT8 functionality.
   **Note**: Using Runtime Extension feature if batch size cannot be divided by number of streams, because mini batch size on each stream are not equivalent, scripts run into this issues.
 - BF16 AMP(auto-mixed-precision) runs abnormally with the extension on the AVX2-only machine if the topology contains `Conv`, `Matmul`, `Linear`, and `BatchNormalization`
-- Runtime extension of MultiStreamModule doesn't support DLRM inference, since the input of DLRM (EmbeddingBag specifically) can't be simplely batch split.
+- Runtime extension of MultiStreamModule doesn't support DLRM inference, since the input of DLRM (EmbeddingBag specifically) can't be simply batch split.
 - Runtime extension of MultiStreamModule has poor performance of RNNT Inference comparing with native throughput mode. Only part of the RNNT models (joint_net specifically) can be jit traced into graph. However, in one batch inference, `joint_net` is invoked multi times. It increases the overhead of MultiStreamModule as input batch split, thread synchronization and output concat.
 - Incorrect Conv and Linear result if the number of OMP threads is changed at runtime
   The oneDNN memory layout depends on the number of OMP threads, which requires the caller to detect the changes for the # of OMP threads while this release has not implemented it yet.
@@ -788,7 +788,7 @@ libintel-ext-pt-cxx11-abi-1.11.0+cpu.run (13.5M)
 
 This release is meant to fix the following issues:
 - Resolve the issue that the PyTorch Tensor Expression(TE) did not work after importing the extension.
-- Wraps the BactchNorm(BN) as another operator to break the TE's BN-related fusions. Because the BatchNorm performance of PyTorch Tensor Expression can not achieve the same performance as PyTorch ATen BN.
+- Wraps the BatchNorm(BN) as another operator to break the TE's BN-related fusions. Because the BatchNorm performance of PyTorch Tensor Expression can not achieve the same performance as PyTorch ATen BN.
 - Update the [documentation](https://intel.github.io/intel-extension-for-pytorch/)
     - Fix the INT8 quantization example issue #205
     - Polish the installation guide
diff --git a/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py b/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py
index 7d741e495..c35ffbc40 100644
--- a/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py
+++ b/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py
@@ -107,13 +107,13 @@ def eval_func(model):
 
         return top1.avg.item()
 
-    print(".........runing autotuning step.........")
+    print(".........running autotuning step.........")
     tuned_model = ipex.quantization.autotune(
         model, val_loader, eval_func=eval_func, sampling_sizes=[300]
     )
     print(".........autotuning step done.........")
 
-    print(".........runing int8 inference.........")
+    print(".........running int8 inference.........")
     converted_model = ipex.quantization.convert(tuned_model)
     with torch.no_grad():
         for i, (images, target) in enumerate(val_loader):
diff --git a/examples/cpu/features/int8_recipe_tuning/int8_autotune.py b/examples/cpu/features/int8_recipe_tuning/int8_autotune.py
index b70155ca4..05b0a3d10 100644
--- a/examples/cpu/features/int8_recipe_tuning/int8_autotune.py
+++ b/examples/cpu/features/int8_recipe_tuning/int8_autotune.py
@@ -84,7 +84,7 @@ def train(dataloader, model, loss_fn, optimizer):
 
 epochs = 5
 for t in range(epochs):
-    print(f"Epoch {t+1}\n-------------------------------")
+    print(f"Epoch {t + 1}\n-------------------------------")
     train(train_dataloader, model, loss_fn, optimizer)
 print("Done!")
 
diff --git a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb
index 8c6c00860..72e04ecbd 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb
@@ -243,7 +243,7 @@
     "\n",
     "|exec type | Description |  \n",
     "|:-----|:----|  \n",
-    "|exec | Time for primitives exection. Better to spend most of time on primitives execution. |  \n",
+    "|exec | Time for primitives execution. Better to spend most of time on primitives execution. |  \n",
     "|create| Time for primitives creation. Primitives creation happens once. Better to spend less time on primitive creation. |  "
    ]
   },
@@ -274,7 +274,7 @@
     "### Step 6: Time breakdown for primitives type\n",
     "The primitives type includes convolution, reorder, sum, etc.  \n",
     "For this simple convolution net example, convolution and inner product primitives are expected to spend most of time.  \n",
-    "However, the exact time percentage of different primitivies may vary among different architectures.    \n",
+    "However, the exact time percentage of different primitives may vary among different architectures.    \n",
     "Users can easily identify top hotpots of primitives executions with this time breakdown.  "
    ]
   },
diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb
index c4bca3199..05efa7abd 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb
@@ -53,7 +53,7 @@
    "source": [
     "## Installation of required packages\n",
     "\n",
-    "Ensure the kernel is set to Pytorch-CPU before running the follwing code."
+    "Ensure the kernel is set to Pytorch-CPU before running the following code."
    ]
   },
   {
diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb
index 03020685e..747f4e5cc 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb
@@ -288,7 +288,7 @@
     "\n",
     "# Calculate speedup when using quantization\n",
     "speedup_from_fp32_static = fp32_inference_time / int8_inference_time_static\n",
-    "print(\"Staic INT8 %.2fX faster than FP32\" %speedup_from_fp32_static)\n",
+    "print(\"Static INT8 %.2fX faster than FP32\" %speedup_from_fp32_static)\n",
     "speedup_from_fp32_dynamic = fp32_inference_time / int8_inference_time_dynamic\n",
     "print(\"Dynamic INT8 %.2fX faster than FP32\" %speedup_from_fp32_dynamic)\n",
     "\n",
diff --git a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb
index b0cec76a8..8a07c7a88 100644
--- a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb
+++ b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb
@@ -229,7 +229,7 @@
     "            width = 0.4)\n",
     "\n",
     "    plt.ylabel(\"Runtime (ms)\")\n",
-    "    plt.title(f\"Speedup acheived - {inference_time_stock/inference_time_optimized:.2f}x\")\n",
+    "    plt.title(f\"Speedup achieved - {inference_time_stock/inference_time_optimized:.2f}x\")\n",
     "    plt.show()\n",
     "    \n",
     "\n"
diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py
index d8f73ee60..ba3d1ee43 100644
--- a/examples/cpu/inference/python/llm-modeling/run.py
+++ b/examples/cpu/inference/python/llm-modeling/run.py
@@ -94,7 +94,7 @@ def get_dummy_input(_model, return_dict=False):
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6B",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--dtype",
@@ -230,7 +230,7 @@ def trace_handler(prof):
 elif args.input_tokens in prompt_pool[model_type]:
     prompt = prompt_pool[model_type][args.input_tokens]
 else:
-    raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.")
+    raise SystemExit("[ERROR] Please use --prompt if want to use custom input.")
 
 input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
 print("---- Prompt size:", input_size)
diff --git a/examples/cpu/inference/python/models/LCM/README.md b/examples/cpu/inference/python/models/LCM/README.md
index 32bbcdffc..8e656adf6 100644
--- a/examples/cpu/inference/python/models/LCM/README.md
+++ b/examples/cpu/inference/python/models/LCM/README.md
@@ -50,7 +50,7 @@ bash download_dataset.sh
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md b/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md
index 31224047d..e107a80e6 100644
--- a/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md
+++ b/examples/cpu/inference/python/models/bert_large/inference/cpu/README.md
@@ -74,7 +74,7 @@ export FINETUNED_MODEL=$(pwd)/bert_squad_model
     ```
     ./setup.sh
     ```
-4. Setup required environment paramaters
+4. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/README.md b/examples/cpu/inference/python/models/bert_large/training/cpu/README.md
index 2a757827e..96c20775d 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/README.md
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/README.md
@@ -145,7 +145,7 @@ you can use "SHARD_NUM" to control the shard files number. the default "SHARD_NU
   ```
   ./setup.sh
   ```
-4. Setup required environment paramaters
+4. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py b/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py
index 322e49b40..189a5310c 100755
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/input_preprocessing/tokenization_local.py
@@ -185,7 +185,7 @@ def whitespace_tokenize(text):
 
 
 class FullTokenizer(object):
-    """Runs end-to-end tokenziation."""
+    """Runs end-to-end tokenization."""
 
     def __init__(self, vocab_file, do_lower_case=True):
         self.vocab = load_vocab(vocab_file)
@@ -336,7 +336,7 @@ def _clean_text(self, text):
 
 
 class WordpieceTokenizer(object):
-    """Runs WordPiece tokenziation."""
+    """Runs WordPiece tokenization."""
 
     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
         self.vocab = vocab
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py b/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py
index 7f9f05ac6..ede009df7 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/lamb.py
@@ -85,7 +85,7 @@ def step(self, closure=None):
                 data = p.data
                 if grad.is_sparse:
                     raise RuntimeError(
-                        "Lamb does not support sparse gradients, consider SparseAdam instad."
+                        "Lamb does not support sparse gradients, consider SparseAdam instead."
                     )
 
                 state = self.state[p]
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py b/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py
index a45397f82..26df9d98b 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py
@@ -1158,8 +1158,8 @@ def main():
                 print(
                     f"Step {training_steps:5d}: loss: {gloss:6.3f} lm_acc: {lm_acc:.3f} \
                     seq_acc: {seq_acc:.3f} lbs: {args.train_batch_size} gbs: {total_batch_size} \
-                    DT: {(t1-t0)*1000.0:.1f} XT: {(t2-t1)*1000.0:.1f} FT: {(t3-t2)*1000.0:.1f} \
-                    BT: {(t4-t3)*1000.0:.1f} OT: {(t5-t4)*1000.0:.1f} TT: {(t5-t0)*1000.0:.1f}"
+                    DT: {(t1 - t0) * 1000.0:.1f} XT: {(t2 - t1) * 1000.0:.1f} FT: {(t3 - t2) * 1000.0:.1f} \
+                    BT: {(t4 - t3) * 1000.0:.1f} OT: {(t5 - t4) * 1000.0:.1f} TT: {(t5 - t0) * 1000.0:.1f}"
                 )
 
                 update_step = training_steps % args.gradient_accumulation_steps == 0
diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py b/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py
index 6c6e9a628..e3058dae9 100644
--- a/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py
+++ b/examples/cpu/inference/python/models/bert_large/training/cpu/utils_local.py
@@ -71,7 +71,7 @@ def setup_seeds(master_seed, epochs, device):
     Generates seeds from one master_seed.
     Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
     used to initialize per-worker random number generators (mostly for
-    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dropouts), shuffling_seeds are for RNGs responsible for reshuffling the
     dataset before each epoch.
     Seeds are generated on worker with rank 0 and broadcasted to all other
     workers.
diff --git a/examples/cpu/inference/python/models/deepseek/README.md b/examples/cpu/inference/python/models/deepseek/README.md
index 6ed236bd8..31a9adb45 100644
--- a/examples/cpu/inference/python/models/deepseek/README.md
+++ b/examples/cpu/inference/python/models/deepseek/README.md
@@ -59,7 +59,7 @@ wget -O prompt.json https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.c
 ```
 
 ### Performance
-#### 1. Setup required environment paramaters
+#### 1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
@@ -74,7 +74,7 @@ wget -O prompt.json https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.c
 
 **Please avoid cross NUMA node memory access when setting SGLANG_CPU_OMP_THREADS_BIND.**
 
-`SGLANG_CPU_OMP_THREADS_BIND` specifies the CPU cores dedicated to the OpenMP threads. `--tp` sets the TP size. Below are the example of running without TP and with TP = 6. By changing `--tp` and `SGLANG_CPU_OMP_THREADS_BIND` accordingly, you could set TP size to other values and specifiy the core binding for each rank.
+`SGLANG_CPU_OMP_THREADS_BIND` specifies the CPU cores dedicated to the OpenMP threads. `--tp` sets the TP size. Below are the example of running without TP and with TP = 6. By changing `--tp` and `SGLANG_CPU_OMP_THREADS_BIND` accordingly, you could set TP size to other values and specify the core binding for each rank.
 
 
 ##### 2.1 Bench one batch
diff --git a/examples/cpu/inference/python/models/distilbert/README.md b/examples/cpu/inference/python/models/distilbert/README.md
index e09ac0a3f..ac8ac0147 100644
--- a/examples/cpu/inference/python/models/distilbert/README.md
+++ b/examples/cpu/inference/python/models/distilbert/README.md
@@ -48,13 +48,13 @@
   #by default they are downloaded in current path
   #note that you should do this after you prepared model (transformers repo)
 
-  (2) make following changes in the scirpts to run:
+  (2) make following changes in the scripts to run:
   delete: --task_name sst2  ==>  add: --train_file {path/to/data_file}/SST-2/train.csv --validation_file {path/to/data_file}/SST-2/dev.csv
 
   (3) export model path
   export FINETUNED_MODEL={path/to/model_file}/distilbert-base-uncased-finetuned-sst-2-english
 
-  (4) run scirpt with HF_DATASETS_OFFLINE=1 flag, like:
+  (4) run script with HF_DATASETS_OFFLINE=1 flag, like:
   HF_DATASETS_OFFLINE=1 bash run_multi_instance_throughput.sh fp32
 
   ```
@@ -90,7 +90,7 @@ export FINETUNED_MODEL=$(pwd)/distilbert-base-uncased-finetuned-sst-2-english
   ./setup.sh
   ```
 
-5. Setup required environment paramaters
+5. Setup required environment parameters
 
 # Custom mode
  Run in custom mode by export TEST_MODE="" and export BATCH_SIZE to set the batch_size, export CORES_PER_INSTANCE to set the number of cores per instance and export INSTANCES to set the number of instances.
diff --git a/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py b/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py
index 2c0284f4b..517702fd6 100755
--- a/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py
+++ b/examples/cpu/inference/python/models/distilbert/scripts/run_glue.py
@@ -680,7 +680,7 @@ def compute_metrics(p: EvalPrediction):
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
 
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        # Loop to handle MNLI double evaluation (matched, miss-matched)
         tasks = [data_args.task_name]
         eval_datasets = [eval_dataset]
         if data_args.task_name == "mnli":
@@ -718,7 +718,7 @@ def compute_metrics(p: EvalPrediction):
     if training_args.do_predict:
         logger.info("*** Predict ***")
 
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        # Loop to handle MNLI double evaluation (matched, miss-matched)
         tasks = [data_args.task_name]
         predict_datasets = [predict_dataset]
         if data_args.task_name == "mnli":
diff --git a/examples/cpu/inference/python/models/distilbert/scripts/trainer.py b/examples/cpu/inference/python/models/distilbert/scripts/trainer.py
index 9b5b64979..8cb609d6a 100755
--- a/examples/cpu/inference/python/models/distilbert/scripts/trainer.py
+++ b/examples/cpu/inference/python/models/distilbert/scripts/trainer.py
@@ -2063,7 +2063,7 @@ def _inner_training_loop(
                 (self.model_wrapped,) = release_memory(self.model_wrapped)
                 self.model_wrapped = self.model
 
-                # Check for DeepSpeed *after* the intial pass and modify the config
+                # Check for DeepSpeed *after* the initial pass and modify the config
                 if self.is_deepspeed_enabled:
                     # Temporarily unset `self.args.train_batch_size`
                     original_bs = self.args.per_device_train_batch_size
@@ -2748,7 +2748,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     # Checkpoint must have been saved with the old smp api.
                     if hasattr(self.args, "fp16") and self.args.fp16 is True:
                         logger.warning(
-                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
+                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported."
                         )
                     state_dict = torch.load(
                         weights_file,
@@ -3923,7 +3923,7 @@ def evaluate(
             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
             dictionary also contains the epoch number which comes from the training state.
         """
-        # handle multipe eval datasets
+        # handle multiple eval datasets
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
         if isinstance(eval_dataset, dict):
             metrics = {}
@@ -4067,7 +4067,7 @@ def predict(
     def benchmark_evaluate(self, model, dataloader):
         steps_per_epoch = len(dataloader)
         total_steps = self.args.perf_run_iters + self.args.perf_begin_iter
-        test_epoches = int(total_steps / steps_per_epoch)
+        test_epochs = int(total_steps / steps_per_epoch)
         print(
             "Evaluating: Steps per Epoch {} total Steps {}".format(
                 steps_per_epoch, total_steps
@@ -4105,7 +4105,7 @@ def benchmark_evaluate(self, model, dataloader):
                     prof.step()
             prof.__exit__(None, None, None)
         with tqdm(total=total_steps, desc="Evaluating") as pbar:
-            for epoch in range(test_epoches + 1):
+            for epoch in range(test_epochs + 1):
                 for it, batch in enumerate(dataloader):
                     if "pixel_values" in batch:
                         if self.args.fp16_cpu:
diff --git a/examples/cpu/inference/python/models/distilbert/scripts/training_args.py b/examples/cpu/inference/python/models/distilbert/scripts/training_args.py
index 66e526096..45b6d4084 100644
--- a/examples/cpu/inference/python/models/distilbert/scripts/training_args.py
+++ b/examples/cpu/inference/python/models/distilbert/scripts/training_args.py
@@ -475,7 +475,7 @@ class TrainingArguments:
                      all-gathers.
                 - use_orig_params (`bool`, *optional*, defaults to `True`)
                     If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
-                    frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
+                    frozen and trainable parameters. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
                 - sync_module_states (`bool`, *optional*, defaults to `True`)
@@ -524,8 +524,8 @@ class TrainingArguments:
                     all workers.
                 - use_seedable_sampler (`bool`, *optional*, defaults to `True`):
                     Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
-                    training results are fully reproducable using a different sampling technique. While seed-to-seed results
-                    may differ, on average the differences are neglible when using multiple different seeds to compare. Should
+                    training results are fully reproducible using a different sampling technique. While seed-to-seed results
+                    may differ, on average the differences are negligible when using multiple different seeds to compare. Should
                     also be ran with [`~utils.set_seed`] for the best results.
 
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
@@ -1281,7 +1281,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Config to be used with the internal Accelerator object initializtion. The value is either a "
+                "Config to be used with the internal Accelerator object initialization. The value is either a "
                 "accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
             )
         },
@@ -1570,7 +1570,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": "Activates neftune noise embeddings into the model. NEFTune"
-            " has been proven to drastically improve model performances for instrcution fine-tuning."
+            " has been proven to drastically improve model performances for instruction fine-tuning."
             " Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original "
             "code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
         },
@@ -1861,7 +1861,7 @@ def __post_init__(self):
                     torch.backends.cudnn.allow_tf32 = True
             else:
                 logger.warning(
-                    "The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
+                    "The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here."
                 )
         if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
             if self.tf32:
@@ -2305,7 +2305,7 @@ def _setup_devices(self) -> "torch.device":
                 )
                 if device.type != "mps":
                     raise ValueError(
-                        "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ "
+                        "Either you do not have an MPS-enabled device on this machine or macOS version is not 12.3+ "
                         "or current PyTorch install was not built with MPS enabled."
                     )
             if device.type == "mps":
diff --git a/examples/cpu/inference/python/models/dlrm/README.md b/examples/cpu/inference/python/models/dlrm/README.md
index 174fa55d6..83b487864 100644
--- a/examples/cpu/inference/python/models/dlrm/README.md
+++ b/examples/cpu/inference/python/models/dlrm/README.md
@@ -57,7 +57,7 @@ After you loading the raw dataset `day_*.gz` and unzip them to RAW_DIR.
 ```bash
 cd intel-extension-for-pytorch/examples/cpu/inference/python/models/dlrm/
 export MODEL_DIR=$(pwd)
-export RAW_DIR=<the unziped raw dataset>
+export RAW_DIR=<the unzipped raw dataset>
 export TEMP_DIR=<where you choose the put the temp file during preprocess>
 export PREPROCESSED_DIR=<where you choose the put the one-hot dataset>
 export MULTI_HOT_DIR=<where you choose the put the multi-hot dataset>
@@ -81,14 +81,14 @@ https://github.com/mlcommons/inference/tree/master/recommendation/dlrm_v2/pytorc
     ./setup.sh
     ```
 
-5. Setup required environment paramaters
+5. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
 | **TEST_MODE** (THROUGHPUT, ACCURACY)              | `export TEST_MODE=THROUGHPUT`                  |
 | **DATASET_DIR**             |                               `export DATASET_DIR=<multi-hot dataset dir>`                                  |
 | **EVAL_BATCH**             |                               `export EVAL_BATCH=20000`                                  |
-| **WEIGHT_DIR** (ONLY FOR ACCURACY)     |                 `export WEIGHT_DIR=<offical released checkpoint>`        |
+| **WEIGHT_DIR** (ONLY FOR ACCURACY)     |                 `export WEIGHT_DIR=<official released checkpoint>`        |
 | **PRECISION**    |                               `export PRECISION=int8 <specify the precision to run: int8, fp32, bf32, bf16 or tf32>`                             |
 | **OUTPUT_DIR**    |                               `export OUTPUT_DIR=$PWD`                               |
 | **BATCH_SIZE** (optional) |                               `export BATCH_SIZE=<set a value for batch size, else it will run with default batch size>`                                |
diff --git a/examples/cpu/inference/python/models/dlrm/run_model.sh b/examples/cpu/inference/python/models/dlrm/run_model.sh
index be2baa4aa..f75a94053 100644
--- a/examples/cpu/inference/python/models/dlrm/run_model.sh
+++ b/examples/cpu/inference/python/models/dlrm/run_model.sh
@@ -33,7 +33,7 @@ if [[ "${TEST_MODE}" == "THROUGHPUT" ]]; then
 elif [[ "${TEST_MODE}" == "ACCURACY" ]]; then
     echo "TEST_MODE set to ACCURACY"
     BATCH_SIZE=${BATCH_SIZE:-65536}
-    LOG_PREFIX=dlrm_inference_accuarcy_log
+    LOG_PREFIX=dlrm_inference_accuracy_log
     if [ -z "${DATASET_DIR}" ]; then
         echo "The required environment variable DATASET_DIR has not been set"
         exit 1
@@ -71,7 +71,7 @@ mkdir -p ${OUTPUT_DIR}
 TORCH_INDUCTOR=${TORCH_INDUCTOR:-"0"}
 AOT_INDUCTOR=${AOT_INDUCTOR:-"0"}
 # if the number of cores are not equal on different numa node
-# or for TORCHINDUCTOR=1 we will lanuch 2 process per numa
+# or for TORCHINDUCTOR=1 we will launch 2 process per numa
 ENABLE_2ND_PROCESS=${ENABLE_2ND_PROCESS:-"0"}
 MANUALLY_LAUNCH=${MANUALLY_LAUNCH:-"0"}
 if [[ "1" == ${TORCH_INDUCTOR} ]];then
diff --git a/examples/cpu/inference/python/models/gptj/README.md b/examples/cpu/inference/python/models/gptj/README.md
index 998b3b698..605918dd2 100644
--- a/examples/cpu/inference/python/models/gptj/README.md
+++ b/examples/cpu/inference/python/models/gptj/README.md
@@ -27,7 +27,7 @@ export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libtcmall
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py b/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py
index e85ad91fe..08df9ba81 100644
--- a/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py
+++ b/examples/cpu/inference/python/models/gptj/run_llm_inductor_greedy.py
@@ -393,7 +393,7 @@ def run_accuracy_lmeval(model, dataset):
         prompt = prompt_pool[model_type][args.input_tokens]
     else:
         raise SystemExit(
-            "[ERROR] No such input_tokens prompt in prompt.json, Plese use --prompt if want to use custom input."
+            "[ERROR] No such input_tokens prompt in prompt.json, Please use --prompt if want to use custom input."
         )
 
 input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
diff --git a/examples/cpu/inference/python/models/gptj/setup.sh b/examples/cpu/inference/python/models/gptj/setup.sh
index 1e94a6015..547298eee 100755
--- a/examples/cpu/inference/python/models/gptj/setup.sh
+++ b/examples/cpu/inference/python/models/gptj/setup.sh
@@ -24,7 +24,7 @@ cd transformers
 pip install -e ./
 cd ..
 
-# Get prompt.json for gneration inference
+# Get prompt.json for generation inference
 wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
 
 export EVAL_SCRIPT="run_llm_inductor_greedy.py"
diff --git a/examples/cpu/inference/python/models/llama/README.md b/examples/cpu/inference/python/models/llama/README.md
index bc4e19966..ec2068148 100644
--- a/examples/cpu/inference/python/models/llama/README.md
+++ b/examples/cpu/inference/python/models/llama/README.md
@@ -59,7 +59,7 @@ wget -O prompt.json https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.c
 ```
 
 ### Performance
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
@@ -127,7 +127,7 @@ export LD_PRELOAD=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib/libtcmall
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py b/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py
index e85ad91fe..08df9ba81 100644
--- a/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py
+++ b/examples/cpu/inference/python/models/llama/run_llm_inductor_greedy.py
@@ -393,7 +393,7 @@ def run_accuracy_lmeval(model, dataset):
         prompt = prompt_pool[model_type][args.input_tokens]
     else:
         raise SystemExit(
-            "[ERROR] No such input_tokens prompt in prompt.json, Plese use --prompt if want to use custom input."
+            "[ERROR] No such input_tokens prompt in prompt.json, Please use --prompt if want to use custom input."
         )
 
 input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
diff --git a/examples/cpu/inference/python/models/llama/setup.sh b/examples/cpu/inference/python/models/llama/setup.sh
index f6ddf9c3a..a0d80afb3 100755
--- a/examples/cpu/inference/python/models/llama/setup.sh
+++ b/examples/cpu/inference/python/models/llama/setup.sh
@@ -24,7 +24,7 @@ cd transformers
 pip install -e ./
 cd ..
 
-# Get prompt.json for gneration inference
+# Get prompt.json for generation inference
 wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json
 
 export EVAL_SCRIPT="run_llm_inductor_greedy.py"
diff --git a/examples/cpu/inference/python/models/resnet50/README.md b/examples/cpu/inference/python/models/resnet50/README.md
index cfa789f9d..e79fef997 100644
--- a/examples/cpu/inference/python/models/resnet50/README.md
+++ b/examples/cpu/inference/python/models/resnet50/README.md
@@ -45,7 +45,7 @@ imagenet
 The folder that contains the `val` directory should be set as the `DATASET_DIR` (for example: `export DATASET_DIR=/home/<user>/imagenet`).
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                                    |                **export command**                                 |
 |:------------------------------------------------:|:-----------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/resnet50/common/main.py b/examples/cpu/inference/python/models/resnet50/common/main.py
index cd86cf507..8ecabc618 100755
--- a/examples/cpu/inference/python/models/resnet50/common/main.py
+++ b/examples/cpu/inference/python/models/resnet50/common/main.py
@@ -512,10 +512,10 @@ def main_worker(gpu, ngpus_per_node, args):
     if args.dummy:
         assert args.evaluate, "please using real dataset if you want run training path"
     if not args.ipex and not args.inductor:
-        # for offical pytorch, int8 and jit path is not enabled.
+        # for official pytorch, int8 and jit path is not enabled.
         # for torch.compile(backend=inductor) INT8 quantization is been supported.
-        assert not args.int8, "int8 path is not enabled for offical pytorch"
-        assert not args.jit, "jit path is not enabled for offical pytorch"
+        assert not args.int8, "int8 path is not enabled for official pytorch"
+        assert not args.jit, "jit path is not enabled for official pytorch"
 
     if not args.dummy:
         # Data loading code
@@ -582,7 +582,7 @@ def main_worker(gpu, ngpus_per_node, args):
         if args.ipex:
             print("using ipex model to do inference\n")
         else:
-            print("using offical pytorch model to do inference\n")
+            print("using official pytorch model to do inference\n")
 
         # IPEX Path
         if args.ipex:
@@ -615,17 +615,17 @@ def main_worker(gpu, ngpus_per_node, args):
                     model = torch.jit.freeze(model.eval())
                     y = model(x)
                     y = model(x)
-                    print("running int8 evalation step\n")
+                    print("running int8 evaluation step\n")
             else:
                 if args.bf16:
                     model = ipex.optimize(model, dtype=torch.bfloat16, inplace=True)
-                    print("running bfloat16 evalation step\n")
+                    print("running bfloat16 evaluation step\n")
                 elif args.fp16:
                     model = ipex.optimize(model, dtype=torch.half, inplace=True)
-                    print("running float16 evalation step\n")
+                    print("running float16 evaluation step\n")
                 else:
                     model = ipex.optimize(model, dtype=torch.float32, inplace=True)
-                    print("running fp32 evalation step\n")
+                    print("running fp32 evaluation step\n")
                 if args.jit:
                     x = torch.randn(args.batch_size, 3, 224, 224).contiguous(
                         memory_format=torch.channels_last
@@ -1019,7 +1019,7 @@ def validate(val_loader, model, criterion, args):
         model.eval()
 
     if args.ipex and args.int8 and args.calibration:
-        print("runing int8 calibration step\n")
+        print("running int8 calibration step\n")
         import intel_extension_for_pytorch as ipex
         from torch.ao.quantization import (
             MinMaxObserver,
@@ -1048,7 +1048,7 @@ def validate(val_loader, model, criterion, args):
             print(".........calibration step done..........")
     else:
         if args.dummy:
-            # always running channle last for fp32, bf16, int8
+            # always running channel last for fp32, bf16, int8
             with torch.no_grad():
                 if args.weight_sharing:
                     threads = []
diff --git a/examples/cpu/inference/python/models/stable_diffusion/README.md b/examples/cpu/inference/python/models/stable_diffusion/README.md
index 70791bb5a..3c5352dd3 100644
--- a/examples/cpu/inference/python/models/stable_diffusion/README.md
+++ b/examples/cpu/inference/python/models/stable_diffusion/README.md
@@ -56,7 +56,7 @@ bash download_dataset.sh
 ```
 
 ### Inference
-1. Setup required environment paramaters
+1. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/vit/README.md b/examples/cpu/inference/python/models/vit/README.md
index f4af3e01d..e4cac8795 100644
--- a/examples/cpu/inference/python/models/vit/README.md
+++ b/examples/cpu/inference/python/models/vit/README.md
@@ -70,7 +70,7 @@ Vision Transformer inference best known configurations with PyTorch.
     ./setup.sh
     ```
 5. Prepare for downloading access
-    On https://huggingface.co/datasets/ILSVRC/imagenet-1k, login your account, and click the aggreement and then generating {your huggingface token}
+    On https://huggingface.co/datasets/ILSVRC/imagenet-1k, login your account, and click the agreement and then generating {your huggingface token}
 
     huggingface-cli login
     {your huggingface token}
@@ -80,7 +80,7 @@ Vision Transformer inference best known configurations with PyTorch.
    #Run "download_data.sh"
    ./download_data.sh
   ```
-7. Setup required environment paramaters
+7. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/vit/scripts/trainer.py b/examples/cpu/inference/python/models/vit/scripts/trainer.py
index c6d5442a9..e9855b7e1 100755
--- a/examples/cpu/inference/python/models/vit/scripts/trainer.py
+++ b/examples/cpu/inference/python/models/vit/scripts/trainer.py
@@ -2049,7 +2049,7 @@ def _inner_training_loop(
                 (self.model_wrapped,) = release_memory(self.model_wrapped)
                 self.model_wrapped = self.model
 
-                # Check for DeepSpeed *after* the intial pass and modify the config
+                # Check for DeepSpeed *after* the initial pass and modify the config
                 if self.is_deepspeed_enabled:
                     # Temporarily unset `self.args.train_batch_size`
                     original_bs = self.args.per_device_train_batch_size
@@ -2734,7 +2734,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     # Checkpoint must have been saved with the old smp api.
                     if hasattr(self.args, "fp16") and self.args.fp16 is True:
                         logger.warning(
-                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
+                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported."
                         )
                     state_dict = torch.load(
                         weights_file,
@@ -3909,7 +3909,7 @@ def evaluate(
             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
             dictionary also contains the epoch number which comes from the training state.
         """
-        # handle multipe eval datasets
+        # handle multiple eval datasets
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
         if isinstance(eval_dataset, dict):
             metrics = {}
@@ -4053,7 +4053,7 @@ def predict(
     def benchmark_evaluate(self, model, dataloader):
         steps_per_epoch = len(dataloader)
         total_steps = self.args.perf_run_iters + self.args.perf_begin_iter
-        test_epoches = int(total_steps / steps_per_epoch)
+        test_epochs = int(total_steps / steps_per_epoch)
         print(
             "Evaluating: Steps per Epoch {} total Steps {}".format(
                 steps_per_epoch, total_steps
@@ -4064,7 +4064,7 @@ def benchmark_evaluate(self, model, dataloader):
         import time
 
         with tqdm(total=total_steps, desc="Evaluating") as pbar:
-            for epoch in range(test_epoches + 1):
+            for epoch in range(test_epochs + 1):
                 for it, batch in enumerate(dataloader):
                     if "pixel_values" in batch and self.args.benchmark:
                         if self.args.fp16_cpu:
diff --git a/examples/cpu/inference/python/models/vit/scripts/training_args.py b/examples/cpu/inference/python/models/vit/scripts/training_args.py
index 384bcf913..51eb22ff2 100644
--- a/examples/cpu/inference/python/models/vit/scripts/training_args.py
+++ b/examples/cpu/inference/python/models/vit/scripts/training_args.py
@@ -472,7 +472,7 @@ class TrainingArguments:
                      all-gathers.
                 - use_orig_params (`bool`, *optional*, defaults to `True`)
                     If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
-                    frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
+                    frozen and trainable parameters. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
                 - sync_module_states (`bool`, *optional*, defaults to `True`)
@@ -521,8 +521,8 @@ class TrainingArguments:
                     all workers.
                 - use_seedable_sampler (`bool`, *optional*, defaults to `True`):
                     Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
-                    training results are fully reproducable using a different sampling technique. While seed-to-seed results
-                    may differ, on average the differences are neglible when using multiple different seeds to compare. Should
+                    training results are fully reproducible using a different sampling technique. While seed-to-seed results
+                    may differ, on average the differences are negligible when using multiple different seeds to compare. Should
                     also be ran with [`~utils.set_seed`] for the best results.
 
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
@@ -1297,7 +1297,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Config to be used with the internal Accelerator object initializtion. The value is either a "
+                "Config to be used with the internal Accelerator object initialization. The value is either a "
                 "accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
             )
         },
@@ -1582,7 +1582,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": "Activates neftune noise embeddings into the model. NEFTune has been proven "
-            "to drastically improve model performances for instrcution fine-tuning. Check out the "
+            "to drastically improve model performances for instruction fine-tuning. Check out the "
             "original paper here: https://arxiv.org/abs/2310.05914 and the original code "
             "here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
         },
@@ -2289,7 +2289,7 @@ def _setup_devices(self) -> "torch.device":
                 )
                 if device.type != "mps":
                     raise ValueError(
-                        "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ "
+                        "Either you do not have an MPS-enabled device on this machine or macOS version is not 12.3+ "
                         "or current PyTorch install was not built with MPS enabled."
                     )
             if device.type == "mps":
diff --git a/examples/cpu/inference/python/models/yolov7/README.md b/examples/cpu/inference/python/models/yolov7/README.md
index 17c30cb6d..7a31038f6 100644
--- a/examples/cpu/inference/python/models/yolov7/README.md
+++ b/examples/cpu/inference/python/models/yolov7/README.md
@@ -57,7 +57,7 @@
     ./setup.sh
     ```
 
-3. Setup required environment paramaters
+3. Setup required environment parameters
 
 | **Parameter**                |                                  **export command**                                  |
 |:---------------------------:|:------------------------------------------------------------------------------------:|
diff --git a/examples/cpu/inference/python/models/yolov7/inference.py b/examples/cpu/inference/python/models/yolov7/inference.py
index 03a1d2eb6..68819dc2a 100644
--- a/examples/cpu/inference/python/models/yolov7/inference.py
+++ b/examples/cpu/inference/python/models/yolov7/inference.py
@@ -281,7 +281,7 @@ def test(
     model = model.to(memory_format=torch.channels_last)
 
     if evaluate:
-        print("using offical pytorch model to do inference\n")
+        print("using official pytorch model to do inference\n")
         x = torch.rand(batch_size, 3, imgsz, imgsz).contiguous(
             memory_format=torch.channels_last
         )
diff --git a/examples/cpu/llm/fine-tuning/finetune.py b/examples/cpu/llm/fine-tuning/finetune.py
index 6d806ef74..84468507d 100644
--- a/examples/cpu/llm/fine-tuning/finetune.py
+++ b/examples/cpu/llm/fine-tuning/finetune.py
@@ -1,5 +1,5 @@
 """
-This script is adapted from the following official alpaca-loca fine-tuning code with minimal code changes:
+This script is adapted from the following official alpaca-local fine-tuning code with minimal code changes:
 https://github.com/tloen/alpaca-lora/blob/main/finetune.py
 """
 
diff --git a/examples/cpu/llm/inference/README.md b/examples/cpu/llm/inference/README.md
index 0eecbb7d2..6444195e3 100644
--- a/examples/cpu/llm/inference/README.md
+++ b/examples/cpu/llm/inference/README.md
@@ -493,7 +493,7 @@ deepspeed --bind_cores_to_rank run.py -m <DEEPSEEK_INT8_CKPT_SAVE_PATH> --benchm
 - Notes
 
 (1) Since the hugeness of the model size as well as the cache based optimizations, it is recommended to use a server with 1.5TB
-or larger memory amount. The memory comsumption optimizations are in progress.
+or larger memory amount. The memory consumption optimizations are in progress.
 
 (2) Please add `--num_accelerators` and `--bind_core_list` arguments for `deepspeed` command based on your SNC configurations.
 For example, for a server having 2 sockets, 128 physical cores per socket with a total number of 6 sub-numa clusters,
diff --git a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
index 0c1aec6c4..88761fd1f 100644
--- a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
+++ b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
@@ -247,7 +247,7 @@ def get_low_precision_checkpoint(args, model_config):
 
 def maybe_set_tp_grain_size(quant_config, ds_init_inf_kwargs):
     tp_grain_size = 64
-    # Need to check if this attr is available. Old DeepSpeep does not have it.
+    # Need to check if this attr is available. Old DeepSpeed does not have it.
     assert "tp_grain_size" in dir(
         deepspeed.inference.config.DeepSpeedTPConfig()
     ), "Old DeepSpeed version detected. Please update to the recommended version."
@@ -1714,7 +1714,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
                 and DEFAULT_IMAGE_TOKEN not in prompts_input
             ):
                 """
-                Three senarios:
+                Three scenarios:
                 1. No image, and there for, no image token should be added.
                 2. image token is already specified in the context, so we don't need to add it.
                 3. image token is not specified in the context and there is image inputs, so we need to add it.
@@ -1857,7 +1857,7 @@ def _collate(x):
                         and DEFAULT_IMAGE_TOKEN not in context
                     ):
                         """
-                        Three senarios:
+                        Three scenarios:
                         1. No image, and there for, no image token should be added.
                         2. image token is already specified in the context, so we don't need to add it.
                         3. image token is not specified in the context and there is image inputs,
diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
index 30e04ff84..563b8f070 100644
--- a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
+++ b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -70,7 +70,7 @@ def str_to_kwargs(s):
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6b",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--vision-text-model",
@@ -619,7 +619,7 @@ def write_checkpoints_json():
     )
 
 tp_grain_size = 64
-# Need to check if this attr is available. Old DeepSpeep does not have it.
+# Need to check if this attr is available. Old DeepSpeed does not have it.
 assert "tp_grain_size" in dir(
     deepspeed.inference.config.DeepSpeedTPConfig()
 ), "Old DeepSpeed version detected. Please update to the recommended version."
@@ -871,7 +871,7 @@ def load_image(image_file):
                 ]
             prompt = current_prompt
         else:
-            raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.")
+            raise SystemExit("[ERROR] Please use --prompt if want to use custom input.")
 
         raw_image = load_image(args.image_url)
         raw_image = [raw_image] * test_bs
@@ -948,7 +948,7 @@ def download_and_open(url: str) -> Image.Image:
             else:
                 input_sentences.append(prompt_pool[model_type][args.input_tokens])
         else:
-            raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.")
+            raise SystemExit("[ERROR] Please use --prompt if want to use custom input.")
         if test_bs > len(input_sentences):
             # dynamically extend to support larger bs by repetition
             input_sentences *= math.ceil(test_bs / len(input_sentences))
@@ -1064,7 +1064,7 @@ def trace_handler(prof):
     generated, _ = generate()
     t_generate_span = time.time() - t_generate_start
     for i, o, _ in generated:
-        print_rank0(f"{'-'*60}\nin={i}\nout={o}\n")
+        print_rank0(f"{'-' * 60}\nin={i}\nout={o}\n")
 
 # benchmark it!
 else:
diff --git a/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb b/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb
index 1aef9adb7..44eab465a 100644
--- a/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb
+++ b/examples/cpu/llm/inference/ipex_llm_optimizations_inference_single_instance.ipynb
@@ -126,7 +126,7 @@
     "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n",
     "model = model.eval()\n",
     "\n",
-    "# Customizeable hyperparamters\n",
+    "# Customizable hyperparameters\n",
     "batch_size = 1\n",
     "num_beams = 1\n",
     "generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=num_beams)"
@@ -342,7 +342,7 @@
    "metadata": {},
    "source": [
     "### Running ipex.llm in a Distributed Manner\n",
-    "Running ipex.llm in a distributed manner allows you to utlize all available cores more effectively. This is done using DeepSpeed. It is recommended to shard the model weight sizes for better memory usage when running with DeepSpeed. Sharding only needs to be done once. On subsequent runs, remove \"--shard-model\" and replace \"-m \\<MODEL_ID\\>\" with \"-m \\<sharded model path\\>\"."
+    "Running ipex.llm in a distributed manner allows you to utilize all available cores more effectively. This is done using DeepSpeed. It is recommended to shard the model weight sizes for better memory usage when running with DeepSpeed. Sharding only needs to be done once. On subsequent runs, remove \"--shard-model\" and replace \"-m \\<MODEL_ID\\>\" with \"-m \\<sharded model path\\>\"."
    ]
   },
   {
diff --git a/examples/cpu/llm/inference/single_instance/run_accuracy.py b/examples/cpu/llm/inference/single_instance/run_accuracy.py
index 9593268d1..698f91a89 100644
--- a/examples/cpu/llm/inference/single_instance/run_accuracy.py
+++ b/examples/cpu/llm/inference/single_instance/run_accuracy.py
@@ -1217,7 +1217,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
                 and DEFAULT_IMAGE_TOKEN not in prompts_input
             ):
                 """
-                Three senarios:
+                Three scenarios:
                 1. No image, and there for, no image token should be added.
                 2. image token is already specified in the context, so we don't need to add it.
                 3. image token is not specified in the context and there is image inputs, so we need to add it.
@@ -1361,7 +1361,7 @@ def _collate(x):
                         and DEFAULT_IMAGE_TOKEN not in context
                     ):
                         """
-                        Three senarios:
+                        Three scenarios:
                         1. No image, and there for, no image token should be added.
                         2. image token is already specified in the context, so we don't need to add it.
                         3. image token is not specified in the context and there is image inputs, so we need to add it.
diff --git a/examples/cpu/llm/inference/single_instance/run_generation.py b/examples/cpu/llm/inference/single_instance/run_generation.py
index d6c01a881..ab33bf5ff 100644
--- a/examples/cpu/llm/inference/single_instance/run_generation.py
+++ b/examples/cpu/llm/inference/single_instance/run_generation.py
@@ -45,7 +45,7 @@ def str_to_kwargs(s):
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6B",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--dtype",
@@ -474,7 +474,7 @@ def trace_handler(prof):
                     prompt = prompt_pool[model_type][args.input_tokens]
             else:
                 raise SystemExit(
-                    "[ERROR] Plese use --prompt if want to use custom input."
+                    "[ERROR] Please use --prompt if want to use custom input."
                 )
             if model_type == "mllama":
                 raw_image = load_image(args.image_url)
diff --git a/examples/cpu/llm/inference/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py
index 52183bdee..c069c33af 100644
--- a/examples/cpu/llm/inference/single_instance/run_quantization.py
+++ b/examples/cpu/llm/inference/single_instance/run_quantization.py
@@ -1619,7 +1619,7 @@ def calib_func(prepared_model):
                     prompt = prompt_pool[model.name][args.input_tokens]
             else:
                 raise SystemExit(
-                    "[ERROR] Plese use --prompt if want to use custom input."
+                    "[ERROR] Please use --prompt if want to use custom input."
                 )
 
             if model.name == "mllama":
diff --git a/examples/cpu/llm/inference/utils/create_shard_model.py b/examples/cpu/llm/inference/utils/create_shard_model.py
index cef4f1e3d..338bd7c3e 100644
--- a/examples/cpu/llm/inference/utils/create_shard_model.py
+++ b/examples/cpu/llm/inference/utils/create_shard_model.py
@@ -17,7 +17,7 @@
     "--model-id",
     type=str,
     default="EleutherAI/gpt-j-6B",
-    help="the huggingface mdoel id",
+    help="the huggingface model id",
 )
 parser.add_argument(
     "--save-path",
diff --git a/examples/cpu/llm/tools/env_setup.sh b/examples/cpu/llm/tools/env_setup.sh
index 743d66cec..b8dbef0b5 100644
--- a/examples/cpu/llm/tools/env_setup.sh
+++ b/examples/cpu/llm/tools/env_setup.sh
@@ -29,7 +29,7 @@ if [ ! -f ${WHEELFOLDER}/lm_eval*.whl ]; then
     (( MODE |= 0x02 ))
 fi
 
-# Check existance of required Linux commands
+# Check existence of required Linux commands
 for CMD in gcc g++; do
     command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" is required."; exit 1;)
 done
@@ -57,7 +57,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
     # Enter IPEX parent dir
     cd ..
 
-    # Check existance of required Linux commands
+    # Check existence of required Linux commands
     for CMD in make git; do
         command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" is required."; exit 3;)
     done
diff --git a/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html b/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html
index 2dd4747bb..df0408eb3 100644
--- a/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html
+++ b/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.html
@@ -14455,7 +14455,7 @@
 </style>
 
 <style type="text/css">
-/* Force rendering true colors when outputing to pdf */
+/* Force rendering true colors when outputting to pdf */
 * {
   -webkit-print-color-adjust: exact;
 }
@@ -14819,7 +14819,7 @@ <h2 id="--%7C---%7C---%7C---%7C---%7C---%7C---%7C---%7C---"><a href="https://www
                           and seccomp
   Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer
                           sanitization
-  Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB fillin
+  Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling
                          g, PBRSB-eIBRS SW sequence
   Srbds:                 Not affected
   Tsx async abort:       Not affected
@@ -15885,7 +15885,7 @@ <h1 id="Begin-Training-ResNet34-+-UNet">Begin Training ResNet34 + UNet<a class="
 </div>
 <div class="jp-InputArea jp-Cell-inputArea"><div class="jp-InputPrompt jp-InputArea-prompt">
 </div><div class="jp-RenderedHTMLCommon jp-RenderedMarkdown jp-MarkdownOutput " data-mime-type="text/markdown">
-<h1 id="Inference">Inference<a class="anchor-link" href="#Inference">&#182;</a></h1><p>To run inference, we can use the <code>02_eval.py</code> script (<a href="https://github.com/avanetten/cresi/blob/main/cresi/02_eval.py">https://github.com/avanetten/cresi/blob/main/cresi/02_eval.py</a>). Remember that we did modify a few lines to accomodate AMX and BF16 (see Codebase Changes above).</p>
+<h1 id="Inference">Inference<a class="anchor-link" href="#Inference">&#182;</a></h1><p>To run inference, we can use the <code>02_eval.py</code> script (<a href="https://github.com/avanetten/cresi/blob/main/cresi/02_eval.py">https://github.com/avanetten/cresi/blob/main/cresi/02_eval.py</a>). Remember that we did modify a few lines to accommodate AMX and BF16 (see Codebase Changes above).</p>
 <div class="highlight"><pre><span></span>python3<span class="w"> </span>/home/devcloud/cresi/cresi/02_eval.py<span class="w"> </span>/home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json
 </pre></div>
 
diff --git a/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.ipynb b/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.ipynb
index 1496006cf..941e539d7 100644
--- a/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.ipynb
+++ b/examples/cpu/usecase_spacenet5/20230303_consolvo_spacenet5_ipex.ipynb
@@ -198,7 +198,7 @@
       "                          and seccomp\n",
       "  Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer\n",
       "                          sanitization\n",
-      "  Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB fillin\n",
+      "  Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling\n",
       "                         g, PBRSB-eIBRS SW sequence\n",
       "  Srbds:                 Not affected\n",
       "  Tsx async abort:       Not affected\n"
@@ -1035,7 +1035,7 @@
    "source": [
     "# Inference\n",
     "\n",
-    "To run inference, we can use the `02_eval.py` script (https://github.com/avanetten/cresi/blob/main/cresi/02_eval.py). Remember that we did modify a few lines to accomodate AMX and BF16 (see Codebase Changes above).\n",
+    "To run inference, we can use the `02_eval.py` script (https://github.com/avanetten/cresi/blob/main/cresi/02_eval.py). Remember that we did modify a few lines to accommodate AMX and BF16 (see Codebase Changes above).\n",
     "\n",
     "```bash\n",
     "python3 /home/devcloud/cresi/cresi/02_eval.py /home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json\n",
diff --git a/intel_extension_for_pytorch/_meta_registrations.py b/intel_extension_for_pytorch/_meta_registrations.py
index d63a61227..2e22e42d1 100644
--- a/intel_extension_for_pytorch/_meta_registrations.py
+++ b/intel_extension_for_pytorch/_meta_registrations.py
@@ -767,7 +767,7 @@ def meta_bgmv_shrink(
     out,
     input,
     weights,
-    indicies,
+    indices,
     scale,
 ):
     return out.new_empty(out.shape)
@@ -778,7 +778,7 @@ def meta_bgmv_expand(
     out,
     input,
     weights,
-    indicies,
+    indices,
     add_inputs,
 ):
     return out.new_empty(out.shape)
@@ -789,7 +789,7 @@ def meta_bgmv_expand_slice(
     out,
     input,
     weights,
-    indicies,
+    indices,
     slice_offset,
     slice_size,
     add_inputs,
@@ -802,7 +802,7 @@ def meta_sgmv_shrink(
     out,
     inputs,
     weights,
-    indicies,
+    indices,
     seq_lens,
     scale,
 ):
@@ -814,7 +814,7 @@ def meta_sgmv_expand(
     out,
     input,
     weights,
-    indicies,
+    indices,
     seq_lens,
     add_inputs,
 ):
@@ -826,7 +826,7 @@ def meta_sgmv_expand_slice(
     out,
     input,
     weights,
-    indicies,
+    indices,
     slice_offset,
     slice_size,
     add_inputs,
diff --git a/intel_extension_for_pytorch/cpu/hypertune/README.md b/intel_extension_for_pytorch/cpu/hypertune/README.md
index ba1835841..d889c343e 100644
--- a/intel_extension_for_pytorch/cpu/hypertune/README.md
+++ b/intel_extension_for_pytorch/cpu/hypertune/README.md
@@ -69,7 +69,7 @@ hyperparams:
   launcher:
     hp: ['malloc']
 ```
-`malloc` will be tuned using its default search space, `['tc', 'je', 'pt']`. All other launcher hyperparamters (`ncores_per_instance`, `ninstances`, `use_all_nodes`, `use_logical_cores`, `disable_numactl`, `disable_iomp`) will not be tuned and instead will use their default values.
+`malloc` will be tuned using its default search space, `['tc', 'je', 'pt']`. All other launcher hyperparameters (`ncores_per_instance`, `ninstances`, `use_all_nodes`, `use_logical_cores`, `disable_numactl`, `disable_iomp`) will not be tuned and instead will use their default values.
 
 #### User defined search space
 
diff --git a/intel_extension_for_pytorch/cpu/launch/launch.py b/intel_extension_for_pytorch/cpu/launch/launch.py
index 7332ee380..96d849192 100644
--- a/intel_extension_for_pytorch/cpu/launch/launch.py
+++ b/intel_extension_for_pytorch/cpu/launch/launch.py
@@ -74,7 +74,7 @@
 spawns up multiple distributed training processes on each of the training nodes. For intel_extension_for_pytorch, oneCCL
 is used as the communication backend and MPI used to launch multi-proc. To get the better
 performance, you should specify the different cores for oneCCL communication and computation
-process seperately. This tool can automatically set these ENVs(such as I_MPI_PIN_DOMIN) and launch
+process separately. This tool can automatically set these ENVs(such as I_MPI_PIN_DOMIN) and launch
 multi-proc for you.
 
 The utility can be used for single-node distributed training, in which one or
diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_base.py b/intel_extension_for_pytorch/cpu/launch/launcher_base.py
index 8b8f24080..2c8cd016c 100644
--- a/intel_extension_for_pytorch/cpu/launch/launcher_base.py
+++ b/intel_extension_for_pytorch/cpu/launch/launcher_base.py
@@ -138,7 +138,7 @@ def check_env(self, env_name, env_value):
             self.verbose(
                 "warning",
                 f"{env_name} in environment variable is {os.environ[env_name]} while the value you would like to set"
-                + f" is {env_value}. Use the exsiting value. Please unset the {env_name} if you wish ipex launcher set it ",
+                + f" is {env_value}. Use the existing value. Please unset the {env_name} if you wish ipex launcher set it ",
                 warning_type=WarningType.AmbiguousArgument,
             )
             return os.environ[env_name]
@@ -269,7 +269,7 @@ def set_memory_allocator(
         """
         Enable TCMalloc/JeMalloc with LD_PRELOAD and set configuration for JeMalloc.
         By default, PTMalloc will be used for PyTorch, but TCMalloc and JeMalloc can get better
-        memory resue and reduce page fault to improve performance.
+        memory reuse and reduce page fault to improve performance.
         """
         if skip_list is None:
             skip_list = []
@@ -334,7 +334,7 @@ def parse_list_argument(self, txt):
                     begin, end = core_range
                     assert (
                         begin <= end
-                    ), "Begining index of a range must be <= ending index."
+                    ), "Beginning index of a range must be <= ending index."
                     ret.extend(list(range(begin, end + 1)))
         ret = list(set(ret))
         return ret
diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py b/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py
index 8c2ef93bb..dd61f9a4e 100644
--- a/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py
+++ b/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py
@@ -9,7 +9,7 @@
 
 class DistributedTrainingLauncher(Launcher):
     """
-    Launcher for distributed traning with MPI launcher
+    Launcher for distributed training with MPI launcher
     """
 
     def add_params(self, parser):
@@ -92,7 +92,7 @@ def get_pin_domain_affinity(
            CCL_WORKER_AFFINITY="0,1,2,3,28,29,30,31"
            I_MPI_PIN_DOMAIN=[0xffffff0,0xffffff00000000]
         2) use logical core oneccl
-           The first ccl_worker_count logical cores which is correponding to the
+           The first ccl_worker_count logical cores which is corresponding to the
            first ccl_worker_count physical cores are used as the ccl cores.
            For example: on CascadeLake 8280 CPU, 2 ranks on one node. ccl_worker_count=4
            CCL_WORKER_COUNT=4
diff --git a/intel_extension_for_pytorch/cpu/runtime/README.md b/intel_extension_for_pytorch/cpu/runtime/README.md
index e2d395552..cdbbe170f 100644
--- a/intel_extension_for_pytorch/cpu/runtime/README.md
+++ b/intel_extension_for_pytorch/cpu/runtime/README.md
@@ -205,7 +205,7 @@ at::Tensor input_tensor = at::rand({100, 8276});
 // Submit task into TaskExecutor
 auto res_future = task(std::move(input_tensor));
 
-// Block until finish executation and get the result
+// Block until finish execution and get the result
 auto res = res_future.get();
 
 ```
diff --git a/intel_extension_for_pytorch/cpu/runtime/cpupool.py b/intel_extension_for_pytorch/cpu/runtime/cpupool.py
index 693f17834..38c843342 100644
--- a/intel_extension_for_pytorch/cpu/runtime/cpupool.py
+++ b/intel_extension_for_pytorch/cpu/runtime/cpupool.py
@@ -24,7 +24,7 @@ def __init__(self, core_ids: list = None, node_id: int = None):
         if core_ids is not None:
             if node_id is not None:
                 logger.warning(
-                    "Both of core_ids and node_id are inputed. core_ids will be used with priority."
+                    "Both of core_ids and node_id are inputted. core_ids will be used with priority."
                     + "You can eliminate this warning by only using one of them",
                     _type=WarningType.AmbiguousArgument,
                 )
diff --git a/intel_extension_for_pytorch/cpu/runtime/multi_stream.py b/intel_extension_for_pytorch/cpu/runtime/multi_stream.py
index ad2e37922..4cf0aee3e 100644
--- a/intel_extension_for_pytorch/cpu/runtime/multi_stream.py
+++ b/intel_extension_for_pytorch/cpu/runtime/multi_stream.py
@@ -12,7 +12,7 @@ class MultiStreamModuleHint(object):
     r"""
     MultiStreamModuleHint is a hint to MultiStreamModule about how to split the inputs
     or concat the output. Each argument should be None, with type of int or a container
-    which containes int or None such as: (0, None, ...) or [0, None, ...]. If the argument
+    which contains int or None such as: (0, None, ...) or [0, None, ...]. If the argument
     is None, it means this argument will not be split or concat. If the argument is with
     type int, its value means along which dim this argument will be split or concat.
 
@@ -301,7 +301,7 @@ def _do_get_input_for_each_stream(
         else:
             AssertionError(
                 False
-            ), "Generate stream input failed, unsupport input hint type of:{}".format(
+            ), "Generate stream input failed, unsupported input hint type of:{}".format(
                 type_arg
             )
         return None
@@ -400,7 +400,7 @@ def _do_generate_outputs(
         else:
             AssertionError(
                 False
-            ), "Generate outputs failed, unsupport output hint type of:{}".format(
+            ), "Generate outputs failed, unsupported output hint type of:{}".format(
                 type_arg
             )
         return None
@@ -454,7 +454,9 @@ def _do_concat_output_for_each_stream(self, hint_object, output_object, idx_or_k
         else:
             AssertionError(
                 False
-            ), "Concat output failed, unsupport output hint type of:{}".format(type_arg)
+            ), "Concat output failed, unsupported output hint type of:{}".format(
+                type_arg
+            )
         return None
 
     def _concat_output_for_each_stream(self):
@@ -529,9 +531,9 @@ def get_stream_number(self):
 
 class _MultiStreamBenchmarkModule(nn.Module):
     # Here is an internal Module for weight sharing benchmark
-    # The diffence with MultiStreamModule:
+    # The difference with MultiStreamModule:
     #    * The input will not be split. So each stream will run with the same input.
-    #    * The output will not be concat. But synchronization point for each stream still exsits at the end \
+    #    * The output will not be concat. But synchronization point for each stream still exists at the end \
     #      of the forward method.
     def __init__(
         self,
diff --git a/intel_extension_for_pytorch/cpu/tpp/fused_bert.py b/intel_extension_for_pytorch/cpu/tpp/fused_bert.py
index c78ea2041..b9a7e74ca 100644
--- a/intel_extension_for_pytorch/cpu/tpp/fused_bert.py
+++ b/intel_extension_for_pytorch/cpu/tpp/fused_bert.py
@@ -1247,7 +1247,7 @@ def fast_bert(model, dtype=torch.float, optimizer=None, unpad=False):
         >>> # running training step.
 
     """
-    # tpp bert optimization depends on the transformers repo to implementate the related module
+    # tpp bert optimization depends on the transformers repo to implement the related module
     installed_pkg = {dist.metadata["Name"].lower() for dist in distributions()}
     min_version = "4.6.0"
     max_version = "4.51.3"
@@ -1285,7 +1285,7 @@ def fast_bert(model, dtype=torch.float, optimizer=None, unpad=False):
     ):
         raise ValueError("TPP only supports torch.float and torch.bfloat16.")
 
-    # setup the seed for libxsmm (can be only positive int value) which will imapct some ops using seed. e.g., dropout
+    # setup the seed for libxsmm (can be only positive int value) which will impact some ops using seed. e.g., dropout
     try:
         torch_ipex_cpp.xsmm_manual_seed(
             torch.tensor(torch.initial_seed()).to(torch.int32).abs().item()
@@ -1295,7 +1295,7 @@ def fast_bert(model, dtype=torch.float, optimizer=None, unpad=False):
             "Set seed failed for libxsmm which may impact the training loss, you can call "
             + "torch.manual_seed(N) before invoking fast_bert."
         )
-    # replace the original transfomers module object with tpp module which has the same functionality but with more
+    # replace the original transformers module object with tpp module which has the same functionality but with more
     # operator fusion optimization
     new_model = copy.deepcopy(model)
     global layer_use_bf16
diff --git a/intel_extension_for_pytorch/cpu/tpp/optim.py b/intel_extension_for_pytorch/cpu/tpp/optim.py
index dae344312..2b569e5e7 100644
--- a/intel_extension_for_pytorch/cpu/tpp/optim.py
+++ b/intel_extension_for_pytorch/cpu/tpp/optim.py
@@ -376,7 +376,7 @@ def step(self, closure=None):
                 grad = p.grad.data
                 if grad.is_sparse:
                     raise RuntimeError(
-                        "Lamb does not support sparse gradients, consider SparseAdam instad."
+                        "Lamb does not support sparse gradients, consider SparseAdam instead."
                     )
 
                 state = self.state[p]
diff --git a/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py b/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py
index 09bf2c16e..161d39c2c 100644
--- a/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py
+++ b/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py
@@ -188,7 +188,7 @@ def __getitem__(self, key):
         return self.unblocked_tensor().__getitem__(key)
 
     def __getattr__(self, attr):
-        # print("requiested attr: %s" % attr)
+        # print("requested attr: %s" % attr)
         if attr == "shape":
             return torch.Size(self.get_plain_shape())
         if attr == "dtype":
diff --git a/intel_extension_for_pytorch/csrc/cpu/Module.cpp b/intel_extension_for_pytorch/csrc/cpu/Module.cpp
index 570ec049c..c856513e8 100644
--- a/intel_extension_for_pytorch/csrc/cpu/Module.cpp
+++ b/intel_extension_for_pytorch/csrc/cpu/Module.cpp
@@ -183,7 +183,7 @@ void InitIpexModuleBindings(py::module m) {
       .def("get", &torch_ipex::runtime::FutureTensor::get);
 
   // The holder type is std::shared_ptr<torch_ipex::runtime::CPUPool>.
-  // Please use std::shared_ptr<torch_ipex::runtime::CPUPool> as funtion
+  // Please use std::shared_ptr<torch_ipex::runtime::CPUPool> as function
   // parameter. If you pass it as parameter from python into C++.
   py::class_<
       torch_ipex::runtime::CPUPool,
diff --git a/intel_extension_for_pytorch/csrc/cpu/TaskModule.cpp b/intel_extension_for_pytorch/csrc/cpu/TaskModule.cpp
index 8331b20d6..d782eecc0 100644
--- a/intel_extension_for_pytorch/csrc/cpu/TaskModule.cpp
+++ b/intel_extension_for_pytorch/csrc/cpu/TaskModule.cpp
@@ -88,7 +88,7 @@ std::unique_ptr<FutureTensor> TaskModule::run_async(
           // set the thread local status, such as the grad mode before
           // execuating the status
           at::GradMode::set_enabled(grad_mode);
-          // execuate the task
+          // execute the task
           (*task)();
         });
       }
@@ -122,7 +122,7 @@ std::unique_ptr<FutureTensor> TaskModule::run_async(
         // set the thread local status, such as the grad mode before execuating
         // the status
         at::GradMode::set_enabled(grad_mode);
-        // execuate the task
+        // execute the task
         (*task)();
       });
     }
diff --git a/intel_extension_for_pytorch/csrc/xpu/Generator.cpp b/intel_extension_for_pytorch/csrc/xpu/Generator.cpp
index 5c119d080..690a378f5 100644
--- a/intel_extension_for_pytorch/csrc/xpu/Generator.cpp
+++ b/intel_extension_for_pytorch/csrc/xpu/Generator.cpp
@@ -7,7 +7,7 @@ namespace xpu {
 // This is a temp solution. We will submit a PR to stock-PyTorch
 //  and make XPU backend supported in torch.Generator() API.
 // TO DO: remove this file and submit a PR to stock-PyTorch. We should move
-// struct Generator from aten to c10. Then unify front-end torch.Geneator with
+// struct Generator from aten to c10. Then unify front-end torch.Generator with
 // VirtualGuardImpl
 PyObject* THPGenerator_New(PyObject* _self, PyObject* args, PyObject* kwargs) {
   HANDLE_TH_ERRORS
diff --git a/intel_extension_for_pytorch/frontend.py b/intel_extension_for_pytorch/frontend.py
index 2cd646b6f..2132ff885 100644
--- a/intel_extension_for_pytorch/frontend.py
+++ b/intel_extension_for_pytorch/frontend.py
@@ -198,14 +198,14 @@ def optimize(
     perspective it has drawbacks. Running with the ``blocked layout``, oneDNN
     splits one or several dimensions of data into blocks with fixed size each
     time the operator is executed. More details information about oneDNN data
-    mermory format is available at `oneDNN manual
+    memory format is available at `oneDNN manual
     <https://oneapi-src.github.io/oneDNN/dev_guide_understanding_memory_formats.html>`_.
     To reduce this overhead, data will be converted to predefined block shapes
     prior to the execution of oneDNN operator execution. In runtime, if the data
     shape matches oneDNN operator execution requirements, oneDNN won't perform
     memory layout conversion but directly go to calculation. Through this
     methodology, called ``weight prepacking``, it is possible to avoid runtime
-    weight data format convertion and thus increase performance.
+    weight data format conversion and thus increase performance.
 
     Args:
         model (torch.nn.Module): User model to apply optimizations on.
@@ -239,7 +239,7 @@ def optimize(
             set by ``level`` knob. For now, XPU doesn't support weights prepack.
         replace_dropout_with_identity (bool): Whether to replace ``nn.Dropout``
             with ``nn.Identity``. If replaced, the ``aten::dropout`` won't be
-            included in the JIT graph. This may provide more fusion opportunites
+            included in the JIT graph. This may provide more fusion opportunities
             on the graph. This only works for inference model. The default value
             is ``None``. Explicitly setting this knob overwrites the configuration
             set by ``level`` knob.
diff --git a/intel_extension_for_pytorch/fx/concat_linear.py b/intel_extension_for_pytorch/fx/concat_linear.py
index 39a48f750..2de351e04 100644
--- a/intel_extension_for_pytorch/fx/concat_linear.py
+++ b/intel_extension_for_pytorch/fx/concat_linear.py
@@ -138,7 +138,7 @@ def prepare_input_for_attn(BasicTransformerBlock):
                 in2 = BasicTransformerBlock.attn2.to_v.in_features
                 # The first dimension of hd/ehd (2) is related to user given batch size
                 # The second dimension of hd (4096, 1024, 256) is related to user ginve h, w
-                # The second dimension of ehd (77) is max-seq-lenght from text-encoder
+                # The second dimension of ehd (77) is max-seq-length from text-encoder
                 # All dimensions above cannot be got from unet model
                 # We can hardcode this because the guards of dynamo export do not require
                 # Concrete shapes on these dimensions with hd and ehd
@@ -194,17 +194,17 @@ def apply_concat_linear_on_BasicTransformerBlock(BasicTransformerBlock):
 
     if "transformers" in sys.modules:
 
-        def is_transfomer_model(model):
+        def is_transformer_model(model):
             name = model.__class__.__module__
             return name.startswith("transformers.models.")
 
-        if is_transfomer_model(model):
+        if is_transformer_model(model):
             try:
                 from transformers.utils.fx import symbolic_trace as hf_symbolic_trace
             except ImportError:
                 # fx are not exposed in transformers.utils
                 logger.warning(
-                    "failed to import transformers symbolic_trace, cannnot apply concat linear",
+                    "failed to import transformers symbolic_trace, cannot apply concat linear",
                     _type=WarningType.NotSupported,
                 )
             try:
@@ -214,7 +214,7 @@ def is_transfomer_model(model):
                 return concat_linear(model, inplace)
             except BaseException:
                 logger.warning(
-                    "failed to symbolic trace model with transformers symbolic_trace, cannnot apply concat linear",
+                    "failed to symbolic trace model with transformers symbolic_trace, cannot apply concat linear",
                     _type=WarningType.NotSupported,
                 )
     else:
@@ -223,7 +223,7 @@ def is_transfomer_model(model):
             return concat_linear(model, inplace)
         except BaseException:
             logger.warning(
-                "pytorch native symbolic trace failed, may cannnot apply concat linear",
+                "pytorch native symbolic trace failed, may cannot apply concat linear",
                 _type=WarningType.NotSupported,
             )
     return model
diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py
index ee08a4bd7..bfaa5ddd6 100644
--- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py
+++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py
@@ -471,7 +471,7 @@ class PagedAttention:
             v_scale
         )
 
-    This operator is used to store the key/value token states into the pre-allcated kv_cache buffers of paged attention.
+    This operator is used to store the key/value token states into the pre-allocated kv_cache buffers of paged attention.
 
     Args:
         key (torch.Tensor): The keytensor. The shape should be [num_seqs, num_heads, head_size].
@@ -489,7 +489,7 @@ class PagedAttention:
 
     [class method]: reshape_and_cache_flash
     ipex.llm.modules.PagedAttention.reshape_and_cache_flash(key, value, key_cache, value_cache, slot_mapping, k_scale, v_scale)
-    This operator is used to store the key/value token states into the pre-allcated kv_cache buffers of paged attention.
+    This operator is used to store the key/value token states into the pre-allocated kv_cache buffers of paged attention.
     This method implementation is the same as reshape_and_cache but we need this to align with XPU.
 
     Args:
@@ -550,7 +550,7 @@ class PagedAttention:
         window_size (int): left size of sliding window, default is -1.
         k_scale (float): The scale used by the fp8 key cache.
         v_scale (float): The scale used by the fp8 value cache.
-        alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads).
+        alibi_slopes (torch.Tensor, optional): which is the alibi slope with the shape of (num_heads).
         softcap (float): the positive softcap value to apply on the attention weights, default is -1.
 
     [class method]: flash_atten_varlen
@@ -593,7 +593,7 @@ class PagedAttention:
         is_cusal (bool): Whether to apply causal attention masking. Default is True. False is not supported yet.
         block_tables:(torch.Tensor): The mapping table used to mapping the logical sequence
             to the physical sequence. The shape should be [batch_size, max_num_blocks_per_seq].
-        alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads).
+        alibi_slopes (torch.Tensor, optional): which is the alibi slope with the shape of (num_heads).
         window_size_left (int): left size of sliding window, default is -1.
         window_size_right (int): right size of sliding window, default is -1.
         k_scale (float): The scale used by the fp8 key cache.
diff --git a/intel_extension_for_pytorch/llm/utils.py b/intel_extension_for_pytorch/llm/utils.py
index 4dd9b1f2c..5b8cf828f 100644
--- a/intel_extension_for_pytorch/llm/utils.py
+++ b/intel_extension_for_pytorch/llm/utils.py
@@ -79,7 +79,7 @@ def _gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
     )
 
     # For old GC format (transformers < 4.35.0) for models that live on the Hub
-    # we will fall back to the overwritten `_set_gradient_checkpointing` methid
+    # we will fall back to the overwritten `_set_gradient_checkpointing` method
     _is_using_old_format = (
         "value" in inspect.signature(self._set_gradient_checkpointing).parameters
     )
@@ -109,7 +109,7 @@ def _gradient_checkpointing_disable(self):
     """
     if self.supports_gradient_checkpointing:
         # For old GC format (transformers < 4.35.0) for models that live on the Hub
-        # we will fall back to the overwritten `_set_gradient_checkpointing` methid
+        # we will fall back to the overwritten `_set_gradient_checkpointing` method
         _is_using_old_format = (
             "value" in inspect.signature(self._set_gradient_checkpointing).parameters
         )
diff --git a/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py b/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py
index 152bdd5ac..4c88b7694 100644
--- a/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py
+++ b/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py
@@ -608,7 +608,7 @@ def from_embeddingbag_list(
 class MergedEmbeddingBagWithCat(MergedEmbeddingBag):
     r"""
     To support `MergedEmbeddingBag` with cat all outputs with an given input.
-    It is a common structure in recomendation system to cat dense output with
+    It is a common structure in recommendation system to cat dense output with
     sparse (embeddingbag) output together. MergedEmbeddingBagWithCat aims to
     fuse the cat together to have good memory behaviour.
     Native usage for multiple EmbeddingBag cat with dense is:
@@ -765,7 +765,7 @@ class DistMergeEmbeddingBagWithAdaGrad(MergedEmbeddingBagWithAdaGrad):
     r"""
     The distributed version or MergedEmbeddingBagWithAdaGrad
     After creating Pytorch Distributed process group, we can create DistMergeEmbeddingBagWithAdaGrad
-    and the emb tables will be automatically seperated to different ranks.
+    and the emb tables will be automatically separated to different ranks.
     Each rank will keep particia table and will only run forward/backward/update on the rows it keeped in local.
     We will also merge the result from different ranks through all to all during forward/backward.
     The returned results for forward is shape of [local BS * num tables * emb_dim]
diff --git a/intel_extension_for_pytorch/nn/utils/_model_convert.py b/intel_extension_for_pytorch/nn/utils/_model_convert.py
index b747babf9..7745cc808 100644
--- a/intel_extension_for_pytorch/nn/utils/_model_convert.py
+++ b/intel_extension_for_pytorch/nn/utils/_model_convert.py
@@ -26,7 +26,7 @@ def replace_customized_linear_with_linear(model):
 
 def replace_dropout_with_identity(model):
     # replace dropout with identity during inference, so that aten::dropout won't be on the JIT graph.
-    # This optimization may provide more fusion opportunites on the graph.
+    # This optimization may provide more fusion opportunities on the graph.
     if isinstance(model, torch.jit.ScriptModule):
         return
     if not model.training:
@@ -189,7 +189,7 @@ def _convert_gptq_scales_qzeros(scales, qzeros, inplace=True):
     GPTQ format:
         scales: (n_groups, N)
         qzeros: (n_groups, N // 8)
-        qzeros are substracted by 1 before packing
+        qzeros are subtracted by 1 before packing
 
     Desired format:
         scales: (N, n_groups)
@@ -231,7 +231,7 @@ def _convert_optimum_format_to_desired(qweight, scales, qzeros, inplace=True):
         qweight: (math.ceil(IC / comp_ratio), OC)
         scales: (n_groups, OC)
         qzeros: (n_groups, math.ceil(OC / comp_ratio))
-        qzeros are substracted by 1 before packing
+        qzeros are subtracted by 1 before packing
 
     Desired format:
         compression_dim = 1
diff --git a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py
index c4ba41a50..6da40e3de 100644
--- a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py
+++ b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py
@@ -261,7 +261,7 @@ def found_wrapper(parameter, params_attr):
 
 
 def patch_state_dict(model, params_attr, mode):
-    def get_parammeter_from_model(model, name_list):
+    def get_parameter_from_model(model, name_list):
         if name_list[0] == "module" and not hasattr(model, "module"):
             # for DDP model, there is an extra module
             name_list = name_list[1:]
@@ -282,7 +282,7 @@ def to_public_fp32(model, state_dict, params_attr):
             # k = "submodule_name.submodule_name.attr_name"
             # for example, "attn.linear.weight"
             name_list = k.split(".")
-            param = get_parammeter_from_model(model, name_list)
+            param = get_parameter_from_model(model, name_list)
             param_wrapper = found_wrapper(param, params_attr)
             if param_wrapper:
                 if mode == "inference" and param_wrapper.original_dtype is not None:
@@ -323,7 +323,7 @@ def __init__(self):
         self.parameter: torch.nn.Parameter = None
         # Parameter trail for split optimization
         self.parameter_trail: torch.Tensor = None
-        # The original dtype for Paramter
+        # The original dtype for Parameter
         self.original_dtype: torch.dtype = None
         # The caseted dtype by ipex.optimize
         self.casted_dtype: torch.dtype = None
@@ -465,7 +465,7 @@ def prepack(self, module, is_training):
     def pack_weight(self, use_dnnl=True):
         if not use_dnnl:
             # TODO: Haozhe, LinWei
-            # weired case that cannot override ".data" for mkl here
+            # weird case that cannot override ".data" for mkl here
             # The op_ctx seems not hold the original plain format weight
             self.parameter = self.op_ctx.get_weight()
         else:
diff --git a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py
index 981d4d762..09d3bfd51 100644
--- a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py
+++ b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py
@@ -568,7 +568,7 @@ def convert(m, optimizer, params_attr):
             # _ipex_module_empty_weight_tensor and _ipex_module_empty_bias_tensor
             # have to be a Parameter so that dynamo could convert it into FakeTensor
             # These empty tensors will only be used during inference but we'll set
-            # it in both training and eval mode to supprt the use case of the below
+            # it in both training and eval mode to support the use case of the below
             # workflow:
             # model.train() -> ipex.optimize(model) -> model.eval()
             new_m._ipex_module_empty_weight_tensor = torch.nn.Parameter(
@@ -591,17 +591,17 @@ def convert_rec(m, optimizer, params_attr):
         return new_m, optimizer, params_attr
 
     if device_type == "cpu":
-        opt_model, opt_optmizer, params_attr = convert_rec(
+        opt_model, opt_optimizer, params_attr = convert_rec(
             model, optimizer, params_attr
         )
 
         patch_state_dict(opt_model, params_attr, "prepack")
         setattr(opt_model, "params_attr", params_attr)  # noqa: B010
-        if opt_optmizer is not None:
-            setattr(opt_optmizer, "params_attr", params_attr)  # noqa: B010
-            optim._optimizer_utils.patch_load_state_dict(opt_optmizer)
-            optim._optimizer_utils.patch_state_dict(opt_optmizer)
-        return opt_model, opt_optmizer, params_attr
+        if opt_optimizer is not None:
+            setattr(opt_optimizer, "params_attr", params_attr)  # noqa: B010
+            optim._optimizer_utils.patch_load_state_dict(opt_optimizer)
+            optim._optimizer_utils.patch_state_dict(opt_optimizer)
+        return opt_model, opt_optimizer, params_attr
 
 
 def record_input_shape_for_prepack(module, sample_input):
diff --git a/intel_extension_for_pytorch/optim/_optimizer_utils.py b/intel_extension_for_pytorch/optim/_optimizer_utils.py
index 32e67dfd8..aa38e9ec1 100644
--- a/intel_extension_for_pytorch/optim/_optimizer_utils.py
+++ b/intel_extension_for_pytorch/optim/_optimizer_utils.py
@@ -95,7 +95,7 @@ def master_param_non_fused_step(self, closure=None):
                 k.grad = _param.grad.detach().float()
 
         loss = self._original_step(closure)
-        # sync mater weight to model's paramerter
+        # sync mater weight to model's parameter
         for k, v in self.params_attr.items():
             _param = v.parameter
             if _param is None or _param is k:
@@ -132,7 +132,7 @@ def sync_grad(self):
 
     def step_sync_weight(self, closure=None):
         loss = self._original_step(closure)
-        # sync mater weight to model's paramerter
+        # sync mater weight to model's parameter
         for k, v in self.params_attr.items():
             _param = v.parameter
             if _param is None or _param is k:
@@ -357,7 +357,7 @@ def get_optimizer_unpacked_state_dict(self):
         )
 
 
-def optimizer_fusion(optimizer, device_type, user_explict_fuse):
+def optimizer_fusion(optimizer, device_type, user_explicit_fuse):
     r"""
     Patch "step" method to choose IPEX optimized fused update kernel.
     """
@@ -375,7 +375,7 @@ def optimizer_fusion(optimizer, device_type, user_explict_fuse):
                 + str(device_type)
                 + ". For now, only support CPU, XPU."
             )
-            warn_if_user_explicitly_set(user_explict_fuse, msg)
+            warn_if_user_explicitly_set(user_explicit_fuse, msg)
             return optimizer
         if not hasattr(optimizer, "_original_step"):
             setattr(optimizer, "_original_step", optimizer.step)  # noqa: B010
@@ -383,9 +383,9 @@ def optimizer_fusion(optimizer, device_type, user_explict_fuse):
         setattr(optimizer, "fused", True)  # noqa: B010
     except KeyError:
         msg = (
-            "Does not suport fused step for "
+            "Does not support fused step for "
             + str(type(optimizer))
             + ", will use non-fused step"
         )
-        warn_if_user_explicitly_set(user_explict_fuse, msg)
+        warn_if_user_explicitly_set(user_explicit_fuse, msg)
     return optimizer
diff --git a/intel_extension_for_pytorch/quantization/README.md b/intel_extension_for_pytorch/quantization/README.md
index f881219b7..a5d11e8ed 100644
--- a/intel_extension_for_pytorch/quantization/README.md
+++ b/intel_extension_for_pytorch/quantization/README.md
@@ -30,7 +30,7 @@ qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_a
                   weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric))
 ```
 
-Note: we fully use of PyTorch [observer methonds](https://pytorch.org/docs/stable/quantization-support.html#torch-quantization-observer), so you can use a different PyTorch obsever methond to define the [QConfig](https://pytorch.org/docs/1.11/generated/torch.quantization.qconfig.QConfig.html). For weight observer, we only support **torch.qint8** dtype now.
+Note: we fully use of PyTorch [observer methonds](https://pytorch.org/docs/stable/quantization-support.html#torch-quantization-observer), so you can use a different PyTorch observer methond to define the [QConfig](https://pytorch.org/docs/1.11/generated/torch.quantization.qconfig.QConfig.html). For weight observer, we only support **torch.qint8** dtype now.
 
 **Suggestion**:
 
diff --git a/intel_extension_for_pytorch/quantization/_quantization_state_utils.py b/intel_extension_for_pytorch/quantization/_quantization_state_utils.py
index 7fa343b27..6e8ef6477 100644
--- a/intel_extension_for_pytorch/quantization/_quantization_state_utils.py
+++ b/intel_extension_for_pytorch/quantization/_quantization_state_utils.py
@@ -423,7 +423,7 @@ def iterate_and_apply_convert(
                     args = torch.quantize_per_channel(args, scale, zp, ch_axis, dtype)
                     args = args.dequantize()
             else:
-                # white list, conv, linear, matmul, we always convert it's input to bflat16 firstly, and then inser q+dq
+                # white list, conv, linear, matmul, we always convert it's input to bflat16 firstly, and then insert q+dq
                 if (
                     str(op)
                     in conv_linear_ops
diff --git a/intel_extension_for_pytorch/quantization/_quantize_utils.py b/intel_extension_for_pytorch/quantization/_quantize_utils.py
index f74919ef0..1eea4758f 100644
--- a/intel_extension_for_pytorch/quantization/_quantize_utils.py
+++ b/intel_extension_for_pytorch/quantization/_quantize_utils.py
@@ -21,7 +21,7 @@
     quantized_modules_has_weights,
     load_qconf_summary_to_model,
     get_fqn_valid_for_module_dict_key,
-    check_model_obsever_has_run,
+    check_model_observer_has_run,
 )
 from ._quantization_state import (
     AutoQuantizationState,
@@ -459,7 +459,7 @@ def save_qconf_summary(self, qconf_summary):
                 sync_pool_and_lstm_input_output_scale_zp(quant_state_map, nodes)
                 get_default_recipe(nodes)
             else:
-                if check_model_obsever_has_run(model):
+                if check_model_observer_has_run(model):
                     # re-compute the scales and zp if user load a json file and re-do the calibration step.
                     attach_scale_zp_values_to_model(model)
                 else:
@@ -562,7 +562,7 @@ class QuantizationConvertTensorProxy(torch.Tensor):
         """
         An override of `torch.Tensor` to enable dynamic dispatch for
         quantization inference.
-        For each function with a `__torch_fuction__` override, this proxy does
+        For each function with a `__torch_function__` override, this proxy does
         the following for functions which need quantization:
         1. calls `_auto_quant_state.validate_cur_op` to validate that
            the currently seen op is the same as what was recorded during tracing
@@ -769,7 +769,7 @@ def unwrap_proxy(a):
         sync_pool_and_lstm_input_output_scale_zp(quant_state_map, nodes)
         get_default_recipe(nodes)
     else:
-        if check_model_obsever_has_run(module):
+        if check_model_observer_has_run(module):
             # re-compute the scales and zp if user load a json file and re-do the calibration step.
             attach_scale_zp_values_to_model(module)
         else:
diff --git a/intel_extension_for_pytorch/quantization/_utils.py b/intel_extension_for_pytorch/quantization/_utils.py
index 553e31629..d7fb622b6 100644
--- a/intel_extension_for_pytorch/quantization/_utils.py
+++ b/intel_extension_for_pytorch/quantization/_utils.py
@@ -241,7 +241,7 @@ def _check_observer_has_run(observer):
     return True
 
 
-def check_model_obsever_has_run(
+def check_model_observer_has_run(
     module: torch.nn.Module,
 ) -> None:
     """
@@ -272,7 +272,7 @@ def check_model_obsever_has_run(
                 ), "The observer's dtype only can be torch.quint8 or torch.qint8"
 
     for _, child in module.named_children():
-        if check_model_obsever_has_run(child):
+        if check_model_observer_has_run(child):
             return True
 
     return False
@@ -383,7 +383,7 @@ def convert_quant_state_map_to_nodes(quant_state_map):
         for nonq_op_infos in v.seen_nonq_op_infos:
             new_node = Node(nonq_op_infos)
             nodes.append(new_node)
-    # create connection between nodess
+    # create connection between nodes
     for cur in nodes:
         if isinstance(cur, ParentNode):
             continue
@@ -667,7 +667,7 @@ def _create_observer(setting):
 
 
 def save_quant_state(quant_state_map, configure_file):
-    # save qparam's as json file for tunning
+    # save qparam's as json file for tuning
     quant_state_dict = OrderedDict()
     for k, v in quant_state_map.items():
         layer_infos = OrderedDict()
@@ -1062,7 +1062,7 @@ def load_qconf_summary_to_model(model, qconf_summary):
                     activation=activation_obs,
                     weight=_create_observer(weight_observer),
                 )
-            # overide the cur model's info
+            # override the cur model's info
             v.idx_to_seen_q_op_infos[int(i)].input_tensor_infos = input_tensor_infos
             v.idx_to_seen_q_op_infos[int(i)].input_tensor_force_inf_dtype = (
                 input_force_dtype_infos
@@ -1244,7 +1244,7 @@ def _lstm_forward(module, input, hx, weights):
 
 def module_call_to_function_call(module, args, weights):
     r"""
-    This function is a help function which replace nn.module call to funtion call, which implement
+    This function is a help function which replace nn.module call to function call, which implement
     the nn.module's forward function.
     """
     if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.Conv3d):
diff --git a/intel_extension_for_pytorch/quantization/fp8/fp8.py b/intel_extension_for_pytorch/quantization/fp8/fp8.py
index f1ac4e05b..4635fb0dc 100644
--- a/intel_extension_for_pytorch/quantization/fp8/fp8.py
+++ b/intel_extension_for_pytorch/quantization/fp8/fp8.py
@@ -1,4 +1,4 @@
-"""FP8 utilies for IPEX"""
+"""FP8 utilities for IPEX"""
 
 from contextlib import contextmanager
 from typing import Optional, Dict, Any, Tuple
diff --git a/intel_extension_for_pytorch/transformers/generation/beam_sample.py b/intel_extension_for_pytorch/transformers/generation/beam_sample.py
index b2d4cfcc0..da0153290 100644
--- a/intel_extension_for_pytorch/transformers/generation/beam_sample.py
+++ b/intel_extension_for_pytorch/transformers/generation/beam_sample.py
@@ -74,7 +74,7 @@ def _beam_sample(
     ).to(input_ids.device)
     model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
     latency_list = []
-    # (joao) feature lost in the refactor. Probably won't implement, hurts readbility with minimal gains (there
+    # (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there
     # are newer low-memory alternatives like the offloaded cache)
     sequential = generation_config.low_memory
     if sequential:
diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py
index b93eca9a5..d47eac5d3 100644
--- a/intel_extension_for_pytorch/transformers/generation/beam_search.py
+++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py
@@ -81,7 +81,7 @@ def _beam_search(
     ).to(input_ids.device)
     model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
     latency_list = []
-    # (joao) feature lost in the refactor. Probably won't implement, hurts readbility with minimal gains (there
+    # (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there
     # are newer low-memory alternatives like the offloaded cache)
     sequential = generation_config.low_memory
     if sequential:
diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py
index eb072c562..11261063c 100644
--- a/intel_extension_for_pytorch/transformers/models/reference/models.py
+++ b/intel_extension_for_pytorch/transformers/models/reference/models.py
@@ -6743,7 +6743,7 @@ def ConformerEncoder_forward(self, xs_pad, masks):
 
     unfolded = False
     ori_bz, seq_len, D = input_tensor.shape
-    max_seq_len = 500  # maxium position for absolute positional encoding
+    max_seq_len = 500  # maximum position for absolute positional encoding
     if seq_len > max_seq_len:
         # audio sequence is longer than max_seq_len, unfold it into chunks of max_seq_len
         unfolded = True
@@ -7559,7 +7559,7 @@ def detect_language(
         )
     elif input_features is not None and encoder_outputs is not None:
         raise ValueError(
-            "Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!"
+            "Make sure to specify only one of `input_features` or `encoder_outputs` - not both!"
         )
     elif input_features is not None:
         inputs = {"input_features": input_features[:, :, :num_segment_frames]}
diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py
index 8ab4a2409..8d7852568 100644
--- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py
+++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py
@@ -379,7 +379,7 @@ def _OPTAttention_forward(
     attn_output = attn_output.transpose(1, 2)
 
     # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-    # partitioned aross GPUs when using tensor-parallelism.
+    # partitioned across GPUs when using tensor-parallelism.
     attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
     return attn_output, attn_weights_reshaped, past_key_value
 
@@ -2849,7 +2849,7 @@ def __init__(self, module, config, sdp_module_ref, distributed=False):
                     self.hidden_size = self.d_k
                 else:
                     raise ValueError(
-                        "Your transformers version does not support GQA feature, plese upgrade (>= 4.31.0)"
+                        "Your transformers version does not support GQA feature, please upgrade (>= 4.31.0)"
                     )
             else:
                 self.num_key_value_heads = self.num_attention_heads
diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py
index e4df081cf..37b46011d 100644
--- a/intel_extension_for_pytorch/transformers/optimize.py
+++ b/intel_extension_for_pytorch/transformers/optimize.py
@@ -1649,7 +1649,7 @@ def ipex_quantization_flow(
 
 
 def attach_extra_weight_for_large_batch_inference(model):
-    # Traverse the entire model and attch extra bf16 weight to linear
+    # Traverse the entire model and attach extra bf16 weight to linear
     assert _using_tpp()
     from intel_extension_for_pytorch.nn.utils._weight_prepack import (
         _IPEXLinear,
@@ -2053,10 +2053,10 @@ def model_convert_lowering(
 
 
 # TODO: refine this check in other specific path
-def validate_device_avaliable(device: str):
+def validate_device_available(device: str):
     def error_message(device):
         raise RuntimeError(
-            f"Device [{device}] is not avaliable in your IPEX package, need to re-install IPEX with [{device}] support, exiting..."
+            f"Device [{device}] is not available in your IPEX package, need to re-install IPEX with [{device}] support, exiting..."
         )
 
     if device == "xpu":
@@ -2093,7 +2093,7 @@ def optimize(
     T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan, Phi, Qwen3, Whisper. Maira2, Jamba, DeepSeekV2.
 
     For the model that is not in the scope of supported model family above, will try to
-    apply default ipex.optimize transparently to get benifits (not include quantizations,
+    apply default ipex.optimize transparently to get benefits (not include quantizations,
     only works for dtypes of torch.bfloat16 and torch.half and torch.float).
 
     Args:
@@ -2116,7 +2116,7 @@ def optimize(
             where `checkpoint` is the state_dict and `quant_method` is dict specifying the quantization
             method including GPTQ or AWQ, e,g, quant_method = {`quant_method`: `gptq`}.
         sample_inputs (Tuple tensors): sample inputs used for model quantization or torchscript.
-            Default value is ``None``, and for well supported model, we provide this sample inputs automaticlly.
+            Default value is ``None``, and for well supported model, we provide this sample inputs automatically.
         deployment_mode (bool): Whether to apply the optimized model for deployment of model generation.
             It means there is no need to further apply optimization like torchscirpt. Default value is ``True``.
         cache_weight_for_large_batch (bool): Whether to cache the dedicated weight for large batch to speed up
@@ -2150,7 +2150,7 @@ def optimize(
         )
         return model, optimizer
 
-    validate_device_avaliable(device)
+    validate_device_available(device)
 
     try:
         well_supported_model = False
diff --git a/intel_extension_for_pytorch/utils/_logger.py b/intel_extension_for_pytorch/utils/_logger.py
index bc30125b8..4f3585e7f 100644
--- a/intel_extension_for_pytorch/utils/_logger.py
+++ b/intel_extension_for_pytorch/utils/_logger.py
@@ -38,7 +38,7 @@ class _Logger(logging.Logger):
     """
     An IPEX wrapper for logging.logger
     We use this wrapper for two purpose:
-    (1) Unified the usage for warnings.warn and logging.warning: Accroding to
+    (1) Unified the usage for warnings.warn and logging.warning: According to
     https://docs.python.org/3/howto/logging.html, we use warnings.warn if the
     issue is avoidable and logging.warn if there is nothing the client
     application can do about the situation.
diff --git a/intel_extension_for_pytorch/xpu/__init__.py b/intel_extension_for_pytorch/xpu/__init__.py
index 0d1b068e4..33e521b9b 100644
--- a/intel_extension_for_pytorch/xpu/__init__.py
+++ b/intel_extension_for_pytorch/xpu/__init__.py
@@ -64,7 +64,7 @@ def init():
 
 
 # This API call _prefetchDeviceCount() if _lazy_init() has not been called such that
-# this API can be used before forking proces.
+# this API can be used before forking process.
 def device_count() -> int:
     r"""Returns the number of XPUs device available."""
     if hasattr(intel_extension_for_pytorch._C, "_getDeviceCount"):
diff --git a/intel_extension_for_pytorch/xpu/_proxy_module.py b/intel_extension_for_pytorch/xpu/_proxy_module.py
index 00d4e4651..a25bc8ffc 100644
--- a/intel_extension_for_pytorch/xpu/_proxy_module.py
+++ b/intel_extension_for_pytorch/xpu/_proxy_module.py
@@ -38,7 +38,7 @@ class proxy_compute_eng(object):
     XETLA = -5
 
 
-# --- [ CPU proxys:
+# --- [ CPU proxies:
 _register_proxy_ops("interaction_forward")
 
 
@@ -46,7 +46,7 @@ class proxy_compute_eng(object):
     intel_extension_for_pytorch._C.__dict__["FP32MathMode"] = proxy_math_mode
 
 
-# --- [ XPU proxys:
+# --- [ XPU proxies:
 _register_proxy("ShortStorageBase")
 _register_proxy("CharStorageBase")
 _register_proxy("IntStorageBase")
diff --git a/intel_extension_for_pytorch/xpu/cpp_extension.py b/intel_extension_for_pytorch/xpu/cpp_extension.py
index b409dbe8c..44557e965 100644
--- a/intel_extension_for_pytorch/xpu/cpp_extension.py
+++ b/intel_extension_for_pytorch/xpu/cpp_extension.py
@@ -94,7 +94,7 @@ def _get_exec_path(module_name, path):
     return os.path.join(path, f"{module_name}{EXEC_EXT}")
 
 
-def get_dpcpp_complier():
+def get_dpcpp_compiler():
     # build cxx via dpcpp
     dpcpp_cmp = shutil.which("icpx")
     if dpcpp_cmp is None:
@@ -105,7 +105,7 @@ def get_dpcpp_complier():
     return dpcpp_cmp
 
 
-def get_icx_complier():
+def get_icx_compiler():
     # build cc via icx
     icx_cmp = shutil.which("icx")
     if icx_cmp is None:
@@ -246,7 +246,7 @@ def build_extensions(self) -> None:
             original_spawn = self.compiler.spawn
         else:
             original_compile = self.compiler._compile
-            # save origin function for passthough
+            # save origin function for passthrough
             original_link_shared_object = self.compiler.link_shared_object
             original_spawn = self.compiler.spawn
 
@@ -278,14 +278,14 @@ def unix_wrap_single_compile(
             try:
                 original_compiler = self.compiler.compiler_so
                 if _is_cpp_file(src):
-                    _cxxbin = get_dpcpp_complier()
+                    _cxxbin = get_dpcpp_compiler()
                     self.compiler.set_executable("compiler_so", _cxxbin)
                     if isinstance(cflags, dict):
                         cflags = cflags["cxx"]
                     else:
                         cflags = unix_dpcpp_flags(cflags)
                 elif _is_c_file(src):
-                    _ccbin = get_icx_complier()
+                    _ccbin = get_icx_compiler()
                     self.compiler.set_executable("compiler_so", _ccbin)
                     if isinstance(cflags, dict):
                         cflags = cflags["cxx"]
@@ -365,7 +365,7 @@ def unix_wrap_single_link_shared_object(
             # create output directories avoid linker error.
             create_parent_dirs_by_path(output_libname)
 
-            _cxxbin = get_dpcpp_complier()
+            _cxxbin = get_dpcpp_compiler()
             cmd = _gen_link_lib_cmd_line(
                 _cxxbin,
                 objects,
@@ -705,7 +705,7 @@ def _write_ninja_file_and_compile_objects(
     if IS_WINDOWS:
         compiler = os.environ.get("CXX", "cl")
     else:
-        compiler = get_dpcpp_complier()
+        compiler = get_dpcpp_compiler()
     get_compiler_abi_compatibility_and_version(compiler)
 
     build_file_path = os.path.join(build_directory, "build.ninja")
@@ -745,7 +745,7 @@ def _write_ninja_file_and_build_library(
     if IS_WINDOWS:
         compiler = os.environ.get("CXX", "cl")
     else:
-        compiler = get_dpcpp_complier()
+        compiler = get_dpcpp_compiler()
     check_compiler_abi_compatibility(compiler)
 
     extra_ldflags = _prepare_ldflags(extra_ldflags or [], verbose, is_standalone)
@@ -908,7 +908,7 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) ->
         # subprocess.run assumes that sys.__stdout__ has not been modified and
         # attempts to write to it by default.  However, when we call _run_ninja_build
         # from ahead-of-time cpp extensions, the following happens:
-        # 1) If the stdout encoding is not utf-8, setuptools detachs __stdout__.
+        # 1) If the stdout encoding is not utf-8, setuptools detaches __stdout__.
         #    https://github.com/pypa/setuptools/blob/7e97def47723303fafabe48b22168bbc11bb4821/setuptools/dist.py#L1110
         #    (it probably shouldn't do this)
         # 2) subprocess.run (on POSIX, with no stdout override) relies on
@@ -991,7 +991,7 @@ def _write_ninja_file_to_build_library(
     # include_paths() gives us the location of torch/extension.h
     system_includes = include_paths()
     # sysconfig.get_path('include') gives us the location of Python.h
-    # Explicitly specify 'posix_prefix' scheme on non-Windows platforms to workaround error on some MacOS
+    # Explicitly specify 'posix_prefix' scheme on non-Windows platforms to workaround error on some macOS
     # installations where default `get_path` points to non-existing `/Library/Python/M.m/include` folder
     python_include_path = sysconfig.get_path(
         "include", scheme="nt" if IS_WINDOWS else "posix_prefix"
@@ -1013,7 +1013,7 @@ def _write_ninja_file_to_build_library(
     #
     # Pybind11 before 2.4 used to build an ABI strings using the following pattern:
     # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_BUILD_TYPE}__"
-    # Since 2.4 compier type, stdlib and build abi parameters are also encoded like this:
+    # Since 2.4 compiler type, stdlib and build abi parameters are also encoded like this:
     # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_COMPILER_TYPE}
     # {PYBIND11_STDLIB}{PYBIND11_BUILD_ABI}{PYBIND11_BUILD_TYPE}__"
     #
@@ -1259,7 +1259,7 @@ def sanitize_flags(flags):
     if IS_WINDOWS:
         compiler = os.environ.get("CXX", "cl")
     else:
-        compiler = get_dpcpp_complier()
+        compiler = get_dpcpp_compiler()
 
     # Version 1.3 is required for the `deps` directive.
     config = ["ninja_required_version = 1.3"]
diff --git a/intel_extension_for_pytorch/xpu/intrinsic/modules/intrinsic.py b/intel_extension_for_pytorch/xpu/intrinsic/modules/intrinsic.py
index dd55b3ff8..b5d3cc5b7 100644
--- a/intel_extension_for_pytorch/xpu/intrinsic/modules/intrinsic.py
+++ b/intel_extension_for_pytorch/xpu/intrinsic/modules/intrinsic.py
@@ -3,10 +3,10 @@
 from torch.autograd import Function
 
 
-class InteractionFuncion(Function):
+class InteractionFunction(Function):
     @staticmethod
     def forward(ctx, input_mlp, input_emb):
         return torch.ops.torch_ipex.interaction(input_mlp, input_emb)
 
 
-Interaction = InteractionFuncion.apply
+Interaction = InteractionFunction.apply
diff --git a/intel_extension_for_pytorch/xpu/lazy_init.py b/intel_extension_for_pytorch/xpu/lazy_init.py
index 426f38660..dc79c2be4 100644
--- a/intel_extension_for_pytorch/xpu/lazy_init.py
+++ b/intel_extension_for_pytorch/xpu/lazy_init.py
@@ -74,7 +74,7 @@ def _lazy_init():
         _C._initExtension()
         # Some of the queued calls in _queued_calls[] may reentrantly call
         # _lazy_init(). We must prevent multiple initializations. In that case
-        # just return early without initializeing to avoid a deadlock.
+        # just return early without initializing to avoid a deadlock.
         _tls.is_initializing = True
 
         for calls in _lazy_seed_tracker.get_calls():
diff --git a/intel_extension_for_pytorch/xpu/overrides.py b/intel_extension_for_pytorch/xpu/overrides.py
index f05918b53..cfb52af4c 100644
--- a/intel_extension_for_pytorch/xpu/overrides.py
+++ b/intel_extension_for_pytorch/xpu/overrides.py
@@ -231,7 +231,7 @@ def new_api(*args, **kwargs):
             new_args = list(args)
             assert len(args) > 0 and isinstance(
                 args[0], torch.Tensor
-            ), f"Current api {api} got non-Tensor for the 1st arguement"
+            ), f"Current api {api} got non-Tensor for the 1st argument"
             dst_device = args[0].device
             dst_dtype = args[0].dtype
             resign_dtype = kwargs.get("dtype")
diff --git a/intel_extension_for_pytorch/xpu/single_card.py b/intel_extension_for_pytorch/xpu/single_card.py
index e35f8dc50..ffc5ef806 100644
--- a/intel_extension_for_pytorch/xpu/single_card.py
+++ b/intel_extension_for_pytorch/xpu/single_card.py
@@ -44,7 +44,7 @@ class single_card_dist:
             i). device: get the xpu information used in model training
             >>> xpu = "xpu:{}".format(local_rank)
             >>> print("DDP Use XPU: {} for training".format(xpu))
-            ii). model: use the model warpped by DDP in the following training
+            ii). model: use the model wrapped by DDP in the following training
             iii). train_sampler: use the train_sampler to get the train_loader
             >>> train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size,
                                                            shuffle=(train_sampler is None),
diff --git a/intel_extension_for_pytorch/xpu/utils.py b/intel_extension_for_pytorch/xpu/utils.py
index 0477e79f7..023302e0c 100644
--- a/intel_extension_for_pytorch/xpu/utils.py
+++ b/intel_extension_for_pytorch/xpu/utils.py
@@ -222,7 +222,7 @@ def optimize(
     adding this alias is to unify the coding style in user scripts base on torch.xpu
     modular.
 
-    TODO: When finish merge frontend code, add other aurgments describtion here.
+    TODO: When finish merge frontend code, add other aurgments description here.
     Args (Specific default values for XPU device):
         inplace (bool): Default set false to save valuable XPU device memory.
         weights_prepack (bool): Disabled for XPU device.
diff --git a/setup.py b/setup.py
index bf25d1278..c319c5f6c 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
 # USE_PERSIST_STREAM    - to use persistent oneDNN stream
 # USE_PRIMITIVE_CACHE   - to Cache oneDNN primitives by framework
 # USE_QUEUE_BARRIER     - to use queue submit_barrier API
-# USE_SCRATCHPAD_MODE   - to trun on oneDNN scratchpad user mode
+# USE_SCRATCHPAD_MODE   - to turn on oneDNN scratchpad user mode
 # USE_MULTI_CONTEXT     - to create DPC++ runtime context per device
 # USE_AOT_DEVLIST       - to set device list for AOT build option, for example, bdw,tgl,ats,..."
 # USE_SYCL_ASSERT       - to enable assert in sycl kernel
@@ -469,7 +469,7 @@ def get_xpu_project_build_dir():
     return xpu_build_dir
 
 
-def get_xpu_compliers():
+def get_xpu_compilers():
     if shutil.which("icx") is None or shutil.which("icpx") is None:
         raise RuntimeError("Failed to find compiler path from OS PATH")
     if IS_WINDOWS:
@@ -478,7 +478,7 @@ def get_xpu_compliers():
         return "icx", "icpx"
 
 
-def get_cpu_compliers():
+def get_cpu_compilers():
     if shutil.which("icx") is None or shutil.which("icpx") is None:
         # For CPU build, Intel Compiler is optional.
         return None, None
@@ -750,7 +750,7 @@ def run(self):
                     "It maybe CPU only branch, and it is not contains XPU code."
                 )
 
-            gpu_cc, gpu_cxx = get_xpu_compliers()
+            gpu_cc, gpu_cxx = get_xpu_compilers()
             build_option_gpu = {
                 **build_option_common,
                 "BUILD_MODULE_TYPE": "GPU",
@@ -780,7 +780,7 @@ def run(self):
 
         if build_with_cpu:
             # Generate cmake for CPU module:
-            cpu_cc, cpu_cxx = get_cpu_compliers()
+            cpu_cc, cpu_cxx = get_cpu_compilers()
             if cpu_cc is not None and cpu_cxx is not None:
                 build_option_cpu = {
                     **build_option_common,
diff --git a/tests/cpu/bench/custom_op_bench/merged_embeddingbag.py b/tests/cpu/bench/custom_op_bench/merged_embeddingbag.py
index 669f5709c..617b33540 100644
--- a/tests/cpu/bench/custom_op_bench/merged_embeddingbag.py
+++ b/tests/cpu/bench/custom_op_bench/merged_embeddingbag.py
@@ -151,21 +151,21 @@ def run_bench(bench_name, module, input_data, optimizer=None, training=False):
 
 def inference_bench(dataset, emb_list, merged_emb):
     emblist_input, merged_emb_input = dataset
-    run_bench("EmbedddingBag List Inference", emb_list, emblist_input)
-    run_bench("Merged EmbedddingBag Inference", merged_emb, merged_emb_input)
+    run_bench("EmbeddingBag List Inference", emb_list, emblist_input)
+    run_bench("Merged EmbeddingBag Inference", merged_emb, merged_emb_input)
 
 
 def training_bench(dataset, emb_list, merged_emb, optimizer):
     emblist_input, merged_emb_input = dataset
     run_bench(
-        "EmbedddingBag List Training",
+        "EmbeddingBag List Training",
         emb_list,
         emblist_input,
         optimizer=optimizer,
         training=True,
     )
     run_bench(
-        "Merged EmbedddingBag Training", merged_emb, merged_emb_input, training=True
+        "Merged EmbeddingBag Training", merged_emb, merged_emb_input, training=True
     )
 
 
diff --git a/tests/cpu/common_device_type.py b/tests/cpu/common_device_type.py
index 2c593f21b..cb12d0c5c 100644
--- a/tests/cpu/common_device_type.py
+++ b/tests/cpu/common_device_type.py
@@ -171,7 +171,7 @@
 class DeviceTypeTestBase(TestCase):
     device_type = "generic_device_type"
 
-    # Precision is a thread-local setting since it may be overriden per test
+    # Precision is a thread-local setting since it may be overridden per test
     _tls = threading.local()
     _tls.precision = TestCase.precision
 
@@ -540,7 +540,7 @@ def __call__(self, fn):
 # Decorator that instantiates a variant of the test for each given dtype.
 # Notes:
 #   (1) Tests that accept the dtype argument MUST use this decorator.
-#   (2) Can be overriden for the CPU or CUDA, respectively, using dtypesIfCPU
+#   (2) Can be overridden for the CPU or CUDA, respectively, using dtypesIfCPU
 #       or dtypesIfCUDA.
 #   (3) Prefer the existing decorators to defining the 'device_type' kwarg.
 class dtypes(object):
diff --git a/tests/cpu/common_utils.py b/tests/cpu/common_utils.py
index 54af69c46..10f06b4c1 100644
--- a/tests/cpu/common_utils.py
+++ b/tests/cpu/common_utils.py
@@ -343,7 +343,7 @@ def skipIfNotRegistered(op_name, message):
 
     Args:
         op_name: Check if this op is registered in `core._REGISTERED_OPERATORS`.
-        message: mesasge to fail with.
+        message: message to fail with.
 
     Usage:
         @skipIfNotRegistered('MyOp', 'MyOp is not linked!')
@@ -820,7 +820,7 @@ def assertTensorsEqual(a, b):
                         raise TypeError("Was expecting both tensors to be bool type.")
                     else:
                         if a.dtype == torch.bool and b.dtype == torch.bool:
-                            # we want to respect precision but as bool doesn't support substraction,
+                            # we want to respect precision but as bool doesn't support subtraction,
                             # boolean tensor has to be converted to int
                             a = a.to(torch.int)
                             b = b.to(torch.int)
diff --git a/tests/cpu/hf_configs/deepseekv2/modeling_deepseek.py b/tests/cpu/hf_configs/deepseekv2/modeling_deepseek.py
index 8e759f563..c00217988 100644
--- a/tests/cpu/hf_configs/deepseekv2/modeling_deepseek.py
+++ b/tests/cpu/hf_configs/deepseekv2/modeling_deepseek.py
@@ -921,7 +921,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement,
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
         # that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
         # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen
@@ -1734,7 +1734,7 @@ def prepare_inputs_for_generation(
 
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if (
                 attention_mask is not None
diff --git a/tests/cpu/hf_configs/deepseekv3/modeling_deepseek.py b/tests/cpu/hf_configs/deepseekv3/modeling_deepseek.py
index 799ea1ab8..acd6b7831 100644
--- a/tests/cpu/hf_configs/deepseekv3/modeling_deepseek.py
+++ b/tests/cpu/hf_configs/deepseekv3/modeling_deepseek.py
@@ -874,7 +874,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement,
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
         # that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
         # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
@@ -1669,7 +1669,7 @@ def prepare_inputs_for_generation(
 
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if (
                 attention_mask is not None
diff --git a/tests/cpu/hf_configs/llava/modeling_llavallama.py b/tests/cpu/hf_configs/llava/modeling_llavallama.py
index 034c386c6..c4acd183b 100644
--- a/tests/cpu/hf_configs/llava/modeling_llavallama.py
+++ b/tests/cpu/hf_configs/llava/modeling_llavallama.py
@@ -530,7 +530,7 @@ def initialize_vision_tokenizer(self, model_args, tokenizer):
                 else:
                     raise ValueError(
                         f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. "
-                        + f"Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}."
+                        + f"Current: {input_embeddings.shape}. Number of new tokens: {num_new_tokens}."
                     )
         elif model_args.mm_use_im_patch_token:
             if model_args.tune_mm_mlp_adapter:
diff --git a/tests/cpu/hf_configs/phi/modeling_phi.py b/tests/cpu/hf_configs/phi/modeling_phi.py
index d9a75c846..fe7d468ab 100644
--- a/tests/cpu/hf_configs/phi/modeling_phi.py
+++ b/tests/cpu/hf_configs/phi/modeling_phi.py
@@ -475,7 +475,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement,
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
         # that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
         # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
diff --git a/tests/cpu/hf_configs/phi3/modeling_phi3.py b/tests/cpu/hf_configs/phi3/modeling_phi3.py
index 078639f21..e15157647 100644
--- a/tests/cpu/hf_configs/phi3/modeling_phi3.py
+++ b/tests/cpu/hf_configs/phi3/modeling_phi3.py
@@ -519,7 +519,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement,
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
         # that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
         # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
diff --git a/tests/cpu/hf_configs/phi4/config.json b/tests/cpu/hf_configs/phi4/config.json
index 084da64bf..1f28b39c0 100644
--- a/tests/cpu/hf_configs/phi4/config.json
+++ b/tests/cpu/hf_configs/phi4/config.json
@@ -23,7 +23,7 @@
         "conv_activation": "swish",
         "conv_glu_type": "swish",
         "depthwise_multiplier": 1,
-        "depthwise_seperable_out_channel": 1024,
+        "depthwise_separable_out_channel": 1024,
         "dropout_rate": 0.0,
         "encoder_embedding_config": {
           "input_size": 80
diff --git a/tests/cpu/hf_configs/phi4/modeling_phi4mm.py b/tests/cpu/hf_configs/phi4/modeling_phi4mm.py
index f656b5a78..8fc32dd9e 100644
--- a/tests/cpu/hf_configs/phi4/modeling_phi4mm.py
+++ b/tests/cpu/hf_configs/phi4/modeling_phi4mm.py
@@ -1594,7 +1594,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement,
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
         # that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
         # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
@@ -2383,7 +2383,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
diff --git a/tests/cpu/hf_configs/phi4/processing_phi4mm.py b/tests/cpu/hf_configs/phi4/processing_phi4mm.py
index e9c520613..0a649235e 100644
--- a/tests/cpu/hf_configs/phi4/processing_phi4mm.py
+++ b/tests/cpu/hf_configs/phi4/processing_phi4mm.py
@@ -371,7 +371,7 @@ def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
         fmax = sample_rate / 2
     if fmin is None:
         fmin = 0
-    assert fmin >= 0, "fmin cannot be negtive"
+    assert fmin >= 0, "fmin cannot be negative"
     assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
 
     def mel(f):
@@ -389,7 +389,7 @@ def f2bin(f):
 
     khi = max(khi, klo)
 
-    # Spec 2: SpeechLib uses trianges in Mel space
+    # Spec 2: SpeechLib uses triangles in Mel space
     mlo = mel(fmin)
     mhi = mel(fmax)
     m_centers = np.linspace(mlo, mhi, n_mels + 2)
@@ -533,7 +533,7 @@ def _extract_spectrogram(self, wav, fs):
         n_batch = (wav.shape[0] - win_length) // hop_length + 1
         # Here we don't use stride_tricks since the input array may not satisfy
         # memory layout requirement and we need writeable output
-        # Here we only use list of views before copy to desination
+        # Here we only use list of views before copy to destination
         # so it is more efficient than broadcasting
         y_frames = np.array(
             [
@@ -699,7 +699,7 @@ def __call__(
             return_tensors=return_tensors,
         )
 
-        # idenfity the input mode
+        # identify the input mode
         if len(image_inputs) > 0 and len(audio_inputs) > 0:
             input_mode = InputMode.VISION_SPEECH
         elif len(image_inputs) > 0:
diff --git a/tests/cpu/hf_configs/phi4/speech_conformer_encoder.py b/tests/cpu/hf_configs/phi4/speech_conformer_encoder.py
index c3da764c4..a8a405dbe 100644
--- a/tests/cpu/hf_configs/phi4/speech_conformer_encoder.py
+++ b/tests/cpu/hf_configs/phi4/speech_conformer_encoder.py
@@ -375,16 +375,16 @@ def forward(self, x):
         return x
 
 
-class DepthWiseSeperableConv1d(nn.Module):
-    """DepthWiseSeperableConv1d module used in Convnet module
+class DepthWiseSeparableConv1d(nn.Module):
+    """DepthWiseSeparableConv1d module used in Convnet module
     for the conformer, for more details see:
     https://arxiv.org/pdf/2005.08100v1.pdf
 
     Args:
         input_dim: int
             input channel size.
-        depthwise_seperable_out_channel: int
-            if set different to 0, the number of depthwise_seperable_out_channel
+        depthwise_separable_out_channel: int
+            if set different to 0, the number of depthwise_separable_out_channel
              will be used as a channel_out of the second conv1d layer.
              otherwise, it equal to 0, the second conv1d layer is skipped.
         kernel_size: int
@@ -401,7 +401,7 @@ class DepthWiseSeperableConv1d(nn.Module):
     def __init__(
         self,
         input_dim,
-        depthwise_seperable_out_channel,
+        depthwise_separable_out_channel,
         kernel_size,
         depthwise_multiplier,
         padding=0,
@@ -417,17 +417,17 @@ def __init__(
             groups=input_dim,
         )
 
-        if depthwise_seperable_out_channel != 0:
+        if depthwise_separable_out_channel != 0:
             self.pw_conv = nn.Conv1d(
                 input_dim * depthwise_multiplier,
-                depthwise_seperable_out_channel,
+                depthwise_separable_out_channel,
                 1,
                 1,
                 0,
             )
         else:
             self.pw_conv = nn.Identity()
-        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_separable_out_channel = depthwise_separable_out_channel
 
     def forward(self, x):
         """
@@ -437,7 +437,7 @@ def forward(self, x):
                 input tensor
         """
         x = self.dw_conv(x)
-        if self.depthwise_seperable_out_channel != 0:
+        if self.depthwise_separable_out_channel != 0:
             x = self.pw_conv(x)
         return x
 
@@ -453,8 +453,8 @@ class ConvModule(nn.Module):
         ext_pw_out_channel: int
             if > 0, ext_pw_out_channel is a dim channel size
              for the last pointwise conv after swish activation.
-        depthwise_seperable_out_channel: int
-            if set different to 0, the number of depthwise_seperable_out_channel
+        depthwise_separable_out_channel: int
+            if set different to 0, the number of depthwise_separable_out_channel
              will be used as a channel_out of the second conv1d layer.
              otherwise, it equal to 0, the second conv1d layer is skipped.
         ext_pw_kernel_size: int
@@ -504,7 +504,7 @@ def __init__(
         self,
         input_dim,
         ext_pw_out_channel,
-        depthwise_seperable_out_channel,
+        depthwise_separable_out_channel,
         ext_pw_kernel_size,
         kernel_size,
         depthwise_multiplier,
@@ -524,7 +524,7 @@ def __init__(
         self.input_dim = input_dim
         self.ext_pw_out_channel = ext_pw_out_channel
         self.ext_pw_kernel_size = ext_pw_kernel_size
-        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_separable_out_channel = depthwise_separable_out_channel
         self.glu_type = glu_type
         self.bias_in_glu = bias_in_glu
         self.linear_glu_in_convm = linear_glu_in_convm
@@ -554,17 +554,17 @@ def __init__(
         else:
             padding = (kernel_size - 1) // 2
 
-        self.dw_sep_conv_1d = DepthWiseSeperableConv1d(
+        self.dw_sep_conv_1d = DepthWiseSeparableConv1d(
             input_dim,
-            depthwise_seperable_out_channel,
+            depthwise_separable_out_channel,
             kernel_size,
             depthwise_multiplier,
             padding=padding,
         )
 
-        if depthwise_seperable_out_channel != 0:
-            if input_dim != depthwise_seperable_out_channel:
-                self.ln2 = nn.Linear(depthwise_seperable_out_channel, input_dim)
+        if depthwise_separable_out_channel != 0:
+            if input_dim != depthwise_separable_out_channel:
+                self.ln2 = nn.Linear(depthwise_separable_out_channel, input_dim)
         else:
             if depthwise_multiplier != 1:
                 self.ln2 = nn.Linear(input_dim * depthwise_multiplier, input_dim)
@@ -996,7 +996,7 @@ def np_loadtxt_with_retry(filepath):
 class MeanVarianceNormLayer(nn.Module):
     """Mean/variance normalization layer.
 
-    Will substract mean and multiply input by inverted standard deviation.
+    Will subtract mean and multiply input by inverted standard deviation.
     Typically used as a very first layer in a model.
 
     Args:
@@ -2066,8 +2066,8 @@ class ConformerEncoderLayer(nn.Module):
         ext_pw_out_channel: int
             if > 0, ext_pw_out_channel is a dim channel size
              for the last pointwise conv after swish activation.
-        depthwise_seperable_out_channel: int
-            if set different to 0, the number of depthwise_seperable_out_channel
+        depthwise_separable_out_channel: int
+            if set different to 0, the number of depthwise_separable_out_channel
              will be used as a channel_out of the second conv1d layer.
              otherwise, it equal to 0, the second conv1d layer is skipped.
         depthwise_multiplier: int
@@ -2118,9 +2118,9 @@ class ConformerEncoderLayer(nn.Module):
             if set to True, use GLULinear module,
              otherwise, used GLUPointWiseConv module.
               default to False.
-        attention_innner_dim: int, otional
+        attention_inner_dim: int, otional
             if equal to -1, attention dim for linears k/q/v is
-            equal to d_model. otherwise attention_innner_dim is used.
+            equal to d_model. otherwise attention_inner_dim is used.
             default -1.
         attention_glu_type: str, optional
             activation function for glu used in the multihead attention,
@@ -2157,7 +2157,7 @@ def __init__(
         self,
         d_model=512,
         ext_pw_out_channel=0,
-        depthwise_seperable_out_channel=256,
+        depthwise_separable_out_channel=256,
         depthwise_multiplier=1,
         n_head=4,
         d_ffn=2048,
@@ -2173,7 +2173,7 @@ def __init__(
         conv_glu_type="sigmoid",
         bias_in_glu=True,
         linear_glu_in_convm=False,
-        attention_innner_dim=-1,
+        attention_inner_dim=-1,
         attention_glu_type="swish",
         activation_checkpointing="",
         export=False,
@@ -2198,7 +2198,7 @@ def __init__(
                 n_head,
                 d_model,
                 dropout_rate,
-                attention_innner_dim,
+                attention_inner_dim,
                 attention_glu_type,
                 bias_in_glu,
                 use_pt_scaled_dot_product_attention=use_pt_scaled_dot_product_attention,
@@ -2208,7 +2208,7 @@ def __init__(
         self.conv = ConvModule(
             d_model,
             ext_pw_out_channel,
-            depthwise_seperable_out_channel,
+            depthwise_separable_out_channel,
             ext_pw_kernel_size,
             kernel_size,
             depthwise_multiplier,
@@ -2719,19 +2719,19 @@ class ConformerEncoder(TransformerEncoderBase):
             default False.
         ext_pw_out_channel: int, optional
             the number of channel for CNN
-            before depthwise_seperable_CNN.
+            before depthwise_separable_CNN.
             If 0 then use linear. default 0.
         ext_pw_kernel_size: int, optional
-            kernel size of N before depthwise_seperable_CNN.
+            kernel size of N before depthwise_separable_CNN.
             only work for ext_pw_out_channel > 0.
             default 1
-        depthwise_seperable_out_channel: int, optional
+        depthwise_separable_out_channel: int, optional
             the number of channel for
-            depthwise_seperable_CNN.
+            depthwise_separable_CNN.
             default 256.
         depthwise_multiplier: int, optional
             the number of multiplier for
-            depthwise_seperable_CNN.
+            depthwise_separable_CNN.
             default 1.
         chunk_se: int, optional
             0 for offline SE.
@@ -2741,7 +2741,7 @@ class ConformerEncoder(TransformerEncoderBase):
              by only the current chunk.
             default 0.
         kernel_size: int, optional
-            the number of kernels for depthwise_seperable_CNN.
+            the number of kernels for depthwise_separable_CNN.
             default 3.
         activation: str, optional
             FeedForward block activation.
@@ -2751,7 +2751,7 @@ class ConformerEncoder(TransformerEncoderBase):
             activation function used in ConvModule part
             of the conformer, default "relu".
         conv_glu_type: str, otional
-            activation used use glu in depthwise_seperable_CNN,
+            activation used use glu in depthwise_separable_CNN,
             default "sigmoid"
         bias_in_glu: bool, optional
             if set to True, use additive bias in the weight module
@@ -2840,7 +2840,7 @@ def __init__(  # pylint: disable-all
         cnn_layer_norm=False,
         ext_pw_out_channel=0,
         ext_pw_kernel_size=1,
-        depthwise_seperable_out_channel=256,
+        depthwise_separable_out_channel=256,
         depthwise_multiplier=1,
         chunk_se=0,
         kernel_size=3,
@@ -2901,7 +2901,7 @@ def __init__(  # pylint: disable-all
                 ConformerEncoderLayer(
                     d_model=attention_dim,
                     ext_pw_out_channel=ext_pw_out_channel,
-                    depthwise_seperable_out_channel=depthwise_seperable_out_channel,
+                    depthwise_separable_out_channel=depthwise_separable_out_channel,
                     depthwise_multiplier=depthwise_multiplier,
                     n_head=attention_heads,
                     d_ffn=linear_units,
@@ -2973,7 +2973,7 @@ def forward(self, xs_pad, masks):
 
         unfolded = False
         ori_bz, seq_len, D = input_tensor.shape
-        max_seq_len = 500  # maxium position for absolute positional encoding
+        max_seq_len = 500  # maximum position for absolute positional encoding
         if seq_len > max_seq_len:
             # audio sequence is longer than max_seq_len, unfold it into chunks of max_seq_len
             unfolded = True
diff --git a/tests/cpu/hf_configs/phi4/vision_siglip_navit.py b/tests/cpu/hf_configs/phi4/vision_siglip_navit.py
index 28b5e5b1e..1f335c938 100644
--- a/tests/cpu/hf_configs/phi4/vision_siglip_navit.py
+++ b/tests/cpu/hf_configs/phi4/vision_siglip_navit.py
@@ -447,7 +447,7 @@ def trunc_normal_tf_(
     best when :math:`a \\leq \text{mean} \\leq b`.
     NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
     bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
+    and the result is subsequently scaled and shifted by the mean and std args.
     Args:
         tensor: an n-dimensional `torch.Tensor`
         mean: the mean of the normal distribution
diff --git a/tests/cpu/hf_configs/qwen/modeling_qwen.py b/tests/cpu/hf_configs/qwen/modeling_qwen.py
index aaa04dc25..4bc6596cd 100644
--- a/tests/cpu/hf_configs/qwen/modeling_qwen.py
+++ b/tests/cpu/hf_configs/qwen/modeling_qwen.py
@@ -1435,7 +1435,7 @@ def _rotate_half(x):
 
 
 def apply_rotary_pos_emb(t, freqs):
-    """Apply rotary embedding to the first rotary_dim of the iput
+    """Apply rotary embedding to the first rotary_dim of the input
 
     Arguments:
       t (tensor(batch_size, seq_len, n_head, head_dim)):
diff --git a/tests/cpu/hf_configs/qwen/qwen_generation_utils.py b/tests/cpu/hf_configs/qwen/qwen_generation_utils.py
index 4f9621745..514c510ac 100644
--- a/tests/cpu/hf_configs/qwen/qwen_generation_utils.py
+++ b/tests/cpu/hf_configs/qwen/qwen_generation_utils.py
@@ -59,7 +59,7 @@ def get_ltor_masks_and_position_ids(
     # Position ids.
     position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
     position_ids = position_ids.unsqueeze(0).expand_as(data)
-    # We need to clone as the ids will be modifed based on batch index.
+    # We need to clone as the ids will be modified based on batch index.
     if reset_position_ids:
         position_ids = position_ids.clone()
 
@@ -68,7 +68,7 @@ def get_ltor_masks_and_position_ids(
         for b in range(micro_batch_size):
             # Find indecies where EOD token is.
             eod_index = position_ids[b, data[b] == eod_token]
-            # Detach indecies from positions if going to modify positions.
+            # Detach indices from positions if going to modify positions.
             if reset_position_ids:
                 eod_index = eod_index.clone()
 
@@ -94,7 +94,7 @@ def get_batch(context_tokens: torch.LongTensor, eod_id: int):
     """Generate batch from context tokens."""
     # Move to GPU.
     tokens = context_tokens.contiguous().to(context_tokens.device)
-    # Get the attention mask and postition ids.
+    # Get the attention mask and position ids.
     attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
         tokens,
         eod_id,
@@ -304,7 +304,7 @@ def decode_tokens(
 
 class StopWordsLogitsProcessor(LogitsProcessor):
     """
-    :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
+    :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop generation.
 
     Args:
         stop_words_ids (:obj:`List[List[int]]`):
@@ -318,7 +318,7 @@ class StopWordsLogitsProcessor(LogitsProcessor):
     def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
         if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
             raise ValueError(
-                f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
+                f"`stop_words_ids` has to be a non-empty list, but is {stop_words_ids}."
             )
         if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
             raise ValueError(
diff --git a/tests/cpu/itensor_size1_test.py b/tests/cpu/itensor_size1_test.py
index cff6185c8..736b550b3 100644
--- a/tests/cpu/itensor_size1_test.py
+++ b/tests/cpu/itensor_size1_test.py
@@ -10,7 +10,7 @@
 # (3) We consider such case to remain strictly channelslast stride (calling into oneDNN), since
 # channelslast is with priority.
 # (4) So we do not expect any reorder of "plainformat <-> channelslast" on conv op src/dst (fwd and bwd).
-# The reoders should only have 3 on this script, which are all for weight format.
+# The reorders should only have 3 on this script, which are all for weight format.
 m = torch.nn.Conv2d(2, 1, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
 m = m.to(memory_format=torch.channels_last)
 m.train()
diff --git a/tests/cpu/test_ao_jit_ipex_quantization.py b/tests/cpu/test_ao_jit_ipex_quantization.py
index be8025946..89b9d2435 100644
--- a/tests/cpu/test_ao_jit_ipex_quantization.py
+++ b/tests/cpu/test_ao_jit_ipex_quantization.py
@@ -615,7 +615,7 @@ def forward(self, x):
 
 
 class TestIpexQuantizationConvertAPI(JitLlgaTestCase):
-    def test_inplace_preapre(self):
+    def test_inplace_prepare(self):
         class M(nn.Module):
             def __init__(self):
                 super(M, self).__init__()
diff --git a/tests/cpu/test_ao_jit_llga_utils.py b/tests/cpu/test_ao_jit_llga_utils.py
index b3895d82b..28a067c82 100644
--- a/tests/cpu/test_ao_jit_llga_utils.py
+++ b/tests/cpu/test_ao_jit_llga_utils.py
@@ -195,7 +195,7 @@ def checkQuantizeTrace(
 
             # test Fallback when input shape changes:
             if x_var:
-                assert x_kwarg is None, "x_kwarg input doesn't suppport use with x_var"
+                assert x_kwarg is None, "x_kwarg input doesn't support use with x_var"
                 y_var = fp32_model(*x_var)
                 y_var = y_var.to(torch.bfloat16) if int8_bf16 else y_var
                 y_var_llga = traced_model(*x_var)
diff --git a/tests/cpu/test_autocast.py b/tests/cpu/test_autocast.py
index b1ef8cc8c..44044e3af 100644
--- a/tests/cpu/test_autocast.py
+++ b/tests/cpu/test_autocast.py
@@ -102,7 +102,7 @@ def test_gradscaler(self):
                 scaler.step(optimizer)
                 scaler.update()
 
-    def test_nested_useage(self):
+    def test_nested_usage(self):
         rand_seed = int(get_rand_seed())
         print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
         torch.manual_seed(rand_seed)
diff --git a/tests/cpu/test_cumsum.py b/tests/cpu/test_cumsum.py
index 00c261474..7dacc5f4c 100644
--- a/tests/cpu/test_cumsum.py
+++ b/tests/cpu/test_cumsum.py
@@ -29,7 +29,7 @@ def test_cumsum(self):
         self.assertEqual(aRes, bRes)
         self.assertEqual(aRes, torch.tensor([[1, 1, 2], [0, 0, 0], [1, 2, 3]]))
 
-        # Check that cummulative sum over a zero length dimension doesn't crash on backprop.
+        # Check that cumulative sum over a zero length dimension doesn't crash on backprop.
         # Also check that cumsum over other dimensions in a tensor with a zero-length
         # dimensiuon also works
         # Also include a basic suite of similar tests for other bases cases.
diff --git a/tests/cpu/test_graph_capture.py b/tests/cpu/test_graph_capture.py
index 5d2538c46..d6b365ae5 100644
--- a/tests/cpu/test_graph_capture.py
+++ b/tests/cpu/test_graph_capture.py
@@ -586,7 +586,7 @@ def test_training_save_load(self):
             origin_optimizer.zero_grad()
             loss.backward()
             origin_optimizer.step()
-            # traing second step for ipex model.
+            # training second step for ipex model.
             y3 = ipex_model(ipex_x)
             loss3 = y3.sum()
             ipex_optimizer.zero_grad()
diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py
index d374ea9ce..ae37b662d 100644
--- a/tests/cpu/test_ipex_llm_module.py
+++ b/tests/cpu/test_ipex_llm_module.py
@@ -857,7 +857,7 @@ def test_rmsnorm(self):
             self.assertEqual(out_2, ref_out)
 
     def test_modules_naming(self):
-        # below ipex.llm modeules has thier own UTs, here only test their access of naming from ipex.llm.modules
+        # below ipex.llm modeules has their own UTs, here only test their access of naming from ipex.llm.modules
         assert ipex.llm.modules.RotaryEmbedding is not None
         assert ipex.llm.modules.RotaryEmbedding.apply_function is not None
         assert ipex.llm.modules.PagedAttention is not None
diff --git a/tests/cpu/test_linear_reorder.py b/tests/cpu/test_linear_reorder.py
index 1fb6c1171..b2a330747 100644
--- a/tests/cpu/test_linear_reorder.py
+++ b/tests/cpu/test_linear_reorder.py
@@ -16,13 +16,13 @@ def test_linear_reorder(self):
                     "reorder_for_pack": 2,
                     "reorder_for_dtype": 0,
                     "reorder_for_format": 0,
-                    "redundent_reorder": 0,
+                    "redundant_reorder": 0,
                 },
                 "bf16": {
                     "reorder_for_pack": 3,
                     "reorder_for_dtype": 0,
                     "reorder_for_format": 0,
-                    "redundent_reorder": 0,
+                    "redundant_reorder": 0,
                 },
             }  # there should be only reorders on prepack, if any other reorder appears, will cause fail
             seg = None
@@ -54,10 +54,10 @@ def test_linear_reorder(self):
                     )
 
                 if self.is_dnnl_verbose(line) and self.RedundantReorder(line):
-                    segmentation[seg]["redundent_reorder"] -= 1
+                    segmentation[seg]["redundant_reorder"] -= 1
                     self.assertTrue(
-                        segmentation[seg]["redundent_reorder"] >= 0,
-                        "show unexpected redundent reorder",
+                        segmentation[seg]["redundant_reorder"] >= 0,
+                        "show unexpected redundant reorder",
                     )
 
 
diff --git a/tests/cpu/test_masked_mha.py b/tests/cpu/test_masked_mha.py
index f8402c9fe..2c858ab3b 100644
--- a/tests/cpu/test_masked_mha.py
+++ b/tests/cpu/test_masked_mha.py
@@ -235,7 +235,7 @@ def _test_mha(self, torchcompile=False):
                 elif beam_size == 1:
                     beam_idx_t = torch.arange(batch_size)
                 beam_idx[offset] = beam_idx_t
-                # reorder cache for naive impelementation
+                # reorder cache for naive implementation
                 key_cache = torch.index_select(key_cache, 0, beam_idx_t)
                 value_cache = torch.index_select(value_cache, 0, beam_idx_t)
 
@@ -396,7 +396,7 @@ def _test_mha(self, torchcompile=False):
                     beam_idx_t = torch.arange(batch_size)
                 beam_idx[offset] = beam_idx_t
                 offset = offset + 1
-                # reorder cache for naive impelementation
+                # reorder cache for naive implementation
                 key_cache = torch.index_select(key_cache, 0, beam_idx_t)
                 value_cache = torch.index_select(value_cache, 0, beam_idx_t)
                 key_cache_bf16 = torch.index_select(key_cache_bf16, 0, beam_idx_t)
diff --git a/tests/cpu/test_masked_mha_fp8.py b/tests/cpu/test_masked_mha_fp8.py
index 331ff4d4a..4f4cec3c0 100644
--- a/tests/cpu/test_masked_mha_fp8.py
+++ b/tests/cpu/test_masked_mha_fp8.py
@@ -252,7 +252,7 @@ def _test_mha(self, torchcompile=False):
                 elif beam_size == 1:
                     beam_idx_t = torch.arange(batch_size)
                 beam_idx[offset] = beam_idx_t
-                # reorder cache for naive impelementation
+                # reorder cache for naive implementation
                 key_cache = torch.index_select(key_cache, 0, beam_idx_t)
                 value_cache = torch.index_select(value_cache, 0, beam_idx_t)
 
@@ -457,7 +457,7 @@ def _test_mha(self, torchcompile=False):
                     beam_idx_t = torch.arange(batch_size)
                 beam_idx[offset] = beam_idx_t
                 offset = offset + 1
-                # reorder cache for naive impelementation
+                # reorder cache for naive implementation
                 key_cache = torch.index_select(key_cache, 0, beam_idx_t)
                 value_cache = torch.index_select(value_cache, 0, beam_idx_t)
                 key_cache_bf16 = torch.index_select(key_cache_bf16, 0, beam_idx_t)
diff --git a/tests/cpu/test_optimizer.py b/tests/cpu/test_optimizer.py
index 58a884cc3..17ec2a27b 100644
--- a/tests/cpu/test_optimizer.py
+++ b/tests/cpu/test_optimizer.py
@@ -40,7 +40,7 @@ def _test_update(
         )
         for i in range(2):
             with torch.cpu.amp.autocast(enabled=True, dtype=dtype):
-                # torch optmizer
+                # torch optimizer
                 y = module(*module.input).sum()
                 optimizer.zero_grad(set_to_none=set_to_none)
                 y.backward()
diff --git a/tests/cpu/test_paged_attention.py b/tests/cpu/test_paged_attention.py
index 3b1c12f2d..015025db6 100644
--- a/tests/cpu/test_paged_attention.py
+++ b/tests/cpu/test_paged_attention.py
@@ -379,13 +379,13 @@ def _test_reshape_and_cache_func(
             )
 
         # Run the reference implementation.
-        block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
-        block_indicies = block_indicies.cpu().tolist()
+        block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+        block_indices = block_indices.cpu().tolist()
         block_offsets = slot_mapping % block_size
         block_offsets = block_offsets.cpu().tolist()
         for i in range(num_token):
             for j in range(num_head):
-                block_idx = block_indicies[i]
+                block_idx = block_indices[i]
                 block_offset = block_offsets[i]
                 cloned_key_cache[block_idx, j, block_offset, :] = key[i][j]
                 cloned_value_cache[block_idx, j, block_offset, :] = value[i][j]
diff --git a/tests/cpu/test_paged_attention_fp8.py b/tests/cpu/test_paged_attention_fp8.py
index 2639b0832..01be90ada 100644
--- a/tests/cpu/test_paged_attention_fp8.py
+++ b/tests/cpu/test_paged_attention_fp8.py
@@ -353,13 +353,13 @@ def _test_reshape_and_cache_func(
         )
 
         # Run the reference implementation.
-        block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
-        block_indicies = block_indicies.cpu().tolist()
+        block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+        block_indices = block_indices.cpu().tolist()
         block_offsets = slot_mapping % block_size
         block_offsets = block_offsets.cpu().tolist()
         for i in range(num_token):
             for j in range(num_head):
-                block_idx = block_indicies[i]
+                block_idx = block_indices[i]
                 block_offset = block_offsets[i]
                 cloned_key_cache[block_idx, j, block_offset, :] = key[i][j]
                 cloned_value_cache[block_idx, j, block_offset, :] = value[i][j]
diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py
index 28da441b5..9c8414642 100644
--- a/tests/cpu/test_quantization_default_recipe.py
+++ b/tests/cpu/test_quantization_default_recipe.py
@@ -187,7 +187,7 @@ def forward(self, x):
         assert hasattr(converted_model, "linear")
         assert isinstance(converted_model.linear, nn.quantized.dynamic.Linear)
 
-    def test_check_model_obsever_has_run(self):
+    def test_check_model_observer_has_run(self):
         class Block(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -208,28 +208,28 @@ def forward(self, x):
                     x = b(x)
                 return x
 
-        check_model_obsever_has_run = (
-            ipex.quantization._utils.check_model_obsever_has_run
+        check_model_observer_has_run = (
+            ipex.quantization._utils.check_model_observer_has_run
         )
         m = Mod().eval()
         x = torch.rand(4, 4)
         qconfig_mapping = ipex.quantization.default_static_qconfig_mapping
         prepared_model = ipex.quantization.prepare(m, qconfig_mapping, x)
-        assert not check_model_obsever_has_run(prepared_model)
+        assert not check_model_observer_has_run(prepared_model)
         for _ in range(5):
             prepared_model(torch.rand(4, 4))
-        assert check_model_obsever_has_run(prepared_model)
+        assert check_model_observer_has_run(prepared_model)
         with tempfile.NamedTemporaryFile() as fp:
             qconf_filename = fp.name
             prepared_model.save_qconf_summary(qconf_filename)
             # Observers are removed after save_qconf_summary
-            assert not check_model_obsever_has_run(prepared_model)
+            assert not check_model_observer_has_run(prepared_model)
             prepared_model.load_qconf_summary(qconf_filename)
             # Observers are added but not run yet after load_qconf_summary
-            assert not check_model_obsever_has_run(prepared_model)
+            assert not check_model_observer_has_run(prepared_model)
             for _ in range(5):
                 prepared_model(torch.rand(4, 4))
-            assert check_model_obsever_has_run(prepared_model)
+            assert check_model_observer_has_run(prepared_model)
 
     def test_none_example_input_for_quantization(self):
         class M(nn.Module):
diff --git a/tests/cpu/test_roialign.py b/tests/cpu/test_roialign.py
index 03e8ee8ff..4f5680fe8 100644
--- a/tests/cpu/test_roialign.py
+++ b/tests/cpu/test_roialign.py
@@ -121,7 +121,7 @@ def expected_fn(
 class RoIAlignTester(TestCase):
     def test_roialign(self):
         pool_size = 5
-        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        # n_channels % (pool_size ** 2) == 0 required for PS operations.
         n_channels = 2 * (pool_size**2)
         for datatype in [torch.double, torch.float32, torch.float16]:
             x = torch.rand(2, n_channels, 10, 10, dtype=datatype)
@@ -214,7 +214,7 @@ def test_roialign(self):
     @skipIfNoTorchVision
     def test_torchvision_roialign(self):
         pool_size = 5
-        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        # n_channels % (pool_size ** 2) == 0 required for PS operations.
         n_channels = 2 * (pool_size**2)
         for datatype in [torch.double, torch.float32, torch.float16]:
             x = torch.rand(2, n_channels, 10, 10, dtype=datatype)
diff --git a/tests/cpu/test_weight_prepack.py b/tests/cpu/test_weight_prepack.py
index 98cc794c8..a2fb35be6 100644
--- a/tests/cpu/test_weight_prepack.py
+++ b/tests/cpu/test_weight_prepack.py
@@ -773,7 +773,7 @@ def _test_conv_serialization_base(self, dim):
             loss.backward()
             origin_optimizer.step()
             with torch.cpu.amp.autocast(enabled=True, dtype=dtype):
-                # traing second step for ipex model.
+                # training second step for ipex model.
                 y3 = ipex_model(ipex_x)
                 loss3 = y3.sum()
                 ipex_optimizer.zero_grad()
@@ -825,7 +825,7 @@ def _test_imagenet_model(self, model):
                 y2 = ipex_model2(x)
 
             self.assertEqual(y1, y2.float(), rtol=1e-2, atol=5e-2)
-            # traing case.
+            # training case.
             origin_model = copy.deepcopy(model).train()
             origin_optimizer = ASGD(origin_model.parameters(), lr=0.01)
             # do weight prepack for 'O1'
@@ -2098,7 +2098,7 @@ def forward(self, x, h=None):
                 loss1_hy1_1.backward(retain_graph=True)
             origin_optimizer.step()
             with torch.cpu.amp.autocast(enabled=True, dtype=dtype):
-                # traing second step for ipex model.
+                # training second step for ipex model.
                 if empty_state:
                     torch.manual_seed(rand_seed)
                     y3, hy3 = ipex_model(x2)
diff --git a/tools/install_c++_sdk.sh.in b/tools/install_c++_sdk.sh.in
index 3fed66c82..909fd4703 100644
--- a/tools/install_c++_sdk.sh.in
+++ b/tools/install_c++_sdk.sh.in
@@ -59,7 +59,7 @@ if [[ ${MODE} == "install" ]]; then
     cp intel_ext_pt_cpuConfig.cmake ${LIBTORCH_PATH}/share/cmake/intel_ext_pt_cpu
 
     echo $IPEX_VERSION > ${LIBTORCH_PATH}/build-version-libintel-ext-pt-cpu
-    echo "Installation successed!"
+    echo "Installation succeeded!"
 elif [[ ${MODE} == "uninstall" ]]; then
     while IFS= read -r lib
     do
@@ -73,5 +73,5 @@ elif [[ ${MODE} == "uninstall" ]]; then
     if [ -f ${LIBTORCH_PATH}/build-version-libintel-ext-pt-cpu ]; then
         rm ${LIBTORCH_PATH}/build-version-libintel-ext-pt-cpu
     fi
-    echo "Uninstallation successed!"
+    echo "Uninstallation succeeded!"
 fi