ARM-software
diff --git a/‎README.md‎
Lines changed: 11 additions & 0 deletions b/‎README.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎SConscript‎
Lines changed: 9 additions & 4 deletions b/‎SConscript‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎SConstruct‎
Lines changed: 8 additions & 11 deletions b/‎SConstruct‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎arm_compute/core/CL/CLKernelLibrary.h‎
Lines changed: 5 additions & 0 deletions b/‎arm_compute/core/CL/CLKernelLibrary.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎arm_compute/core/CL/CLKernels.h‎
Lines changed: 7 additions & 2 deletions b/‎arm_compute/core/CL/CLKernels.h‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎arm_compute/core/CL/CLTypes.h‎
Lines changed: 20 additions & 1 deletion b/‎arm_compute/core/CL/CLTypes.h‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎arm_compute/core/CL/ICLKernel.h‎
Lines changed: 28 additions & 0 deletions b/‎arm_compute/core/CL/ICLKernel.h‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎arm_compute/core/CL/ICLTensor.h‎
Lines changed: 8 additions & 1 deletion b/‎arm_compute/core/CL/ICLTensor.h‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h‎
Lines changed: 55 additions & 0 deletions b/‎arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h‎
Lines changed: 55 additions & 0 deletions
@@ -12,17 +12,26 @@ Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 
 News:
 
+- [Gian Marco's talk on Performance Analysis for Optimizing Embedded Deep Learning Inference Software](https://www.embedded-vision.com/platinum-members/arm/embedded-vision-training/videos/pages/may-2019-embedded-vision-summit)
 - [Gian Marco's talk on optimizing CNNs with Winograd algorithms at the EVS](https://www.embedded-vision.com/platinum-members/arm/embedded-vision-training/videos/pages/may-2018-embedded-vision-summit-iodice)
 - [Gian Marco's talk on using SGEMM and FFTs to Accelerate Deep Learning](https://www.embedded-vision.com/platinum-members/arm/embedded-vision-training/videos/pages/may-2016-embedded-vision-summit-iodice)
 
 Related projects:
 
 - [Arm NN SDK](https://github.com/arm-software/armnn)
+
+Tutorials:
+
 - [Tutorial: Cartoonifying Images on Raspberry Pi with the Compute Library](https://community.arm.com/graphics/b/blog/posts/cartoonifying-images-on-raspberry-pi-with-the-compute-library)
 - [Tutorial: Running AlexNet on Raspberry Pi with Compute Library](https://community.arm.com/processors/b/blog/posts/running-alexnet-on-raspberry-pi-with-compute-library)
 
+Blogs:
+
+- [Happy Birthday ACL!](https://community.arm.com/developer/tools-software/graphics/b/blog/posts/arm-compute-library-19-05-is-coming)
+
 Documentation available here:
 
+- [v19.08](https://arm-software.github.io/ComputeLibrary/v19.08/)
 - [v19.05](https://arm-software.github.io/ComputeLibrary/v19.05/)
 - [v19.02](https://arm-software.github.io/ComputeLibrary/v19.02/)
 - [v18.11](https://arm-software.github.io/ComputeLibrary/v18.11/index.xhtml)
@@ -41,6 +50,8 @@ Documentation available here:
 
 Binaries available here:
 
+- [v19.08-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08/arm_compute-v19.08-bin-linux.tar.gz)
+- [v19.08-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08/arm_compute-v19.08-bin-android.tar.gz)
 - [v19.05-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.05/arm_compute-v19.05-bin-linux.tar.gz)
 - [v19.05-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.05/arm_compute-v19.05-bin-android.tar.gz)
 - [v19.02-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.02/arm_compute-v19.02-bin-linux.tar.gz)
 
@@ -24,8 +24,8 @@ import os.path
 import re
 import subprocess
 
-VERSION = "v19.05"
-SONAME_VERSION="15.0.0"
+VERSION = "v19.08"
+SONAME_VERSION="16.0.0"
 
 Import('env')
 Import('vars')
@@ -164,6 +164,7 @@ core_files += Glob('src/core/CPP/kernels/*.cpp')
 core_files += Glob('src/core/utils/helpers/*.cpp')
 core_files += Glob('src/core/utils/io/*.cpp')
 core_files += Glob('src/core/utils/quantization/*.cpp')
+core_files += Glob('src/core/utils/misc/*.cpp')
 if env["logging"]:
     core_files += Glob('src/core/utils/logging/*.cpp')
 
@@ -187,6 +188,7 @@ if env['opencl']:
     core_files += Glob('src/core/CL/*.cpp')
     core_files += Glob('src/core/CL/kernels/*.cpp')
     core_files += Glob('src/core/CL/gemm/*.cpp')
+    core_files += Glob('src/core/CL/gemm/native/*.cpp')
     core_files += Glob('src/core/CL/gemm/reshaped/*.cpp')
     core_files += Glob('src/core/CL/gemm/reshaped_only_rhs/*.cpp')
 
@@ -204,10 +206,13 @@ if env['neon']:
 
     core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp')
 
-    # build winograd sources for either v7a / v8a
+    # build winograd/depthwise sources for either v7a / v8a
     core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp')
     core_files += Glob('src/core/NEON/kernels/convolution/winograd/*/*.cpp')
-    arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/winograd/","arm_compute/core/NEON/kernels/convolution/common/" , "arm_compute/core/NEON/kernels/assembly/"])
+    arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/common/",
+                                      "arm_compute/core/NEON/kernels/convolution/winograd/",
+                                      "arm_compute/core/NEON/kernels/convolution/depthwise/",
+                                      "arm_compute/core/NEON/kernels/assembly/"])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
 
@@ -145,7 +145,7 @@ if env['os'] == 'android' and ( 'clang++' not in cpp_compiler or 'clang' not in
 if 'clang++' in cpp_compiler:
     env.Append(CXXFLAGS = ['-Wno-format-nonliteral','-Wno-deprecated-increment-bool','-Wno-vla-extension','-Wno-mismatched-tags'])
 else:
-    env.Append(CXXFLAGS = ['-Wlogical-op','-Wnoexcept','-Wstrict-null-sentinel','-Wno-implicit-fallthrough'])
+    env.Append(CXXFLAGS = ['-Wlogical-op','-Wnoexcept','-Wstrict-null-sentinel', '-Wno-redundant-move'])
 
 if env['cppthreads']:
     env.Append(CPPDEFINES = [('ARM_COMPUTE_CPP_SCHEDULER', 1)])
@@ -185,18 +185,15 @@ elif env['arch'] == 'arm64-v8a':
         env.Append(CXXFLAGS = ['-no-integrated-as'])
 elif 'arm64-v8.2-a' in env['arch']:
     if env['arch'] == 'arm64-v8.2-a-sve':
-        if env['os'] != 'bare_metal':
-            print("Only bare metal SVE is supported at the moment")
-            Exit(1)
         env.Append(CXXFLAGS = ['-march=armv8.2-a+sve+fp16+dotprod'])
     else:
         env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
-        if env['os'] == 'linux':
-            prefix = "aarch64-linux-gnu-"
-        elif env['os'] == 'bare_metal':
-            prefix = "aarch64-elf-"
-        elif env['os'] == 'android':
-            prefix = "aarch64-linux-android-"
+    if env['os'] == 'linux':
+        prefix = "aarch64-linux-gnu-"
+    elif env['os'] == 'bare_metal':
+        prefix = "aarch64-elf-"
+    elif env['os'] == 'android':
+        prefix = "aarch64-linux-android-"
     env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8_2','NO_DOT_IN_TOOLCHAIN'])
     if 'clang++' in cpp_compiler:
         env.Append(CXXFLAGS = ['-no-integrated-as'])
@@ -282,7 +279,7 @@ if env['debug']:
     env.Append(CXXFLAGS = ['-O0','-g','-gdwarf-2'])
     env.Append(CPPDEFINES = ['ARM_COMPUTE_DEBUG_ENABLED'])
 else:
-    env.Append(CXXFLAGS = ['-O3','-ftree-vectorize'])
+    env.Append(CXXFLAGS = ['-O3'])
 
 if env['asserts']:
     env.Append(CPPDEFINES = ['ARM_COMPUTE_ASSERTS_ENABLED'])
 
@@ -297,6 +297,11 @@ class CLKernelLibrary
      * @return The content of CL_DEVICE_VERSION
      */
     std::string get_device_version();
+    /** Return the maximum number of compute units in the device
+     *
+     * @return The content of CL_DEVICE_MAX_COMPUTE_UNITS
+     */
+    cl_uint get_num_compute_units();
     /** Creates a kernel from the kernel library.
      *
      * @param[in] kernel_name       Kernel name.
 
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
 #include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
@@ -51,6 +52,7 @@
 #include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h"
@@ -73,25 +75,26 @@
 #include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
 #include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/CL/kernels/CLGatherKernel.h"
 #include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
@@ -109,6 +112,7 @@
 #include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
@@ -138,6 +142,7 @@
 #include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_CL_TYPES_H__
 #define __ARM_COMPUTE_CL_TYPES_H__
 
+#include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/GPUTarget.h"
 
 #include <string>
@@ -53,5 +54,23 @@ struct CLDeviceOptions
     size_t      num_cores;   /**< Number of cores */
     size_t      cache_size;  /**< Cache size */
 };
+
+/** OpenCL quantization data */
+struct CLQuantization
+{
+    /** Default Constructor */
+    CLQuantization()
+        : scale(nullptr), offset(nullptr) {};
+    /** Constructor
+     *
+     * @param[in] scale  OpenCL scale array
+     * @param[in] offset OpenCL offset array
+     */
+    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset)
+        : scale(scale), offset(offset) {};
+
+    const ICLFloatArray *scale;  /**< Quantization scale array */
+    const ICLInt32Array *offset; /**< Quantization offset array */
+};
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CL_TYPES_H__ */
@@ -111,6 +111,20 @@ class ICLKernel : public IKernel
     {
         add_tensor_argument<1>(idx, tensor, window);
     }
+    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
+     *
+     * @param[in]     cond   Condition to check
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        if(cond)
+        {
+            add_1D_tensor_argument(idx, tensor, window);
+        }
+    }
     /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -121,6 +135,20 @@ class ICLKernel : public IKernel
     {
         add_tensor_argument<2>(idx, tensor, window);
     }
+    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
+     *
+     * @param[in]     cond   Condition to check
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        if(cond)
+        {
+            add_2D_tensor_argument(idx, tensor, window);
+        }
+    }
     /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 
 #include "arm_compute/core/ITensor.h"
 
+#include "arm_compute/core/CL/CLTypes.h"
+
 #include <cstdint>
 
 namespace cl
@@ -53,6 +55,11 @@ class ICLTensor : public ITensor
     /** Default virtual destructor. */
     virtual ~ICLTensor() = default;
 
+    /** Interface to be implemented by the child class to return the wrapped quantization info data
+     *
+     * @return A wrapped quantization info object.
+     */
+    virtual CLQuantization quantization() const = 0;
     /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.
      *
      * @return A reference to an OpenCL buffer containing the image's data.
 
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H__
+#define __ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H__
+
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMNative factory class */
+class CLGEMMNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to construct CLGEMMNative kernel object accordingly with the GPU architecture
+     *
+     * @param[in] arch GPU target
+     *
+     * @return CLGEMMNative kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget arch)
+    {
+        // Note: At the moment we only support Bifrost architecture. However, we should have a dedicated path for each GPU architecture
+        // using get_arch_from_target(arch)
+        return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationBifrost>(arch);
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H__ */