test: Add thread safety tests for CpuTranspose

wajahat-abbas · wajahat-abbas · commit 7f31b0b5c647 · 2025-08-14T10:43:45.000Z
Resolves: COMPMID-8391 Change-Id: Ie1f3319a6e6f56a5d324be5a88d1112fef4c39f9 Signed-off-by: Syed Wajahat Abbas Naqvi <syedwajahatabbas.naqvi@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/15066 Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Dongsung Kim <dongsung.kim@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/fixtures/CpuTransposeFixture.h b/tests/validation/fixtures/CpuTransposeFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Arm Limited.
+ * Copyright (c) 2024-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,78 +32,198 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/reference/Permute.h"
 
+#if !defined(BARE_METAL)
+#include <thread>
+#include <vector>
+#endif // !defined(BARE_METAL)
+
 namespace arm_compute
 {
 namespace test
 {
 namespace validation
 {
+namespace
+{
+constexpr int NUM_THREADS =  3;
+}// namespace
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class CpuTransposeValidationFixture : public framework::Fixture
+class CpuTransposeGenericFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape, DataType data_type)
+    void setup(TensorShape shape, DataType data_type, QuantizationInfo qinfo, TestType test_type = TestType::ConfigureOnceRunOnce)
     {
-        _target    = compute_target(shape, data_type);
-        _reference = compute_reference(shape, data_type);
+        if (std::is_same<TensorType, Tensor>::value && // Cpu
+            data_type == DataType::F16 && !CPUInfo::get().has_fp16())
+        {
+            return;
+        }
+        _test_type  = test_type;
+        _num_parallel_runs = (_test_type == TestType::ConfigureOnceRunMultiThreaded ? NUM_THREADS : 1);
+
+        compute_target(shape, data_type, qinfo);
+        compute_reference(shape, data_type, qinfo);
     }
 
 protected:
     template <typename U>
     void fill(U &&tensor)
     {
-        library->fill_tensor_uniform(tensor, 0);
+        if(tensor.data_type() == DataType::F32)
+        {
+            std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
+            library->fill(tensor, distribution, 0);
+        }
+        else if(tensor.data_type() == DataType::F16)
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -10.0f, 10.0f };
+            library->fill(tensor, distribution, 0);
+        }
+        else if(!is_data_type_quantized(tensor.data_type()))
+        {
+            std::uniform_int_distribution<> distribution(0, 100);
+            library->fill(tensor, distribution, 0);
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
     }
 
-    TensorType compute_target(const TensorShape &shape, DataType data_type)
-    {
-        // Make rows the columns of the original shape
-        TensorShape output_shape{ shape[1], shape[0] };
+    void allocate_and_fill_tensors(TensorType *src, TensorType *dst){
+        for(int i = 0; i < _num_parallel_runs; ++i) {
 
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(output_shape, data_type);
+            ARM_COMPUTE_ASSERT(src[i].info()->is_resizable());
+            ARM_COMPUTE_ASSERT(dst[i].info()->is_resizable());
 
-        // Create and configure function
-        FunctionType trans_func;
-        trans_func.configure(src.info(), dst.info());
+            // Allocate tensors
+            src[i].allocator()->allocate();
+            dst[i].allocator()->allocate();
 
-        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!src[i].info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!dst[i].info()->is_resizable());
 
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
+            // Fill tensors
+            fill(AccessorType(src[i]));
+        }
+    }
 
-        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+    void compute_target(const TensorShape &shape, DataType data_type, QuantizationInfo qinfo)
+    {
+        // Create tensors
+        TensorType src[NUM_THREADS];
+        TensorType dst[NUM_THREADS];
+        TensorType *dst_ptrs[NUM_THREADS];
 
-        // Fill tensors
-        fill(AccessorType(src));
+        // Retain the shape but make rows the columns of the original shape
+        TensorShape output_shape = shape;
+        std::swap(output_shape[0], output_shape[1]);
 
-        // Compute function
-        ITensorPack run_pack{ { arm_compute::TensorType::ACL_SRC, &src }, { arm_compute::TensorType::ACL_DST, &dst } };
-        trans_func.run(run_pack);
+        for(int i = 0; i < _num_parallel_runs; ++i){
+            src[i] = create_tensor<TensorType>(shape, data_type, 1, qinfo);
+            dst[i] = create_tensor<TensorType>(output_shape, data_type, 1, qinfo);
+            dst_ptrs[i] = &dst[i];
+        }
 
-        return dst;
+        // Create and configure function
+        FunctionType trans_func;
+        trans_func.configure(src[0].info(), dst_ptrs[0]->info());
+
+        allocate_and_fill_tensors(src, dst);
+
+        if(_test_type == TestType::ConfigureOnceRunMultiThreaded)
+        {
+#ifndef BARE_METAL
+
+            ITensorPack run_pack[NUM_THREADS];
+            std::vector<std::thread> threads;
+
+            threads.reserve(_num_parallel_runs);
+            for(int i = 0; i < _num_parallel_runs; ++i)
+            {
+                // Compute function
+                run_pack[i] = { {arm_compute::TensorType::ACL_SRC, &src[i]},
+                                {arm_compute::TensorType::ACL_DST, dst_ptrs[i]}};
+
+                threads.emplace_back([&,i]
+                {
+                    trans_func.run(run_pack[i]);
+                    _target[i] = std::move(*(dst_ptrs[i]));
+                });
+            }
+            for(int i = 0; i < _num_parallel_runs; ++i)
+            {
+                threads[i].join();
+            }
+#endif // ifndef BARE_METAL
+        }
+        else
+        {
+            // Compute function
+            ITensorPack run_pack{{ arm_compute::TensorType::ACL_SRC, &src[0]},
+                {arm_compute::TensorType::ACL_DST, dst_ptrs[0]}};
+            trans_func.run(run_pack);
+            _target[0] = std::move(*(dst_ptrs[0]));
+        }
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type)
+    void compute_reference(const TensorShape &shape, DataType data_type, QuantizationInfo qinfo)
     {
         // Create reference
-        SimpleTensor<T> src{ shape, data_type };
+        SimpleTensor<T> src{shape, data_type, 1, qinfo};
+
+        for(int i = 0; i < _num_parallel_runs; ++i)
+        {
+            // Fill reference
+            fill(src);
+            _reference[i] = reference::permute<T>(src, PermutationVector(1U, 0U));
+        }
+    }
 
-        // Fill reference
-        fill(src);
+    TensorType      _target[NUM_THREADS];
+    SimpleTensor<T> _reference[NUM_THREADS];
+    TestType        _test_type{};
+    int             _num_parallel_runs{};
+};
 
-        return reference::permute<T>(src, PermutationVector(1U, 0U));
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class CpuTransposeValidationFixture
+    : public CpuTransposeGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        CpuTransposeGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, QuantizationInfo());
     }
+};
 
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class CpuTransposeThreadSafeValidationFixture
+    : public CpuTransposeGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        CpuTransposeGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, QuantizationInfo(),
+            TestType::ConfigureOnceRunMultiThreaded);
+    }
 };
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class CpuTransposeQuantizedThreadSafeValidationFixture
+    : public CpuTransposeGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo qinfo)
+    {
+        CpuTransposeGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, qinfo,
+            TestType::ConfigureOnceRunMultiThreaded);
+    }
+};
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019,2024 Arm Limited.
+ * Copyright (c) 2017-2019,2024-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,7 @@ template SimpleTensor<int8_t>   permute(const SimpleTensor<int8_t> &src, Permuta
 template SimpleTensor<uint8_t>  permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);
 template SimpleTensor<int16_t>  permute(const SimpleTensor<int16_t> &src, PermutationVector perm);
 template SimpleTensor<uint16_t> permute(const SimpleTensor<uint16_t> &src, PermutationVector perm);
+template SimpleTensor<int32_t>  permute(const SimpleTensor<int32_t> &src, PermutationVector perm);
 template SimpleTensor<uint32_t> permute(const SimpleTensor<uint32_t> &src, PermutationVector perm);
 template SimpleTensor<float>    permute(const SimpleTensor<float> &src, PermutationVector perm);
 template SimpleTensor<half>     permute(const SimpleTensor<half> &src, PermutationVector perm);
diff --git a/tests/validation/runtime/experimental/operators/CpuTranspose.cpp b/tests/validation/runtime/experimental/operators/CpuTranspose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024 Arm Limited.
+ * Copyright (c) 2024-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,12 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+using framework::dataset::make;
+
+} // namespace
+
 TEST_SUITE(NEON)
 TEST_SUITE(OPERATORS)
 
@@ -48,15 +54,111 @@ TEST_SUITE(CpuTranspose)
 template <typename T>
 using CpuTransposeFixture = CpuTransposeValidationFixture<Tensor, Accessor, experimental::op::CpuTranspose, T>;
 
+template <typename T>
+using CpuTransposeThreadSafeFixture =
+    CpuTransposeThreadSafeValidationFixture<Tensor, Accessor,  experimental::op::CpuTranspose, T>;
+
+    template <typename T>
+using CpuTransposeQuantizedThreadSafeFixture =
+    CpuTransposeQuantizedThreadSafeValidationFixture<Tensor, Accessor,  experimental::op::CpuTranspose, T>;
+
 TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(SmokeTest, CpuTransposeFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(concat(datasets::Small1DShapes(), datasets::Small2DShapes()),
-                                                                                                          framework::dataset::make("DataType", DataType::U8)))
+                                                                                                          make("DataType", DataType::U8)))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    for(int i = 0; i < _num_parallel_runs; ++i)
+    {
+        validate(Accessor(_target[i]), _reference[i]);
+    }
 }
-TEST_SUITE_END() // U8
+TEST_SUITE_END() //U8
 
+#ifndef BARE_METAL
+TEST_SUITE(ThreadSafety)
+TEST_SUITE(Float)
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(ConfigureOnceUseFromDifferentThreads,
+                       CpuTransposeThreadSafeFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::Small2DShapes(), make("DataType", DataType::F32)))
+{
+    // Validate output
+    for(int i = 0; i < _num_parallel_runs; ++i)
+    {
+        validate(Accessor(_target[i]), _reference[i]);
+    }
+}
+TEST_SUITE_END() // F32
+#ifdef ARM_COMPUTE_ENABLE_FP16
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(ConfigureOnceUseFromDifferentThreads,
+                       CpuTransposeThreadSafeFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::Tiny4DShapes(), make("DataType", DataType::F16)))
+{
+    if (CPUInfo::get().has_fp16())
+    {
+        // Validate output
+        for(int i = 0; i < _num_parallel_runs; ++i)
+        {
+            validate(Accessor(_target[i]), _reference[i]);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // F16
+#endif // ARM_COMPUTE_ENABLE_FP16
+TEST_SUITE_END() // Float
+TEST_SUITE(Integer)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(ConfigureOnceUseFromDifferentThreads,
+                       CpuTransposeThreadSafeFixture<int32_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::Tiny4DShapes(), make("DataType", DataType::S32)))
+{
+    // Validate output
+    for(int i = 0; i < _num_parallel_runs; ++i)
+    {
+        validate(Accessor(_target[i]), _reference[i]);
+    }
+}
+TEST_SUITE_END() // S32
+TEST_SUITE_END() // Integer
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(ConfigureOnceUseFromDifferentThreads,
+                       CpuTransposeQuantizedThreadSafeFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::Tiny4DShapes(), make("DataType", DataType::QASYMM8_SIGNED), make("QuantizationInfoIn", {QuantizationInfo(0.5f, 0)})))
+{
+    // Validate output
+    for(int i = 0; i < _num_parallel_runs; ++i)
+    {
+        validate(Accessor(_target[i]), _reference[i]);
+    }
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(ConfigureOnceUseFromDifferentThreads,
+                       CpuTransposeQuantizedThreadSafeFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::Tiny4DShapes(), make("DataType", DataType::QASYMM8), make("QuantizationInfoIn", {QuantizationInfo(0.5f, 0)})))
+{
+    // Validate output
+    for(int i = 0; i < _num_parallel_runs; ++i)
+    {
+        validate(Accessor(_target[i]), _reference[i]);
+    }
+}
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // ThreadSafety
+#endif // #ifndef BARE_METAL
 TEST_SUITE_END() // CpuTranspose
 
 TEST_SUITE_END() // OPERATORS

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2017-2019,2024 Arm Limited.`
	`2`	`+ * Copyright (c) 2017-2019,2024-2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -63,6 +63,7 @@ template SimpleTensor<int8_t> permute(const SimpleTensor<int8_t> &src, Permuta`
`63`	`63`	`template SimpleTensor<uint8_t> permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);`
`64`	`64`	`template SimpleTensor<int16_t> permute(const SimpleTensor<int16_t> &src, PermutationVector perm);`
`65`	`65`	`template SimpleTensor<uint16_t> permute(const SimpleTensor<uint16_t> &src, PermutationVector perm);`
	`66`	`+template SimpleTensor<int32_t> permute(const SimpleTensor<int32_t> &src, PermutationVector perm);`
`66`	`67`	`template SimpleTensor<uint32_t> permute(const SimpleTensor<uint32_t> &src, PermutationVector perm);`
`67`	`68`	`template SimpleTensor<float> permute(const SimpleTensor<float> &src, PermutationVector perm);`
`68`	`69`	`template SimpleTensor<half> permute(const SimpleTensor<half> &src, PermutationVector perm);`