Merge pull request bytedance#103 from bytedance/jzs/fix_micro_perf

suisiyuan · web-flow · commit c5d85fe0a22f · 2024-08-30T03:09:30.000+08:00
[micro_perf] use less memory to avoid using L2 Cache; fix bugs
diff --git a/byte_micro_perf/backends/GPU/backend_gpu.py b/byte_micro_perf/backends/GPU/backend_gpu.py
@@ -211,10 +211,17 @@ def build_tensor(self, input_shapes, dtype):
             bytes_per_cnt = dtype_size * element_num
 
 
-        # compute max avail tensors for compute
-        avail_bytes = (self.memory_limit - 4) * 1024**3
-        avail_cnts = avail_bytes // bytes_per_cnt
-        max_data_cnt = min(self.iterations, avail_cnts)
+        # avoid use L2 Cache: assume 256 MB currently
+        # data_per_cnt > 256 MB, only one buffer
+        # data_per_cnt < 256 MB, malloc multiple buffer to exceed 256MB, and use first and last buffer
+
+        assume_l2_cache_size = 256 * 1024**2
+        if bytes_per_cnt > self.memory_limit * 0.9 * 1024 ** 3:
+            return [], 0, bytes_per_cnt
+        elif bytes_per_cnt > assume_l2_cache_size:
+            max_data_cnt = 1
+        else:
+            max_data_cnt = math.ceil(assume_l2_cache_size / bytes_per_cnt)
 
         # create input tensors for each op
         input_tensors_list = []
@@ -241,7 +248,17 @@ def build_tensor(self, input_shapes, dtype):
                 self.op.process_inputs(*(input_tensor))
                 for input_tensor in input_tensors_list
             ]
-        return input_tensors_list, max_data_cnt, bytes_per_cnt
+
+
+        if max_data_cnt > 2:
+            max_data_cnt = 2
+            new_tensor_list = []
+            new_tensor_list.append(input_tensors_list[0])
+            new_tensor_list.append(input_tensors_list[-1])
+        else:
+            new_tensor_list = input_tensors_list
+
+        return new_tensor_list, max_data_cnt, bytes_per_cnt
 
 
 
diff --git a/byte_micro_perf/backends/backend.py b/byte_micro_perf/backends/backend.py
@@ -222,30 +222,17 @@ def perf(self, input_shapes: List[List[int]], dtype):
 
         if tensor_cnt > 0:
             try:
-                # random select input tensors
-                input_index_list = [
-                    random.randint(0, tensor_cnt - 1) for _ in range(self.iterations)
-                ]
-
                 # warmup
                 num_warm_up = 10
                 for _ in range(num_warm_up):
                     self._run_operation(self.op, tensor_list[0])
 
-
-                # ccl ops need barrier
-                if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
-                    self.barier()
-
                 # test perf
-                num_test_perf = 5
+                num_test_perf = 10
                 self.device_synchronize()
                 start_time = time.perf_counter_ns()
                 for i in range(num_test_perf):
-                    self._run_operation(
-                        self.op,
-                        tensor_list[input_index_list[i]]
-                    )
+                    self._run_operation(self.op, tensor_list[0])
                 self.device_synchronize()
                 end_time = time.perf_counter_ns()
 
@@ -257,7 +244,6 @@ def perf(self, input_shapes: List[List[int]], dtype):
                 else:
                     prefer_iterations = min(max(int(max_perf_seconds // op_duration), 10), self.iterations)
 
-
                 # ccl ops need barrier
                 if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
                     self.barier()
@@ -266,10 +252,7 @@ def perf(self, input_shapes: List[List[int]], dtype):
                 self.device_synchronize()
                 start_time = time.perf_counter_ns()
                 for i in range(prefer_iterations):
-                    self._run_operation(
-                        self.op,
-                        tensor_list[input_index_list[i]]
-                    )
+                    self._run_operation(self.op, tensor_list[i % tensor_cnt])
                 self.device_synchronize()
                 end_time = time.perf_counter_ns()
 
diff --git a/byte_micro_perf/backends/module_store.py b/byte_micro_perf/backends/module_store.py
@@ -183,9 +183,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
         device_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device=xpu_device)
         return [host_tensor, device_tensor]
 
-    def forward(self, input_tensors):
-        host_tensor = input_tensors[0]
-        device_tensor = input_tensors[1]
+    def forward(self, host_tensor, device_tensor):
         device_tensor.copy_(host_tensor, non_blocking=True)
         return device_tensor
 
@@ -199,9 +197,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
         host_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device="cpu")
         return [device_tensor, host_tensor]
 
-    def forward(self, input_tensors):
-        device_tensor = input_tensors[0]
-        host_tensor= input_tensors[1]
+    def forward(self, device_tensor, host_tensor):
         host_tensor.copy_(device_tensor, non_blocking=True)
         return host_tensor