Skip to content

Commit c5d85fe

Browse files
authored
Merge pull request bytedance#103 from bytedance/jzs/fix_micro_perf
[micro_perf] use less memory to avoid using L2 Cache; fix bugs
2 parents 22e9b39 + 8f5ae7c commit c5d85fe

File tree

3 files changed

+27
-31
lines changed

3 files changed

+27
-31
lines changed

byte_micro_perf/backends/GPU/backend_gpu.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -211,10 +211,17 @@ def build_tensor(self, input_shapes, dtype):
211211
bytes_per_cnt = dtype_size * element_num
212212

213213

214-
# compute max avail tensors for compute
215-
avail_bytes = (self.memory_limit - 4) * 1024**3
216-
avail_cnts = avail_bytes // bytes_per_cnt
217-
max_data_cnt = min(self.iterations, avail_cnts)
214+
# avoid use L2 Cache: assume 256 MB currently
215+
# data_per_cnt > 256 MB, only one buffer
216+
# data_per_cnt < 256 MB, malloc multiple buffer to exceed 256MB, and use first and last buffer
217+
218+
assume_l2_cache_size = 256 * 1024**2
219+
if bytes_per_cnt > self.memory_limit * 0.9 * 1024 ** 3:
220+
return [], 0, bytes_per_cnt
221+
elif bytes_per_cnt > assume_l2_cache_size:
222+
max_data_cnt = 1
223+
else:
224+
max_data_cnt = math.ceil(assume_l2_cache_size / bytes_per_cnt)
218225

219226
# create input tensors for each op
220227
input_tensors_list = []
@@ -241,7 +248,17 @@ def build_tensor(self, input_shapes, dtype):
241248
self.op.process_inputs(*(input_tensor))
242249
for input_tensor in input_tensors_list
243250
]
244-
return input_tensors_list, max_data_cnt, bytes_per_cnt
251+
252+
253+
if max_data_cnt > 2:
254+
max_data_cnt = 2
255+
new_tensor_list = []
256+
new_tensor_list.append(input_tensors_list[0])
257+
new_tensor_list.append(input_tensors_list[-1])
258+
else:
259+
new_tensor_list = input_tensors_list
260+
261+
return new_tensor_list, max_data_cnt, bytes_per_cnt
245262

246263

247264

byte_micro_perf/backends/backend.py

+3-20
Original file line numberDiff line numberDiff line change
@@ -222,30 +222,17 @@ def perf(self, input_shapes: List[List[int]], dtype):
222222

223223
if tensor_cnt > 0:
224224
try:
225-
# random select input tensors
226-
input_index_list = [
227-
random.randint(0, tensor_cnt - 1) for _ in range(self.iterations)
228-
]
229-
230225
# warmup
231226
num_warm_up = 10
232227
for _ in range(num_warm_up):
233228
self._run_operation(self.op, tensor_list[0])
234229

235-
236-
# ccl ops need barrier
237-
if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
238-
self.barier()
239-
240230
# test perf
241-
num_test_perf = 5
231+
num_test_perf = 10
242232
self.device_synchronize()
243233
start_time = time.perf_counter_ns()
244234
for i in range(num_test_perf):
245-
self._run_operation(
246-
self.op,
247-
tensor_list[input_index_list[i]]
248-
)
235+
self._run_operation(self.op, tensor_list[0])
249236
self.device_synchronize()
250237
end_time = time.perf_counter_ns()
251238

@@ -257,7 +244,6 @@ def perf(self, input_shapes: List[List[int]], dtype):
257244
else:
258245
prefer_iterations = min(max(int(max_perf_seconds // op_duration), 10), self.iterations)
259246

260-
261247
# ccl ops need barrier
262248
if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
263249
self.barier()
@@ -266,10 +252,7 @@ def perf(self, input_shapes: List[List[int]], dtype):
266252
self.device_synchronize()
267253
start_time = time.perf_counter_ns()
268254
for i in range(prefer_iterations):
269-
self._run_operation(
270-
self.op,
271-
tensor_list[input_index_list[i]]
272-
)
255+
self._run_operation(self.op, tensor_list[i % tensor_cnt])
273256
self.device_synchronize()
274257
end_time = time.perf_counter_ns()
275258

byte_micro_perf/backends/module_store.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
183183
device_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device=xpu_device)
184184
return [host_tensor, device_tensor]
185185

186-
def forward(self, input_tensors):
187-
host_tensor = input_tensors[0]
188-
device_tensor = input_tensors[1]
186+
def forward(self, host_tensor, device_tensor):
189187
device_tensor.copy_(host_tensor, non_blocking=True)
190188
return device_tensor
191189

@@ -199,9 +197,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
199197
host_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device="cpu")
200198
return [device_tensor, host_tensor]
201199

202-
def forward(self, input_tensors):
203-
device_tensor = input_tensors[0]
204-
host_tensor= input_tensors[1]
200+
def forward(self, device_tensor, host_tensor):
205201
host_tensor.copy_(device_tensor, non_blocking=True)
206202
return host_tensor
207203

0 commit comments

Comments
 (0)