⚡️ Speed up method LoRALayer.calc_size by 38%
#139
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 38% (0.38x) speedup for
LoRALayer.calc_sizeininvokeai/backend/patches/layers/lora_layer.py⏱️ Runtime :
97.8 microseconds→70.9 microseconds(best of44runs)📝 Explanation and details
The optimization achieves a 37% speedup by inlining tensor size calculations and eliminating function call overhead in the
calc_tensors_sizeutility function.Key optimization: The original code used a generator expression with
sum()that calledcalc_tensor_size()for each tensor:The optimized version replaces this with a direct loop that inlines the tensor size calculation:
Why this is faster:
calc_tensor_size()for each tensor (which involves Python function call stack operations), the calculation is performed directly inlinesum()in PythonPerformance characteristics from tests:
This optimization is especially valuable because
calc_size()methods are likely called frequently during model loading, memory management, and optimization passes in ML frameworks like InvokeAI, where LoRA layers are commonly used for efficient model adaptation.✅ Correctness verification report:
⚙️ Existing Unit Tests and Runtime
backend/patches/layers/test_lora_layer.py::test_lora_layer_calc_size🌀 Generated Regression Tests and Runtime
from typing import Optional
imports
import pytest
import torch
from invokeai.backend.patches.layers.lora_layer import LoRALayer
def calc_tensor_size(t: torch.Tensor) -> int:
"""Calculate the size of a tensor in bytes."""
return t.nelement() * t.element_size()
from invokeai.backend.patches.layers.lora_layer import LoRALayer
------------------------ UNIT TESTS ------------------------
----------- BASIC TEST CASES -----------
def test_calc_size_basic_all_float32():
"""Test with small float32 tensors for up, mid, down, and bias."""
up = torch.ones((2, 3), dtype=torch.float32)
mid = torch.ones((3, 4), dtype=torch.float32)
down = torch.ones((4, 2), dtype=torch.float32)
bias = torch.ones((2,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
# Each float32 element is 4 bytes
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 4.42μs -> 3.20μs (38.1% faster)
def test_calc_size_basic_none_mid():
"""Test with mid=None, up/down/bias present."""
up = torch.ones((2, 2), dtype=torch.float32)
mid = None
down = torch.ones((2, 2), dtype=torch.float32)
bias = torch.ones((2,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=0.5, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 3.78μs -> 2.72μs (39.0% faster)
def test_calc_size_basic_none_bias():
"""Test with bias=None, up/mid/down present."""
up = torch.ones((2, 2), dtype=torch.float32)
mid = torch.ones((2, 2), dtype=torch.float32)
down = torch.ones((2, 2), dtype=torch.float32)
bias = None
layer = LoRALayer(up, mid, down, alpha=2.0, bias=bias)
expected = (
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 3.52μs -> 2.48μs (41.9% faster)
def test_calc_size_basic_none_mid_and_bias():
"""Test with both mid and bias as None."""
up = torch.ones((2, 2), dtype=torch.float32)
mid = None
down = torch.ones((2, 2), dtype=torch.float32)
bias = None
layer = LoRALayer(up, mid, down, alpha=None, bias=bias)
expected = (
calc_tensor_size(up) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 3.50μs -> 2.42μs (44.7% faster)
def test_calc_size_basic_different_dtypes():
"""Test with tensors of different dtypes."""
up = torch.ones((2, 2), dtype=torch.float64) # 8 bytes per element
mid = torch.ones((2, 2), dtype=torch.int32) # 4 bytes per element
down = torch.ones((2, 2), dtype=torch.int8) # 1 byte per element
bias = torch.ones((2,), dtype=torch.float16) # 2 bytes per element
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 3.95μs -> 2.75μs (43.3% faster)
----------- EDGE TEST CASES -----------
def test_calc_size_edge_empty_tensors():
"""Test with tensors that have zero elements (empty shape)."""
up = torch.empty((0, 2), dtype=torch.float32)
mid = torch.empty((2, 0), dtype=torch.float32)
down = torch.empty((0, 0), dtype=torch.float32)
bias = torch.empty((0,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = 0
codeflash_output = layer.calc_size() # 4.53μs -> 3.33μs (35.8% faster)
def test_calc_size_edge_single_element_tensors():
"""Test with tensors containing a single element."""
up = torch.ones((1,), dtype=torch.float32)
mid = torch.ones((1,), dtype=torch.float32)
down = torch.ones((1,), dtype=torch.float32)
bias = torch.ones((1,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = 4 * 4 # 4 tensors, each with 1 float32 element (4 bytes)
codeflash_output = layer.calc_size()
def test_calc_size_edge_large_dtype_difference():
"""Test with tensors of maximum and minimum element sizes."""
up = torch.ones((2,), dtype=torch.int8) # 1 byte per element
mid = torch.ones((2,), dtype=torch.float64) # 8 bytes per element
down = torch.ones((2,), dtype=torch.float16) # 2 bytes per element
bias = torch.ones((2,), dtype=torch.int64) # 8 bytes per element
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size()
def test_calc_size_edge_bias_only():
"""Test with only bias present, up/mid/down are None or zero-size."""
up = torch.empty((0,), dtype=torch.float32)
mid = None
down = torch.empty((0,), dtype=torch.float32)
bias = torch.ones((3,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=0.0, bias=bias)
expected = calc_tensor_size(bias)
codeflash_output = layer.calc_size()
def test_calc_size_edge_all_none():
"""Test with all tensors as None (should raise TypeError on up/down)."""
with pytest.raises(TypeError):
# up/down cannot be None, LoRALayer expects torch.Tensor
LoRALayer(None, None, None, alpha=None, bias=None)
def test_calc_size_edge_bias_is_zero_tensor():
"""Test with bias as a tensor containing all zeros."""
up = torch.ones((2, 2), dtype=torch.float32)
mid = torch.ones((2, 2), dtype=torch.float32)
down = torch.ones((2, 2), dtype=torch.float32)
bias = torch.zeros((2,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 4.51μs -> 3.10μs (45.5% faster)
----------- LARGE SCALE TEST CASES -----------
def test_calc_size_large_scale_100x100_float32():
"""Test with large 100x100 float32 tensors, all present."""
up = torch.ones((100, 100), dtype=torch.float32)
mid = torch.ones((100, 100), dtype=torch.float32)
down = torch.ones((100, 100), dtype=torch.float32)
bias = torch.ones((100,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 4.01μs -> 3.08μs (30.0% faster)
def test_calc_size_large_scale_500x2_float64():
"""Test with large float64 tensors, ensuring <100MB total."""
up = torch.ones((500, 2), dtype=torch.float64) # 8000 bytes
mid = torch.ones((2, 500), dtype=torch.float64) # 8000 bytes
down = torch.ones((500, 2), dtype=torch.float64) # 8000 bytes
bias = torch.ones((1000,), dtype=torch.float64) # 8000 bytes
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 4.04μs -> 2.91μs (38.9% faster)
def test_calc_size_large_scale_sparse_mid():
"""Test with large up/down, mid=None, bias present."""
up = torch.ones((1000, 1), dtype=torch.float32)
mid = None
down = torch.ones((1, 1000), dtype=torch.float32)
bias = torch.ones((100,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 3.82μs -> 2.71μs (40.7% faster)
def test_calc_size_large_scale_bias_none():
"""Test with large up/mid/down, bias=None."""
up = torch.ones((500, 500), dtype=torch.float32)
mid = torch.ones((500, 500), dtype=torch.float32)
down = torch.ones((500, 500), dtype=torch.float32)
bias = None
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 4.96μs -> 3.41μs (45.4% faster)
def test_calc_size_large_scale_varied_shapes():
"""Test with up/mid/down/bias of different shapes and dtypes."""
up = torch.ones((100, 10), dtype=torch.float32)
mid = torch.ones((10, 100), dtype=torch.float16)
down = torch.ones((50, 20), dtype=torch.int32)
bias = torch.ones((200,), dtype=torch.float64)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = layer.calc_size() # 4.02μs -> 3.09μs (29.9% faster)
----------- DETERMINISM TEST -----------
def test_calc_size_determinism():
"""Test that repeated calls with the same tensors yield the same result."""
up = torch.ones((10, 10), dtype=torch.float32)
mid = torch.ones((10, 10), dtype=torch.float32)
down = torch.ones((10, 10), dtype=torch.float32)
bias = torch.ones((10,), dtype=torch.float32)
layer = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
codeflash_output = layer.calc_size(); result1 = codeflash_output # 4.65μs -> 3.44μs (35.0% faster)
codeflash_output = layer.calc_size(); result2 = codeflash_output # 2.30μs -> 1.69μs (36.4% faster)
----------- ERROR HANDLING TESTS -----------
def test_calc_size_error_non_tensor_up():
"""Test that passing a non-tensor for up raises TypeError."""
with pytest.raises(AttributeError):
LoRALayer("not_a_tensor", None, torch.ones((2,2)), alpha=1.0, bias=None)
def test_calc_size_error_non_tensor_down():
"""Test that passing a non-tensor for down raises TypeError."""
with pytest.raises(AttributeError):
LoRALayer(torch.ones((2,2)), None, "not_a_tensor", alpha=1.0, bias=None)
def test_calc_size_error_non_tensor_mid():
"""Test that passing a non-tensor for mid raises TypeError."""
with pytest.raises(AttributeError):
LoRALayer(torch.ones((2,2)), "not_a_tensor", torch.ones((2,2)), alpha=1.0, bias=None)
def test_calc_size_error_non_tensor_bias():
"""Test that passing a non-tensor for bias raises AttributeError (when not None)."""
up = torch.ones((2,2))
mid = torch.ones((2,2))
down = torch.ones((2,2))
with pytest.raises(AttributeError):
LoRALayer(up, mid, down, alpha=1.0, bias="not_a_tensor")
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
import pytest
import torch
from invokeai.backend.patches.layers.lora_layer import LoRALayer
Function to test (LoRALayer.calc_size and dependencies)
def calc_tensor_size(t: torch.Tensor) -> int:
"""Calculate the size of a tensor in bytes."""
return t.nelement() * t.element_size()
from invokeai.backend.patches.layers.lora_layer import LoRALayer
=======================
Unit Test Suite for LoRALayer.calc_size
=======================
----------- Basic Test Cases ------------
def test_basic_all_tensors_present():
"""Test with all tensors present and small sizes."""
up = torch.ones((2, 3), dtype=torch.float32)
mid = torch.ones((3, 3), dtype=torch.float32)
down = torch.ones((3, 2), dtype=torch.float32)
bias = torch.ones((2,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 3.96μs -> 2.99μs (32.8% faster)
def test_basic_mid_is_none():
"""Test with mid tensor as None."""
up = torch.ones((2, 3), dtype=torch.float32)
mid = None
down = torch.ones((3, 2), dtype=torch.float32)
bias = torch.ones((2,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 3.65μs -> 2.56μs (42.6% faster)
def test_basic_bias_is_none():
"""Test with bias tensor as None."""
up = torch.ones((2, 3), dtype=torch.float32)
mid = torch.ones((3, 3), dtype=torch.float32)
down = torch.ones((3, 2), dtype=torch.float32)
bias = None
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 3.64μs -> 2.53μs (44.2% faster)
def test_basic_all_none_except_up_down():
"""Test with mid and bias as None."""
up = torch.ones((2, 3), dtype=torch.float32)
mid = None
down = torch.ones((3, 2), dtype=torch.float32)
bias = None
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(up) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 3.35μs -> 2.47μs (35.9% faster)
----------- Edge Test Cases ------------
def test_edge_empty_tensors():
"""Test with tensors of zero elements (shape (0,))."""
up = torch.empty((0,), dtype=torch.float32)
mid = torch.empty((0,), dtype=torch.float32)
down = torch.empty((0,), dtype=torch.float32)
bias = torch.empty((0,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = 0
codeflash_output = lora.calc_size()
def test_edge_scalar_tensors():
"""Test with tensors of shape () (scalar)."""
up = torch.tensor(1.0, dtype=torch.float32)
mid = torch.tensor(2.0, dtype=torch.float32)
down = torch.tensor(3.0, dtype=torch.float32)
bias = torch.tensor(4.0, dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size()
def test_edge_different_dtypes():
"""Test with tensors of different dtypes."""
up = torch.ones((2, 3), dtype=torch.float64)
mid = torch.ones((3, 3), dtype=torch.int32)
down = torch.ones((3, 2), dtype=torch.float16)
bias = torch.ones((2,), dtype=torch.uint8)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 4.42μs -> 3.13μs (41.3% faster)
def test_edge_non_contiguous_tensor():
"""Test with non-contiguous tensors (e.g., transposed)."""
up = torch.ones((2, 3), dtype=torch.float32).t() # Now shape (3,2), non-contiguous
mid = torch.ones((3, 3), dtype=torch.float32)
down = torch.ones((3, 2), dtype=torch.float32)
bias = torch.ones((2,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 3.96μs -> 2.91μs (36.3% faster)
def test_edge_large_tensor_but_under_limit():
"""Test with a single large tensor, but total size < 100MB."""
# float32 = 4 bytes, so 25_000_000 elements = 100MB
# We'll use 10_000_000 elements for safety
up = torch.ones((10_000_000,), dtype=torch.float32)
mid = None
down = None
bias = None
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = calc_tensor_size(up)
codeflash_output = lora.calc_size()
def test_edge_up_down_shape_mismatch():
"""Test that are_ranks_equal is set correctly for shape mismatch, but calc_size is unaffected."""
up = torch.ones((2, 4), dtype=torch.float32)
mid = torch.ones((4, 4), dtype=torch.float32)
down = torch.ones((3, 2), dtype=torch.float32) # up.shape[1] != down.shape[0]
bias = torch.ones((2,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 4.32μs -> 3.16μs (36.6% faster)
def test_edge_up_down_shape_equal():
"""Test that are_ranks_equal is True for matching shapes."""
up = torch.ones((2, 3), dtype=torch.float32)
mid = torch.ones((3, 3), dtype=torch.float32)
down = torch.ones((3, 2), dtype=torch.float32) # up.shape[1] == down.shape[0]
bias = torch.ones((2,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 4.09μs -> 2.80μs (45.7% faster)
----------- Large Scale Test Cases ------------
def test_large_scale_many_elements():
"""Test with large tensors, total size under 100MB."""
# float32 = 4 bytes, so 5_000_000 elements = 20MB
up = torch.ones((5_000_000,), dtype=torch.float32)
mid = torch.ones((5_000_000,), dtype=torch.float32)
down = torch.ones((5_000_000,), dtype=torch.float32)
bias = torch.ones((5_000_000,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size()
def test_large_scale_many_small_tensors():
"""Test with many small tensors (simulate batch)."""
# Each tensor is small, but there are many
tensors = [torch.ones((10,), dtype=torch.float32) for _ in range(999)]
# We'll sum up their sizes for expected_size
up = tensors[0]
mid = tensors[1]
down = tensors[2]
bias = tensors[3]
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size()
def test_large_scale_high_dimensional_tensors():
"""Test with high-dimensional tensors, but total size < 100MB."""
up = torch.ones((5, 5, 5, 5, 5, 5), dtype=torch.float32) # 5^6 = 15625 elements
mid = torch.ones((5, 5, 5, 5, 5, 5), dtype=torch.float32)
down = torch.ones((5, 5, 5, 5, 5, 5), dtype=torch.float32)
bias = torch.ones((5, 5, 5, 5, 5, 5), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(bias) +
calc_tensor_size(up) +
calc_tensor_size(mid) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size() # 4.30μs -> 3.29μs (30.7% faster)
def test_large_scale_mixed_none_and_large():
"""Test with some tensors None and some large."""
up = torch.ones((10_000_000,), dtype=torch.float32)
mid = None
down = torch.ones((10_000_000,), dtype=torch.float32)
bias = None
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
expected_size = (
calc_tensor_size(up) +
calc_tensor_size(down)
)
codeflash_output = lora.calc_size()
----------- Determinism Test ------------
def test_determinism_repeatability():
"""Test that repeated calls to calc_size give the same result."""
up = torch.ones((100,), dtype=torch.float32)
mid = torch.ones((100,), dtype=torch.float32)
down = torch.ones((100,), dtype=torch.float32)
bias = torch.ones((100,), dtype=torch.float32)
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
sizes = [lora.calc_size() for _ in range(10)]
----------- Type Robustness Test ------------
def test_type_error_on_non_tensor():
"""Test that passing a non-tensor raises an AttributeError (since .nelement() would fail)."""
up = "not a tensor"
mid = None
down = torch.ones((2, 2), dtype=torch.float32)
bias = None
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
with pytest.raises(AttributeError):
lora.calc_size()
def test_type_error_on_list_instead_of_tensor():
"""Test that passing a list instead of tensor raises an AttributeError."""
up = [1, 2, 3]
mid = None
down = torch.ones((2, 2), dtype=torch.float32)
bias = None
lora = LoRALayer(up, mid, down, alpha=1.0, bias=bias)
with pytest.raises(AttributeError):
lora.calc_size()
----------- LoRALayerBase Coverage ------------
To edit these changes
git checkout codeflash/optimize-LoRALayer.calc_size-mhvkodo7and push.