Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
111 commits
Select commit Hold shift + click to select a range
9ec13f9
Add classifier training support
runwangdl Mar 17, 2025
f1a0491
Fix L3 DMA and Maxpool Bugs
runwangdl Mar 3, 2025
29baf2c
WIP Static Memory Allocation of IOs
Victor-Jung Mar 3, 2025
25be229
Temporary fix broken float softmax
Victor-Jung Mar 4, 2025
da56cbe
Fix lifetime of aliased input buffers
Victor-Jung Mar 4, 2025
721f747
Fix output buffer lifetime
Victor-Jung Mar 4, 2025
78685e5
Linting
Victor-Jung Mar 4, 2025
02b5435
WIP fix output buffer lifetime
Victor-Jung Mar 4, 2025
a2d67a0
Change RQHardswish dim due to compiler bug
Victor-Jung Mar 4, 2025
bdd92de
Fix typo
Victor-Jung Mar 4, 2025
20b1f8b
Fix duplicated IO in memory allocation visualization
Victor-Jung Mar 4, 2025
c708069
Fix the Constant Tensor offset to not take into account IO since they…
Victor-Jung Mar 4, 2025
b6e2448
Add new attribute to Variable and Transient buffer to annotate if the…
Victor-Jung Mar 7, 2025
7e96f18
Adapt calculateLifetime to use buffer I/O annotation
Victor-Jung Mar 7, 2025
b923520
Fix typo
Victor-Jung Mar 7, 2025
f4cb9e0
Remove IO buffer name and refactor var name
Victor-Jung Mar 13, 2025
435cc9d
Linting
Victor-Jung Mar 13, 2025
731f39f
Test the correctness of the memory map after memory allocation
Victor-Jung Mar 17, 2025
dd1370c
Allocate memory arena first
Victor-Jung Mar 17, 2025
8bfdb13
correct DMA lengh of copy assertion
runwangdl Mar 18, 2025
f01eb7f
Align memory allocation test
Victor-Jung Mar 18, 2025
031dc79
delete redundant shell scripts
runwangdl Mar 19, 2025
58e18da
Merge branch 'devel' into PULPCCTL3_16_16_64
runwangdl Mar 19, 2025
ac2d879
Update node with multioutput to single output
runwangdl Mar 19, 2025
6a7198b
add softmaxcrossentropygrad tiling
runwangdl Mar 19, 2025
360aef7
Add softmaxcrossentropylossgrad tiling
runwangdl Mar 20, 2025
bc48582
Merge branch 'PULPCCTL3_16_16_64' into GEMM_training_tiled
runwangdl Mar 20, 2025
b6542ba
Fix CI issue
runwangdl Mar 20, 2025
fe208d0
Fix CI bugs
runwangdl Mar 20, 2025
4a21359
update CI
runwangdl Mar 20, 2025
a0dcb6d
Improve memory alloc visualization
Victor-Jung Mar 20, 2025
91f12f0
Add and pass test for CCT gemmtraining 1_16_16_8 to 128
runwangdl Mar 20, 2025
d1e1ebf
update CI with 8-128 dim CCT last gemm training test
runwangdl Mar 20, 2025
86a2e99
Add SGD support for PULP Open
runwangdl Mar 20, 2025
bdacd2f
Update CCT training test with sgd
runwangdl Mar 20, 2025
b5421cc
Multi-level profiling + Linting
Victor-Jung Mar 21, 2025
99035f0
Update Changelog
runwangdl Mar 23, 2025
62e87d3
Merge branch 'devel' into GEMM_training_tiled
runwangdl Mar 23, 2025
15ea3ec
Solved issues caused by merging conflicts
runwangdl Mar 23, 2025
a644fdf
Solved Review Comments
runwangdl Mar 28, 2025
643e160
Resolving conflicts
runwangdl Mar 28, 2025
80a9518
Reresolve the conflict
runwangdl Mar 28, 2025
501775d
Solving CI issues
runwangdl Mar 28, 2025
65a56b7
fix linting errors
runwangdl Mar 28, 2025
03c3f4a
gelu sigmoid approximation
runwangdl Mar 24, 2025
7e141fd
gelu parallel + unroll
runwangdl Mar 24, 2025
c3ee783
Float Matmul Parallel on M
runwangdl Mar 24, 2025
47d8c19
Softmax Parallel and Softmax Op Support
runwangdl Mar 24, 2025
ccba380
conv parallel without im2col
runwangdl Mar 25, 2025
fafcedf
PULP Layernorm Parallel
runwangdl Mar 25, 2025
147e68f
Fixed CI issues
runwangdl Mar 28, 2025
6e07dc9
fixing linting
runwangdl Mar 28, 2025
8b2f685
Merge branch 'devel' into devel_CCT_Optim
runwangdl Apr 8, 2025
9c0b8f6
Enlarge CI floatconv tiling L1 size for 8 core and delete CCT 128 tes…
runwangdl Apr 8, 2025
4c36de2
matmul 1*4 unrolling
runwangdl Apr 24, 2025
28ec2ca
Add computeOp support for CCT necessary kernels
runwangdl Apr 24, 2025
bf1f8ae
Add openlibm expf
runwangdl Apr 13, 2025
deac9ce
add relu, mul, maxpool ops num
runwangdl May 4, 2025
3b12187
Optimize parallel for multiple kernels
runwangdl May 4, 2025
49da947
Merge branch 'devel' into devel_CCT_Optim
runwangdl May 4, 2025
47961b9
Merge branch 'devel' into devel_CCT_Optim
runwangdl May 6, 2025
8907532
Change ConvTileConstraint to only tile on outchannel
runwangdl May 6, 2025
133f9ae
Fix error in gelu
runwangdl May 6, 2025
f25127d
Fix Linting Issues
runwangdl May 6, 2025
6f3f585
Merge branch 'devel' into devel_CCT_Optim
runwangdl May 8, 2025
4ffea9b
Change CI tests
runwangdl May 8, 2025
81c3460
profilling string change to const static
runwangdl May 8, 2025
4af69de
Fix profiling dual loop issue
runwangdl May 8, 2025
e819626
Add RV32IMF Picolibc support for Siracusa platform
runwangdl May 8, 2025
fa0cc37
Build Docker for new gvsoc for testing
runwangdl May 8, 2025
ac56ca2
Gvsoc Small test
runwangdl May 8, 2025
fd6c99d
Add Redmule Platform, Engline, Tiler, and Deployer
runwangdl May 8, 2025
2862f29
Add rv32imf.txt to build docker
runwangdl May 8, 2025
9ef9cc2
Update GVSOC hash
runwangdl May 9, 2025
10de9f6
matmul delicate constraints for Redmule
runwangdl May 9, 2025
efab54c
Merge branch 'devel_CCT_Optim' into redmule_platform
runwangdl May 9, 2025
37670e6
conv with redmule
runwangdl May 9, 2025
08b7e23
Add CCT 32 test
runwangdl May 9, 2025
e42b3d6
xtensor gvsoc docker build
runwangdl May 9, 2025
823d847
add softmaxgrad tileconstraint
runwangdl May 10, 2025
212ff3c
LayernormGrad and CCT MLP Training Graph
runwangdl May 11, 2025
d7346a5
Merge branch 'devel' into exp/heterogeneous-memory-placement
runwangdl May 12, 2025
c51694b
Fix Layernormgrad
runwangdl May 12, 2025
3efa661
Add Gelugrad
runwangdl May 16, 2025
aee7651
Merge branch 'exp/heterogeneous-memory-placement' into AttentionTraining
runwangdl May 16, 2025
b40cbd7
GEMM with Redmule
runwangdl May 18, 2025
203f095
Efficient GEMM
runwangdl May 18, 2025
7835c5a
reducesum tileconstraint
runwangdl Jun 9, 2025
21294bb
temporary deactiate transposesplit otherwise kq training failed
runwangdl Jun 9, 2025
90689e2
merge devel
runwangdl Jun 13, 2025
5c3f287
gemm no bias + input in name issue for codegenerate
runwangdl Jun 19, 2025
3271c3a
Parallelization and Optimization of CCT Inference and Training Kernel…
runwangdl Jun 12, 2025
7f99f2c
Adapation for Merging Devel
runwangdl Jun 20, 2025
2b46d2d
AttentionTraining Support
runwangdl Nov 16, 2025
bc3f951
clean unwanted deeplotest
runwangdl Nov 16, 2025
fe13842
Merge branch 'devel' into AttentionTraining
runwangdl Nov 16, 2025
d867f73
Remove Redmule Content from this branch
runwangdl Nov 16, 2025
a7d6903
Fix Bugs after merge
runwangdl Nov 16, 2025
0806442
Update CCT training testcases
runwangdl Nov 25, 2025
df6e698
Add CCT2 Training to CI
runwangdl Nov 25, 2025
c28300a
Fix CI errors
runwangdl Nov 25, 2025
06fa447
Remove redundant files
runwangdl Nov 25, 2025
3b2af3f
Decrease unneccessary changes compared with devel
runwangdl Nov 26, 2025
caa751a
Fix transposesplit samenaming issue & update GEMM no bias for tiling
runwangdl Nov 26, 2025
179262e
Fixing Linting
runwangdl Nov 26, 2025
0e16453
Remove redundant changes
runwangdl Nov 26, 2025
3f5a042
Fix linting again
runwangdl Nov 26, 2025
5502d04
Merge latest devel including TinyViT tiling support
runwangdl Nov 26, 2025
a292765
[AttentionTraining] Fixed PR Comments
runwangdl Dec 8, 2025
7bf6395
[AttentionTraining] Fix Linting
runwangdl Dec 8, 2025
00d542f
[AttentionTraining] Remove three input add binding
runwangdl Dec 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions .github/workflows/ci-platform-siracusa-tiled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,7 @@ jobs:
- name: "MLPerf/AnomalyDetection"
L1: [64000]
- name: "CCT/CCT_1_16_16_8"
L1: [2000, 64000]
- name: "testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8"
L1: [4000, 64000]
L1: [64000]
- name: "testFloatDemoTinyViT"
L1: [4000]
num-cores: [8]
Expand Down Expand Up @@ -168,9 +166,9 @@ jobs:
- name: "microLlama/microLlama1"
L1: [60000, 10000, 5000]
- name: "CCT/CCT_2_32_32_128"
L1: [64000, 128000]
- name: "testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128"
L1: [32000, 64000]
L1: [128000]
- name: "testTrainCCT/CCT2_FT2"
L1: [128000]
- name: "testFloatDemoTinyViT"
L1: [4000]
num-cores: [8]
Expand Down Expand Up @@ -208,9 +206,9 @@ jobs:
- name: "microLlama/microLlama8_parallel"
L1: [60000, 20000, 10000]
- name: "CCT/CCT_2_32_32_128"
L1: [64000, 128000]
- name: "testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128"
L1: [8000, 64000]
L1: [128000]
- name: "testTrainCCT/CCT2_FT2"
L1: [128000]
- name: "testFloatDemoTinyViT"
L1: [4000]
num-cores: [8]
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/ci-platform-siracusa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,5 @@ jobs:
MLPerf/AnomalyDetection
CCT/CCT_1_16_16_8
CCT/CCT_2_32_32_128_Opset20
testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8
testFloatDemoTinyViT
num-cores: 8
18 changes: 18 additions & 0 deletions Deeploy/Targets/Generic/Layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ def computeOps(self):
return mul1 + neg + exp + add + div + mul2


class GELUGradLayer(ONNXLayer):

def __init__(self, maps: List[NodeMapper]):
super().__init__(maps)

def computeOps(self):
size = self.mapper.parser.operatorRepresentation['size']
ops_per_element = 9
gelu_grad_ops = size * ops_per_element
return gelu_grad_ops


class iHardswishLayer(ONNXLayer):

def __init__(self, maps: List[NodeMapper]):
Expand Down Expand Up @@ -438,6 +450,12 @@ def computeOps(self):
return compAverage + compNormalize + compSqr + compSum + compSqrt + compDiv


class LayerNormGradLayer(ONNXLayer):

def __init__(self, maps: List[NodeMapper]):
super().__init__(maps)


class TransposeLayer(ONNXLayer):

def __init__(self, maps: List[NodeMapper]):
Expand Down
57 changes: 57 additions & 0 deletions Deeploy/Targets/Generic/Parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,33 @@ def parseNodeCtxt(self,
return ctxt, True


class GELUGradParser(NodeParser):

def __init__(self):
super().__init__()

def parseNode(self, node: gs.Node) -> bool:

ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
return ret

def parseNodeCtxt(self,
ctxt: NetworkContext,
node: gs.Node,
channels_first: bool = True) -> Tuple[NetworkContext, bool]:

upstream_grad = ctxt.lookup(node.inputs[0].name)
gelu_input = ctxt.lookup(node.inputs[1].name)
gelu_grad = ctxt.lookup(node.outputs[0].name)

self.operatorRepresentation['grad_in'] = upstream_grad.name
self.operatorRepresentation['data_in'] = gelu_input.name
self.operatorRepresentation['grad_out'] = gelu_grad.name
self.operatorRepresentation['size'] = np.prod(upstream_grad.shape)

return ctxt, True


class RQSiGELUParser(GELUParser):

def __init__(self):
Expand Down Expand Up @@ -1647,6 +1674,36 @@ def parseNodeCtxt(self,
return ctxt, True


class LayerNormGradParser(iLayerNormParser):

def parseNode(self, node: gs.Node) -> (bool):

ret = all(['epsilon' in node.attrs, len(node.inputs) == 4, len(node.outputs) == 1])

if ret:
self.operatorRepresentation['epsilon'] = node.attrs['epsilon']

return ret

def parseNodeCtxt(self,
ctxt: NetworkContext,
node: gs.Node,
channels_first: bool = True) -> Tuple[NetworkContext, bool]:

inputs = ['grad_in', 'data_in', 'weight', 'bias']
outputs = ['grad_out']

for idx, inputNode in enumerate(node.inputs):
self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
for idx, outputNode in enumerate(node.outputs):
self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name

self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1]

return ctxt, True


class MatMulParser(NodeParser):

def __init__(self, noBiasHoisting = True):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -676,8 +676,8 @@ def _split_transposes_fun(graph: gs.Graph, match: Match, name: str):
inputNode.outputs = [postSplitOutput]

for node in originalNode.outputs.copy():
nodeName = node.name + f"_transpose_in"
varName = node.name + f"_transpose_in_var"
nodeName = f"{t1.name}_{node.name}_transpose_in"
varName = f"{t1.name}_{node.name}_transpose_in_var"
newOutput = gs.Variable(name = varName, dtype = np.float32, shape = t1.outputs[0].shape)

transposeNode = gs.Node(name = nodeName,
Expand Down
12 changes: 12 additions & 0 deletions Deeploy/Targets/PULPOpen/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,22 @@
PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate,
ForkTransformer)

PULPLayernormGradBinding = NodeBinding(
LayerNormChecker(
[PointerClass(float32_t),
PointerClass(float32_t),
PointerClass(float32_t),
PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceGradTemplate,
ForkTransformer)

PULPFloatGELUBinding = NodeBinding(
GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatGELUTemplate.referenceTemplate, ForkTransformer)

PULPFloatGELUGradBinding = NodeBinding(
GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatGELUTemplate.referenceGradTemplate, ForkTransformer)

PULPGatherBindings = [
NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]),
GatherTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
Expand Down
42 changes: 24 additions & 18 deletions Deeploy/Targets/PULPOpen/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,18 @@
from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \
BasicRQIntegerDivBinding
from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \
LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \
ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \
SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \
TransposeLayer, iHardswishLayer, iRMSNormLayer
from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \
GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \
ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \
RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \
SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer
from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \
GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, \
QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, \
RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \
SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \
TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser
GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool2DParser, \
MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \
RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, \
SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, \
SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, \
iSoftmaxParser
from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \
MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \
Expand All @@ -37,14 +38,15 @@
from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \
PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, \
PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, \
PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \
PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \
PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, \
PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, \
PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, \
PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \
PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \
PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \
PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \
PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, \
PULPRQAddTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, \
PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, \
PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \
PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings
from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
Expand All @@ -54,6 +56,7 @@
AddMapper = NodeMapper(AddParser(), PULPAddTilingReadyBindings)
FlattenMapper = NodeMapper(FlattenParser(), PULPFlattenTilingReadyBindings)
GELUMapper = NodeMapper(GELUParser(), PULPFPGELUTilingReadyBindings)
GELUGradMapper = NodeMapper(GELUGradParser(), PULPFPGELUGradTilingReadyBindings)
GatherMapper = NodeMapper(GatherParser(), PULPGatherTilingReadyBindings)
MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings)
Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
Expand Down Expand Up @@ -83,6 +86,7 @@
TallGEMMMapper = NodeMapper(PULPTallGEMMParser(), PULPRQSTallGEMMTilingReadyBindings)
MaxPool2DMapper = NodeMapper(MaxPool2DParser(), PULPMaxPool2DTilingReadyBindings)
LayerNormMapper = NodeMapper(LayerNormParser(), PULPLayernormTilingReadyBindings)
LayerNormGradMapper = NodeMapper(LayerNormGradParser(), PULPLayernormGradTilingReadyBindings)
ReluMapper = NodeMapper(ReluParser(), PULPReluTilingReadyBindings)
SoftmaxMapper = NodeMapper(SoftmaxParser(), PULPSoftmaxTilingReadyBindings)
SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), PULPSoftmaxGradTilingReadyBindings)
Expand Down Expand Up @@ -111,7 +115,9 @@
'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),
'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]),
'Gelu': GELULayer([GELUMapper]),
'GeluGrad': GELUGradLayer([GELUGradMapper]),
'LayerNormalization': LayerNormLayer([LayerNormMapper]),
'LayerNormalizationGrad': LayerNormGradLayer([LayerNormGradMapper]),
'MaxPool': MaxPoolLayer([MaxPool2DMapper]),
'RequantizediGELU': RQSiGELULayer([RQGELU_int8_Mapper]),
'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
Expand Down
10 changes: 10 additions & 0 deletions Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,14 @@
referenceTemplate = NodeTemplate("""
// GELU (Name: ${nodeName}, Op: ${nodeOp})
PULP_GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size});
""")

referenceGradTemplate = NodeTemplate("""
// GELU Parallel (Name: ${nodeName}, Op: ${nodeOp})
int8_t ${nodeName}_core_id = pi_core_id();
int8_t ${nodeName}_log2Core = log2(NUM_CORES);
int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
GELU_fp${data_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}_sigmoid_grad_chunk(${grad_in}, ${data_in}, ${grad_out}, ${nodeName}_chunk_start, ${nodeName}_chunk_stop);
""")
46 changes: 42 additions & 4 deletions Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,42 @@
#
# SPDX-License-Identifier: Apache-2.0

from Deeploy.DeeployTypes import NodeTemplate
from typing import Dict, List, Tuple

referenceTemplate = NodeTemplate("""
from Deeploy.AbstractDataTypes import float32_tPtr
from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation


class PULPFloatGEMMTemplate(NodeTemplate):

def __init__(self, templateStr):
super().__init__(templateStr)

def alignToContext(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:

if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
# No bias case - set C to NULL and provide a default type
operatorRepresentation['C'] = None
operatorRepresentation['C_type'] = float32_tPtr # Default to fp32 type
operatorRepresentation['C_batched'] = False

return ctxt, operatorRepresentation, []


referenceTemplate = PULPFloatGEMMTemplate("""
// GEMM (Name: ${nodeName}, Op: ${nodeOp})
${A_type.typeName} ref_${data_out}_${A} = ${A};
${B_type.typeName} ref_${data_out}_${B} = ${B};
% if C is not None:
${C_type.typeName} ref_${data_out}_${C} = ${C};
% else:
${C_type.typeName} ref_${data_out}_C = NULL;
% endif
${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};

for(uint32_t i=0; i<${batch}; i++){
% if C is not None:
PULP_Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
ref_${data_out}_${A},
ref_${data_out}_${B},
Expand All @@ -23,7 +49,19 @@
${transA},
${transB}
);

% else:
PULP_Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
ref_${data_out}_${A},
ref_${data_out}_${B},
NULL,
ref_${data_out}_${data_out},
${M},
${N},
${O},
${transA},
${transB}
);
% endif
% if A_batched:
ref_${data_out}_${A} += ${M} * ${N};
% endif
Expand All @@ -32,7 +70,7 @@
ref_${data_out}_${B} += ${N} * ${O};
% endif

% if C_batched:
% if C is not None and C_batched:
ref_${data_out}_${C} += ${M} * ${O};
% endif

Expand Down
34 changes: 34 additions & 0 deletions Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,38 @@
${size},
${lastDimLength}
);
""")

referenceGradTemplate = NodeTemplate("""
// FloatLayernormGrad Parallel (Name: ${nodeName}, Op: ${nodeOp})

int8_t ${nodeName}_core_id = pi_core_id();
int8_t ${nodeName}_log2Core = log2(NUM_CORES);

int32_t ${nodeName}_seq_length = ${size} / ${lastDimLength};
int32_t ${nodeName}_chunk = (${nodeName}_seq_length >> ${nodeName}_log2Core) +
((${nodeName}_seq_length & (NUM_CORES-1)) != 0);
int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${nodeName}_seq_length);
int32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${nodeName}_seq_length);

int32_t ${nodeName}_elem_start = ${nodeName}_start * ${lastDimLength};
int32_t ${nodeName}_elem_end = ${nodeName}_end * ${lastDimLength};
int32_t ${nodeName}_elem_count = ${nodeName}_elem_end - ${nodeName}_elem_start;

const float${grad_in_type.referencedType.typeWidth}_t* ${nodeName}_grad_in_ptr = ${grad_in} + ${nodeName}_elem_start;
const float${data_in_type.referencedType.typeWidth}_t* ${nodeName}_data_in_ptr = ${data_in} + ${nodeName}_elem_start;
float${grad_out_type.referencedType.typeWidth}_t* ${nodeName}_grad_out_ptr = ${grad_out} + ${nodeName}_elem_start;

if (${nodeName}_elem_count > 0) {
LayernormGrad_fp${grad_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}(
${nodeName}_grad_in_ptr, // Upstream gradient (dy)
${nodeName}_data_in_ptr, // Original input (x)
${nodeName}_grad_out_ptr, // Output gradient (dx)
${weight}, // Input Scale parameter
${bias}, // Input Bias parameter
${epsilon}, // Epsilon for numerical stability
${nodeName}_elem_count, // Number of elements to process
${lastDimLength} // Size of the feature dimension
);
}
""")
Loading
Loading