Skip to content

Commit fc49eeb

Browse files
authored
Merge pull request #3143 from byuccl/fix_mult
Generalize Parmys Mult_Split to Allow for Multipliers Whose Input Widths are not Equal
2 parents b8c96bc + 26a6387 commit fc49eeb

File tree

6 files changed

+165
-57
lines changed

6 files changed

+165
-57
lines changed

parmys/parmys-plugin/core/multiplier.cc

Lines changed: 153 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,7 @@ void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b)
937937
*-----------------------------------------------------------------------*/
938938
void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *netlist)
939939
{
940-
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addbig;
940+
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addsmall2, *addbig;
941941
int size;
942942

943943
/* Check for a legitimate split */
@@ -976,50 +976,151 @@ void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *
976976
init_split_multiplier(node, a1b0, a0, a1, 0, b0, a1b1, a0b0);
977977
mult_list = insert_in_vptr_list(mult_list, a1b0);
978978

979-
/* New node for the initial add */
980-
addsmall = allocate_nnode(node->loc);
981-
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
982-
strcpy(addsmall->name, node->name);
983-
strcat(addsmall->name, "-add0");
984-
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
985-
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);
986-
987-
/* New node for the BIG add */
988-
addbig = allocate_nnode(node->loc);
989-
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
990-
strcpy(addbig->name, node->name);
991-
strcat(addbig->name, "-add1");
992-
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);
993-
994-
// connect inputs to port a of addsmall
995-
for (int i = 0; i < a1b0->num_output_pins; i++)
996-
connect_nodes(a1b0, i, addsmall, i);
997-
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
998-
// connect inputs to port b of addsmall
999-
for (int i = 0; i < a0b1->num_output_pins; i++)
1000-
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
1001-
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);
1002-
1003-
// connect inputs to port a of addbig
1004-
size = addsmall->num_output_pins;
1005-
for (int i = 0; i < size; i++)
1006-
connect_nodes(addsmall, i, addbig, i);
1007-
1008-
// connect inputs to port b of addbig
1009-
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
1010-
connect_nodes(a0b0, i, addbig, i - b0 + size);
1011-
size = size + a0b0->output_port_sizes[0] - b0;
1012-
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
1013-
connect_nodes(a1b1, i, addbig, i + size);
1014-
1015-
// remap the multiplier outputs coming directly from a0b0
1016-
for (int i = 0; i < b0; i++) {
1017-
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
1018-
}
979+
// using the balenced addition method only works if a0 and b0 are the same size
980+
// (i.e. if the input ports on the hardware multiplier are equal)
981+
if (b0 == a0) {
982+
/* New node for the initial add */
983+
addsmall = allocate_nnode(node->loc);
984+
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
985+
strcpy(addsmall->name, node->name);
986+
strcat(addsmall->name, "-add0");
987+
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
988+
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);
989+
990+
// connect inputs to port a of addsmall
991+
for (int i = 0; i < a1b0->num_output_pins; i++)
992+
connect_nodes(a1b0, i, addsmall, i);
993+
994+
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
995+
// connect inputs to port b of addsmall
996+
for (int i = 0; i < a0b1->num_output_pins; i++)
997+
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
998+
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);
999+
1000+
/* New node for the BIG add */
1001+
addbig = allocate_nnode(node->loc);
1002+
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
1003+
strcpy(addbig->name, node->name);
1004+
strcat(addbig->name, "-add1");
1005+
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);
1006+
1007+
// connect inputs to port a of addbig
1008+
size = addsmall->num_output_pins;
1009+
for (int i = 0; i < size; i++)
1010+
connect_nodes(addsmall, i, addbig, i);
1011+
1012+
// connect inputs to port b of addbig
1013+
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
1014+
connect_nodes(a0b0, i, addbig, i - b0 + size);
1015+
size = size + a0b0->output_port_sizes[0] - b0;
1016+
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
1017+
connect_nodes(a1b1, i, addbig, i + size);
1018+
1019+
// remap the multiplier outputs coming directly from a0b0
1020+
for (int i = 0; i < b0; i++) {
1021+
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
1022+
}
1023+
1024+
// remap the multiplier outputs coming from addbig
1025+
for (int i = 0; i < addbig->num_output_pins; i++) {
1026+
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
1027+
}
1028+
} else {
1029+
/* Expounding upon the description for the method in this function.
1030+
if we have two numbers A and B and we have a hardware multiplier of size a0xb0,
1031+
we can split them into two parts:
1032+
A = A1 << a0 + A0
1033+
B = B1 << b0 + B0
1034+
where A1 and B1 are the high bits of A and B, and A0 and B0 are the low bits.
1035+
Note that len(A0) = a0 and len(B0) = b0 by definition.
1036+
The multiplication of A and B can be expressed as:
1037+
A * B = (A1 << a0 + A0) * (B1 << b0 + B0)
1038+
= {A1 * B1 << (a0 + b0)} + {(A1 * B0) << a0 + (A0 * B1) << b0} + {A0 * B0}
1039+
we define split the editions up like so:
1040+
addsmall = (A1 * B0) << a0 + (A0 * B1) << b0 // can have carry
1041+
addsmall2 = (A1 * B1 << (a0 + b0)) + (A0 * B0) // Will not have carry
1042+
addbig = addsmall + addsmall2
1043+
This is a slightly modified version of the Karatsuba algorithm.
1044+
*/
1045+
/////////////// Addsmall /////////////////////
1046+
addsmall = allocate_nnode(node->loc);
1047+
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
1048+
strcpy(addsmall->name, node->name);
1049+
strcat(addsmall->name, "-add0");
1050+
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + a0 + 1, a0b1->num_output_pins + b0 + 1);
1051+
1052+
// The first a0 pins of addsmall input connecting to a1b0 are connected to zero
1053+
for (int i = 0; i < a0; i++) {
1054+
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i);
1055+
}
1056+
1057+
// connect inputs to port a of addsmall
1058+
for (int i = 0; i < a1b0->num_output_pins; i++) {
1059+
connect_nodes(a1b0, i, addsmall, i + a0);
1060+
}
1061+
1062+
// add zero pin for carry
1063+
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins + a0);
1064+
1065+
// The first b0 pins of addsmall input connecting to a0b1 are connected to zero
1066+
for (int i = 0; i < b0; i++) {
1067+
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i + addsmall->input_port_sizes[0]);
1068+
}
1069+
1070+
// connect inputs to port b of addsmall
1071+
for (int i = 0; i < a0b1->num_output_pins; i++) {
1072+
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0] + b0);
1073+
}
1074+
1075+
// add zero pin for carry
1076+
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0] + b0);
1077+
1078+
/////////////// Addsmall2 /////////////////////
1079+
addsmall2 = allocate_nnode(node->loc);
1080+
addsmall2->name = (char *)vtr::malloc(strlen(node->name) + 6);
1081+
strcpy(addsmall2->name, node->name);
1082+
strcat(addsmall2->name, "-add1");
1083+
init_multiplier_adder(addsmall2, a1b1, a1b1->num_output_pins + a0 + b0, a0b0->num_output_pins);
10191084

1020-
// remap the multiplier outputs coming from addbig
1021-
for (int i = 0; i < addbig->num_output_pins; i++) {
1022-
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
1085+
// connect first a0+ b0 pins of addsmall2 to zero
1086+
for (int i = 0; i < a0 + b0; i++) {
1087+
add_input_pin_to_node(addsmall2, get_zero_pin(netlist), i);
1088+
}
1089+
1090+
// connect inputs to port a of addsmall2
1091+
for (int i = 0; i < a1b1->num_output_pins; i++) {
1092+
connect_nodes(a1b1, i, addsmall2, i + a0 + b0);
1093+
}
1094+
1095+
// connect inputs to port b of addsmall2
1096+
for (int i = 0; i < a0b0->output_port_sizes[0]; i++) {
1097+
connect_nodes(a0b0, i, addsmall2, i + addsmall2->input_port_sizes[0]);
1098+
}
1099+
1100+
/////////////// Addbig /////////////////////
1101+
addbig = allocate_nnode(node->loc);
1102+
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
1103+
strcpy(addbig->name, node->name);
1104+
strcat(addbig->name, "-add2");
1105+
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, addsmall2->num_output_pins);
1106+
// Here the final addition can have a carry out in the worst case, however,
1107+
// our final product will always only be the length of the longest input port so regardless of the carry the
1108+
// final adds carry will always drop out.
1109+
1110+
// connect inputs to port a of addbig
1111+
for (int i = 0; i < addsmall->num_output_pins; i++) {
1112+
connect_nodes(addsmall, i, addbig, i);
1113+
}
1114+
1115+
// connect inputs to port b of addbig
1116+
for (int i = 0; i < addsmall2->num_output_pins; i++) {
1117+
connect_nodes(addsmall2, i, addbig, i + addbig->input_port_sizes[0]);
1118+
}
1119+
1120+
// remap the multiplier outputs coming directly from a0b0
1121+
for (int i = 0; i < addbig->num_output_pins; i++) {
1122+
remap_pin_to_new_node(node->output_pins[i], addbig, i);
1123+
}
10231124
}
10241125

10251126
// CLEAN UP
@@ -1060,7 +1161,6 @@ void split_multiplier_a(nnode_t *node, int a0, int a1, int b)
10601161
strcat(a0b->name, "-0");
10611162
init_split_multiplier(node, a0b, 0, a0, 0, b, nullptr, nullptr);
10621163
mult_list = insert_in_vptr_list(mult_list, a0b);
1063-
10641164
/* New node for a1b multiply */
10651165
a1b = allocate_nnode(node->loc);
10661166
a1b->name = (char *)vtr::malloc(strlen(node->name) + 3);
@@ -1184,7 +1284,6 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)
11841284

11851285
oassert(node->type == MULTIPLY);
11861286
oassert(hard_multipliers != NULL);
1187-
11881287
sizea = node->input_port_sizes[0];
11891288
sizeb = node->input_port_sizes[1];
11901289
sizeout = node->output_port_sizes[0];
@@ -1199,6 +1298,13 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)
11991298
}
12001299
diffa = ina - sizea;
12011300
diffb = inb - sizeb;
1301+
// input multiplier size on middle range of unequal Hard Block size(ex; mul_size>18 && mul_size<25)
1302+
if (diffb < 0) {
1303+
std::swap(ina, inb);
1304+
diffa = ina - sizea;
1305+
diffb = inb - sizeb;
1306+
}
1307+
12021308
diffout = hard_multipliers->outputs->size - sizeout;
12031309

12041310
if (configuration.split_hard_multiplier == 1) {
@@ -1281,11 +1387,9 @@ void iterate_multipliers(netlist_t *netlist)
12811387
int mula, mulb;
12821388
int a0, a1, b0, b1;
12831389
nnode_t *node;
1284-
12851390
/* Can only perform the optimisation if hard multipliers exist! */
12861391
if (hard_multipliers == NULL)
12871392
return;
1288-
12891393
sizea = hard_multipliers->inputs->size;
12901394
sizeb = hard_multipliers->inputs->next->size;
12911395
if (sizea < sizeb) {
@@ -1313,7 +1417,6 @@ void iterate_multipliers(netlist_t *netlist)
13131417
sizea = sizeb;
13141418
sizeb = swap;
13151419
}
1316-
13171420
/* Do I need to split the multiplier on both inputs? */
13181421
if ((mula > sizea) && (mulb > sizeb)) {
13191422
a0 = sizea;
@@ -1890,4 +1993,4 @@ void free_multipliers()
18901993

18911994
hard_multipliers->instances = NULL;
18921995
}
1893-
}
1996+
}

parmys/parmys-plugin/netlist/netlist_utils.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@ void remap_pin_to_new_net(npin_t *pin, nnet_t *new_net)
485485
*-----------------------------------------------------------------------*/
486486
void remap_pin_to_new_node(npin_t *pin, nnode_t *new_node, int pin_idx)
487487
{
488+
oassert(pin != NULL);
488489
if (pin->type == INPUT) {
489490
/* clean out the entry in the old net */
490491
pin->node->input_pins[pin->pin_node_idx] = NULL;

vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/vtr_xilinx_qor/config/config.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ circuits_dir=benchmarks/verilog
1212
arch_list_add=7series_BRAM_DSP_carry.xml
1313

1414
# Add circuits to list to sweep
15+
circuit_list_add=mcml.v
1516
circuit_list_add=LU32PEEng.v
1617
circuit_list_add=LU8PEEng.v
1718
circuit_list_add=bgm.v

0 commit comments

Comments
 (0)