@@ -85,11 +85,13 @@ def _iaca_ast_to_c(self, ast, opts={}):
85
85
iaca_ast , last , nest , loop_count = coffee .utils .insert_iaca (ast , iakify )
86
86
if not last :
87
87
iakify += 1
88
- ast_handler = ASTKernel (iaca_ast , self ._include_dirs )
89
- ast_handler .plan_cpu (opts )
90
- self ._applied_blas = ast_handler .blas
91
- self ._applied_ap = ast_handler .ap
92
- iaca_kernels .append ([ast_handler .gencode (), nest , loop_count ])
88
+ # ast_handler = ASTKernel(iaca_ast, self._include_dirs)
89
+ # ast_handler.plan_cpu(opts)
90
+ # self._applied_blas = ast_handler.blas
91
+ # self._applied_ap = ast_handler.ap
92
+ # iaca_kernels.append([ast_handler.gencode(), nest, loop_count])
93
+ # from IPython import embed; embed()
94
+ iaca_kernels .append ([iaca_ast .gencode (), nest , loop_count ])
93
95
return iaca_kernels
94
96
95
97
@@ -777,18 +779,25 @@ def sum_nested_flop_values(self, iaca_kernels, val_pos):
777
779
for ind , ic in enumerate (reversed (iaca_kernels )):
778
780
if ic [1 ] == prev_nest :
779
781
i_flops += (ic [2 ] if ic [3 ] else 1 ) * ic [val_pos ]
782
+ # print "same nest: ", i_flops
780
783
elif ic [1 ] < prev_nest :
781
784
iaca_block_flops = 0
782
785
for ic_prev in iaca_kernels [len (iaca_kernels ) - ind :]:
783
786
if prev_nest == ic_prev [1 ]:
784
787
iaca_block_flops += ic_prev [val_pos ]
785
- else :
786
- break
787
- i_flops = (ic [2 ] if ic [3 ] else 1 ) * (i_flops + ic [val_pos ] - iaca_block_flops )
788
+ # else:
789
+ # break
790
+ curr_flops = (ic [2 ] if ic [3 ] else 1 ) * (i_flops + ic [val_pos ] - iaca_block_flops )
791
+ if ic [6 ] == 1 :
792
+ i_flops = min (curr_flops , (ic [2 ] if ic [3 ] else 1 ) * ic [val_pos ])
793
+ else :
794
+ i_flops = curr_flops
795
+ # print "prev nest greater: ", i_flops
788
796
else :
789
797
# Not tested yet. Might never apply to FFC kernels.
790
798
total_flops += i_flops
791
799
i_flops = (ic [2 ] if ic [3 ] else 1 ) * ic [val_pos ]
800
+ # print "prev nest smaller: ", i_flops
792
801
prev_nest = ic [1 ]
793
802
total_flops += i_flops
794
803
return total_flops
@@ -807,32 +816,37 @@ def sum_nested_cycle_values(self, iaca_kernels, val_pos):
807
816
if ic [1 ] == prev_nest :
808
817
# Add the cycles or flops and multiply by loop counter if the loop is not unrolled
809
818
i_flops += (ic [2 ] if ic [3 ] else 1 ) * ic [val_pos ]
819
+ # print "same nest: ", i_flops
810
820
# if the current loop contains the previous one (has a lower nest number).
811
821
elif ic [1 ] < prev_nest :
812
822
# if there is only one jump instruction and the loop is not unrolled (i.e. inner loops unrolled but current not unrolled)
813
823
if ic [6 ] == 1 and ic [3 ]:
814
824
# loop counter times the number of cycles
815
825
i_flops = ic [2 ] * ic [val_pos ]
826
+ # print "greater nest 1: ", i_flops
816
827
else :
817
828
# sum up the values for the static cycle counts for the loops contaied in the current loop
818
829
# regardless of their unrolled/not unrolled status
819
830
iaca_block_flops = 0
820
831
for ic_prev in iaca_kernels [len (iaca_kernels ) - ind :]:
821
832
if prev_nest == ic_prev [1 ]:
822
833
iaca_block_flops += ic_prev [val_pos ]
823
- else :
824
- break
834
+ # else:
835
+ # break
825
836
# if there are less cycles to be done in the current loop then return that count
826
837
if iaca_block_flops > ic [val_pos ]:
827
838
i_flops = (ic [2 ] if ic [3 ] else 1 ) * ic [val_pos ]
839
+ # print "greater nest 2: ", i_flops
828
840
else :
829
841
# iflops contains the flop count for the inner loops, add to that the flops generated by the rest of the code
830
842
# in the current loop
831
843
i_flops = (ic [2 ] if ic [3 ] else 1 ) * (i_flops + ic [val_pos ] - iaca_block_flops )
844
+ # print "greater nest 3: ", i_flops
832
845
else :
833
846
# Not tested yet. Might never apply to FFC kernels.
834
847
total_flops += i_flops
835
848
i_flops = (ic [2 ] if ic [3 ] else 1 ) * ic [val_pos ]
849
+ # print "smaller nest: ", i_flops
836
850
prev_nest = ic [1 ]
837
851
total_flops += i_flops
838
852
return total_flops
@@ -985,6 +999,7 @@ def compile(self, argtypes=None, restype=None):
985
999
wrapper_code ['iaca_end' ] = ""
986
1000
for ind in range (1 , len (iaca_kernels )):
987
1001
iaca_kernels [ind ][0 ] = self .get_c_code (iaca_kernels [ind ][0 ], wrapper_code )
1002
+ for ind in range (1 , len (iaca_cycle_kernels )):
988
1003
iaca_cycle_kernels [ind ][0 ] = self .get_c_code (iaca_cycle_kernels [ind ][0 ], wrapper_code )
989
1004
iaca_path = path_to_iaca_file + region_name + "_" + self ._kernel ._md5 + ".txt"
990
1005
self .build_loop_nest_reports (iaca_kernels , wrapper_code , iaca_path , compilation , extension , cppargs , ldargs , argtypes , restype , compiler )
0 commit comments