2222#include  " mlir/Interfaces/DataLayoutInterfaces.h" 
2323#include  " mlir/Pass/PassManager.h" 
2424
25+ #ifdef  GC_ENABLE_GPU_PROFILE
26+ #include  " PtiGpuUtils.h" 
27+ #include  " pti/pti_view.h" 
28+ std::map<std::pair<pti_view_external_kind, uint64_t >, std::vector<uint32_t >>
29+     external_corr_map;
30+ std::map<uint32_t , std::string> runtime_enq_2_gpu_kernel_name_map;
31+ std::map<uint32_t , std::string> runtime_enq_2_gpu_mem_op_name_map;
32+ 
33+ class  GPUKernelTracer  {
34+ public: 
35+   GPUKernelTracer () {
36+     gcLogD (" Enable Profiling."  );
37+     ptiViewSetCallbacks (
38+         [](auto  **buf, auto  *buf_size) {
39+           *buf_size = sizeof (pti_view_record_kernel) * 100 ;
40+           void  *ptr = ::operator  new (*buf_size);
41+           ptr = std::align (8 , sizeof (unsigned  char ), ptr, *buf_size);
42+           *buf = reinterpret_cast <unsigned  char  *>(ptr);
43+           if  (!*buf) {
44+             std::abort ();
45+           }
46+           return ;
47+         },
48+         [](auto  *buf, auto  buf_size, auto  valid_buf_size) {
49+           if  (!buf_size || !valid_buf_size || !buf_size) {
50+             std::cerr << " Received empty buffer"   << ' \n '  ;
51+             if  (valid_buf_size) {
52+               ::operator  delete (buf);
53+             }
54+             return ;
55+           }
56+           pti_view_record_base *ptr = nullptr ;
57+           while  (true ) {
58+             auto  buf_status = ptiViewGetNextRecord (buf, valid_buf_size, &ptr);
59+             if  (buf_status == pti_result::PTI_STATUS_END_OF_BUFFER) {
60+               std::cout << " Reached End of buffer"   << ' \n '  ;
61+               break ;
62+             }
63+             if  (buf_status != pti_result::PTI_SUCCESS) {
64+               std::cerr << " Found Error Parsing Records from PTI"   << ' \n '  ;
65+               break ;
66+             }
67+             switch  (ptr->_view_kind ) {
68+             case  pti_view_kind::PTI_VIEW_INVALID: {
69+               std::cout << " Found Invalid Record"   << ' \n '  ;
70+               break ;
71+             }
72+             case  pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_COPY: {
73+               std::cout << " ---------------------------------------------------" 
74+                            " -----------------------------" 
75+                         << ' \n '  ;
76+               pti_view_record_memory_copy *rec =
77+                   reinterpret_cast <pti_view_record_memory_copy *>(ptr);
78+               runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id ] =
79+                   rec->_name ;
80+               std::cout << " Found Memory Record"   << ' \n '  ;
81+               samples_utils::dump_record (rec);
82+               std::cout << " ---------------------------------------------------" 
83+                            " -----------------------------" 
84+                         << ' \n '  ;
85+               break ;
86+             }
87+             case  pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_FILL: {
88+               std::cout << " ---------------------------------------------------" 
89+                            " -----------------------------" 
90+                         << ' \n '  ;
91+               pti_view_record_memory_fill *rec =
92+                   reinterpret_cast <pti_view_record_memory_fill *>(ptr);
93+               runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id ] =
94+                   rec->_name ;
95+               std::cout << " Found Memory Record"   << ' \n '  ;
96+               samples_utils::dump_record (rec);
97+               std::cout << " ---------------------------------------------------" 
98+                            " -----------------------------" 
99+                         << ' \n '  ;
100+               break ;
101+             }
102+             case  pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: {
103+               std::cout << " ---------------------------------------------------" 
104+                            " -----------------------------" 
105+                         << ' \n '  ;
106+               pti_view_record_kernel *rec =
107+                   reinterpret_cast <pti_view_record_kernel *>(ptr);
108+               runtime_enq_2_gpu_kernel_name_map[rec->_correlation_id ] =
109+                   rec->_name ;
110+               std::cout << " Found Kernel Record"   << ' \n '  ;
111+               samples_utils::dump_record (rec);
112+ 
113+               std::cout << " ---------------------------------------------------" 
114+                            " -----------------------------" 
115+                         << ' \n '  ;
116+               if  (samples_utils::isMonotonic (
117+                       {rec->_sycl_task_begin_timestamp ,
118+                        rec->_sycl_enqk_begin_timestamp , rec->_append_timestamp ,
119+                        rec->_submit_timestamp , rec->_start_timestamp ,
120+                        rec->_end_timestamp })) {
121+                 std::cout << " ------------>     All Monotonic"   << std::endl;
122+               } else  {
123+                 std::cerr
124+                     << " ------------>     Something wrong: NOT All monotonic" 
125+                     << std::endl;
126+               };
127+               if  (rec->_sycl_task_begin_timestamp  == 0 ) {
128+                 std::cerr << " ------------>     Something wrong: Sycl Task " 
129+                              " Begin Time is 0" 
130+                           << std::endl;
131+               }
132+               if  (rec->_sycl_enqk_begin_timestamp  == 0 ) {
133+                 std::cerr << " ------------>     Something wrong: Sycl Enq " 
134+                              " Launch Kernel Time is 0" 
135+                           << std::endl;
136+               }
137+ 
138+               break ;
139+             }
140+             case  pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: {
141+               std::cout << " ---------------------------------------------------" 
142+                            " -----------------------------" 
143+                         << ' \n '  ;
144+               pti_view_record_external_correlation *rec =
145+                   reinterpret_cast <pti_view_record_external_correlation *>(ptr);
146+ 
147+               external_corr_map[std::pair{rec->_external_kind ,
148+                                           rec->_external_id }]
149+                   .push_back (rec->_correlation_id );
150+               samples_utils::dump_record (rec);
151+               break ;
152+             }
153+             case  pti_view_kind::PTI_VIEW_OPENCL_CALLS: {
154+               std::cout << " ---------------------------------------------------" 
155+                            " -----------------------------" 
156+                         << ' \n '  ;
157+               pti_view_record_oclcalls *rec =
158+                   reinterpret_cast <pti_view_record_oclcalls *>(ptr);
159+               samples_utils::dump_record (rec);
160+               break ;
161+             }
162+             default : {
163+               std::cerr << " This shouldn't happen"   << ' \n '  ;
164+               break ;
165+             }
166+             }
167+           }
168+           ::operator  delete (buf);
169+         });
170+     ptiViewSetOclProfiling ();
171+ 
172+     ptiViewEnable (PTI_VIEW_DEVICE_GPU_KERNEL);
173+     ptiViewEnable (PTI_VIEW_DEVICE_GPU_MEM_COPY);
174+     ptiViewEnable (PTI_VIEW_DEVICE_GPU_MEM_FILL);
175+     ptiViewEnable (PTI_VIEW_OPENCL_CALLS);
176+     ptiViewEnable (PTI_VIEW_EXTERNAL_CORRELATION);
177+   }
178+ 
179+   ~GPUKernelTracer () {
180+     gcLogD (" Profiling is finished."  );
181+     ptiViewDisable (PTI_VIEW_DEVICE_GPU_KERNEL);
182+     ptiViewDisable (PTI_VIEW_DEVICE_GPU_MEM_COPY);
183+     ptiViewDisable (PTI_VIEW_DEVICE_GPU_MEM_FILL);
184+     ptiViewEnable (PTI_VIEW_OPENCL_CALLS);
185+     ptiViewDisable (PTI_VIEW_EXTERNAL_CORRELATION);
186+     ptiFlushAllViews ();
187+   }
188+ };
189+ 
190+ /* 
191+ Create an RAII tracer with a static life cycle to trace all device kernel 
192+ execution during the program. When the tracer's constructor is called, the 
193+ EnableProfiling will also be called, registering some metric collection 
194+ call-back function into the opencl function call. When the tracer is destroyed, 
195+ the DisableProfiling is also called which will statistic the collected metric 
196+ during the tracer lifetime and print the result. The concrete implementation of 
197+ EnableProfiling and DisableProfiling could refer to 
198+ https://github.com/intel/pti-gpu/blob/master/tools/onetrace/tool.cc. 
199+ */ 
200+ static  GPUKernelTracer tracer;
201+ 
202+ #endif 
203+ 
25204namespace  mlir ::gc::gpu {
26205
27206#define  makeClErrPref (code ) " OpenCL error "  , code, " : " 
@@ -128,10 +307,9 @@ struct Kernel {
128307
129308  explicit  Kernel (cl_program program, cl_kernel kernel, const  size_t  *gridSize,
130309                  const  size_t  *blockSize, size_t  argNum, const  size_t  *argSize)
131-       : program(program),
132-         kernel(kernel), globalSize{gridSize[0 ] * blockSize[0 ],
133-                                    gridSize[1 ] * blockSize[1 ],
134-                                    gridSize[2 ] * blockSize[2 ]},
310+       : program(program), kernel(kernel),
311+         globalSize{gridSize[0 ] * blockSize[0 ], gridSize[1 ] * blockSize[1 ],
312+                    gridSize[2 ] * blockSize[2 ]},
135313        localSize{blockSize[0 ], blockSize[1 ], blockSize[2 ]},
136314        argSize (argSize, argSize + argNum) {
137315#ifndef  NDEBUG
@@ -1014,4 +1192,4 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
10141192  return  cache.emplace (OclDevCtxPair (ext.device , ext.context ), ptr)
10151193      .first ->second ;
10161194}
1017- } //  namespace mlir::gc::gpu
1195+ } //  namespace mlir::gc::gpu
0 commit comments