forked from RoyalCaliber/vertexAPI2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgpugas_kernels.cuh
157 lines (125 loc) · 4.74 KB
/
gpugas_kernels.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/******************************************************************************
Copyright 2013 Royal Caliber LLC. (http://www.royal-caliber.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
******************************************************************************/
#ifndef GPUGAS_KERNELS_CUH__
#define GPUGAS_KERNELS_CUH__
//from moderngpu library
#include "device/ctaloadbalance.cuh"
//Device code for GASEngineGPU
//Ideally would have wanted these kernels to be static private member functions
//but nvcc (Cuda 5.0) does not like __global__ or __device__ code inside classes.
namespace GPUGASKernels
{
//assumes blocks are 1-D
//allows for 2D grids since 1D grids don't get us very far.
template<typename Int>
__device__ Int globalThreadId()
{
return threadIdx.x + blockDim.x * (blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y);
}
//set an array to a range of values
template<typename Int>
__global__ void kRange(Int start, Int end, Int *out)
{
Int tid = globalThreadId<Int>();
if( tid < end - start )
out[tid] = tid + start;
}
//This version does the gatherMap only and generates keys for subsequent
//use iwth thrust reduce_by_key
template<typename Program, typename Int, int NT, int indirectedGather>
__global__ void kGatherMap(Int nActiveVertices
, const Int *activeVertices
, const int numBlocks
, Int nTotalEdges
, const Int *edgeCountScan
, const Int *mergePathPartitions
, const Int *srcOffsets
, const Int *srcs
, const typename Program::VertexData* vertexData
, const typename Program::EdgeData* edgeData
, const Int* edgeIndexCSC
, Int *dsts
, typename Program::GatherResult* output)
{
//boilerplate from MGPU, VT will be 1 in this kernel until
//a full rewrite of this kernel, so not bothering with LaunchBox
const int VT = 1;
union Shared
{
Int indices[NT * (VT + 1)];
Int dstVerts[NT * VT];
};
__shared__ Shared shared; //so poetic!
Int block = blockIdx.x + blockIdx.y * gridDim.x;
Int bTid = threadIdx.x; //tid within block
if (block >= numBlocks)
return;
int4 range = mgpu::CTALoadBalance<NT, VT>(nTotalEdges, edgeCountScan
, nActiveVertices, block, bTid, mergePathPartitions
, shared.indices, true);
//global index into output
Int gTid = bTid + range.x;
//get the count of edges this block will do
int edgeCount = range.y - range.x;
//get the number of dst vertices this block will do
//int nDsts = range.w - range.z;
int iActive[VT];
mgpu::DeviceSharedToReg<NT, VT>(shared.indices, bTid, iActive);
//each thread that is responsible for an edge should now apply Program::gatherMap
if( bTid < edgeCount )
{
//get the incoming edge index for this dstVertex
int iEdge;
iEdge = gTid - shared.indices[edgeCount + iActive[0] - range.z];
typename Program::GatherResult result;
Int dstVerts[VT];
//should we use an mgpu function for this indirected load?
Int dst = dstVerts[0] = activeVertices[iActive[0]];
//check if we have a vertex with no incoming edges
//this is the matching kludge for faking the count to be 1
Int soff = srcOffsets[dst];
Int nEdges = srcOffsets[dst + 1] - soff;
if( nEdges )
{
iEdge += soff;
Int src = srcs[ iEdge ];
if( indirectedGather )
result = Program::gatherMap(vertexData + dst, vertexData + src, edgeData + edgeIndexCSC[iEdge] );
else
result = Program::gatherMap(vertexData + dst, vertexData + src, edgeData + iEdge );
}
else
result = Program::gatherZero;
//write out a key and a result.
//Next we will be adding a blockwide or atleast a warpwide reduction here.
dsts[gTid] = dstVerts[0];
output[gTid] = result;
}
}
//Run Program::apply and store which vertices are to activate their neighbors
//we can either return a compact list or a set of flags. To be decided
template<typename Program, typename Int>
__global__ void kApply(Int nActiveVertices
, const Int *activeVertices
, const typename Program::GatherResult *gatherResults
, typename Program::VertexData *vertexData
, Int *retFlags)
{
Int tid = globalThreadId<Int>();
if( tid >= nActiveVertices )
return;
int vid = activeVertices[tid];
retFlags[tid] = Program::apply(vertexData ? vertexData + vid : 0, gatherResults[tid]);
}
} //end namespace GPUGASKernels
#endif