diff --git a/nvmolkit/clustering.py b/nvmolkit/clustering.py index 538eb31..1d85ae3 100644 --- a/nvmolkit/clustering.py +++ b/nvmolkit/clustering.py @@ -139,6 +139,8 @@ def fused_butina( n_start = x.shape[0] device = x.device indices = torch.arange(n_start, dtype=torch.int32, device=device) + # CPU mirror of indices avoids a D2H sync to record each centroid. + indices_host = list(range(n_start)) cluster_count = torch.zeros(2, dtype=torch.int32, device=device) cluster_count[1] = n_start - 1 cluster_indices = torch.zeros(n_start, dtype=torch.int32, device=device) @@ -149,33 +151,55 @@ def fused_butina( threshold = float(1 - cutoff) y = x first_run = True - while cluster_count[0].item() <= cluster_count[1].item() and x.shape[0] > 0: + # cc[0] = next cluster start index, cc[1] = last valid index. + # Initialized to [0, n_start-1] matching cluster_count, so the initial + # while condition is satisfied for n_start > 0 without a D2H sync. + cc = [0, n_start - 1] + while cc[0] <= cc[1] and x.shape[0] > 0: update_neighbor_counts(x, y, neigh, threshold, subtract=not first_run, metric=metric) first_run = False - max_val = neigh.max().item() + # Batch max and last-argmax into one D2H transfer (sync 1 of 2). + neigh_flipped = neigh.flip(0) + batch_ma = torch.stack( + [neigh.max().to(torch.int64), neigh_flipped.argmax().to(torch.int64)] + ).tolist() + max_val = int(batch_ma[0]) if max_val == 0: break - id_max = neigh.shape[0] - 1 - neigh.flip(0).contiguous().argmax().item() - centroids.append(indices[id_max].item()) + id_max = neigh.shape[0] - 1 - int(batch_ma[1]) + centroids.append(indices_host[id_max]) # CPU mirror, no sync extract_cluster_and_singletons( x, id_max, is_free, neigh, cluster_count, cluster_indices, threshold, indices, metric=metric ) - cluster_sizes.append(cluster_count[0].item()) - x, y = x[is_free, :].contiguous(), x[~is_free, :].contiguous() + + # Batch cluster_count and is_free into one D2H transfer (sync 2 of 2). + # combined[:2] = cluster_count, combined[2:] = is_free as int32. + combined = torch.cat([cluster_count, is_free.to(torch.int32)]).tolist() + cc = [int(combined[0]), int(combined[1])] + is_free_host = [bool(v) for v in combined[2:]] + + cluster_sizes.append(cc[0]) + + # is_free is already updated in-place on GPU by extract_cluster_and_singletons; + # use it directly to avoid a H2D→CPU→H2D roundtrip. + # is_free_host (from the combined download above) is still needed for indices_host. + y = x[~is_free, :].contiguous() + x = x[is_free, :].contiguous() indices = indices[is_free].contiguous() neigh = neigh[is_free].contiguous() is_free = torch.ones(x.shape[0], dtype=torch.bool, device=x.device) + indices_host = [idx for idx, keep in zip(indices_host, is_free_host) if keep] cluster_indices_cpu = cluster_indices.cpu() + indices_cpu = cluster_indices_cpu.numpy() for i in range(n_start - cluster_sizes[-1]): item = cluster_sizes[-1] cluster_sizes.append(cluster_sizes[-1] + 1) - centroids.append(cluster_indices_cpu[item].item()) + centroids.append(int(indices_cpu[item])) clusters = [] - indices_cpu = cluster_indices.cpu().numpy() for i in range(len(cluster_sizes) - 1): start_idx = cluster_sizes[i] end_idx = cluster_sizes[i + 1] diff --git a/src/conformer_rmsd.cu b/src/conformer_rmsd.cu index bb10445..d1bb6ed 100644 --- a/src/conformer_rmsd.cu +++ b/src/conformer_rmsd.cu @@ -99,15 +99,29 @@ __device__ __forceinline__ double det3x3(const double* H) { // --------------------------------------------------------------------------- constexpr int kRmsdBlockSize = 128; -using RmsdBlockReduceT = cub::BlockReduce; +constexpr int kRmsdWarps = kRmsdBlockSize / 32; // 4 warps per block + +using RmsdWarpReduce = cub::WarpReduce; __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coordI, const double* __restrict__ coordJ, const int numAtoms, const bool prealigned, double* outRmsd) { - const int tid = threadIdx.x; - __shared__ RmsdBlockReduceT::TempStorage reduceTmp; + const int tid = threadIdx.x; + const int warpId = tid / 32; + const int laneId = tid % 32; + + // Shared buffers for warp→block reduction. + // warpReduceTemp[w]: CUB WarpReduce scratch for warp w (empty for full 32-thread warps). + // warpBuf[w][f]: field f's warp-partial sum for warp w. + // sCent[6]: broadcast centroid buffer written by thread 0 after sync 1. + // prealigned path uses only warpBuf[w][0]. + // Alignment path phase-1 uses fields 0-5 (centroid sums); + // phase-2 uses fields 0-10 (Sp, Sq, H[0..8]). + __shared__ typename RmsdWarpReduce::TempStorage warpReduceTemp[kRmsdWarps]; + __shared__ double warpBuf[kRmsdWarps][11]; + __shared__ double sCent[6]; if (prealigned) { // ---- Simple RMSD without alignment (no centering, matches RDKit behavior) ---- @@ -118,16 +132,22 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord const double dz = coordI[a * 3 + 2] - coordJ[a * 3 + 2]; sumSqDiff += dx * dx + dy * dy + dz * dz; } - const double total = RmsdBlockReduceT(reduceTmp).Sum(sumSqDiff); - if (tid == 0) + sumSqDiff = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumSqDiff); + if (laneId == 0) + warpBuf[warpId][0] = sumSqDiff; + __syncthreads(); + if (tid == 0) { + double total = 0.0; + for (int w = 0; w < kRmsdWarps; ++w) + total += warpBuf[w][0]; *outRmsd = sqrt(total / static_cast(numAtoms)); + } return; } - // ---- Kabsch alignment path: compute centroids ---- - __shared__ double sCentI[3]; - __shared__ double sCentJ[3]; - + // ---- Kabsch alignment path ---- + // Phase 1: accumulate centroid sums, warp-reduce into warpBuf (sync 1), + // thread 0 computes and broadcasts centroids via sCent (sync 2). double sumIx = 0.0, sumIy = 0.0, sumIz = 0.0; double sumJx = 0.0, sumJy = 0.0, sumJz = 0.0; for (int a = tid; a < numAtoms; a += kRmsdBlockSize) { @@ -138,43 +158,49 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord sumJy += coordJ[a * 3 + 1]; sumJz += coordJ[a * 3 + 2]; } + sumIx = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumIx); + sumIy = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumIy); + sumIz = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumIz); + sumJx = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumJx); + sumJy = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumJy); + sumJz = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumJz); + + if (laneId == 0) { + warpBuf[warpId][0] = sumIx; + warpBuf[warpId][1] = sumIy; + warpBuf[warpId][2] = sumIz; + warpBuf[warpId][3] = sumJx; + warpBuf[warpId][4] = sumJy; + warpBuf[warpId][5] = sumJz; + } + __syncthreads(); // sync 1: warpBuf[*][0..5] visible; thread 0 reduces below. - // Reduce centroid components; write results to shared memory for broadcast. - // Each __syncthreads() both allows TempStorage reuse and makes the previous - // shared-memory write visible to all threads before the next reduction. const double invN = 1.0 / static_cast(numAtoms); - sumIx = RmsdBlockReduceT(reduceTmp).Sum(sumIx); - __syncthreads(); - if (tid == 0) - sCentI[0] = sumIx * invN; - sumIy = RmsdBlockReduceT(reduceTmp).Sum(sumIy); - __syncthreads(); - if (tid == 0) - sCentI[1] = sumIy * invN; - sumIz = RmsdBlockReduceT(reduceTmp).Sum(sumIz); - __syncthreads(); - if (tid == 0) - sCentI[2] = sumIz * invN; - sumJx = RmsdBlockReduceT(reduceTmp).Sum(sumJx); - __syncthreads(); - if (tid == 0) - sCentJ[0] = sumJx * invN; - sumJy = RmsdBlockReduceT(reduceTmp).Sum(sumJy); - __syncthreads(); - if (tid == 0) - sCentJ[1] = sumJy * invN; - sumJz = RmsdBlockReduceT(reduceTmp).Sum(sumJz); - __syncthreads(); - if (tid == 0) - sCentJ[2] = sumJz * invN; - __syncthreads(); // broadcast sCentJ[2] and ensure all centroid writes are visible - - const double cIx = sCentI[0], cIy = sCentI[1], cIz = sCentI[2]; - const double cJx = sCentJ[0], cJy = sCentJ[1], cJz = sCentJ[2]; - - // ---- Compute Sp, Sq, and cross-covariance H (Kabsch alignment) ---- + if (tid == 0) { + double sIx = 0.0, sIy = 0.0, sIz = 0.0, sJx = 0.0, sJy = 0.0, sJz = 0.0; + for (int w = 0; w < kRmsdWarps; ++w) { + sIx += warpBuf[w][0]; + sIy += warpBuf[w][1]; + sIz += warpBuf[w][2]; + sJx += warpBuf[w][3]; + sJy += warpBuf[w][4]; + sJz += warpBuf[w][5]; + } + sCent[0] = sIx * invN; + sCent[1] = sIy * invN; + sCent[2] = sIz * invN; + sCent[3] = sJx * invN; + sCent[4] = sJy * invN; + sCent[5] = sJz * invN; + } + __syncthreads(); // sync 2: sCent visible to all threads. + + const double cIx = sCent[0], cIy = sCent[1], cIz = sCent[2]; + const double cJx = sCent[3], cJy = sCent[4], cJz = sCent[5]; + + // Phase 2: accumulate Sp, Sq, and H = P^T Q (11 values), warp-reduce into + // warpBuf (sync 3), thread 0 sums and computes RMSD. // Sp = sum ||pi - centI||^2, Sq = sum ||qj - centJ||^2 - // H[r][c] = sum (pi[r] - centI[r]) * (qj[c] - centJ[c]) double localSp = 0.0, localSq = 0.0; double localH[9] = {0.0}; @@ -189,7 +215,6 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord localSp += px * px + py * py + pz * pz; localSq += qx * qx + qy * qy + qz * qz; - // H = P^T Q (sum of outer products) localH[0] += px * qx; localH[1] += px * qy; localH[2] += px * qz; @@ -201,35 +226,27 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord localH[8] += pz * qz; } - // Reduce all 11 values into thread 0. Results in non-zero threads are - // undefined and unused; only thread 0 performs the RMSD computation below. - // __syncthreads() between calls allows TempStorage reuse. - localSp = RmsdBlockReduceT(reduceTmp).Sum(localSp); - __syncthreads(); - localSq = RmsdBlockReduceT(reduceTmp).Sum(localSq); - __syncthreads(); - localH[0] = RmsdBlockReduceT(reduceTmp).Sum(localH[0]); - __syncthreads(); - localH[1] = RmsdBlockReduceT(reduceTmp).Sum(localH[1]); - __syncthreads(); - localH[2] = RmsdBlockReduceT(reduceTmp).Sum(localH[2]); - __syncthreads(); - localH[3] = RmsdBlockReduceT(reduceTmp).Sum(localH[3]); - __syncthreads(); - localH[4] = RmsdBlockReduceT(reduceTmp).Sum(localH[4]); - __syncthreads(); - localH[5] = RmsdBlockReduceT(reduceTmp).Sum(localH[5]); - __syncthreads(); - localH[6] = RmsdBlockReduceT(reduceTmp).Sum(localH[6]); - __syncthreads(); - localH[7] = RmsdBlockReduceT(reduceTmp).Sum(localH[7]); - __syncthreads(); - localH[8] = RmsdBlockReduceT(reduceTmp).Sum(localH[8]); - // No final sync: only thread 0 reads localSp, localSq, localH below. - - // ---- Thread 0: compute RMSD from Sp, Sq, singular values of H ---- + localSp = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(localSp); + localSq = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(localSq); + for (int i = 0; i < 9; ++i) + localH[i] = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(localH[i]); + + if (laneId == 0) { + warpBuf[warpId][0] = localSp; + warpBuf[warpId][1] = localSq; + for (int i = 0; i < 9; ++i) + warpBuf[warpId][2 + i] = localH[i]; + } + __syncthreads(); // sync 3: warpBuf[*][0..10] visible; thread 0 computes RMSD. + if (tid == 0) { - const double* H = localH; + double Sp = 0.0, Sq = 0.0, H[9] = {0.0}; + for (int w = 0; w < kRmsdWarps; ++w) { + Sp += warpBuf[w][0]; + Sq += warpBuf[w][1]; + for (int i = 0; i < 9; ++i) + H[i] += warpBuf[w][2 + i]; + } // G = H^T H (3x3 symmetric positive semi-definite) const double g00 = H[0] * H[0] + H[3] * H[3] + H[6] * H[6]; @@ -253,7 +270,7 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord s2 = -s2; // RMSD^2 = (Sp + Sq - 2*(s0 + s1 + s2)) / N - const double rmsdSq = fmax((localSp + localSq - 2.0 * (s0 + s1 + s2)) * invN, 0.0); + const double rmsdSq = fmax((Sp + Sq - 2.0 * (s0 + s1 + s2)) * invN, 0.0); *outRmsd = sqrt(rmsdSq); } } diff --git a/src/minimizer/bfgs_minimize.cu b/src/minimizer/bfgs_minimize.cu index 020cc09..e240eec 100644 --- a/src/minimizer/bfgs_minimize.cu +++ b/src/minimizer/bfgs_minimize.cu @@ -908,6 +908,10 @@ __global__ void updateDGradKernel(const double gradTol, double blockMax = cub::BlockReduce(tempStorage).Reduce(localMax, cubMax()); if (idxWithinSystem == 0) { + // Matches RDKit's signed-energy convergence denominator in ForceField::minimize. + // Negative-energy geometries can clamp this to 1, artificially tightening the + // check — but fixing it unconditionally diverges from reference behavior. + // TODO: file upstream RDKit bug; gate fix on RDKit version once merged there. const double term = max(energies[sysIdx] * gradScales[sysIdx], 1.0); blockMax /= term; if (blockMax < gradTol) { diff --git a/src/similarity_kernels.cu b/src/similarity_kernels.cu index 4bffc72..7de1d65 100644 --- a/src/similarity_kernels.cu +++ b/src/similarity_kernels.cu @@ -65,6 +65,30 @@ bool supportsTensorOps(const int major, const int minor) { return false; } +// Per-device cache for tensor-op capability: 0=unknown, 1=unsupported, 2=supported. +// Indexed by device ordinal; supports up to kMaxDevices GPUs in a single process. +// Initialized to zero by C++ static-storage rules (all unknown at startup). +constexpr int kMaxDevices = 16; + +std::atomic g_tensorOpsCache[kMaxDevices]{}; + +//! Returns whether the current CUDA device supports BMMA tensor ops, +//! querying cudaGetDeviceProperties at most once per device per process. +bool isTensorOpsSupportedCached() { + int device; + cudaCheckError(cudaGetDevice(&device)); + if (device >= 0 && device < kMaxDevices && g_tensorOpsCache[device] != 0) { + return g_tensorOpsCache[device] == 2; + } + cudaDeviceProp props; + cudaCheckError(cudaGetDeviceProperties(&props, device)); + const bool result = supportsTensorOps(props.major, props.minor); + if (device >= 0 && device < kMaxDevices) { + g_tensorOpsCache[device] = result ? 2 : 1; + } + return result; +} + //! Cast a device vector to a span of a type smaller or equal to self. //! Does some basic checking of compatibility between types. template @@ -423,17 +447,10 @@ void launchCrossTanimotoSimilarity(const AsyncDeviceVector const size_t numBitsPerMolecule, AsyncDeviceVector& results, const size_t offset) { - int device; - cudaGetDevice(&device); - cudaCheckError(cudaGetLastError()); - - cudaDeviceProp deviceProp; - cudaCheckError(cudaGetDeviceProperties(&deviceProp, device)); - size_t m = bitsOne.size(); size_t n = bitsTwo.size(); - if (supportsTensorOps(deviceProp.major, deviceProp.minor)) { + if (isTensorOpsSupportedCached()) { constexpr unsigned int BLOCK_TILE_SIZE_X{64U}; constexpr unsigned int BLOCK_TILE_SIZE_Y{64U}; @@ -508,17 +525,10 @@ void launchCrossTanimotoSimilarity(const cuda::std::span bi const cuda::std::span results, const size_t offset, cudaStream_t stream) { - int device; - cudaGetDevice(&device); - cudaCheckError(cudaGetLastError()); - - cudaDeviceProp deviceProp; - cudaCheckError(cudaGetDeviceProperties(&deviceProp, device)); - size_t m = bitsOne.size() / numBitsPerMolecule; size_t n = bitsTwo.size() / numBitsPerMolecule; - if (supportsTensorOps(deviceProp.major, deviceProp.minor)) { + if (isTensorOpsSupportedCached()) { constexpr unsigned int BLOCK_TILE_SIZE_X{64U}; constexpr unsigned int BLOCK_TILE_SIZE_Y{64U}; @@ -645,17 +655,10 @@ void launchCrossCosineSimilarity(const AsyncDeviceVector& const size_t numBitsPerMolecule, AsyncDeviceVector& results, const size_t offset) { - int device; - cudaGetDevice(&device); - cudaCheckError(cudaGetLastError()); - - cudaDeviceProp deviceProp; - cudaCheckError(cudaGetDeviceProperties(&deviceProp, device)); - size_t m = bitsOne.size(); size_t n = bitsTwo.size(); - if (supportsTensorOps(deviceProp.major, deviceProp.minor)) { + if (isTensorOpsSupportedCached()) { constexpr unsigned int BLOCK_TILE_SIZE_X{64U}; constexpr unsigned int BLOCK_TILE_SIZE_Y{64U}; @@ -730,17 +733,10 @@ void launchCrossCosineSimilarity(const cuda::std::span bits const cuda::std::span results, const size_t offset, cudaStream_t stream) { - int device; - cudaGetDevice(&device); - cudaCheckError(cudaGetLastError()); - - cudaDeviceProp deviceProp; - cudaCheckError(cudaGetDeviceProperties(&deviceProp, device)); - size_t m = bitsOne.size() / numBitsPerMolecule; size_t n = bitsTwo.size() / numBitsPerMolecule; - if (supportsTensorOps(deviceProp.major, deviceProp.minor)) { + if (isTensorOpsSupportedCached()) { constexpr unsigned int BLOCK_TILE_SIZE_X{64U}; constexpr unsigned int BLOCK_TILE_SIZE_Y{64U}; diff --git a/src/symmetric_eigensolver.cu b/src/symmetric_eigensolver.cu index 2a69d9e..8aefd3c 100644 --- a/src/symmetric_eigensolver.cu +++ b/src/symmetric_eigensolver.cu @@ -198,6 +198,7 @@ void launchBatchEigensolverKernel(const int numEigs, double* eigenvaluesOut, double* eigenvectorsOut, uint8_t* converged, + curandState* states, const uint8_t* active, const int seed = 42) { const int numBlocks = numSystems; @@ -205,15 +206,13 @@ void launchBatchEigensolverKernel(const int numEigs, if (matrixDim > 256) { throw std::runtime_error("Matrix dimension is too large for the kernel"); } - // TODO: reuse. - AsyncDeviceVector states(numSystems * matrixDim); batchEigensolverKernel<<>>(numEigs, matrixDim, mutableBoundsMatrices, eigenvaluesOut, eigenvectorsOut, converged, - states.data(), + states, active, seed); cudaCheckError(cudaGetLastError()); @@ -234,7 +233,11 @@ class BatchedEigenSolver::Impl { converged_.resize(batch_size); cudaCheckError(cudaMemsetAsync(converged_.data(), 0, batch_size * sizeof(uint8_t))); - // Solve batched eigenvalue problem + const size_t statesNeeded = static_cast(batch_size) * static_cast(matrixDim); + if (statesNeeded > states_.size()) { + states_.resize(statesNeeded); + } + launchBatchEigensolverKernel(numEigs, batch_size, matrixDim, @@ -242,6 +245,7 @@ class BatchedEigenSolver::Impl { eigenvalues, eigenvectors, converged_.data(), + states_.data(), active, randomSeed); } @@ -249,7 +253,8 @@ class BatchedEigenSolver::Impl { const uint8_t* converged() const { return converged_.data(); } private: - AsyncDeviceVector converged_; + AsyncDeviceVector converged_; + AsyncDeviceVector states_; }; BatchedEigenSolver::BatchedEigenSolver() : pimpl_(std::make_unique()) {} diff --git a/src/tfd/tfd_kernels.cu b/src/tfd/tfd_kernels.cu index 28c3314..ef9337e 100644 --- a/src/tfd/tfd_kernels.cu +++ b/src/tfd/tfd_kernels.cu @@ -109,9 +109,10 @@ __global__ void tfdMatrixKernel(const int numMolecules, // Decode lower-triangular (i, j) from flat pairIdx // i*(i-1)/2 + j = pairIdx, i > j >= 0 // i = floor((1 + sqrt(1 + 8*pairIdx)) / 2) - int i = static_cast((1.0f + sqrtf(1.0f + 8.0f * pairIdx)) * 0.5f); - // Symmetric guards: float sqrtf can round either way when 8*pairIdx exceeds - // float32's 24-bit mantissa (pairIdx >~ 2M, i.e. >~2000 conformers). + // Use double precision: float32 loses bits when 8*pairIdx > 2^23 (~1M pairs, + // ~1415 conformers), the same threshold documented in conformer_rmsd.cu. + int i = static_cast((1.0 + sqrt(1.0 + 8.0 * static_cast(pairIdx))) * 0.5); + // Clamp guards for any residual rounding at boundary values. if (i * (i - 1) / 2 > pairIdx) i--; else if ((i + 1) * i / 2 <= pairIdx) @@ -135,15 +136,19 @@ __global__ void tfdMatrixKernel(const int numMolecules, deviation = circularDifference(dihedralAngles[aI + qLocalStart], dihedralAngles[aJ + qLocalStart]) / torsionMaxDevs[globalT]; } else if (type == 1) { // Ring - float avgI = 0.0f; - float avgJ = 0.0f; - for (int qq = 0; qq < numQ; ++qq) { - avgI += fabsf(dihedralAngles[aI + qLocalStart + qq] - 180.0f); - avgJ += fabsf(dihedralAngles[aJ + qLocalStart + qq] - 180.0f); + if (numQ == 0) { + deviation = 0.0f; + } else { + float avgI = 0.0f; + float avgJ = 0.0f; + for (int qq = 0; qq < numQ; ++qq) { + avgI += fabsf(dihedralAngles[aI + qLocalStart + qq] - 180.0f); + avgJ += fabsf(dihedralAngles[aJ + qLocalStart + qq] - 180.0f); + } + avgI /= numQ; + avgJ /= numQ; + deviation = fabsf(avgI - avgJ) / torsionMaxDevs[globalT]; } - avgI /= numQ; - avgJ /= numQ; - deviation = fabsf(avgI - avgJ) / torsionMaxDevs[globalT]; } else { // Symmetric float minDiff = 180.0f; for (int qi = 0; qi < numQ; ++qi) { diff --git a/src/triangle_smooth.cu b/src/triangle_smooth.cu index 226c0e3..700fd23 100644 --- a/src/triangle_smooth.cu +++ b/src/triangle_smooth.cu @@ -330,7 +330,7 @@ void DeviceBoundsMatrixBatch::copyToHost(std::vector<::DistGeom::BoundsMatPtr>& // Copy device data to host vector first std::vector hostData(totalElements_); data_.copyToHost(hostData); - cudaDeviceSynchronize(); + cudaCheckError(cudaStreamSynchronize(data_.stream())); // Distribute data to individual matrices size_t globalOffset = 0;