diff --git a/nvmolkit/clustering.py b/nvmolkit/clustering.py
index 538eb31..1d85ae3 100644
--- a/nvmolkit/clustering.py
+++ b/nvmolkit/clustering.py
@@ -139,6 +139,8 @@ def fused_butina(
         n_start = x.shape[0]
         device = x.device
         indices = torch.arange(n_start, dtype=torch.int32, device=device)
+        # CPU mirror of indices avoids a D2H sync to record each centroid.
+        indices_host = list(range(n_start))
         cluster_count = torch.zeros(2, dtype=torch.int32, device=device)
         cluster_count[1] = n_start - 1
         cluster_indices = torch.zeros(n_start, dtype=torch.int32, device=device)
@@ -149,33 +151,55 @@ def fused_butina(
         threshold = float(1 - cutoff)
         y = x
         first_run = True
-        while cluster_count[0].item() <= cluster_count[1].item() and x.shape[0] > 0:
+        # cc[0] = next cluster start index, cc[1] = last valid index.
+        # Initialized to [0, n_start-1] matching cluster_count, so the initial
+        # while condition is satisfied for n_start > 0 without a D2H sync.
+        cc = [0, n_start - 1]
+        while cc[0] <= cc[1] and x.shape[0] > 0:
             update_neighbor_counts(x, y, neigh, threshold, subtract=not first_run, metric=metric)
             first_run = False
 
-            max_val = neigh.max().item()
+            # Batch max and last-argmax into one D2H transfer (sync 1 of 2).
+            neigh_flipped = neigh.flip(0)
+            batch_ma = torch.stack(
+                [neigh.max().to(torch.int64), neigh_flipped.argmax().to(torch.int64)]
+            ).tolist()
+            max_val = int(batch_ma[0])
             if max_val == 0:
                 break
-            id_max = neigh.shape[0] - 1 - neigh.flip(0).contiguous().argmax().item()
-            centroids.append(indices[id_max].item())
+            id_max = neigh.shape[0] - 1 - int(batch_ma[1])
+            centroids.append(indices_host[id_max])  # CPU mirror, no sync
 
             extract_cluster_and_singletons(
                 x, id_max, is_free, neigh, cluster_count, cluster_indices, threshold, indices, metric=metric
             )
-            cluster_sizes.append(cluster_count[0].item())
-            x, y = x[is_free, :].contiguous(), x[~is_free, :].contiguous()
+
+            # Batch cluster_count and is_free into one D2H transfer (sync 2 of 2).
+            # combined[:2] = cluster_count, combined[2:] = is_free as int32.
+            combined = torch.cat([cluster_count, is_free.to(torch.int32)]).tolist()
+            cc = [int(combined[0]), int(combined[1])]
+            is_free_host = [bool(v) for v in combined[2:]]
+
+            cluster_sizes.append(cc[0])
+
+            # is_free is already updated in-place on GPU by extract_cluster_and_singletons;
+            # use it directly to avoid a H2D→CPU→H2D roundtrip.
+            # is_free_host (from the combined download above) is still needed for indices_host.
+            y = x[~is_free, :].contiguous()
+            x = x[is_free, :].contiguous()
             indices = indices[is_free].contiguous()
             neigh = neigh[is_free].contiguous()
             is_free = torch.ones(x.shape[0], dtype=torch.bool, device=x.device)
+            indices_host = [idx for idx, keep in zip(indices_host, is_free_host) if keep]
 
         cluster_indices_cpu = cluster_indices.cpu()
+        indices_cpu = cluster_indices_cpu.numpy()
         for i in range(n_start - cluster_sizes[-1]):
             item = cluster_sizes[-1]
             cluster_sizes.append(cluster_sizes[-1] + 1)
-            centroids.append(cluster_indices_cpu[item].item())
+            centroids.append(int(indices_cpu[item]))
 
         clusters = []
-        indices_cpu = cluster_indices.cpu().numpy()
         for i in range(len(cluster_sizes) - 1):
             start_idx = cluster_sizes[i]
             end_idx = cluster_sizes[i + 1]
diff --git a/src/conformer_rmsd.cu b/src/conformer_rmsd.cu
index bb10445..d1bb6ed 100644
--- a/src/conformer_rmsd.cu
+++ b/src/conformer_rmsd.cu
@@ -99,15 +99,29 @@ __device__ __forceinline__ double det3x3(const double* H) {
 // ---------------------------------------------------------------------------
 
 constexpr int kRmsdBlockSize = 128;
-using RmsdBlockReduceT       = cub::BlockReduce<double, kRmsdBlockSize>;
+constexpr int kRmsdWarps     = kRmsdBlockSize / 32;  // 4 warps per block
+
+using RmsdWarpReduce = cub::WarpReduce<double>;
 
 __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coordI,
                                                 const double* __restrict__ coordJ,
                                                 const int  numAtoms,
                                                 const bool prealigned,
                                                 double*    outRmsd) {
-  const int  tid = threadIdx.x;
-  __shared__ RmsdBlockReduceT::TempStorage reduceTmp;
+  const int tid    = threadIdx.x;
+  const int warpId = tid / 32;
+  const int laneId = tid % 32;
+
+  // Shared buffers for warp→block reduction.
+  //   warpReduceTemp[w]: CUB WarpReduce scratch for warp w (empty for full 32-thread warps).
+  //   warpBuf[w][f]:     field f's warp-partial sum for warp w.
+  //   sCent[6]:          broadcast centroid buffer written by thread 0 after sync 1.
+  // prealigned path uses only warpBuf[w][0].
+  // Alignment path phase-1 uses fields 0-5 (centroid sums);
+  //              phase-2 uses fields 0-10 (Sp, Sq, H[0..8]).
+  __shared__ typename RmsdWarpReduce::TempStorage warpReduceTemp[kRmsdWarps];
+  __shared__ double                               warpBuf[kRmsdWarps][11];
+  __shared__ double                               sCent[6];
 
   if (prealigned) {
     // ---- Simple RMSD without alignment (no centering, matches RDKit behavior) ----
@@ -118,16 +132,22 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord
       const double dz = coordI[a * 3 + 2] - coordJ[a * 3 + 2];
       sumSqDiff += dx * dx + dy * dy + dz * dz;
     }
-    const double total = RmsdBlockReduceT(reduceTmp).Sum(sumSqDiff);
-    if (tid == 0)
+    sumSqDiff = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumSqDiff);
+    if (laneId == 0)
+      warpBuf[warpId][0] = sumSqDiff;
+    __syncthreads();
+    if (tid == 0) {
+      double total = 0.0;
+      for (int w = 0; w < kRmsdWarps; ++w)
+        total += warpBuf[w][0];
       *outRmsd = sqrt(total / static_cast<double>(numAtoms));
+    }
     return;
   }
 
-  // ---- Kabsch alignment path: compute centroids ----
-  __shared__ double sCentI[3];
-  __shared__ double sCentJ[3];
-
+  // ---- Kabsch alignment path ----
+  // Phase 1: accumulate centroid sums, warp-reduce into warpBuf (sync 1),
+  //          thread 0 computes and broadcasts centroids via sCent (sync 2).
   double sumIx = 0.0, sumIy = 0.0, sumIz = 0.0;
   double sumJx = 0.0, sumJy = 0.0, sumJz = 0.0;
   for (int a = tid; a < numAtoms; a += kRmsdBlockSize) {
@@ -138,43 +158,49 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord
     sumJy += coordJ[a * 3 + 1];
     sumJz += coordJ[a * 3 + 2];
   }
+  sumIx = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumIx);
+  sumIy = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumIy);
+  sumIz = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumIz);
+  sumJx = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumJx);
+  sumJy = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumJy);
+  sumJz = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(sumJz);
+
+  if (laneId == 0) {
+    warpBuf[warpId][0] = sumIx;
+    warpBuf[warpId][1] = sumIy;
+    warpBuf[warpId][2] = sumIz;
+    warpBuf[warpId][3] = sumJx;
+    warpBuf[warpId][4] = sumJy;
+    warpBuf[warpId][5] = sumJz;
+  }
+  __syncthreads();  // sync 1: warpBuf[*][0..5] visible; thread 0 reduces below.
 
-  // Reduce centroid components; write results to shared memory for broadcast.
-  // Each __syncthreads() both allows TempStorage reuse and makes the previous
-  // shared-memory write visible to all threads before the next reduction.
   const double invN = 1.0 / static_cast<double>(numAtoms);
-  sumIx             = RmsdBlockReduceT(reduceTmp).Sum(sumIx);
-  __syncthreads();
-  if (tid == 0)
-    sCentI[0] = sumIx * invN;
-  sumIy = RmsdBlockReduceT(reduceTmp).Sum(sumIy);
-  __syncthreads();
-  if (tid == 0)
-    sCentI[1] = sumIy * invN;
-  sumIz = RmsdBlockReduceT(reduceTmp).Sum(sumIz);
-  __syncthreads();
-  if (tid == 0)
-    sCentI[2] = sumIz * invN;
-  sumJx = RmsdBlockReduceT(reduceTmp).Sum(sumJx);
-  __syncthreads();
-  if (tid == 0)
-    sCentJ[0] = sumJx * invN;
-  sumJy = RmsdBlockReduceT(reduceTmp).Sum(sumJy);
-  __syncthreads();
-  if (tid == 0)
-    sCentJ[1] = sumJy * invN;
-  sumJz = RmsdBlockReduceT(reduceTmp).Sum(sumJz);
-  __syncthreads();
-  if (tid == 0)
-    sCentJ[2] = sumJz * invN;
-  __syncthreads();  // broadcast sCentJ[2] and ensure all centroid writes are visible
-
-  const double cIx = sCentI[0], cIy = sCentI[1], cIz = sCentI[2];
-  const double cJx = sCentJ[0], cJy = sCentJ[1], cJz = sCentJ[2];
-
-  // ---- Compute Sp, Sq, and cross-covariance H (Kabsch alignment) ----
+  if (tid == 0) {
+    double sIx = 0.0, sIy = 0.0, sIz = 0.0, sJx = 0.0, sJy = 0.0, sJz = 0.0;
+    for (int w = 0; w < kRmsdWarps; ++w) {
+      sIx += warpBuf[w][0];
+      sIy += warpBuf[w][1];
+      sIz += warpBuf[w][2];
+      sJx += warpBuf[w][3];
+      sJy += warpBuf[w][4];
+      sJz += warpBuf[w][5];
+    }
+    sCent[0] = sIx * invN;
+    sCent[1] = sIy * invN;
+    sCent[2] = sIz * invN;
+    sCent[3] = sJx * invN;
+    sCent[4] = sJy * invN;
+    sCent[5] = sJz * invN;
+  }
+  __syncthreads();  // sync 2: sCent visible to all threads.
+
+  const double cIx = sCent[0], cIy = sCent[1], cIz = sCent[2];
+  const double cJx = sCent[3], cJy = sCent[4], cJz = sCent[5];
+
+  // Phase 2: accumulate Sp, Sq, and H = P^T Q (11 values), warp-reduce into
+  //          warpBuf (sync 3), thread 0 sums and computes RMSD.
   // Sp = sum ||pi - centI||^2,  Sq = sum ||qj - centJ||^2
-  // H[r][c] = sum (pi[r] - centI[r]) * (qj[c] - centJ[c])
   double localSp = 0.0, localSq = 0.0;
   double localH[9] = {0.0};
 
@@ -189,7 +215,6 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord
     localSp += px * px + py * py + pz * pz;
     localSq += qx * qx + qy * qy + qz * qz;
 
-    // H = P^T Q  (sum of outer products)
     localH[0] += px * qx;
     localH[1] += px * qy;
     localH[2] += px * qz;
@@ -201,35 +226,27 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord
     localH[8] += pz * qz;
   }
 
-  // Reduce all 11 values into thread 0.  Results in non-zero threads are
-  // undefined and unused; only thread 0 performs the RMSD computation below.
-  // __syncthreads() between calls allows TempStorage reuse.
-  localSp = RmsdBlockReduceT(reduceTmp).Sum(localSp);
-  __syncthreads();
-  localSq = RmsdBlockReduceT(reduceTmp).Sum(localSq);
-  __syncthreads();
-  localH[0] = RmsdBlockReduceT(reduceTmp).Sum(localH[0]);
-  __syncthreads();
-  localH[1] = RmsdBlockReduceT(reduceTmp).Sum(localH[1]);
-  __syncthreads();
-  localH[2] = RmsdBlockReduceT(reduceTmp).Sum(localH[2]);
-  __syncthreads();
-  localH[3] = RmsdBlockReduceT(reduceTmp).Sum(localH[3]);
-  __syncthreads();
-  localH[4] = RmsdBlockReduceT(reduceTmp).Sum(localH[4]);
-  __syncthreads();
-  localH[5] = RmsdBlockReduceT(reduceTmp).Sum(localH[5]);
-  __syncthreads();
-  localH[6] = RmsdBlockReduceT(reduceTmp).Sum(localH[6]);
-  __syncthreads();
-  localH[7] = RmsdBlockReduceT(reduceTmp).Sum(localH[7]);
-  __syncthreads();
-  localH[8] = RmsdBlockReduceT(reduceTmp).Sum(localH[8]);
-  // No final sync: only thread 0 reads localSp, localSq, localH below.
-
-  // ---- Thread 0: compute RMSD from Sp, Sq, singular values of H ----
+  localSp = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(localSp);
+  localSq = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(localSq);
+  for (int i = 0; i < 9; ++i)
+    localH[i] = RmsdWarpReduce(warpReduceTemp[warpId]).Sum(localH[i]);
+
+  if (laneId == 0) {
+    warpBuf[warpId][0] = localSp;
+    warpBuf[warpId][1] = localSq;
+    for (int i = 0; i < 9; ++i)
+      warpBuf[warpId][2 + i] = localH[i];
+  }
+  __syncthreads();  // sync 3: warpBuf[*][0..10] visible; thread 0 computes RMSD.
+
   if (tid == 0) {
-    const double* H = localH;
+    double Sp = 0.0, Sq = 0.0, H[9] = {0.0};
+    for (int w = 0; w < kRmsdWarps; ++w) {
+      Sp += warpBuf[w][0];
+      Sq += warpBuf[w][1];
+      for (int i = 0; i < 9; ++i)
+        H[i] += warpBuf[w][2 + i];
+    }
 
     // G = H^T H  (3x3 symmetric positive semi-definite)
     const double g00 = H[0] * H[0] + H[3] * H[3] + H[6] * H[6];
@@ -253,7 +270,7 @@ __device__ __forceinline__ void computePairRmsd(const double* __restrict__ coord
       s2 = -s2;
 
     // RMSD^2 = (Sp + Sq - 2*(s0 + s1 + s2)) / N
-    const double rmsdSq = fmax((localSp + localSq - 2.0 * (s0 + s1 + s2)) * invN, 0.0);
+    const double rmsdSq = fmax((Sp + Sq - 2.0 * (s0 + s1 + s2)) * invN, 0.0);
     *outRmsd            = sqrt(rmsdSq);
   }
 }
diff --git a/src/minimizer/bfgs_minimize.cu b/src/minimizer/bfgs_minimize.cu
index 020cc09..e240eec 100644
--- a/src/minimizer/bfgs_minimize.cu
+++ b/src/minimizer/bfgs_minimize.cu
@@ -908,6 +908,10 @@ __global__ void updateDGradKernel(const double  gradTol,
   double blockMax = cub::BlockReduce<double, 128>(tempStorage).Reduce(localMax, cubMax());
 
   if (idxWithinSystem == 0) {
+    // Matches RDKit's signed-energy convergence denominator in ForceField::minimize.
+    // Negative-energy geometries can clamp this to 1, artificially tightening the
+    // check — but fixing it unconditionally diverges from reference behavior.
+    // TODO: file upstream RDKit bug; gate fix on RDKit version once merged there.
     const double term = max(energies[sysIdx] * gradScales[sysIdx], 1.0);
     blockMax /= term;
     if (blockMax < gradTol) {
diff --git a/src/similarity_kernels.cu b/src/similarity_kernels.cu
index 4bffc72..7de1d65 100644
--- a/src/similarity_kernels.cu
+++ b/src/similarity_kernels.cu
@@ -65,6 +65,30 @@ bool supportsTensorOps(const int major, const int minor) {
   return false;
 }
 
+// Per-device cache for tensor-op capability: 0=unknown, 1=unsupported, 2=supported.
+// Indexed by device ordinal; supports up to kMaxDevices GPUs in a single process.
+// Initialized to zero by C++ static-storage rules (all unknown at startup).
+constexpr int kMaxDevices = 16;
+
+std::atomic<int8_t> g_tensorOpsCache[kMaxDevices]{};
+
+//! Returns whether the current CUDA device supports BMMA tensor ops,
+//! querying cudaGetDeviceProperties at most once per device per process.
+bool isTensorOpsSupportedCached() {
+  int device;
+  cudaCheckError(cudaGetDevice(&device));
+  if (device >= 0 && device < kMaxDevices && g_tensorOpsCache[device] != 0) {
+    return g_tensorOpsCache[device] == 2;
+  }
+  cudaDeviceProp props;
+  cudaCheckError(cudaGetDeviceProperties(&props, device));
+  const bool result = supportsTensorOps(props.major, props.minor);
+  if (device >= 0 && device < kMaxDevices) {
+    g_tensorOpsCache[device] = result ? 2 : 1;
+  }
+  return result;
+}
+
 //! Cast a device vector to a span of a type smaller or equal to self.
 //! Does some basic checking of compatibility between types.
 template <typename T, typename OtherT>
@@ -423,17 +447,10 @@ void launchCrossTanimotoSimilarity(const AsyncDeviceVector<internal::kBlockType>
                                    const size_t                                   numBitsPerMolecule,
                                    AsyncDeviceVector<double>&                     results,
                                    const size_t                                   offset) {
-  int device;
-  cudaGetDevice(&device);
-  cudaCheckError(cudaGetLastError());
-
-  cudaDeviceProp deviceProp;
-  cudaCheckError(cudaGetDeviceProperties(&deviceProp, device));
-
   size_t m = bitsOne.size();
   size_t n = bitsTwo.size();
 
-  if (supportsTensorOps(deviceProp.major, deviceProp.minor)) {
+  if (isTensorOpsSupportedCached()) {
     constexpr unsigned int BLOCK_TILE_SIZE_X{64U};
     constexpr unsigned int BLOCK_TILE_SIZE_Y{64U};
 
@@ -508,17 +525,10 @@ void launchCrossTanimotoSimilarity(const cuda::std::span<const std::uint32_t> bi
                                    const cuda::std::span<double>              results,
                                    const size_t                               offset,
                                    cudaStream_t                               stream) {
-  int device;
-  cudaGetDevice(&device);
-  cudaCheckError(cudaGetLastError());
-
-  cudaDeviceProp deviceProp;
-  cudaCheckError(cudaGetDeviceProperties(&deviceProp, device));
-
   size_t m = bitsOne.size() / numBitsPerMolecule;
   size_t n = bitsTwo.size() / numBitsPerMolecule;
 
-  if (supportsTensorOps(deviceProp.major, deviceProp.minor)) {
+  if (isTensorOpsSupportedCached()) {
     constexpr unsigned int BLOCK_TILE_SIZE_X{64U};
     constexpr unsigned int BLOCK_TILE_SIZE_Y{64U};
 
@@ -645,17 +655,10 @@ void launchCrossCosineSimilarity(const AsyncDeviceVector<internal::kBlockType>&
                                  const size_t                                   numBitsPerMolecule,
                                  AsyncDeviceVector<double>&                     results,
                                  const size_t                                   offset) {
-  int device;
-  cudaGetDevice(&device);
-  cudaCheckError(cudaGetLastError());
-
-  cudaDeviceProp deviceProp;
-  cudaCheckError(cudaGetDeviceProperties(&deviceProp, device));
-
   size_t m = bitsOne.size();
   size_t n = bitsTwo.size();
 
-  if (supportsTensorOps(deviceProp.major, deviceProp.minor)) {
+  if (isTensorOpsSupportedCached()) {
     constexpr unsigned int BLOCK_TILE_SIZE_X{64U};
     constexpr unsigned int BLOCK_TILE_SIZE_Y{64U};
 
@@ -730,17 +733,10 @@ void launchCrossCosineSimilarity(const cuda::std::span<const std::uint32_t> bits
                                  const cuda::std::span<double>              results,
                                  const size_t                               offset,
                                  cudaStream_t                               stream) {
-  int device;
-  cudaGetDevice(&device);
-  cudaCheckError(cudaGetLastError());
-
-  cudaDeviceProp deviceProp;
-  cudaCheckError(cudaGetDeviceProperties(&deviceProp, device));
-
   size_t m = bitsOne.size() / numBitsPerMolecule;
   size_t n = bitsTwo.size() / numBitsPerMolecule;
 
-  if (supportsTensorOps(deviceProp.major, deviceProp.minor)) {
+  if (isTensorOpsSupportedCached()) {
     constexpr unsigned int BLOCK_TILE_SIZE_X{64U};
     constexpr unsigned int BLOCK_TILE_SIZE_Y{64U};
 
diff --git a/src/symmetric_eigensolver.cu b/src/symmetric_eigensolver.cu
index 2a69d9e..8aefd3c 100644
--- a/src/symmetric_eigensolver.cu
+++ b/src/symmetric_eigensolver.cu
@@ -198,6 +198,7 @@ void launchBatchEigensolverKernel(const int      numEigs,
                                   double*        eigenvaluesOut,
                                   double*        eigenvectorsOut,
                                   uint8_t*       converged,
+                                  curandState*   states,
                                   const uint8_t* active,
                                   const int      seed = 42) {
   const int numBlocks          = numSystems;
@@ -205,15 +206,13 @@ void launchBatchEigensolverKernel(const int      numEigs,
   if (matrixDim > 256) {
     throw std::runtime_error("Matrix dimension is too large for the kernel");
   }
-  // TODO: reuse.
-  AsyncDeviceVector<curandState> states(numSystems * matrixDim);
   batchEigensolverKernel<<<numBlocks, numThreadsPerBlock>>>(numEigs,
                                                             matrixDim,
                                                             mutableBoundsMatrices,
                                                             eigenvaluesOut,
                                                             eigenvectorsOut,
                                                             converged,
-                                                            states.data(),
+                                                            states,
                                                             active,
                                                             seed);
   cudaCheckError(cudaGetLastError());
@@ -234,7 +233,11 @@ class BatchedEigenSolver::Impl {
     converged_.resize(batch_size);
     cudaCheckError(cudaMemsetAsync(converged_.data(), 0, batch_size * sizeof(uint8_t)));
 
-    // Solve batched eigenvalue problem
+    const size_t statesNeeded = static_cast<size_t>(batch_size) * static_cast<size_t>(matrixDim);
+    if (statesNeeded > states_.size()) {
+      states_.resize(statesNeeded);
+    }
+
     launchBatchEigensolverKernel(numEigs,
                                  batch_size,
                                  matrixDim,
@@ -242,6 +245,7 @@ class BatchedEigenSolver::Impl {
                                  eigenvalues,
                                  eigenvectors,
                                  converged_.data(),
+                                 states_.data(),
                                  active,
                                  randomSeed);
   }
@@ -249,7 +253,8 @@ class BatchedEigenSolver::Impl {
   const uint8_t* converged() const { return converged_.data(); }
 
  private:
-  AsyncDeviceVector<uint8_t> converged_;
+  AsyncDeviceVector<uint8_t>     converged_;
+  AsyncDeviceVector<curandState> states_;
 };
 
 BatchedEigenSolver::BatchedEigenSolver() : pimpl_(std::make_unique<Impl>()) {}
diff --git a/src/tfd/tfd_kernels.cu b/src/tfd/tfd_kernels.cu
index 28c3314..ef9337e 100644
--- a/src/tfd/tfd_kernels.cu
+++ b/src/tfd/tfd_kernels.cu
@@ -109,9 +109,10 @@ __global__ void tfdMatrixKernel(const int numMolecules,
     // Decode lower-triangular (i, j) from flat pairIdx
     // i*(i-1)/2 + j = pairIdx, i > j >= 0
     // i = floor((1 + sqrt(1 + 8*pairIdx)) / 2)
-    int i = static_cast<int>((1.0f + sqrtf(1.0f + 8.0f * pairIdx)) * 0.5f);
-    // Symmetric guards: float sqrtf can round either way when 8*pairIdx exceeds
-    // float32's 24-bit mantissa (pairIdx >~ 2M, i.e. >~2000 conformers).
+    // Use double precision: float32 loses bits when 8*pairIdx > 2^23 (~1M pairs,
+    // ~1415 conformers), the same threshold documented in conformer_rmsd.cu.
+    int i = static_cast<int>((1.0 + sqrt(1.0 + 8.0 * static_cast<double>(pairIdx))) * 0.5);
+    // Clamp guards for any residual rounding at boundary values.
     if (i * (i - 1) / 2 > pairIdx)
       i--;
     else if ((i + 1) * i / 2 <= pairIdx)
@@ -135,15 +136,19 @@ __global__ void tfdMatrixKernel(const int numMolecules,
         deviation = circularDifference(dihedralAngles[aI + qLocalStart], dihedralAngles[aJ + qLocalStart]) /
                     torsionMaxDevs[globalT];
       } else if (type == 1) {  // Ring
-        float avgI = 0.0f;
-        float avgJ = 0.0f;
-        for (int qq = 0; qq < numQ; ++qq) {
-          avgI += fabsf(dihedralAngles[aI + qLocalStart + qq] - 180.0f);
-          avgJ += fabsf(dihedralAngles[aJ + qLocalStart + qq] - 180.0f);
+        if (numQ == 0) {
+          deviation = 0.0f;
+        } else {
+          float avgI = 0.0f;
+          float avgJ = 0.0f;
+          for (int qq = 0; qq < numQ; ++qq) {
+            avgI += fabsf(dihedralAngles[aI + qLocalStart + qq] - 180.0f);
+            avgJ += fabsf(dihedralAngles[aJ + qLocalStart + qq] - 180.0f);
+          }
+          avgI /= numQ;
+          avgJ /= numQ;
+          deviation = fabsf(avgI - avgJ) / torsionMaxDevs[globalT];
         }
-        avgI /= numQ;
-        avgJ /= numQ;
-        deviation = fabsf(avgI - avgJ) / torsionMaxDevs[globalT];
       } else {  // Symmetric
         float minDiff = 180.0f;
         for (int qi = 0; qi < numQ; ++qi) {
diff --git a/src/triangle_smooth.cu b/src/triangle_smooth.cu
index 226c0e3..700fd23 100644
--- a/src/triangle_smooth.cu
+++ b/src/triangle_smooth.cu
@@ -330,7 +330,7 @@ void DeviceBoundsMatrixBatch::copyToHost(std::vector<::DistGeom::BoundsMatPtr>&
   // Copy device data to host vector first
   std::vector<double> hostData(totalElements_);
   data_.copyToHost(hostData);
-  cudaDeviceSynchronize();
+  cudaCheckError(cudaStreamSynchronize(data_.stream()));
 
   // Distribute data to individual matrices
   size_t globalOffset = 0;