made type definitions in cython match numpy types explicitly

HeikoSchuett · HeikoSchuett · commit 45998f6d7677 · 2024-07-03T00:38:34.000+02:00
diff --git a/setup.py b/setup.py
@@ -6,12 +6,14 @@
 from setuptools import setup, Extension
 import setuptools_scm  # noqa # pylint: disable=unused-import
 from Cython.Build import build_ext
+import numpy
 
 
 setup(
-    ext_modules = [
+    ext_modules=[
         Extension(
             "rsatoolbox.cengine.similarity",
-            ["src/rsatoolbox/cengine/similarity.pyx"])],
+            ["src/rsatoolbox/cengine/similarity.pyx"],
+            include_dirs=[numpy.get_include()])],
     cmdclass={'build_ext': build_ext}
 )
diff --git a/src/rsatoolbox/cengine/similarity.pyx b/src/rsatoolbox/cengine/similarity.pyx
@@ -6,15 +6,20 @@ from cython.view cimport array as cvarray
 from libc.math cimport log, sqrt, isnan, NAN
 from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
 cimport scipy.linalg.cython_blas as blas
+cimport numpy as cnp
 
+cnp.import_array()
+
+ctypedef cnp.int64_t int_t
+ctypedef cnp.float64_t float_t
 
 @cython.boundscheck(False)
 @cython.cdivision(True)
-cpdef double [:] calc(
-    double [:, :] data, long [:] desc,
-    long [:] cv_desc, int n,
-    int method_idx, double [:, :] noise=None,
-    double prior_lambda=1, double prior_weight=0.1,
+cpdef float_t [:] calc(
+    float_t [:, :] data, int_t [:] desc,
+    int_t [:] cv_desc, int n,
+    int method_idx, float_t [:, :] noise=None,
+    float_t prior_lambda=1, float_t prior_weight=0.1,
     int weighting=1, int crossval=0):
     # calculates an RDM from a double array of data with integer descriptors
     # There are no checks or saveguards in this function!
@@ -37,17 +42,17 @@ cpdef double [:] calc(
     #     0: each row has equal weight
     #     1: rows weighted by number of valid measurements
     cdef:
-        double [:] vec_i
-        double [:] vec_j
-        double weight, sim
-        double [:] weights
-        double [:] values
+        float_t [:] vec_i
+        float_t [:] vec_j
+        float_t weight, sim
+        float_t [:] weights
+        float_t [:] values
         int i, j, idx
         int n_rdm = (n * (n-1)) / 2
         int n_dim = data.shape[1]
-        double prior_lambda_l = prior_lambda * prior_weight
-        double prior_weight_l = 1 + prior_weight
-        double [:, :] log_data
+        float_t prior_lambda_l = prior_lambda * prior_weight
+        float_t prior_weight_l = 1 + prior_weight
+        float_t [:, :] log_data
     if (method_idx > 4) or (method_idx < 1):
         raise ValueError('dissimilarity method not recognized!')
     # precompute stuff for poisson KL
@@ -58,8 +63,8 @@ cpdef double [:] calc(
             for j in range(n_dim):
                 data[i, j] = (data[i, j] + prior_lambda_l) / prior_weight_l
                 log_data[i, j] = log(data[i, j])
-    weights = <double [:(n_rdm+n)]> PyMem_Malloc((n_rdm+n) * sizeof(double))
-    values = <double [:(n_rdm+n)]> PyMem_Malloc((n_rdm+n) * sizeof(double))
+    weights = <float_t [:(n_rdm+n)]> PyMem_Malloc((n_rdm+n) * sizeof(float_t))
+    values = <float_t [:(n_rdm+n)]> PyMem_Malloc((n_rdm+n) * sizeof(float_t))
     for idx in range(n_rdm + n):
         weights[idx] = 0
         values[idx] = 0
@@ -74,7 +79,7 @@ cpdef double [:] calc(
                     sim, weight = euclid(data[i], data[i], n_dim)
                 else:
                     sim = mahalanobis(data[i], data[i], n_dim, noise)
-                    weight = <double> n_dim
+                    weight = <float_t> n_dim
             elif method_idx == 4: # method in ['poisson', 'poisson_cv']:
                 sim, weight = poisson_cv(data[i], data[i], log_data[i], log_data[i], n_dim)
             idx = desc[i]
@@ -97,7 +102,7 @@ cpdef double [:] calc(
                         sim, weight = euclid(data[i], data[j], n_dim)
                     else:
                         sim = mahalanobis(data[i], data[j], n_dim, noise)
-                        weight = <double> n_dim
+                        weight = <float_t> n_dim
                 elif method_idx == 4: # method in ['poisson', 'poisson_cv']:
                     sim, weight = poisson_cv(data[i], data[j], log_data[i], log_data[j], n_dim)
                 if weight > 0:
@@ -124,25 +129,25 @@ cpdef double [:] calc(
 
 @cython.boundscheck(False)
 @cython.cdivision(True)
-cpdef (double, double) calc_one(
-    double [:, :] data_i, double [:, :] data_j,
-    long [:] cv_desc_i, long [:] cv_desc_j,
+cpdef (float_t, float_t) calc_one(
+    float_t [:, :] data_i, float_t [:, :] data_j,
+    int_t [:] cv_desc_i, int_t [:] cv_desc_j,
     int n_i, int n_j,
-    int method_idx, double [:, :] noise=None,
-    double prior_lambda=1, double prior_weight=0.1,
+    int method_idx, float_t [:, :] noise=None,
+    float_t prior_lambda=1, float_t prior_weight=0.1,
     int weighting=1):
     cdef:
         #double [:] values = np.zeros(n_i * n_j)
         #double [:] weights = np.zeros(n_i * n_j)
-        double [:] vec_i
-        double [:] vec_j
-        double weight, sim, weight_sum, value
+        float_t [:] vec_i
+        float_t [:] vec_j
+        float_t weight, sim, weight_sum, value
         int i, j
         int n_dim = data_i.shape[1]
-        double prior_lambda_l = prior_lambda * prior_weight
-        double prior_weight_l = 1 + prior_weight
-        double [:, :] log_data_i
-        double [:, :] log_data_j
+        float_t prior_lambda_l = prior_lambda * prior_weight
+        float_t prior_weight_l = 1 + prior_weight
+        float_t [:, :] log_data_i
+        float_t [:, :] log_data_j
     if (method_idx > 4) or (method_idx < 1):
         raise ValueError('dissimilarity method not recognized!')
     # precompute stuff for poisson KL
@@ -173,7 +178,7 @@ cpdef (double, double) calc_one(
                         sim, weight = euclid(data_i[i], data_j[j], n_dim)
                     else:
                         sim = mahalanobis(data_i[i], data_j[j], n_dim, noise)
-                        weight = <double> n_dim
+                        weight = <float_t> n_dim
                 elif method_idx == 4: # method in ['poisson', 'poisson_cv']:
                     sim, weight = poisson_cv(data_i[i], data_j[j], log_data_i[i], log_data_j[j], n_dim)
                 if weight > 0:
@@ -191,8 +196,8 @@ cpdef (double, double) calc_one(
 
 
 @cython.boundscheck(False)
-cpdef (double, double) similarity(double [:] vec_i, double [:] vec_j, int method_idx,
-                       int n_dim, double [:, :] noise):
+cpdef (float_t, float_t) similarity(float_t [:] vec_i, float_t [:] vec_j, int method_idx,
+                       int n_dim, float_t [:, :] noise):
     """
     double similarity(double [:] vec_i, double [:] vec_j, int method_idx,
                       int n_dim, double [:, :] noise=None)
@@ -203,8 +208,8 @@ cpdef (double, double) similarity(double [:] vec_i, double [:] vec_j, int method
 
     Mahalanobis distances require full measurement vectors at the moment!
     """
-    cdef double sim
-    cdef double weight
+    cdef float_t sim
+    cdef float_t weight
     if method_idx == 1: # method == 'euclidean':
         sim, weight = euclid(vec_i, vec_j, n_dim)
     elif method_idx == 2: # method == 'correlation':
@@ -214,15 +219,15 @@ cpdef (double, double) similarity(double [:] vec_i, double [:] vec_j, int method
             sim, weight = euclid(vec_i, vec_j, n_dim)
         else:
             sim = mahalanobis(vec_i, vec_j, n_dim, noise)
-            weight = <double> n_dim
+            weight = <float_t> n_dim
     return sim, weight
 
 
 @cython.boundscheck(False)
-cdef (double, double) euclid(double [:] vec_i, double [:] vec_j, int n_dim):
+cdef (float_t, float_t) euclid(float_t [:] vec_i, float_t [:] vec_j, int n_dim):
     cdef:
-        double sim = 0
-        double weight = 0
+        float_t sim = 0
+        float_t weight = 0
         int i
     for i in range(n_dim):
         if not isnan(vec_i[i]) and not isnan(vec_j[i]):
@@ -233,12 +238,12 @@ cdef (double, double) euclid(double [:] vec_i, double [:] vec_j, int n_dim):
 
 @cython.boundscheck(False)
 @cython.cdivision(True)
-cdef (double, double) poisson_cv(double [:] vec_i, double [:] vec_j,
-                                 double [:] log_vec_i, double [:] log_vec_j,
+cdef (float_t, float_t) poisson_cv(float_t [:] vec_i, float_t [:] vec_j,
+                                 float_t [:] log_vec_i, float_t [:] log_vec_j,
                                  int n_dim):
     cdef:
-        double sim = 0
-        double weight = 0
+        float_t sim = 0
+        float_t weight = 0
         int i
     for i in range(n_dim):
         if not isnan(vec_i[i]) and not isnan(vec_j[i]):
@@ -249,20 +254,20 @@ cdef (double, double) poisson_cv(double [:] vec_i, double [:] vec_j,
 
 
 @cython.boundscheck(False)
-cdef double mahalanobis(double [:] vec_i, double [:] vec_j, int n_dim,
-                        double [:, :] noise):
+cdef float_t mahalanobis(float_t [:] vec_i, float_t [:] vec_j, int n_dim,
+                        float_t [:, :] noise):
     cdef:
-        double *vec1
-        double *vec2
+        float_t *vec1
+        float_t *vec2
         int *finite
         int zero = 0
         int one = 1
-        double onef = 1.0
-        double zerof = 0.0
+        float_t onef = 1.0
+        float_t zerof = 0.0
         char trans = b'n'
-        double sim = 0.0
+        float_t sim = 0.0
         int i, j, k, l, n_finite
-        double [:, :] noise_small
+        float_t [:, :] noise_small
     finite = <int*> PyMem_Malloc(n_dim * sizeof(int))
     # use finite as a bool to choose the non-nan values
     n_finite = 0
@@ -272,11 +277,10 @@ cdef double mahalanobis(double [:] vec_i, double [:] vec_j, int n_dim,
             n_finite += 1
         else:
             finite[i] = 0
-    vec1 = <double*> PyMem_Malloc(n_finite * sizeof(double))
-    vec2 = <double*> PyMem_Malloc(n_finite * sizeof(double))
-    vec3 = <double*> PyMem_Malloc(n_finite * sizeof(double))
-    #noise_small = <double [:n_finite, :n_finite]> PyMem_Malloc(n_finite * n_finite * sizeof(double))
-    noise_small = cvarray(shape=(n_finite, n_finite), itemsize=sizeof(double), format="d")
+    vec1 = <float_t*> PyMem_Malloc(n_finite * sizeof(float_t))
+    vec2 = <float_t*> PyMem_Malloc(n_finite * sizeof(float_t))
+    vec3 = <float_t*> PyMem_Malloc(n_finite * sizeof(float_t))
+    noise_small = cvarray(shape=(n_finite, n_finite), itemsize=sizeof(float_t), format="d")
     k = 0
     for i in range(n_dim):
         if finite[i]:
@@ -300,15 +304,15 @@ cdef double mahalanobis(double [:] vec_i, double [:] vec_j, int n_dim,
 
 @cython.boundscheck(False)
 @cython.cdivision(True)
-cdef (double, double) correlation(double [:] vec_i, double [:] vec_j, int n_dim):
+cdef (float_t, float_t) correlation(float_t [:] vec_i, float_t [:] vec_j, int n_dim):
     cdef:
-        double si = 0.0
-        double sj = 0.0
-        double si2 = 0.0
-        double sj2 = 0.0
-        double sij = 0.0
-        double sim
-        double weight = 0
+        float_t si = 0.0
+        float_t sj = 0.0
+        float_t si2 = 0.0
+        float_t sj2 = 0.0
+        float_t sij = 0.0
+        float_t sim
+        float_t weight = 0
         int i
     for i in range(n_dim):
         if not isnan(vec_i[i]) and not isnan(vec_j[i]):
diff --git a/src/rsatoolbox/rdm/calc_unbalanced.py b/src/rsatoolbox/rdm/calc_unbalanced.py
@@ -91,14 +91,14 @@ def calc_rdm_unbalanced(dataset: SingleOrMultiDataset, method='euclidean',
             dataset.obs_descriptors[descriptor])
         # unique_cond = set(dataset.obs_descriptors[descriptor])
         if cv_descriptor is None:
-            cv_desc_int = np.arange(dataset.n_obs, dtype=int)
+            cv_desc_int = np.arange(dataset.n_obs, dtype=np.int64)
             crossval = 0
         else:
             _, indices = np.unique(
                 dataset.obs_descriptors[cv_descriptor],
                 return_inverse=True
             )
-            cv_desc_int = indices.astype(int)
+            cv_desc_int = indices.astype(np.int64)
             crossval = 1
         if method == 'euclidean':
             method_idx = 1
@@ -114,7 +114,7 @@ def calc_rdm_unbalanced(dataset: SingleOrMultiDataset, method='euclidean',
             weight_idx = 0
         else:
             weight_idx = 1
-        cond_indices_int = cond_indices.astype(int)
+        cond_indices_int = cond_indices.astype(np.int64)
         rdm = calc(
             ensure_double(dataset.measurements),
             cond_indices_int,