atrtibutes for git lfs + cleanup of requirements code

xinglab-ai · Jul 7, 2023 · bb3abed · bb3abed
1 parent a881355
commit bb3abed
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 119 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+data/TM_data.csv filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+*.egg-info
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,4 +4,5 @@ include README.md
 exclude genomap/genoNet.py
 exclude genomap/genoNetus.py
 recursive-exclude * __pycache__
-recursive-include data/readme.txt
+recursive-include data/readme.txt
+include data/TM_data.csv
diff --git a/genoMapDemo.ipynb b/genoMapDemo.ipynb
@@ -42,7 +42,7 @@
     "import scipy.io as sio\n",
     "import numpy as np\n",
     "import scipy.io as sio\n",
-    "import genomap.genoNet as gNet\n",
+    "import genomap.genoNet.genoNet as gNet\n",
     "import os\n",
     "import torch\n",
     "from torch.utils.data import DataLoader, Dataset\n",
@@ -56,7 +56,7 @@
     "from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer\n",
     "from pyclustering.cluster import cluster_visualizer\n",
     "\n",
-    "from genomap.genoNet import geneDataset, get_device, load_genoNet, predict, traingenoNet, rescale\n",
+    "from genomap.genoNet.genoNet import geneDataset, get_device, load_genoNet, predict, traingenoNet, rescale\n",
     "from genomap import construct_genomap\n",
     "\n",
     "# For reproducibility\n",
@@ -99,34 +99,32 @@
     "# Creation of genomaps\n",
     "# Selection of row and column number of the genomaps \n",
     "# To create square genomaps, the row and column numbers are set to be the same.\n",
-    "colNum=31\n",
-    "rowNum=31\n",
-    "n=rowNum*colNum # Total number of pixels in genomaps\n",
-    "\n",
-    "\n",
+    "colNum = 31\n",
+    "rowNum = 31\n",
+    "n = rowNum*colNum # Total number of pixels in genomaps\n",
     "\n",
     "# When the dataset has more genes than number of pixels in the desired genomap,\n",
     "# select the first n most variable genes\n",
-    "if n<data.shape[1]:\n",
+    "if n < data.shape[1]:\n",
     "    # create an instance of the VarianceThreshold class\n",
     "    selector = VarianceThreshold()\n",
     "    # fit the selector to the data and get the indices of the top n most variable features\n",
     "    var_threshold = selector.fit(data)\n",
     "    top_n_indices = var_threshold.get_support(indices=True)\n",
-    "    top_n_features=data.columns[top_n_indices[0:n]]\n",
-    "    data=data[top_n_features]\n",
+    "    top_n_features = data.columns[top_n_indices[0:n]]\n",
+    "    data = data[top_n_features]\n",
     "# Normalization of the data\n",
-    "dataNorm=scipy.stats.zscore(data,axis=0,ddof=1)\n",
+    "dataNorm = scipy.stats.zscore(data,axis=0,ddof=1)\n",
     "# Construction of genomaps\n",
-    "genoMaps=construct_genomap(dataNorm,rowNum,colNum,epsilon=0.0,num_iter=200)\n",
+    "genoMaps = construct_genomap(dataNorm,rowNum,colNum,epsilon=0.0,num_iter=200)\n",
     "\n",
     "# Visualization of the constructed genomaps:\n",
     "# The \"genoMaps\" array: All the constructed genomaps are saved in the array. This \n",
     "# array has four indices. The first one indexes the series of genomaps. \n",
     "# The second and third ones denote the row and column number in a genomap.\n",
     "# The fourth index is introduced to facillitate the caclulation using Pytorch or Tensorflow  \n",
     "# to visualize  the i-th genomap, set the first index variable to i (i=10 here) \n",
-    "findI=genoMaps[10,:,:,:]\n",
+    "findI = genoMaps[10,:,:,:]\n",
     "plt.figure(1)\n",
     "plt.imshow(findI, origin = 'lower',  extent = [0, 10, 0, 10], aspect = 1)\n",
     "plt.title('Genomap of a cell from TM dataset')"
@@ -225,9 +223,9 @@
     "gt_data = sio.loadmat('data/index_GRADCAM.mat')\n",
     "\n",
     "# Compute class activation maps of B-cells using GRAD-CAM\n",
-    "indxcell=gt_data['idxBcell']\n",
+    "indxcell = gt_data['idxBcell']\n",
     "# Create a numpy array to store the grad CAM results\n",
-    "grayscale_cam=np.zeros([indxcell.shape[0],colNum*rowNum])\n",
+    "grayscale_cam = np.zeros([indxcell.shape[0],colNum*rowNum])\n",
     "\n",
     "for i in range(indxcell.shape[0]):\n",
     "    # Looping over all B-cells to compute the class activation maps\n",
@@ -236,7 +234,7 @@
     "    pred_label = torch.argmax(net(grad_cam_img),axis=-1)\n",
     "    targets = [ClassifierOutputTarget(pred_label)]\n",
     "    A = cam(input_tensor=grad_cam_img, targets=targets)\n",
-    "    B=np.reshape(A.T,(1,colNum*rowNum))\n",
+    "    B = np.reshape(A.T,(1,colNum*rowNum))\n",
     "    grayscale_cam[i,:]=B\n",
     "\n",
     "# Plot an example activation map of a B-cell, which highlights the genes that are important \n",
@@ -247,114 +245,113 @@
     "\n",
     "# Compute the mean of gene activation values across all the B-cells\n",
     "# to find the average activity of all the genes in B-cells\n",
-    "graySmeanB=np.mean(grayscale_cam,axis=0) \n",
-    "\n",
+    "graySmeanB = np.mean(grayscale_cam,axis=0) \n",
     "\n",
     "# Repeat the same procedure for T cell, natural killer cells,\n",
     "# Monocytes, Pneumoctyes and Hematopoitic cells\n",
     "\n",
-    "indxcell=gt_data['idxTcell']\n",
-    "grayscale_cam=np.zeros([indxcell.shape[0],colNum*rowNum])\n",
+    "indxcell = gt_data['idxTcell']\n",
+    "grayscale_cam = np.zeros([indxcell.shape[0],colNum*rowNum])\n",
     "for i in range(indxcell.shape[0]):\n",
     "\n",
     "    grad_cam_img = torch.Tensor(dataMat_CNNtrain[i,:,:,:].transpose(2,0,1)\n",
     "                            .reshape((1,1,colNum,rowNum))).to(device)\n",
     "    pred_label = torch.argmax(net(grad_cam_img),axis=-1)\n",
     "    targets = [ClassifierOutputTarget(pred_label)]\n",
     "    A = cam(input_tensor=grad_cam_img, targets=targets)\n",
-    "    B=np.reshape(A.T,(1,colNum*rowNum))\n",
+    "    B = np.reshape(A.T,(1,colNum*rowNum))\n",
     "    grayscale_cam[i,:]=B\n",
     "\n",
-    "graySmeanT=np.mean(grayscale_cam,axis=0) # Activity of genes in T-cells\n",
+    "graySmeanT = np.mean(grayscale_cam,axis=0) # Activity of genes in T-cells\n",
     "\n",
-    "indxcell=gt_data['idxNK']\n",
-    "grayscale_cam=np.zeros([indxcell.shape[0],colNum*rowNum])\n",
+    "indxcell = gt_data['idxNK']\n",
+    "grayscale_cam = np.zeros([indxcell.shape[0],colNum*rowNum])\n",
     "for i in range(indxcell.shape[0]):\n",
     "\n",
     "    grad_cam_img = torch.Tensor(dataMat_CNNtrain[i,:,:,:].transpose(2,0,1)\n",
     "                            .reshape((1,1,colNum,rowNum))).to(device)\n",
     "    pred_label = torch.argmax(net(grad_cam_img),axis=-1)\n",
     "    targets = [ClassifierOutputTarget(pred_label)]\n",
     "    A = cam(input_tensor=grad_cam_img, targets=targets)\n",
-    "    B=np.reshape(A.T,(1,colNum*rowNum))\n",
+    "    B = np.reshape(A.T,(1,colNum*rowNum))\n",
     "    grayscale_cam[i,:]=B\n",
     "\n",
-    "graySmeanNK=np.mean(grayscale_cam,axis=0) # Activity of genes in natural killer cells\n",
+    "graySmeanNK = np.mean(grayscale_cam,axis=0) # Activity of genes in natural killer cells\n",
     "\n",
-    "indxcell=gt_data['idxMono']\n",
-    "grayscale_cam=np.zeros([indxcell.shape[0],colNum*rowNum])\n",
+    "indxcell = gt_data['idxMono']\n",
+    "grayscale_cam = np.zeros([indxcell.shape[0],colNum*rowNum])\n",
     "for i in range(indxcell.shape[0]):\n",
     "\n",
     "    grad_cam_img = torch.Tensor(dataMat_CNNtrain[i,:,:,:].transpose(2,0,1)\n",
     "                            .reshape((1,1,colNum,rowNum))).to(device)\n",
     "    pred_label = torch.argmax(net(grad_cam_img),axis=-1)\n",
     "    targets = [ClassifierOutputTarget(pred_label)]\n",
     "    A = cam(input_tensor=grad_cam_img, targets=targets)\n",
-    "    B=np.reshape(A.T,(1,colNum*rowNum))\n",
+    "    B = np.reshape(A.T,(1,colNum*rowNum))\n",
     "    grayscale_cam[i,:]=B\n",
     "\n",
-    "graySmeanMono=np.mean(grayscale_cam,axis=0) # Activity of genes in Monocytes\n",
+    "graySmeanMono = np.mean(grayscale_cam,axis=0) # Activity of genes in Monocytes\n",
     "\n",
     "\n",
-    "indxcell=gt_data['idxPne']\n",
-    "grayscale_cam=np.zeros([indxcell.shape[0],colNum*rowNum])\n",
+    "indxcell = gt_data['idxPne']\n",
+    "grayscale_cam = np.zeros([indxcell.shape[0],colNum*rowNum])\n",
     "for i in range(indxcell.shape[0]):\n",
     "\n",
     "    grad_cam_img = torch.Tensor(dataMat_CNNtrain[i,:,:,:].transpose(2,0,1)\n",
     "                            .reshape((1,1,colNum,rowNum))).to(device)\n",
     "    pred_label = torch.argmax(net(grad_cam_img),axis=-1)\n",
     "    targets = [ClassifierOutputTarget(pred_label)]\n",
     "    A = cam(input_tensor=grad_cam_img, targets=targets)\n",
-    "    B=np.reshape(A.T,(1,colNum*rowNum))\n",
+    "    B = np.reshape(A.T,(1,colNum*rowNum))\n",
     "    grayscale_cam[i,:]=B\n",
-    "grayscale_cam=np.nan_to_num(grayscale_cam)\n",
-    "graySmeanPne=np.mean(grayscale_cam,axis=0) # Activity of genes in Pneumoctyes\n",
+    "grayscale_cam = np.nan_to_num(grayscale_cam)\n",
+    "graySmeanPne = np.mean(grayscale_cam,axis=0) # Activity of genes in Pneumoctyes\n",
     "\n",
     "\n",
-    "indxcell=gt_data['idxHema']\n",
-    "grayscale_cam=np.zeros([indxcell.shape[0],colNum*rowNum])\n",
+    "indxcell = gt_data['idxHema']\n",
+    "grayscale_cam = np.zeros([indxcell.shape[0],colNum*rowNum])\n",
     "for i in range(indxcell.shape[0]):\n",
     "\n",
     "    grad_cam_img = torch.Tensor(dataMat_CNNtrain[i,:,:,:].transpose(2,0,1)\n",
     "                            .reshape((1,1,colNum,rowNum))).to(device)\n",
     "    pred_label = torch.argmax(net(grad_cam_img),axis=-1)\n",
     "    targets = [ClassifierOutputTarget(pred_label)]\n",
     "    A = cam(input_tensor=grad_cam_img, targets=targets)\n",
-    "    B=np.reshape(A.T,(1,colNum*rowNum))\n",
+    "    B = np.reshape(A.T,(1,colNum*rowNum))\n",
     "    grayscale_cam[i,:]=B\n",
     "\n",
-    "graySmeanHema=np.mean(grayscale_cam,axis=0) # Activity of genes in Hematopoitic cells\n",
+    "graySmeanHema = np.mean(grayscale_cam,axis=0) # Activity of genes in Hematopoitic cells\n",
     "\n",
     "\n",
     "# Stack all the average gene activity values in different types of cells for plotting\n",
-    "activityMatrix=np.vstack((graySmeanB,graySmeanHema,graySmeanMono,graySmeanNK,graySmeanT,graySmeanPne))\n",
+    "activityMatrix = np.vstack((graySmeanB,graySmeanHema,graySmeanMono,graySmeanNK,graySmeanT,graySmeanPne))\n",
     "\n",
     "# Load the gene names for plotting\n",
     "geneNames = pd.read_csv('data/gene_interest_ROI_ImXreXB.csv', header=None,\n",
     "                   delim_whitespace=False)\n",
     "\n",
     "\n",
     "# The following are the 10 most variable genes in sci-ATAC-seq dataset\n",
-    "genes=np.array(('MSC','RPL31','SERPINB7','ARHGEF4','DCAF8','RNF149','ACMSD','ESPNL','FAM135A','RGS13'))\n",
+    "genes = np.array(('MSC','RPL31','SERPINB7','ARHGEF4','DCAF8','RNF149','ACMSD','ESPNL','FAM135A','RGS13'))\n",
     "\n",
     "# Plot the activity of the 10 most variable genes in sci-ATAC-seq dataset\n",
-    "idxS=np.zeros(genes.shape[0])\n",
+    "idxS = np.zeros(genes.shape[0])\n",
     "for ii in range(genes.shape[0]):\n",
     "    \n",
-    "    idx=[geneNames==genes[ii]]\n",
-    "    idxN=np.squeeze((np.array(idx)))\n",
-    "    idxS[ii]=np.squeeze(np.asarray(np.where(idxN==True)))\n",
+    "    idx = [geneNames==genes[ii]]\n",
+    "    idxN = np.squeeze((np.array(idx)))\n",
+    "    idxS[ii] = np.squeeze(np.asarray(np.where(idxN==True)))\n",
     "\n",
-    "idxSs=idxS.astype(int)\n",
-    "actData=activityMatrix[:,idxSs]\n",
+    "idxSs = idxS.astype(int)\n",
+    "actData = activityMatrix[:,idxSs]\n",
     "\n",
     "# Rescale the activity in each cell type from 0 to 1 for plotting\n",
-    "actDataRe=actData\n",
+    "actDataRe = actData\n",
     "for ik in range(6):\n",
     "    actDataRe[ik,:]=rescale(actData[ik,:])\n",
     "\n",
     "plt.figure(figsize=(12,4))\n",
-    "ax=plt.stem(genes,actDataRe[0,:]) # to see the gene activity in other cells, please change this \n",
+    "ax = plt.stem(genes,actDataRe[0,:]) # to see the gene activity in other cells, please change this \n",
     "# index to a value from 1 to 5. Default is 0 (B-cell). 1 is for T-cell, 2 for natural killer cells, \n",
     "# 3 for Monocytes, 4 for Pneumoctyes, and 5 for Hematopoitic cells.\n",
     "plt.xlabel(\"Genes\")\n",
@@ -383,25 +380,25 @@
     "# In Seurat, the parameter 'nfeatures' (denoting  the number of features) is set\n",
     "# to 2000. The resulting Seurat output is loaded here\n",
     "data = sio.loadmat('data/outSeurat_pancORG.mat')\n",
-    "outSeurat=data['outSeurat']\n",
+    "outSeurat = data['outSeurat']\n",
     "\n",
     "# We now create a genomap for each cell in the data\n",
     "# We select the nearest square number to 2000, which is 1936\n",
     "# Thus the size of the genomap would be 44 by 44.\n",
     "# Next, let us select data with 1936 most variable features \n",
-    "numRow=44\n",
-    "numCol=44\n",
-    "varX=np.var(outSeurat,axis=0)\n",
-    "idxV=np.argsort(varX)\n",
-    "idxVX=np.flip(idxV)\n",
-    "outSeurat1936=outSeurat[:,idxVX[0:numRow*numCol]]\n",
+    "numRow = 44\n",
+    "numCol = 44\n",
+    "varX = np.var(outSeurat,axis=0)\n",
+    "idxV = np.argsort(varX)\n",
+    "idxVX = np.flip(idxV)\n",
+    "outSeurat1936 = outSeurat[:,idxVX[0:numRow*numCol]]\n",
     "\n",
     "# Construction of the genomaps for the pancreatic data from the five different technologies\n",
-    "genoMaps=construct_genomap(outSeurat1936,numRow,numCol,epsilon=0.0,num_iter=200)\n",
+    "genoMaps = construct_genomap(outSeurat1936,numRow,numCol,epsilon=0.0,num_iter=200)\n",
     "\n",
     "\n",
     "# Visualize a genomap (we show here the first one (i=0))\n",
-    "findI=genoMaps[0,:,:,:]\n",
+    "findI = genoMaps[0,:,:,:]\n",
     "plt.figure(4)\n",
     "plt.imshow(findI, origin = 'lower',  extent = [0, 10, 0, 10], aspect = 1)\n",
     "plt.title('Genomap of a cell from pancreatic dataset')\n",
@@ -495,12 +492,11 @@
     "clusters = xmeans_instance.get_clusters()\n",
     "centers = xmeans_instance.get_centers()\n",
     "\n",
-    "clusIndex=np.zeros(dataVec.shape[0]);\n",
+    "clusIndex = np.zeros(dataVec.shape[0]);\n",
     "for i in range(0,len(clusters)):\n",
-    "    a=(clusters[i])\n",
+    "    a = (clusters[i])\n",
     "    clusIndex[a]=i;\n",
     "    \n",
-    "\n",
     "# Set up the training and testing data \n",
     "dataMat_CNNtrain = genoMaps # For unsupervised genoNet, all the data are used for training\n",
     "dataMat_CNNtest = genoMaps\n",
@@ -530,17 +526,17 @@
     "gx=np.reshape(XTrain,(XTrain.shape[0],1,XTrain.shape[2],XTrain.shape[3]))\n",
     "device = get_device()\n",
     "t = torch.from_numpy(gx).to(device)\n",
-    "net=net.double()\n",
-    "dataAtFC=net.forwardX(t)\n",
-    "data=dataAtFC.cpu().detach().numpy()\n",
+    "net = net.double()\n",
+    "dataAtFC = net.forwardX(t)\n",
+    "data = dataAtFC.cpu().detach().numpy()\n",
     "\n",
     "# Run PHATE on the genoNet features\n",
     "phate_op = phate.PHATE(random_state=1)\n",
     "X_embedded = phate_op.fit_transform(data)\n",
     "\n",
     "# Plot embeddings\n",
     "plt.figure(11)\n",
-    "scatter=plt.scatter(X_embedded[:,0], X_embedded[:,1],s=2,c=GrTruth,cmap=\"jet\")\n",
+    "scatter = plt.scatter(X_embedded[:,0], X_embedded[:,1],s=2,c=GrTruth,cmap=\"jet\")\n",
     "plt.xlabel(\"PHATE1\")\n",
     "plt.ylabel(\"PHATE2\")\n",
     "plt.title('PHATE embedding of genoNet features')\n",
@@ -637,15 +633,15 @@
     "gx=np.reshape(XTrain,(XTrain.shape[0],1,XTrain.shape[2],XTrain.shape[3]))\n",
     "device = get_device()\n",
     "t = torch.from_numpy(gx).to(device)\n",
-    "net=net.double()\n",
-    "dataAtFC=net.forwardX(t)\n",
+    "net = net.double()\n",
+    "dataAtFC = net.forwardX(t)\n",
     "\n",
     "# Run t-SNE on the genoNet features\n",
-    "X=dataAtFC.cpu().detach().numpy()\n",
+    "X = dataAtFC.cpu().detach().numpy()\n",
     "X_embedded = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(X)\n",
     "\n",
     "plt.figure(13)\n",
-    "scatter=plt.scatter(X_embedded[:,0], X_embedded[:,1],s=2,c=GrTruth,cmap=\"jet\")\n",
+    "scatter = plt.scatter(X_embedded[:,0], X_embedded[:,1],s=2,c=GrTruth,cmap=\"jet\")\n",
     "plt.xlabel(\"tSNE1\")\n",
     "plt.ylabel(\"tSNE2\")\n",
     "plt.title('t-SNE embedding of genoNet features')\n",
@@ -657,7 +653,7 @@
     "X_embedded = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(dataVec)\n",
     "\n",
     "plt.figure(14)\n",
-    "scatter=plt.scatter(X_embedded[:,0], X_embedded[:,1],s=2,c=GrTruth,cmap=\"jet\")\n",
+    "scatter = plt.scatter(X_embedded[:,0], X_embedded[:,1],s=2,c=GrTruth,cmap=\"jet\")\n",
     "plt.xlabel(\"tSNE1\")\n",
     "plt.ylabel(\"tSNE2\")\n",
     "plt.title('t-SNE embedding of raw data')\n",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		data/TM_data.csv filter=lfs diff=lfs merge=lfs -text