Skip to content

Commit

Permalink
feat: ajout base
Browse files Browse the repository at this point in the history
  • Loading branch information
arxaqapi committed May 12, 2022
1 parent 9ae9cd6 commit d6d8bc0
Show file tree
Hide file tree
Showing 12 changed files with 557 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
data/*
!data/README.md


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
# tumour-classification
Travail d'Etude et de Recherche sur le sous-typage de cancer avec des méthodes de deep learning



## Références
Le travail effectué utilise comme article de référence l'article suivant: [Deep learning for tumor classification in imaging mass spectrometry](https://doi.org/10.1093/bioinformatics/btx724).

3 changes: 3 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Data folder

Ce dossier contient les données de base (non publique car donnée médicale) ainsi que les jeux de données générés.
150 changes: 150 additions & 0 deletions src/AutoEncoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from keras.models import Model, Sequential
from keras.layers import (
Dense,
Conv2D,
MaxPooling2D,
UpSampling2D)
from keras.losses import mean_squared_error
from tensorflow.keras.optimizers import Adadelta


class AutoEncoder(Model):
def __init__(self, input_dim: int = 32320, latent_dims: list[int] = [30000, 15000, 7500, 1000]):
# 32320
super(AutoEncoder, self).__init__(name="AutoEncoder")

self.input_dim = input_dim
self.latent_dims = latent_dims

self.encoder = Sequential([
Dense(latent_dims[0], activation='relu'),
Dense(latent_dims[1], activation='relu'),
Dense(latent_dims[2], activation='relu'),
Dense(latent_dims[3], activation='relu'),
])
self.decoder = Sequential([
Dense(latent_dims[2], activation='sigmoid'),
Dense(latent_dims[1], activation='sigmoid'),
Dense(latent_dims[0], activation='sigmoid'),
Dense(self.input_dim, activation='sigmoid'),
])

def call(self, inputs):
encoded = self.encoder(inputs)
decoded = self.decoder(encoded)
return decoded


class AutoEncoder(Model):
def __init__(self, input_dim: int = 32320, latent_dims: list[int] = [30000, 15000, 7500, 1000]):
# 32320
super(AutoEncoder, self).__init__(name="AutoEncoder")

self.input_dim = input_dim
self.latent_dims = latent_dims

self.encoder = Sequential([
Dense(latent_dims[0], activation='relu'),
Dense(latent_dims[1], activation='relu'),
Dense(latent_dims[2], activation='relu'),
Dense(latent_dims[3], activation='relu'),
])
self.decoder = Sequential([
Dense(latent_dims[2], activation='sigmoid'),
Dense(latent_dims[1], activation='sigmoid'),
Dense(latent_dims[0], activation='sigmoid'),
Dense(self.input_dim, activation='sigmoid'),
])

def call(self, inputs):
encoded = self.encoder(inputs)
decoded = self.decoder(encoded)
return decoded

class ConvolutionalAutoEncoder(Model):
def __init__(self, input_dim: int = (32320, 1, 1), latent_dims: int = [32, 16, 8]):
super(ConvolutionalAutoEncoder, self).__init__()

self.input_dim = input_dim
self.latent_dims = latent_dims
ksize = (12, 1)

self.encoder = Sequential([
# Block 1
Conv2D(filters=latent_dims[0], kernel_size=ksize, padding='same', activation='relu'),
MaxPooling2D(pool_size=(2, 1), padding='same'),
# Block 2
Conv2D(filters=latent_dims[1], kernel_size=ksize, padding='same', activation='relu'),
MaxPooling2D(pool_size=(2, 1), padding='same'),
# Block 3
Conv2D(filters=latent_dims[2], kernel_size=ksize, padding='same', activation='relu'),
MaxPooling2D(pool_size=(2, 1), padding='same'),

], name="Conv-encoder")

self.decoder = Sequential([
# Block 1
UpSampling2D(size=(2, 1)),
Conv2D(filters=latent_dims[2], kernel_size=ksize, padding='same', activation='relu'),
# Block 2
UpSampling2D(size=(2, 1)),
Conv2D(filters=latent_dims[1], kernel_size=ksize, padding='same', activation='relu'),
# Block 2
UpSampling2D(size=(2, 1)),
Conv2D(filters=latent_dims[0], kernel_size=ksize, padding='same', activation='relu'),
# Final reconstruction layer
Conv2D(filters=1, kernel_size=ksize, padding='same', activation='sigmoid'),
], name="Conv-decoder")

def call(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded



def train_ae(X_train, X_test, y_train, y_test):
batch_size = 1024 # 512, 256
epochs = 100

model = AutoEncoder()
model.build((None, 32320))
model.summary()

model.compile(
optimizer=Adadelta(learning_rate=10.0),
loss=mean_squared_error)

# history = model.fit(
# x=X_train,
# y=X_train,
# batch_size=batch_size,
# epochs=epochs,
# validation_split=0.2,
# validation_data=(X_test, X_test),
# shuffle=True)


def train_convae(X_train, X_test, y_train, y_test):
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1))

batch_size = 256
epochs = 100

model = ConvolutionalAutoEncoder()
model.build((None, 32320 ,1 , 1))
model.summary()

model.compile(
optimizer=Adadelta(learning_rate=1.0),
loss=mean_squared_error)

history = model.fit(
x=X_train,
y=X_train,
batch_size=batch_size,
epochs=epochs,
validation_split=0.2,
validation_data=(X_test, X_test),
shuffle=True)
212 changes: 212 additions & 0 deletions src/DatasetGeneration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import numpy as np
from pathlib import Path

import cv2
import pickle
from pyimzml.ImzMLParser import ImzMLParser
from sklearn.model_selection import train_test_split


class TissueSample():
def __init__(self, x1, y1, x2, y2):
assert x1 >= 0 and x1 < 416
assert y1 >= 0 and y1 < 311
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
self.label = ''

def set_values(self, array) -> None:
self.values: np.ndarray = array

def set_label(self, label: str) -> None:
self.label = label

def __repr__(self) -> str:
return f'TissueSample [{self.x1=} {self.y1=} {self.x2=} {self.y2=}]'



class TMADataset:
def __init__(self) -> None:
self.imzml_file = Path('../data/20200729_tma_ctrl_cc-chc_sans_normalisation.imzML')
self.ibd_file = Path('../data/20200729_tma_ctrl_cc-chc_sans_normalisation.ibd')

if not self.imzml_file.exists() or (not self.ibd_file.exists()):
raise FileNotFoundError(f"ImzMl file ({self.imzml_file.name}) or .ibd file ({self.ibd_file.name}) not found")

self.parser = ImzMLParser(self.imzml_file)

if self.parser.imzmldict['max count of pixels z'] != 1:
raise ValueError(f'Expected a 2D array with z dimension = 1, instead got z = {self.parser.imzmldict["max count of pixels z"]}')
# {'x': 416, 'y': 311}
self.tma_size = {'x': self.parser.imzmldict['max count of pixels x'], 'y': self.parser.imzmldict['max count of pixels y']}
self.mz_values = self.parser.getspectrum(0)[0]

# Coordonnées valides (matrice creuse, 255 = valeur atteignable)
self.valid_coordinates = self.create_index_coverage_mat(True)

def is_valid_coordinate(self, x: int, y: int) -> bool:
"""
Vérifie si une paire de coordonnées est valide, permettant de récupérer un spectre
"""
assert x >= 0 and x < self.tma_size['x']
assert y >= 0 and y < self.tma_size['y']
for valid_x, valid_y, _ in self.parser.coordinates:

assert valid_x - 1 >= 0 and valid_x - 1 < self.tma_size['x']
assert valid_y - 1 >= 0 and valid_y - 1 < self.tma_size['y']

if x == (valid_x - 1) and y == (valid_y - 1):
return True
return False

def idx_of_mz_value(self, value: float):
"""
Récupère l'index de la valeur m/z dans le tableau self.mz_values
- dataset.mz_values[2389], dataset.idx_of_mz_value(810.800048828125)
"""
return np.where(self.mz_values == value)[0][0]

def create_index_coverage_mat(self, create_image_file: bool = False):
"""
Créer la matrice avec les pixels ayant pour valeur 255 sont les pixels
atteignable.
Cette fonction permet de récuperer les indexs des valeurs atteignables
(416, 311)
"""
valid_coordinates = np.zeros((tuple(self.tma_size.values()))) # 416, 311
for x, y, _ in self.parser.coordinates:
valid_coordinates[x - 1, y - 1] = 255
if create_image_file:
cv2.imwrite('index_coverage.png', valid_coordinates.T)
return valid_coordinates

def bounding_boxes_of_tissue_samples(self):
# (416, 311)
assert self.valid_coordinates.shape == (416, 311)
gray_image = self.valid_coordinates.astype(dtype=np.uint8)
edges = cv2.Canny(gray_image, 50, 200)
contours, hierarchy = cv2.findContours(
edges,
cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_NONE)
empty_im = np.zeros(self.valid_coordinates.shape + (3,))

hulls = []
for cnt in contours:
hull = cv2.convexHull(cnt)
hulls.append(hull)
cv2.drawContours(empty_im, [hull], 0, (0, 255, 0), 1)

self.tissue_samples = []
for hull in hulls:
coordinates = hull.reshape(hull.shape[0], hull.shape[2])
if coordinates.shape[0] > 8:
# NOTE: xy yx
y1, x1 = coordinates.min(axis=0)
y2, x2 = coordinates.max(axis=0)
# x1, y1 = coordinates.min(axis=0)
# x2, y2 = coordinates.max(axis=0)
self.tissue_samples.append(TissueSample(x1=x1,y1=y1,x2=x2,y2=y2))
# NOTE: xy yx
# cv2.rectangle(empty_im, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.rectangle(empty_im, (y1, x1), (y2, x2), (0, 0, 255), 1)
# empty_im[x1+1:x2, y1+1:y2] = 127
# point:
cv2.rectangle(empty_im, (y1, x1), (y1, x1), (0, 255, 255), 1)
cv2.imwrite('bounding_boxes.png', empty_im)
self.test_image = empty_im

def _get_idx_in_coords(self, x, y) -> int:
"""
Récupère l'index unique des coordonnées dans le tableau des coordonnées
afin de récupérer le spectre
"""
assert x >= 0 and x < self.tma_size['x']
assert y >= 0 and y < self.tma_size['y']
for i, (x_, y_, _) in enumerate(self.parser.coordinates):
if (x == (x_ - 1)) and (y == (y_ - 1)):
return i
return -1

def set_spectrums_of_tissue(self, sample: TissueSample):
"""
Récupère chaque spectre de chaque point de relevé valide du tissue
"""
intensities = []
for x in range(sample.x1, sample.x2):
for y in range(sample.y1, sample.y2):
if self.is_valid_coordinate(x, y):
_, intensity_array = self.parser.getspectrum(
self._get_idx_in_coords(x , y))
intensities.append(intensity_array)
# self.test_image[x, y, :] = 255
intensities = np.array(intensities)
sample.set_values(intensities)
# cv2.imwrite("hm.png", self.test_image)
return sample.values

def set_all_spectrums(self, overwrite: bool = True):
"""
Pour chaque sample, leurs spectres sont récupérées
"""
# NOTE: too much data in cache
print('Get spectrums of all samples')
temp_path = Path('../data/temp.pkl')

if temp_path.exists():
with open(temp_path, 'rb') as temp_f:
self.tissue_samples = pickle.load(temp_f)
else:
for sample in self.tissue_samples:
self.set_spectrums_of_tissue(sample)
with open(temp_path, 'wb') as temp_f:
pickle.dump(self.tissue_samples, temp_f)

def manually_assign_labels(self):
# (416, 311)
for sample in self.tissue_samples:
# if sample.x1 > 5 and sample.x1 < 40 and sample.y1 > 250 and
# print(sample)
if sample.y2 >= 308:
print(sample)
self.test_image[sample.x1:sample.x2+1, sample.y1:sample.y2+1] = 255
cv2.imwrite("hm.png", self.test_image)
exit(1)

def data_generator(self):
for sample in self.tissue_samples:
for point in sample.values:
yield point, sample.label

def get_train_test_data(self):
dataset_path = Path('../data/dataset_extracted_20200729_tma_ctrl_cc-chc_sans_normalisation.pkl')

if dataset_path.exists():
print('loading dataset')
with open(dataset_path, 'rb') as data_pkl_f:
X_train, X_test, y_train, y_test = pickle.load(data_pkl_f)
return X_train, X_test, y_train, y_test

else:
print('creating dataset')
X_ = []
y_ = []
for x, y in self.data_generator():
X_.append(x)
y_.append(y)
# print(np.array(X_).shape)
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2)
X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)
with open(dataset_path, 'wb') as data_pkl_f:
pickle.dump((X_train, X_test, y_train, y_test), data_pkl_f)
return X_train, X_test, y_train, y_test


# def show_im(p, mz_value=2400):
# im = getionimage(p, mz_value)
# plt.imshow(im, cmap='viridis', interpolation ='nearest')
# plt.show()

Loading

0 comments on commit d6d8bc0

Please sign in to comment.