train.py

import argparse
import numpy as np
import cv2
import os
from datetime import datetime

import torch.nn as nn
import torch
import torchmetrics
import torch.multiprocessing as mp
import torch.distributed as dist

from rsunet import RSUNet
from dataset import RatLiverDataset_Tiled_Train18, RatLiverDataset_Tiled_Test18  

#change for different pretraining/finetuning datasets as needed


class DiceLoss(nn.Module):
    def __init__(self):
        super(DiceLoss, self).__init__()

    def forward(self, inputs, targets, smooth=1):
        
        #comment out if your model contains a sigmoid or equivalent activation layer
        sig00 = nn.Sigmoid()
        inputs = sig00(inputs)     
        
        #flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)
        
        intersection = (inputs * targets).sum()                            
        dice = (2.*intersection + smooth)/(inputs.sum() + targets.sum() + smooth)  
        
        return 1 - dice


class DiceCE(nn.Module):
    def __init__(self):
        super(DiceCE, self).__init__()

    def forward(self, inputs, targets, loss_weights, smooth=1):

        pos_weights = targets*loss_weights
        ce_loss = nn.BCEWithLogitsLoss(pos_weight = pos_weights)
        CE = ce_loss(inputs, targets)

        sig00 = nn.Sigmoid()
        inputs = sig00(inputs)     
        
        #flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)
        
        intersection = (inputs * targets).sum()                            
        dice = (2.*intersection + smooth)/(inputs.sum() + targets.sum() + smooth)  
        return (CE + (1-dice))


def train(args):
    
    #update passed in args values:
    epochs = args.epochs
    save_images_b = args.save_images
    hotstart_b = args.hotstart
    rotations_b = args.rotation_augs
    contrast_augs_b = args.contrast_augs
    loss_weights = args.loss_weights
    
    #Data loading
    XY_train = RatLiverDataset_Tiled_Train18()
    XY_test = RatLiverDataset_Tiled_Test18()
    loader = torch.utils.data.DataLoader(XY_train, batch_size=10, shuffle=True, pin_memory = True)
    loader2 = torch.utils.data.DataLoader(XY_test, batch_size=10, shuffle=False, pin_memory = True)
    
    #create folder to save metrics
    exp_name = args.exp_name
    if not os.path.isdir(str(exp_name)):
        os.mkdir(str(exp_name))
    
    
    net = RSUNet(out_ch = 1)
    model = net.cuda()
    model.train() 
    
    
    if args.model_loc != None:
        print("loading model from path: ", args.model_loc)
        try:
            model.load_state_dict(torch.load(str(args.model_loc))) #entire model is pretrained and loaded
        except: 
            #when models only share some layers, load only shared layers
            state_dict2 = torch.load(str(args.model_loc))
    

            for key in list(state_dict2):
                new_key = str(key).split(".")[1:]
                new_key = ".".join(new_key)
                
                state_dict2[new_key] = state_dict2.pop(key)

            pretrained_dict = state_dict2
            model_dict = model.state_dict()

            #filter out unnecessary keys
            pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
            
            print("loading and overwriting " + str(len(pretrained_dict)) + " layers from pretrained model")
            model_dict.update(pretrained_dict)  #overwrite entries in the existing state dict
            model.load_state_dict(model_dict)  #load the new state dict
    
    if hotstart_b:
        optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

    
    #metric tracking
    run_losses = []
    test_losses = []
    run_TPRs = []
    test_TPRs = []
    run_PPVs = []
    test_PPVs = []
    run_accs = []
    test_accs = []
    run_IOUs = []
    test_IOUs = []
    
    acc_metric = torchmetrics.Accuracy().cuda()
    ppv_metric = torchmetrics.Precision().cuda()
    tpr_metric = torchmetrics.Recall().cuda()
    iou_metric = torchmetrics.JaccardIndex(num_classes = 2).cuda()
    
    acc_metric_test = torchmetrics.Accuracy().cuda()
    ppv_metric_test = torchmetrics.Precision().cuda()
    tpr_metric_test = torchmetrics.Recall().cuda()
    iou_metric_test = torchmetrics.JaccardIndex(num_classes = 2).cuda()

    
    #begin training
    print("========== BEGIN TRAINING LOOP ==========")
    start = datetime.now()
    
    for e in range(epochs):
        print("device number: ", torch.cuda.current_device())
        print("----------------epoch:", e)
        
        if (e == 50 and hotstart_b):
            for g in optimizer.param_groups:
                g['lr'] = 0.002
        
        for g in optimizer.param_groups:
            print("current lr:" + str(g['lr']))
            
            
        model.train()
        run_loss = 0
        
    
        for i, xy in enumerate(loader):
            sample = xy[0].cuda() 
            labels = xy[1].cuda()
        
            if rotations_b:
                for z0 in range(sample.size()[0]):
                    rot_n = np.random.randint(low = 0, high = 4)
                    sample[z0,:,:,:,:] = torch.rot90(sample[z0,:,:,:,:], rot_n, dims = [2,3])
                    labels[z0,:,:,:,:] = torch.rot90(labels[z0,:,:,:,:], rot_n, dims = [2,3])
            
            if contrast_augs_b:
                for z0 in range(sample.size()[0]):
                    rand_int1 = np.random.randint(low = 0, high = 4)  #25% chance to not aug
                    if rand_int1 != 0:
                        alpha = torch.rand(1).cuda()+0.5  #contrast from 0.5 to 1.5
                        beta = torch.rand(1).cuda()-0.5  #brightness from -0.5 to 0.5
                        sample[z0,:,:,:,:] = torch.clamp(sample[z0,:,:,:,:]*alpha+beta,-1,1)
                    
            # Optimizer step.
            optimizer.zero_grad()
            pred = model(sample)
            sig0 = nn.Sigmoid()
            
            pred_mask = sig0(pred)

            
            batch_acc = acc_metric(pred_mask, labels.int())
            batch_ppv = ppv_metric(pred_mask, labels.int())
            batch_tpr = tpr_metric(pred_mask, labels.int()) 
            batch_iou = iou_metric(pred_mask, labels.int())
            

            if args.loss_type == "Dice":
                criteria = DiceLoss()
                loss = criteria(pred,labels.float())
            elif args.loss_type == "DiceCE":
                criteria = DiceCE()
                loss = criteria(pred,labels.float(), loss_weights = loss_weights)
            loss.backward()
            optimizer.step()
            
            run_loss += loss             
            if i % 20 == 0: #print metrics every 20 batch
                print("loss:", loss.item())
                print("accuracy:", str(batch_acc.item()))
                print("PPV:", str(batch_ppv.item()))
                print("TPR:", str(batch_tpr.item()))
                print("IoU:", str(batch_iou.item()))
                
            if e in [10,20,50,100,150] and save_images_b:
                #saving images
                if not os.path.isdir(str(exp_name) + "/epoch_"+ str(e)):
                    os.mkdir(str(exp_name) + "/epoch_"+ str(e))
                if i % np.floor(len(loader)/3) == 0:
                    if not os.path.isdir(str(exp_name) + "/epoch_"+ str(e)+"/"+str(i)):
                        os.mkdir(str(exp_name) + "/epoch_"+ str(e) + "/" + str(i))
                    save_imgs = pred_mask.detach().cpu().numpy()
                    save_gts = labels.detach().cpu().numpy()
                    save_imgs = np.float32(save_imgs)*255
                    save_gts = np.float32(save_gts)*255
                    save_sample = sample.detach().cpu().numpy()
                    save_sample = np.float32(save_sample)*127.5 + 127.5
                    for z in [0,8,17]:
                        cv2.imwrite(str(exp_name) + "/epoch_"+ str(e) + "/"+str(i)+"/prediction_i"+ str("%.3d" %i) +"z_" + str("%.3d" %z) + ".png",
                                       save_imgs[0,0,z,:,:])
                        cv2.imwrite(str(exp_name) + "/epoch_"+ str(e) + "/"+str(i)+"/gt_i"+ str("%.3d" %i) +"z_" + str("%.3d" %z) + ".png",
                                       save_gts[0,0,z,:,:])
                        cv2.imwrite(str(exp_name) + "/epoch_"+ str(e) + "/"+str(i)+"/input_i"+ str("%.3d" %i) +"z_" + str("%.3d" %z) + ".png",
                                       save_sample[0,0,z,:,:])


        if e % 5  == 0:
            #testing
            if loader2 != None:
                model.eval()
                with torch.no_grad():
                    print("---------------")
                    print("Starting testing for epoch ", e)
                    
                    test_loss = 0
                    for i, xy in enumerate(loader2):
                        sample_test = xy[0].cuda()
                        labels_test = xy[1].cuda()

                        # Optimizer step
                        pred2 = model(sample_test)

                        sig1 = nn.Sigmoid()
                        pred_mask2 = sig1(pred2)
                        
                        
                        batch_acc_test = acc_metric_test(pred_mask2, labels_test.int())
                        batch_ppv_test = ppv_metric_test(pred_mask2, labels_test.int())
                        batch_tpr_test = tpr_metric_test(pred_mask2, labels_test.int()) 
                        batch_iou_test = iou_metric_test(pred_mask2, labels_test.int())
                        
                        #balance2 = labels_test*loss_weights
                        #loss2 = nn.BCEWithLogitsLoss(pos_weight = balance2)

                        if args.loss_type == "Dice":
                            criteria2 = DiceLoss()
                            loss2 = criteria2(pred,labels.float())
                        elif args.loss_type == "DiceCE":
                            criteria2 = DiceCE()
                            loss2 = criteria2(pred,labels.float(), loss_weights = loss_weights)
                        
                        
                        test_loss += loss2
                        

                        if i % 20 == 0:
                            print("Test loss:", loss2.item())
                            print("accuracy:", str(batch_acc_test.item()))
                            print("PPV:", str(batch_ppv_test.item()))
                            print("TPR:", str(batch_tpr_test.item()))
                            print("IoU:", str(batch_iou_test.item()))
                        
                        
                        if e % 50 == 0 and e > 0 and save_images_b:
                            #saving images
                            if not os.path.isdir(str(exp_name) + "/test_epoch_"+ str(e)):
                                os.mkdir(str(exp_name) + "/test_epoch_"+ str(e))
                            if i % np.floor(len(loader2)/3) == 0:
                                if not os.path.isdir(str(exp_name) + "/test_epoch_"+ str(e)+"/"+str(i)):
                                    os.mkdir(str(exp_name) + "/test_epoch_"+ str(e) + "/" + str(i))
                                save_imgs = pred_mask2.detach().cpu().numpy()
                                save_gts = labels_test.detach().cpu().numpy()
                                save_imgs = np.float32(save_imgs)*255
                                save_gts = np.float32(save_gts)*255
                                save_sample = sample_test.detach().cpu().numpy()
                                save_sample = np.float32(save_sample)*127.5 + 127.5
                                for z in [0,8,17]:
                                    cv2.imwrite(str(exp_name) + "/test_epoch_"+ str(e) + "/"+str(i)+"/prediction_i"+ str("%.3d" %i) +"z_" + str("%.3d" %z) + ".png",
                                                   save_imgs[0,0,z,:,:])
                                    cv2.imwrite(str(exp_name) + "/test_epoch_"+ str(e) + "/"+str(i)+"/gt_i"+ str("%.3d" %i) +"z_" + str("%.3d" %z) + ".png",
                                                   save_gts[0,0,z,:,:])
                                    cv2.imwrite(str(exp_name) + "/test_epoch_"+ str(e) + "/"+str(i)+"/input_i"+ str("%.3d" %i) +"z_" + str("%.3d" %z) + ".png",
                                                   save_sample[0,0,z,:,:])
                    
        
        run_losses.append(run_loss.detach().cpu().numpy())
        run_accs.append(acc_metric.compute().item())
        run_TPRs.append(tpr_metric.compute().item())
        run_PPVs.append(ppv_metric.compute().item())
        run_IOUs.append(iou_metric.compute().item())
        
        print("-----------run_loss-----------:", run_loss.detach().cpu().numpy())
        print("-----------run_acc-----------: " + str(acc_metric.compute().item()))
        print("-----------run_TPR-----------: " + str(tpr_metric.compute().item()))
        print("-----------run_PPV-----------: " + str(ppv_metric.compute().item()))
        print("-----------run_IoU-----------: " + str(iou_metric.compute().item()))
        
        iou_metric.reset()
        acc_metric.reset()
        tpr_metric.reset()
        ppv_metric.reset()
        

        if e == 0: 
            best_iou = 0
            #best_acc = 0   #acc is not as indicative of performance for highly imbalanced tasks


        if e % 5 == 0 and loader2 != None:
            test_losses.append(test_loss.detach().cpu().numpy())
            test_accs.append(acc_metric_test.compute().item())
            test_TPRs.append(tpr_metric_test.compute().item())
            test_PPVs.append(ppv_metric_test.compute().item())
            test_IOUs.append(iou_metric_test.compute().item())
            
            print("-----------test_loss-----------:", test_loss.detach().cpu().numpy())
            print("-----------test_IoU-----------: " + str(iou_metric_test.compute().item()))
            print("-----------test_acc-----------: " + str(acc_metric_test.compute().item()))
            print("-----------test_TPR-----------: " + str(tpr_metric_test.compute().item()))
            print("-----------test_PPV-----------: " + str(ppv_metric_test.compute().item()))
            
            if e != 0:
                if test_IOUs[-1] > best_iou:
                    best_iou = test_IOUs[-1]
                    torch.save(model.state_dict(), str(exp_name) + '/trained_model_epoch' + str(e) + '_accR' + str(run_accs[-1]) + "_accT" + str(test_accs[-1]) + "_iouT" + str(test_IOUs[-1])  + '.pth')  
                    print("Saved new best model, test acc:" + str(test_accs[-1]))
                    print("Test IoU:" + str(test_IOUs[-1]))
                    print("Test PPV:" + str(test_PPVs[-1]))
                if e in [50, 75, 100]:
                    print("Saving model at checkpoint 50, 75, 100 epochs")
                    torch.save(model.state_dict(), str(exp_name) + '/trained_model_epoch' + str(e) + '_accR' + str(run_accs[-1]) + "_accT" + str(test_accs[-1]) + "_iouT" + str(test_IOUs[-1])  + '.pth')  
            
            iou_metric_test.reset()
            acc_metric_test.reset()
            tpr_metric_test.reset()
            ppv_metric_test.reset()
    
        elif e % 5 == 0 and loader2 == None: #no testing set
            print("No testing data provided. No testing metrics calculated. Saving best model based on best IOU during training")
            if e != 0:
                if run_IOUs[-1] > best_iou:
                    best_iou = run_IOUs[-1]
                    torch.save(model.state_dict(),str(exp_name) + '/trained_model_epoch' + str(e) + '_accR' + str(run_accs[-1]) + "_iouR" + str(run_IOUs[-1]) + '.pth')  
                    print("Saved new best model, run acc:" + str(run_accs[-1]))
                    print("Run IoU:" + str(run_IOUs[-1]))
                    print("Run PPV:" + str(run_PPVs[-1]))
                if e in [50, 75, 100]:
                    print("Saving model at checkpoint 50, 75, 100 epochs")
                    torch.save(model.state_dict(),str(exp_name) + '/trained_model_epoch' + str(e) + '_accR' + str(run_accs[-1]) + "_iouR" + str(run_IOUs[-1]) + '.pth') 


        torch.save(model.state_dict(),str(exp_name) + '/trained_model_training.pth')  
        
        np.save(str(exp_name) + "/run_losses" , run_losses)
        np.save(str(exp_name) + "/test_losses" , test_losses)
        np.save(str(exp_name) + "/run_accs", run_accs)
        np.save(str(exp_name) + "/test_accs", test_accs)
        np.save(str(exp_name) + "/run_TPRs", run_TPRs)
        np.save(str(exp_name) + "/test_TPRs", test_TPRs)
        np.save(str(exp_name) + "/run_PPVs", run_PPVs)
        np.save(str(exp_name) + "/test_PPVs", test_PPVs)
        np.save(str(exp_name) + "/run_IOUs" , run_IOUs)
        np.save(str(exp_name) + "/test_IOUs", test_IOUs)

    print("Training complete in: " + str(datetime.now() - start))
    return()

 
##########################################
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    
    parser.add_argument('--epochs', default = 5, type = int)
    parser.add_argument('-e', '--exp_name', help = 'name of the output folder')
    parser.add_argument('--subsample_frac', default = 1, type = float)
    parser.add_argument('--subsample_seed', default = 27, type = int)
    parser.add_argument('--filter_threshold', default = -1, type = float)
    parser.add_argument('--loss_type', help = 'choose from Dice or DiceCE')

    parser.add_argument('--loss_weights', default = 1, type = float, help = 'specify the weight of the positive class when calculating BCE loss. > 1 if positive class is in the minority')
    
    parser.add_argument('--model_loc', default = None, type = str, help = 'the path to the .pth file of the model you want to load')
    parser.add_argument('--freeze_encoder', default = False, action = 'store_true')
    parser.add_argument('--save_images', default = False, action = 'store_true')
    
    parser.add_argument('--hotstart', default = False, action = 'store_true') 
    parser.add_argument('--rotation_augs', default = False, action = 'store_true')
    parser.add_argument('--contrast_augs', default = False, action = 'store_true')
    
    args = parser.parse_args()
    train(args)