diff --git a/.gitignore b/.gitignore index bf6cca2..37733e1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store .vscode/ */__pycache__/* +*.pyc diff --git a/README.md b/README.md index 078f91a..6f052b6 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,18 @@ The deep embeddings used in this work are generated using the End2End network pr ``` Krishnan, P., Dutta, K., Jawahar, C.V.: Word spotting and recognition using deep embedding. In: 2018 13th IAPR International Workshop on Document Analysis Systems (DAS). pp. 1–6 (April 2018). https://doi.org/10.1109/DAS.2018.70 ``` -Word text and image's deep embeddings for testing this repository are provided in the ```embeddings``` folder. +Word text and image's deep embeddings for testing this repository are provided in the ``embeddings`` folder. Text files containing the information about the embeddings are required while running the code. They are in the format
-```1```
-```1```
-...
-Corresponding text files for testing this repository are provided in the ``gen_files`` folder. +``` +1 +1 +... +``` +One can refer to and use [https://github.com/kris314/hwnet](https://github.com/kris314/hwnet) for generating embeddings. + +For the purpose of making it easier to explore the code in this repository, sample text files and embeddings are provided in ``gen_files`` and ``embeddings``, respectively. + +Original Dataset used in this work will be released by [CVIT](http://cvit.iiit.ac.in) soon. ### Performing word recognition (using a pre-trained EmbedNet) Pre-trained EmbedNet models are saved in the ``models`` folder.
@@ -102,7 +108,47 @@ Other arguments for word recognition experiment are: - `k` total number of predictions to test on (max 20) ### Training EmbedNet -TODO +Prepare text files and embeddings as mentioned in [Generating/using deep embeddings](#Generating/using-deep-embeddings). Refer files in folder ``gen_files`` for text file's examples. Once the embeddings are prepared run the following command +```sh +python src/EmbedNet_train.py --model_name provide_a_name_of_your_choice +``` +For the purpose of a demonstration, you can run the following command +```sh +python src/EmbedNet_train.py --model_name temp +``` +This will start training an EmbedNet for 1000 epochs and save the models in `trained/EmbedNet_models`. + +Other arguments for `EmbedNet_train.py` are: +```sh +--base_dir +--model_dir +--train_percentage +--epochs +--lr +--batch +--model_name +--margin +--hidden_layers +--gpu_id +--image_embeds +--topk_embeds +--image_file +--predictions_file +``` +- `base_dir` is a path to the directory for saving models +- `model_dir` is a name of the folder for saving trained models +- `train_percentage` percentage of data to use for training +- `epochs` number of epochs to train for +- `lr` learning rate +- `batch` batch size +- `model_name` name of the model for saving +- `margin` triplet loss margin +- `hidden_layers` list of input size of the hidden layers +- `gpu_id` specify which GPU to use +- `image_embeds` is used to provide path to the image embeddings +- `topk_embeds` is used to provide path to the TopK predictions' embeddings +- `image_file` is used to provide path to the image's text information file +- `predictions_file` is used to provide path to the TopK predictions' text information file License and Citation --------------------- diff --git a/src/EmbedNet_dataprep.py b/src/EmbedNet_dataprep.py deleted file mode 100644 index a7aa754..0000000 --- a/src/EmbedNet_dataprep.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -This code is used for preparing the Triplet dataset for EmbedNet -""" -# Standard imports -import os -import pdb -import pickle -import argparse - -# Third party imports -import numpy as np -from tqdm import tqdm -import Levenshtein as lev - -parser = argparse.ArgumentParser(description='Data Preperation for deep word recognition') - -# Arguments for text and embeddings path -parser.add_argument('--image_embeds', default='/ssd_scratch/cvit/sid/embeddings/topk_preds_1500featsImg.npy', help='Path to the image embeddings') -parser.add_argument('--text_embeds', default='/ssd_scratch/cvit/sid/embeddings/topk_preds_1500featsSynth.npy', help='Path to the text embeddigns') -parser.add_argument('--image_info', default='/ssd_scratch/cvit/sid/image_embed_top_k_1500.txt', help='Path to the file containing word image information') -parser.add_argument('--text_info', default='/ssd_scratch/cvit/sid/top_preds_embeds_with_confidence_1500.txt', help='Path to the file containing text output information') - -# model path and name arguments -parser.add_argument('--base_path', default='/ssd_scratch/cvit/sid/', help='Path to the base directory where the training and testing data is stored') -parser.add_argument('--file_name', default='EmbedNet_data', help='Name of the data file') - -# Training and testing split flag -parser.add_argument('--train_percent', default=0.8, type=float, help='Percent of train data') -parser.add_argument('--semi_hard', default=False, action='store_true', help='If True semi-hard examples will also be included') -parser.add_argument('--save', default=False, action='store_true', help='If true data will be saved in ssd_scratch') - -args = parser.parse_args() -print(args) - -data_path = args.base_path + args.file_name - -print('[INFO] Loading embeddings and text files...') -image_embeds = np.load(args.image_embeds) -try: - topk_embeds = np.load(args.text_embeds) -except Exception as e: - print('[INFO] Loading text embeddings in memmap mode...') - topk_embeds = np.memmap(args.text_embeds, dtype=np.float32, mode='r', shape=(2109500, 2048)) - -with open(args.image_info, 'r') as image_file: - image_info = image_file.readlines() -image_info = [item.split()[1] for item in image_info] - -with open(args.text_info, 'r') as text_file: - text_info = text_file.readlines() -text_info = [item.split()[1] for item in text_info] - -# This piece is for handling text files with more data as compared to the numpy files -# text_info = text_info[:topk_embeds.shape[0]] -# image_info = image_info[:image_embeds.shape[0]] - -# # Getting count of number of words in training set -split_count = int(args.train_percent * len(image_info)) -image_info = image_info[:split_count] -text_info = text_info[:split_count*20] -image_embeds = image_embeds[:split_count] -topk_embeds = topk_embeds[:split_count*20] - -text_dict = dict() -embeds_dict = dict() -ko = 0 -k = 20 -"""Text Dictionary is in the form -{'word':[([top_20_preds],[lev_dist]), (..., ...), ...], ...} -Embedding dictionary is in the form -{'word': {'image_embeds': [all image_embeds occurances], 'text_embeds': [[top_20_text_embeds], [top_20_text_embeds], ...]}, ...} -""" -for word in tqdm(image_info, desc='[INFO] Text Dict'): - if word not in text_dict.keys(): - text_dict[word] = [(text_info[ko: k], [lev.distance(word, item) for item in text_info[ko: k]])] - else: - text_dict[word].append((text_info[ko: k], [lev.distance(word, item) for item in text_info[ko: k]])) - ko = k - k += 20 - -ko = 0 -k = 20 -for count, image_embed in enumerate(tqdm(image_embeds, desc='[INFO] Embeds Dict')): - word = image_info[count] - if word not in embeds_dict.keys(): - embeds_dict[word] = {'image_embeds': [image_embed], 'text_embeds': [topk_embeds[ko: k]]} - else: - embeds_dict[word]['image_embeds'].append(image_embed) - embeds_dict[word]['text_embeds'].append(topk_embeds[ko: k]) - ko = k - k += 20 - -final_list = list() -for word in tqdm(text_dict.keys(), desc='[INFO] Data Prep'): - predictions = text_dict[word] - image_embeddings, text_embeddings = embeds_dict[word]['image_embeds'], np.array(embeds_dict[word]['text_embeds']) - for instance_count, single_instance in enumerate(predictions): - top20_preds, top20_edit_dist = single_instance[0], single_instance[1] - instance_text_embeds = text_embeddings[instance_count] - anchor = image_embeddings[instance_count] - positive = None - negative_list = list() - if args.semi_hard: - semi_negative_list = list() - for count, pred in enumerate(top20_preds): - if word == pred: - positive = instance_text_embeds[count] - else: - if not args.semi_hard: - negative_list.append(instance_text_embeds[count]) - if args.semi_hard: - condition = True - no_inf = 1000 - while condition and no_inf != 0: - random_num = np.random.randint(low=1, high=len(topk_embeds)) - random_embedding = topk_embeds[random_num] - if np.linalg.norm(anchor - random_embedding) > 0.4: - condition = False - no_inf -= 1 - semi_negative_list.append(random_embedding) - if args.semi_hard: - for semi_hard_neg_embed in semi_negative_list: - if positive is None: - pass - else: - final_list.append({'anchor': anchor, 'positive': positive, 'negative': np.array(semi_hard_neg_embed)}) - else: - for negative_embeds in negative_list: - if positive is None: # There are a few cases when the OCR even fails to predics in Top20 predicitons - pass - else: - final_list.append({'anchor': anchor, 'positive': positive, 'negative': np.array(negative_embeds)}) - -def check(final_list): - positive_distance = list() - negative_distance = list() - for sample in tqdm(final_list, desc='[INFO] Checking'): - anchor = sample['anchor'] - positive = sample['positive'] - negative = sample['negative'] - try: - positive_distance.append(np.linalg.norm(anchor - positive)) - negative_distance.append(np.linalg.norm(anchor - negative)) - except Exception as e: - print(e) - pdb.set_trace() - print('[INFO] Mean distance of anchors with positive pairs is {} Max {} Min {}.\n[INFO] Mean distance of anchor with negative pairs is {} Max {} Min {}.'.format(np.mean(positive_distance), np.max(positive_distance), np.min(positive_distance), np.mean(negative_distance), np.max(negative_distance), np.min(negative_distance))) - -check(final_list) -if args.save: - pickle.dump(final_list, open(data_path, 'wb')) - print('[INFO] Total number of triples generated: {}\n[INFO] Pickle file saved at {}'.format(len(final_list), data_path)) diff --git a/src/EmbedNet.py b/src/EmbedNet_train.py similarity index 67% rename from src/EmbedNet.py rename to src/EmbedNet_train.py index 276f72b..4307930 100644 --- a/src/EmbedNet.py +++ b/src/EmbedNet_train.py @@ -15,18 +15,18 @@ import torch.nn as nn from tqdm import tqdm from models import EmbedNet -from online_triplets import Triplets +from triplets import Triplets import torch.nn.functional as F import torch.utils.data as data from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter torch.backends.cudnn.enabled = False torch.backends.cudnn.benchmark = False parser = argparse.ArgumentParser(description='Neural Networks for word recognition') # File paths and directory names -parser.add_argument('--base_dir', default='/ssd_scratch/cvit/sid/', help='Path to the directory for saving models') +parser.add_argument('--base_dir', default='trained', help='Path to the directory for saving models') +parser.add_argument('--model_dir', default='EmbedNet_models', help='Name of the folder for saving trained models') # Various model hyperparameters parser.add_argument('--train_percentage', type=float, default=0.8, help='Percentage of data to use for training') @@ -38,39 +38,35 @@ parser.add_argument('--hidden_layers', nargs='+', type=int, default=[1024, 512, 256, 128], help='List of input size of the hidden layers') parser.add_argument('--gpu_id', default=0, type=int, help='Specify which GPU to use') -parser.add_argument('--image_embeds', default='/ssd_scratch/cvit/sid/embeddings/image_embeds_top_k_allfeatsImg.npy', help='Path to the image embeddings') -parser.add_argument('--text_embeds', default='/ssd_scratch/cvit/sid/embeddings/top_preds_embeds_all_featsSynth.dat', help='Path to the text embeddigns') -parser.add_argument('--image_info', default='/ssd_scratch/cvit/sid/EmbedGenFiles/image_embed_top_k_all.txt', help='Path to the file containing word image information') -parser.add_argument('--text_info', default='/ssd_scratch/cvit/sid/EmbedGenFiles/top_preds_embeds_all_with_confidence.txt', help='Path to the file containing text output information') +parser.add_argument('--image_embeds', default='embeddings/topk_preds_100featsImg.npy', help='Path to the image embeddings') +parser.add_argument('--topk_embeds', default='embeddings/topk_preds_100featsSynth.npy', help='Path to the text embeddigns') +parser.add_argument('--image_file', default='gen_files/image_embed_top_k_100.txt', help='Path to the file containing word image information') +parser.add_argument('--predictions_file', default='gen_files/top_preds_embeds_100_with_conf.txt', help='Path to the file containing text output information') args = parser.parse_args() print(args) print('[INFO] Loading embeddings and text files...') image_embeds = np.load(args.image_embeds) try: - topk_embeds = np.load(args.text_embeds) + topk_embeds = np.load(args.topk_embeds) except Exception as e: print('[INFO] Loading text embeddings in memmap mode...') - topk_embeds = np.memmap(args.text_embeds, dtype=np.float32, mode='r', shape=(2109500, 2048)) + topk_embeds = np.memmap(args.topk_embeds, dtype=np.float32, mode='r', shape=(2109500, 2048)) -with open(args.image_info, 'r') as image_file: - image_info = image_file.readlines() -image_info = [item.split()[1] for item in image_info] +with open(args.image_file, 'r') as image_file: + image_file = image_file.readlines() +image_file = [item.split()[1] for item in image_file] -with open(args.text_info, 'r') as text_file: - text_info = text_file.readlines() -topk_info = [item.split()[1] for item in text_info] +with open(args.predictions_file, 'r') as text_file: + predictions_file = text_file.readlines() +topk_info = [item.split()[1] for item in predictions_file] assert args.model_name, "Provide a model name for proceeding" epochs = args.epochs lr = args.lr -writer_path = 'logs/' + args.model_name -writer = SummaryWriter(writer_path) -model_dir = 'EmbedNet/EmbedNet_models' -train_list_dir = 'EmbedNet' -assert os.path.exists(os.path.join(args.base_dir, train_list_dir)), "Train data directory does not exists, create one using data_prep.py" +model_dir = args.model_dir if not os.path.exists(os.path.join(args.base_dir, model_dir)): - os.mkdir(os.path.join(args.base_dir, model_dir)) + os.makedirs(os.path.join(args.base_dir, model_dir)) if torch.cuda.device_count() > 1: torch.cuda.set_device(args.gpu_id) @@ -115,22 +111,7 @@ def get_dataloaders(train_list): return train_data_loader, val_data_loader -def calculate_accuracy(model_path): - print("[INFO] Calculating current model's accuracy...") - temp_model_path = os.path.join(model_path, args.model_name + '_temp.pkl') - hidden_string = str(args.hidden_layers).replace(',', ' ').replace('[', '').replace(']', '') - try: - command = 'python parallel_word_rec_EmbedNet.py --use_model --hidden_layers {} --model_path {} --testing --test_split 0.75858 > {}.txt'.format(hidden_string, temp_model_path, args.model_name) - except Exception as e: - print(e) - pdb.set_trace() - os.system(command) - data = open('{}.txt'.format(args.model_name), 'r').readlines() - accuracy = data[0].split()[-1] - return float(accuracy) - - -triplet = Triplets(topk_info, image_info, topk_embeds, image_embeds, args.train_percentage, args.margin, verbose=True) +triplet = Triplets(topk_info, image_file, topk_embeds, image_embeds, args.train_percentage, args.margin, verbose=True) train_list = triplet.initial_list() train_data_loader, val_data_loader = get_dataloaders(train_list) @@ -176,6 +157,8 @@ def calculate_accuracy(model_path): anchor = anchor.cuda().double() positive = positive.cuda().double() negative = negative.cuda().double() + else: + model = model.double() model.zero_grad() anchor_ = model(anchor) positive_ = model(positive) @@ -185,8 +168,6 @@ def calculate_accuracy(model_path): tr_loss.backward() optimizer.step() train_loss_per_epoch += float(tr_loss) - writer.add_scalar('Train Loss/Batch', float(tr_loss), train_batch_count) - writer.add_scalar('Train Loss/Epoch', train_loss_per_epoch, epoch) for data_point in tqdm(val_data_loader, desc='[INFO] Validation'): validation_batch_count += 1 anchor = data_point['anchor'] @@ -203,26 +184,15 @@ def calculate_accuracy(model_path): negative_ = model(negative) val_loss = criterion(anchor_, positive_, negative_) val_loss_per_epoch += float(val_loss) - writer.add_scalar('Validation Loss/Batch', float0(val_loss), validation_batch_count) - writer.add_scalar('Validation Loss/Epoch', val_loss_per_epoch, epoch) - # Saving model based on the current accuracy - save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer, temp=True) - updated_accuracy = calculate_accuracy(os.path.join(args.base_dir, model_dir)) - writer.add_scalar('Word Accuracy/Epoch', updated_accuracy, epoch) - if updated_accuracy > accuracy: - accuracy = updated_accuracy - save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer, accuracy=True) - model_saved_epoch = epoch + 1 if val_loss_per_epoch < base_valid: base_valid = val_loss_per_epoch save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer) - print('[INFO] Train Loss {}, validation loss {} accuracy {}.'.format(round(train_loss_per_epoch, 3), round(val_loss_per_epoch, 3), round(updated_accuracy, 3))) + print('[INFO] Train Loss {}, validation loss {}.'.format(round(train_loss_per_epoch, 3), round(val_loss_per_epoch, 3))) if (epoch + 1) - model_saved_epoch >= 5: print('[INFO] Updating the train and validation list...') - updated_list, new_hard_neg_number = triplet.EmbedNet_embeds(model, 128) + updated_list, new_hard_neg_number = triplet.embednet_embeds(model, 128) if new_hard_neg_number < old_hard_neg_number: save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer, hard=True) old_hard_neg_number = new_hard_neg_number train_data_loader, val_data_loader = get_dataloaders(updated_list) model_saved_epoch = epoch + 1 - writer.add_scalars('Training Curves', {'Train Loss': train_loss_per_epoch, 'Validation Loss': val_loss_per_epoch}, epoch) diff --git a/src/models.py b/src/models.py index 6a72296..c33f0b6 100644 --- a/src/models.py +++ b/src/models.py @@ -48,8 +48,6 @@ def __init__(self, in_features, out_features, hidden_layers=[1024, 512, 256, 128 self.hidden_layers = hidden_layers self.layers = nn.ModuleList() current_dim = self.in_features - # if self.hidden_layers[-1] != out_features: - # print('[INFO] Last hidden layer output and final layer output is different.') for hidden_dim in self.hidden_layers: self.layers.append(nn.Linear(current_dim, hidden_dim)) current_dim = hidden_dim diff --git a/src/online_triplets.py b/src/triplets.py similarity index 100% rename from src/online_triplets.py rename to src/triplets.py diff --git a/src/word_rec_EmbedNet.py b/src/word_rec_EmbedNet.py index c389a8e..b416ea5 100644 --- a/src/word_rec_EmbedNet.py +++ b/src/word_rec_EmbedNet.py @@ -158,16 +158,3 @@ def get_EmbedNet_embed(input_embedding): accuracyList = [round(item, 3) for item in accuracyList] print('[INFO] Top {} accuracies are: {}.'.format(len(accuracyList), accuracyList)) print('[INFO] Number of words tested on {}.'.format(total)) - -# Command using for generating final new results (02/12/20) -# python3 src/word_rec_EmbedNet.py --image_embeds embeddings/topk_preds_100featsImg.npy --topk_embeds embeddings/topk_preds_100featsSynth.npy --predictions_file gen_files/top_preds_embeds_100_with_conf.txt --image_file gen_files/image_embed_top_k_100.txt --use_model --model_path /ssd_scratch/cvit/sid/WNet1AdamLR000001EXTOnGen1MarginNoConfidence240620.pkl --hidden_layers 1024 --test_split 1 --testing -# Command updated on 03/12/20 -# python3 src/word_rec_EmbedNet.py --use_model --hidden_layers 1024 -# Command for running baseline model -# python3 src/word_rec_EmbedNet.py -# Command for running model using the confidence scores -# python3 src/word_rec_EmbedNet.py --use_confidence -# Command for running model using the EmbedNet -# python3 src/word_rec_EmbedNet.py --use_confidence --use_model --hidden_layers 1024 -# Command for running model using EmbedNet and CAB -# python3 src/word_rec_EmbedNet.py --use_confidence --use_model --hidden_layers 1024 --cab