diff --git a/.gitignore b/.gitignore
index bf6cca2..37733e1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
.DS_Store
.vscode/
*/__pycache__/*
+*.pyc
diff --git a/README.md b/README.md
index 078f91a..6f052b6 100644
--- a/README.md
+++ b/README.md
@@ -41,12 +41,18 @@ The deep embeddings used in this work are generated using the End2End network pr
```
Krishnan, P., Dutta, K., Jawahar, C.V.: Word spotting and recognition using deep embedding. In: 2018 13th IAPR International Workshop on Document Analysis Systems (DAS). pp. 1–6 (April 2018). https://doi.org/10.1109/DAS.2018.70
```
-Word text and image's deep embeddings for testing this repository are provided in the ```embeddings``` folder.
+Word text and image's deep embeddings for testing this repository are provided in the ``embeddings`` folder.
Text files containing the information about the embeddings are required while running the code. They are in the format
-```1```
-```1```
-...
-Corresponding text files for testing this repository are provided in the ``gen_files`` folder.
+```
+1
+1
+...
+```
+One can refer to and use [https://github.com/kris314/hwnet](https://github.com/kris314/hwnet) for generating embeddings.
+
+For the purpose of making it easier to explore the code in this repository, sample text files and embeddings are provided in ``gen_files`` and ``embeddings``, respectively.
+
+Original Dataset used in this work will be released by [CVIT](http://cvit.iiit.ac.in) soon.
### Performing word recognition (using a pre-trained EmbedNet)
Pre-trained EmbedNet models are saved in the ``models`` folder.
@@ -102,7 +108,47 @@ Other arguments for word recognition experiment are:
- `k` total number of predictions to test on (max 20)
### Training EmbedNet
-TODO
+Prepare text files and embeddings as mentioned in [Generating/using deep embeddings](#Generating/using-deep-embeddings). Refer files in folder ``gen_files`` for text file's examples. Once the embeddings are prepared run the following command
+```sh
+python src/EmbedNet_train.py --model_name provide_a_name_of_your_choice
+```
+For the purpose of a demonstration, you can run the following command
+```sh
+python src/EmbedNet_train.py --model_name temp
+```
+This will start training an EmbedNet for 1000 epochs and save the models in `trained/EmbedNet_models`.
+
+Other arguments for `EmbedNet_train.py` are:
+```sh
+--base_dir
+--model_dir
+--train_percentage
+--epochs
+--lr
+--batch
+--model_name
+--margin
+--hidden_layers
+--gpu_id
+--image_embeds
+--topk_embeds
+--image_file
+--predictions_file
+```
+- `base_dir` is a path to the directory for saving models
+- `model_dir` is a name of the folder for saving trained models
+- `train_percentage` percentage of data to use for training
+- `epochs` number of epochs to train for
+- `lr` learning rate
+- `batch` batch size
+- `model_name` name of the model for saving
+- `margin` triplet loss margin
+- `hidden_layers` list of input size of the hidden layers
+- `gpu_id` specify which GPU to use
+- `image_embeds` is used to provide path to the image embeddings
+- `topk_embeds` is used to provide path to the TopK predictions' embeddings
+- `image_file` is used to provide path to the image's text information file
+- `predictions_file` is used to provide path to the TopK predictions' text information file
License and Citation
---------------------
diff --git a/src/EmbedNet_dataprep.py b/src/EmbedNet_dataprep.py
deleted file mode 100644
index a7aa754..0000000
--- a/src/EmbedNet_dataprep.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""
-This code is used for preparing the Triplet dataset for EmbedNet
-"""
-# Standard imports
-import os
-import pdb
-import pickle
-import argparse
-
-# Third party imports
-import numpy as np
-from tqdm import tqdm
-import Levenshtein as lev
-
-parser = argparse.ArgumentParser(description='Data Preperation for deep word recognition')
-
-# Arguments for text and embeddings path
-parser.add_argument('--image_embeds', default='/ssd_scratch/cvit/sid/embeddings/topk_preds_1500featsImg.npy', help='Path to the image embeddings')
-parser.add_argument('--text_embeds', default='/ssd_scratch/cvit/sid/embeddings/topk_preds_1500featsSynth.npy', help='Path to the text embeddigns')
-parser.add_argument('--image_info', default='/ssd_scratch/cvit/sid/image_embed_top_k_1500.txt', help='Path to the file containing word image information')
-parser.add_argument('--text_info', default='/ssd_scratch/cvit/sid/top_preds_embeds_with_confidence_1500.txt', help='Path to the file containing text output information')
-
-# model path and name arguments
-parser.add_argument('--base_path', default='/ssd_scratch/cvit/sid/', help='Path to the base directory where the training and testing data is stored')
-parser.add_argument('--file_name', default='EmbedNet_data', help='Name of the data file')
-
-# Training and testing split flag
-parser.add_argument('--train_percent', default=0.8, type=float, help='Percent of train data')
-parser.add_argument('--semi_hard', default=False, action='store_true', help='If True semi-hard examples will also be included')
-parser.add_argument('--save', default=False, action='store_true', help='If true data will be saved in ssd_scratch')
-
-args = parser.parse_args()
-print(args)
-
-data_path = args.base_path + args.file_name
-
-print('[INFO] Loading embeddings and text files...')
-image_embeds = np.load(args.image_embeds)
-try:
- topk_embeds = np.load(args.text_embeds)
-except Exception as e:
- print('[INFO] Loading text embeddings in memmap mode...')
- topk_embeds = np.memmap(args.text_embeds, dtype=np.float32, mode='r', shape=(2109500, 2048))
-
-with open(args.image_info, 'r') as image_file:
- image_info = image_file.readlines()
-image_info = [item.split()[1] for item in image_info]
-
-with open(args.text_info, 'r') as text_file:
- text_info = text_file.readlines()
-text_info = [item.split()[1] for item in text_info]
-
-# This piece is for handling text files with more data as compared to the numpy files
-# text_info = text_info[:topk_embeds.shape[0]]
-# image_info = image_info[:image_embeds.shape[0]]
-
-# # Getting count of number of words in training set
-split_count = int(args.train_percent * len(image_info))
-image_info = image_info[:split_count]
-text_info = text_info[:split_count*20]
-image_embeds = image_embeds[:split_count]
-topk_embeds = topk_embeds[:split_count*20]
-
-text_dict = dict()
-embeds_dict = dict()
-ko = 0
-k = 20
-"""Text Dictionary is in the form
-{'word':[([top_20_preds],[lev_dist]), (..., ...), ...], ...}
-Embedding dictionary is in the form
-{'word': {'image_embeds': [all image_embeds occurances], 'text_embeds': [[top_20_text_embeds], [top_20_text_embeds], ...]}, ...}
-"""
-for word in tqdm(image_info, desc='[INFO] Text Dict'):
- if word not in text_dict.keys():
- text_dict[word] = [(text_info[ko: k], [lev.distance(word, item) for item in text_info[ko: k]])]
- else:
- text_dict[word].append((text_info[ko: k], [lev.distance(word, item) for item in text_info[ko: k]]))
- ko = k
- k += 20
-
-ko = 0
-k = 20
-for count, image_embed in enumerate(tqdm(image_embeds, desc='[INFO] Embeds Dict')):
- word = image_info[count]
- if word not in embeds_dict.keys():
- embeds_dict[word] = {'image_embeds': [image_embed], 'text_embeds': [topk_embeds[ko: k]]}
- else:
- embeds_dict[word]['image_embeds'].append(image_embed)
- embeds_dict[word]['text_embeds'].append(topk_embeds[ko: k])
- ko = k
- k += 20
-
-final_list = list()
-for word in tqdm(text_dict.keys(), desc='[INFO] Data Prep'):
- predictions = text_dict[word]
- image_embeddings, text_embeddings = embeds_dict[word]['image_embeds'], np.array(embeds_dict[word]['text_embeds'])
- for instance_count, single_instance in enumerate(predictions):
- top20_preds, top20_edit_dist = single_instance[0], single_instance[1]
- instance_text_embeds = text_embeddings[instance_count]
- anchor = image_embeddings[instance_count]
- positive = None
- negative_list = list()
- if args.semi_hard:
- semi_negative_list = list()
- for count, pred in enumerate(top20_preds):
- if word == pred:
- positive = instance_text_embeds[count]
- else:
- if not args.semi_hard:
- negative_list.append(instance_text_embeds[count])
- if args.semi_hard:
- condition = True
- no_inf = 1000
- while condition and no_inf != 0:
- random_num = np.random.randint(low=1, high=len(topk_embeds))
- random_embedding = topk_embeds[random_num]
- if np.linalg.norm(anchor - random_embedding) > 0.4:
- condition = False
- no_inf -= 1
- semi_negative_list.append(random_embedding)
- if args.semi_hard:
- for semi_hard_neg_embed in semi_negative_list:
- if positive is None:
- pass
- else:
- final_list.append({'anchor': anchor, 'positive': positive, 'negative': np.array(semi_hard_neg_embed)})
- else:
- for negative_embeds in negative_list:
- if positive is None: # There are a few cases when the OCR even fails to predics in Top20 predicitons
- pass
- else:
- final_list.append({'anchor': anchor, 'positive': positive, 'negative': np.array(negative_embeds)})
-
-def check(final_list):
- positive_distance = list()
- negative_distance = list()
- for sample in tqdm(final_list, desc='[INFO] Checking'):
- anchor = sample['anchor']
- positive = sample['positive']
- negative = sample['negative']
- try:
- positive_distance.append(np.linalg.norm(anchor - positive))
- negative_distance.append(np.linalg.norm(anchor - negative))
- except Exception as e:
- print(e)
- pdb.set_trace()
- print('[INFO] Mean distance of anchors with positive pairs is {} Max {} Min {}.\n[INFO] Mean distance of anchor with negative pairs is {} Max {} Min {}.'.format(np.mean(positive_distance), np.max(positive_distance), np.min(positive_distance), np.mean(negative_distance), np.max(negative_distance), np.min(negative_distance)))
-
-check(final_list)
-if args.save:
- pickle.dump(final_list, open(data_path, 'wb'))
- print('[INFO] Total number of triples generated: {}\n[INFO] Pickle file saved at {}'.format(len(final_list), data_path))
diff --git a/src/EmbedNet.py b/src/EmbedNet_train.py
similarity index 67%
rename from src/EmbedNet.py
rename to src/EmbedNet_train.py
index 276f72b..4307930 100644
--- a/src/EmbedNet.py
+++ b/src/EmbedNet_train.py
@@ -15,18 +15,18 @@
import torch.nn as nn
from tqdm import tqdm
from models import EmbedNet
-from online_triplets import Triplets
+from triplets import Triplets
import torch.nn.functional as F
import torch.utils.data as data
from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
parser = argparse.ArgumentParser(description='Neural Networks for word recognition')
# File paths and directory names
-parser.add_argument('--base_dir', default='/ssd_scratch/cvit/sid/', help='Path to the directory for saving models')
+parser.add_argument('--base_dir', default='trained', help='Path to the directory for saving models')
+parser.add_argument('--model_dir', default='EmbedNet_models', help='Name of the folder for saving trained models')
# Various model hyperparameters
parser.add_argument('--train_percentage', type=float, default=0.8, help='Percentage of data to use for training')
@@ -38,39 +38,35 @@
parser.add_argument('--hidden_layers', nargs='+', type=int, default=[1024, 512, 256, 128], help='List of input size of the hidden layers')
parser.add_argument('--gpu_id', default=0, type=int, help='Specify which GPU to use')
-parser.add_argument('--image_embeds', default='/ssd_scratch/cvit/sid/embeddings/image_embeds_top_k_allfeatsImg.npy', help='Path to the image embeddings')
-parser.add_argument('--text_embeds', default='/ssd_scratch/cvit/sid/embeddings/top_preds_embeds_all_featsSynth.dat', help='Path to the text embeddigns')
-parser.add_argument('--image_info', default='/ssd_scratch/cvit/sid/EmbedGenFiles/image_embed_top_k_all.txt', help='Path to the file containing word image information')
-parser.add_argument('--text_info', default='/ssd_scratch/cvit/sid/EmbedGenFiles/top_preds_embeds_all_with_confidence.txt', help='Path to the file containing text output information')
+parser.add_argument('--image_embeds', default='embeddings/topk_preds_100featsImg.npy', help='Path to the image embeddings')
+parser.add_argument('--topk_embeds', default='embeddings/topk_preds_100featsSynth.npy', help='Path to the text embeddigns')
+parser.add_argument('--image_file', default='gen_files/image_embed_top_k_100.txt', help='Path to the file containing word image information')
+parser.add_argument('--predictions_file', default='gen_files/top_preds_embeds_100_with_conf.txt', help='Path to the file containing text output information')
args = parser.parse_args()
print(args)
print('[INFO] Loading embeddings and text files...')
image_embeds = np.load(args.image_embeds)
try:
- topk_embeds = np.load(args.text_embeds)
+ topk_embeds = np.load(args.topk_embeds)
except Exception as e:
print('[INFO] Loading text embeddings in memmap mode...')
- topk_embeds = np.memmap(args.text_embeds, dtype=np.float32, mode='r', shape=(2109500, 2048))
+ topk_embeds = np.memmap(args.topk_embeds, dtype=np.float32, mode='r', shape=(2109500, 2048))
-with open(args.image_info, 'r') as image_file:
- image_info = image_file.readlines()
-image_info = [item.split()[1] for item in image_info]
+with open(args.image_file, 'r') as image_file:
+ image_file = image_file.readlines()
+image_file = [item.split()[1] for item in image_file]
-with open(args.text_info, 'r') as text_file:
- text_info = text_file.readlines()
-topk_info = [item.split()[1] for item in text_info]
+with open(args.predictions_file, 'r') as text_file:
+ predictions_file = text_file.readlines()
+topk_info = [item.split()[1] for item in predictions_file]
assert args.model_name, "Provide a model name for proceeding"
epochs = args.epochs
lr = args.lr
-writer_path = 'logs/' + args.model_name
-writer = SummaryWriter(writer_path)
-model_dir = 'EmbedNet/EmbedNet_models'
-train_list_dir = 'EmbedNet'
-assert os.path.exists(os.path.join(args.base_dir, train_list_dir)), "Train data directory does not exists, create one using data_prep.py"
+model_dir = args.model_dir
if not os.path.exists(os.path.join(args.base_dir, model_dir)):
- os.mkdir(os.path.join(args.base_dir, model_dir))
+ os.makedirs(os.path.join(args.base_dir, model_dir))
if torch.cuda.device_count() > 1:
torch.cuda.set_device(args.gpu_id)
@@ -115,22 +111,7 @@ def get_dataloaders(train_list):
return train_data_loader, val_data_loader
-def calculate_accuracy(model_path):
- print("[INFO] Calculating current model's accuracy...")
- temp_model_path = os.path.join(model_path, args.model_name + '_temp.pkl')
- hidden_string = str(args.hidden_layers).replace(',', ' ').replace('[', '').replace(']', '')
- try:
- command = 'python parallel_word_rec_EmbedNet.py --use_model --hidden_layers {} --model_path {} --testing --test_split 0.75858 > {}.txt'.format(hidden_string, temp_model_path, args.model_name)
- except Exception as e:
- print(e)
- pdb.set_trace()
- os.system(command)
- data = open('{}.txt'.format(args.model_name), 'r').readlines()
- accuracy = data[0].split()[-1]
- return float(accuracy)
-
-
-triplet = Triplets(topk_info, image_info, topk_embeds, image_embeds, args.train_percentage, args.margin, verbose=True)
+triplet = Triplets(topk_info, image_file, topk_embeds, image_embeds, args.train_percentage, args.margin, verbose=True)
train_list = triplet.initial_list()
train_data_loader, val_data_loader = get_dataloaders(train_list)
@@ -176,6 +157,8 @@ def calculate_accuracy(model_path):
anchor = anchor.cuda().double()
positive = positive.cuda().double()
negative = negative.cuda().double()
+ else:
+ model = model.double()
model.zero_grad()
anchor_ = model(anchor)
positive_ = model(positive)
@@ -185,8 +168,6 @@ def calculate_accuracy(model_path):
tr_loss.backward()
optimizer.step()
train_loss_per_epoch += float(tr_loss)
- writer.add_scalar('Train Loss/Batch', float(tr_loss), train_batch_count)
- writer.add_scalar('Train Loss/Epoch', train_loss_per_epoch, epoch)
for data_point in tqdm(val_data_loader, desc='[INFO] Validation'):
validation_batch_count += 1
anchor = data_point['anchor']
@@ -203,26 +184,15 @@ def calculate_accuracy(model_path):
negative_ = model(negative)
val_loss = criterion(anchor_, positive_, negative_)
val_loss_per_epoch += float(val_loss)
- writer.add_scalar('Validation Loss/Batch', float0(val_loss), validation_batch_count)
- writer.add_scalar('Validation Loss/Epoch', val_loss_per_epoch, epoch)
- # Saving model based on the current accuracy
- save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer, temp=True)
- updated_accuracy = calculate_accuracy(os.path.join(args.base_dir, model_dir))
- writer.add_scalar('Word Accuracy/Epoch', updated_accuracy, epoch)
- if updated_accuracy > accuracy:
- accuracy = updated_accuracy
- save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer, accuracy=True)
- model_saved_epoch = epoch + 1
if val_loss_per_epoch < base_valid:
base_valid = val_loss_per_epoch
save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer)
- print('[INFO] Train Loss {}, validation loss {} accuracy {}.'.format(round(train_loss_per_epoch, 3), round(val_loss_per_epoch, 3), round(updated_accuracy, 3)))
+ print('[INFO] Train Loss {}, validation loss {}.'.format(round(train_loss_per_epoch, 3), round(val_loss_per_epoch, 3)))
if (epoch + 1) - model_saved_epoch >= 5:
print('[INFO] Updating the train and validation list...')
- updated_list, new_hard_neg_number = triplet.EmbedNet_embeds(model, 128)
+ updated_list, new_hard_neg_number = triplet.embednet_embeds(model, 128)
if new_hard_neg_number < old_hard_neg_number:
save_checkpoint(os.path.join(args.base_dir, model_dir), epoch, model, optimizer, hard=True)
old_hard_neg_number = new_hard_neg_number
train_data_loader, val_data_loader = get_dataloaders(updated_list)
model_saved_epoch = epoch + 1
- writer.add_scalars('Training Curves', {'Train Loss': train_loss_per_epoch, 'Validation Loss': val_loss_per_epoch}, epoch)
diff --git a/src/models.py b/src/models.py
index 6a72296..c33f0b6 100644
--- a/src/models.py
+++ b/src/models.py
@@ -48,8 +48,6 @@ def __init__(self, in_features, out_features, hidden_layers=[1024, 512, 256, 128
self.hidden_layers = hidden_layers
self.layers = nn.ModuleList()
current_dim = self.in_features
- # if self.hidden_layers[-1] != out_features:
- # print('[INFO] Last hidden layer output and final layer output is different.')
for hidden_dim in self.hidden_layers:
self.layers.append(nn.Linear(current_dim, hidden_dim))
current_dim = hidden_dim
diff --git a/src/online_triplets.py b/src/triplets.py
similarity index 100%
rename from src/online_triplets.py
rename to src/triplets.py
diff --git a/src/word_rec_EmbedNet.py b/src/word_rec_EmbedNet.py
index c389a8e..b416ea5 100644
--- a/src/word_rec_EmbedNet.py
+++ b/src/word_rec_EmbedNet.py
@@ -158,16 +158,3 @@ def get_EmbedNet_embed(input_embedding):
accuracyList = [round(item, 3) for item in accuracyList]
print('[INFO] Top {} accuracies are: {}.'.format(len(accuracyList), accuracyList))
print('[INFO] Number of words tested on {}.'.format(total))
-
-# Command using for generating final new results (02/12/20)
-# python3 src/word_rec_EmbedNet.py --image_embeds embeddings/topk_preds_100featsImg.npy --topk_embeds embeddings/topk_preds_100featsSynth.npy --predictions_file gen_files/top_preds_embeds_100_with_conf.txt --image_file gen_files/image_embed_top_k_100.txt --use_model --model_path /ssd_scratch/cvit/sid/WNet1AdamLR000001EXTOnGen1MarginNoConfidence240620.pkl --hidden_layers 1024 --test_split 1 --testing
-# Command updated on 03/12/20
-# python3 src/word_rec_EmbedNet.py --use_model --hidden_layers 1024
-# Command for running baseline model
-# python3 src/word_rec_EmbedNet.py
-# Command for running model using the confidence scores
-# python3 src/word_rec_EmbedNet.py --use_confidence
-# Command for running model using the EmbedNet
-# python3 src/word_rec_EmbedNet.py --use_confidence --use_model --hidden_layers 1024
-# Command for running model using EmbedNet and CAB
-# python3 src/word_rec_EmbedNet.py --use_confidence --use_model --hidden_layers 1024 --cab