-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
97 lines (73 loc) · 2.64 KB
/
data_loader.py
File metadata and controls
97 lines (73 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from os.path import join
import pandas as pd
import spacy
from PIL import Image
from torch import tensor
from torch.utils.data import Dataset
from torchvision import transforms
from utils import MAGIC_MU, MAGIC_SIGMA
class DatasetLoader(Dataset):
"""
DataLoader.
"""
def __init__(self, img_path, captions_file, normalise=False, img_size=299, nb_img=None):
self.img_path = img_path
if nb_img is not None:
nb_img = int(nb_img) * 5
self.df = pd.read_csv(captions_file)
self.df = self.df[:nb_img]
self.normalise = normalise
self.imgs = self.df["image"]
self.captions = self.df["caption"]
self.spacy_en = spacy.load("en_core_web_sm")
self.word2idx = {"<START>": 0, "<END>": 1, "<UNK>": 2, "<PAD>": 3}
self.idx2word = {i: w for w, i in self.word2idx.items()}
self.transform = transforms.Compose([
transforms.Resize((img_size, img_size)),
transforms.ToTensor() # scale image to [0,1]
])
def __len__(self):
return self.df.shape[0]
def __getitem__(self, idx):
"""
Get item.
"""
caption = self.captions[idx]
img_name = self.imgs[idx]
# Image
img = Image.open(join(self.img_path, img_name))
img = self.transform(img)
if self.normalise:
for c in range(3):
img[c] -= MAGIC_MU[c]
img[c] /= MAGIC_SIGMA[c]
# Captions
caption = tensor([self.word2idx["<START>"]] +
self.tokenise(caption) +
[self.word2idx["<END>"]])
return img, caption, img_name
def build_vocab(self):
"""
Build vocabulary.
"""
curr_idx = len(self.word2idx)
freq_threshold = 2
frequencies = dict()
for sentence in self.captions:
tokens = [str(token).lower() for token in self.spacy_en.tokenizer(sentence)]
for tok in tokens:
if tok in frequencies.keys():
frequencies[tok] += 1
else:
frequencies[tok] = 1
if frequencies[tok] == freq_threshold:
self.word2idx[tok] = curr_idx
self.idx2word[curr_idx] = tok
curr_idx += 1
def tokenise(self, sentence):
"""
Tokenise sentence.
:param sentence: sentence to tokenise
"""
tokens = [str(token).lower() for token in self.spacy_en.tokenizer(sentence)]
return [self.word2idx[tok] if tok in self.word2idx else self.word2idx["<UNK>"] for tok in tokens]