Skip to content

Commit 3eb93b3

Browse files
committed
release
1 parent aa8f1eb commit 3eb93b3

28 files changed

+2937
-0
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,7 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
# for this repo
132+
data/*
133+
result/*

assets/result.png

147 KB
Loading

datasets/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .loader import get_uea_loader, get_tabular_loader, get_img_loader

datasets/loader.py

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import numpy as np
2+
import torch
3+
4+
from torchvision import transforms
5+
from prefetch_generator import BackgroundGenerator
6+
import util
7+
8+
import torchvision.datasets as torch_data
9+
from .time_series import uea as uea_data
10+
from .tabular import maf as maf_data
11+
12+
13+
def _gen_mini_dataset(dataset, dataset_ratio):
14+
n_dataset = dataset.shape[0]
15+
n_mini_dataset = int(dataset_ratio*n_dataset)
16+
s = torch.from_numpy(np.random.choice(
17+
np.arange(n_dataset, dtype=np.int64), n_mini_dataset, replace=False)
18+
)
19+
return dataset[s]
20+
21+
22+
class DataLoaderX(torch.utils.data.DataLoader):
23+
def __iter__(self):
24+
return BackgroundGenerator(super().__iter__())
25+
26+
27+
class TabularLoader:
28+
def __init__(self,opt, data, batch_size=None, shuffle=True):
29+
30+
self.data_size = data.shape[0]
31+
self.opt = opt
32+
self.device = opt.device
33+
34+
self.data = data.to(opt.device)
35+
self.batch_size = opt.batch_size if batch_size is None else batch_size
36+
self.shuffle = shuffle
37+
38+
self.input_dim = data.shape[-1]
39+
self.output_dim= [data.shape[-1]]
40+
41+
loc = torch.zeros(data.shape[-1]).to(opt.device)
42+
covariance_matrix = torch.eye(data.shape[-1]).to(opt.device) # TODO(Guan) scale down the cov ?
43+
self.p_z0 = torch.distributions.MultivariateNormal(loc=loc, covariance_matrix=covariance_matrix)
44+
self._reset_idxs()
45+
self.data_size = len(self.idxs_by_batch_size)
46+
47+
def _reset_idxs(self):
48+
idxs = torch.randperm(self.data.shape[0]) if self.shuffle else torch.arange(self.data.shape[0])
49+
self.idxs_by_batch_size = idxs.split(self.batch_size)
50+
self.batch_idx = 0
51+
52+
def __len__(self):
53+
return self.data_size
54+
55+
def __iter__(self):
56+
return self
57+
58+
def __next__(self):
59+
if self.batch_idx >= len(self.idxs_by_batch_size):
60+
self._reset_idxs()
61+
raise StopIteration
62+
63+
s = self.idxs_by_batch_size[self.batch_idx]
64+
self.batch_idx += 1
65+
x = self.data[s]
66+
logp_diff_t1 = torch.zeros(x.shape[0], 1, device=x.device)
67+
return (x, logp_diff_t1), self.p_z0
68+
69+
70+
def get_uea_loader(opt):
71+
72+
print(util.magenta("loading uea data..."))
73+
74+
dataset_name = {
75+
'CharT' :'CharacterTrajectories',
76+
'ArtWR' : 'ArticularyWordRecognition',
77+
'SpoAD' : 'SpokenArabicDigits',
78+
}.get(opt.problem)
79+
80+
missing_rate = 0.0
81+
device = opt.device
82+
intensity_data = True
83+
84+
(times, train_dataloader, val_dataloader,
85+
test_dataloader, num_classes, input_channels) = uea_data.get_data(dataset_name, missing_rate, device,
86+
intensity=intensity_data,
87+
batch_size=opt.batch_size)
88+
89+
# we'll return dataloader and store the rest in opt
90+
opt.times = times
91+
opt.output_dim = num_classes
92+
opt.input_dim = input_channels
93+
return train_dataloader, test_dataloader
94+
95+
96+
def get_tabular_loader(opt, test_batch_size=1000):
97+
assert opt.problem in ['gas', 'miniboone']
98+
print(util.magenta("loading tabular data..."))
99+
100+
data = maf_data.get_data(opt.problem)
101+
data.trn.x = torch.from_numpy(data.trn.x)
102+
data.val.x = torch.from_numpy(data.val.x)
103+
data.tst.x = torch.from_numpy(data.tst.x)
104+
105+
if opt.dataset_ratio < 1.0:
106+
data.trn.x = _gen_mini_dataset(data.trn.x, opt.dataset_ratio)
107+
data.val.x = _gen_mini_dataset(data.val.x, opt.dataset_ratio)
108+
data.tst.x = _gen_mini_dataset(data.tst.x, opt.dataset_ratio)
109+
110+
train_loader = TabularLoader(opt, data.trn.x, shuffle=True)
111+
val_loader = TabularLoader(opt, data.val.x, batch_size=test_batch_size, shuffle=False)
112+
test_loader = TabularLoader(opt, data.tst.x, batch_size=test_batch_size, shuffle=False)
113+
114+
opt.input_dim = train_loader.input_dim
115+
opt.output_dim = train_loader.output_dim
116+
117+
return train_loader, test_loader
118+
119+
120+
def get_img_loader(opt, test_batch_size=1000):
121+
print(util.magenta("loading image data..."))
122+
123+
dataset_builder, root, input_dim, output_dim = {
124+
'mnist': [torch_data.MNIST, 'data/img/mnist', [1,28,28], 10],
125+
'SVHN': [torch_data.SVHN, 'data/img/svhn', [3,32,32], 10],
126+
'cifar10': [torch_data.CIFAR10,'data/img/cifar10',[3,32,32], 10],
127+
}.get(opt.problem)
128+
opt.input_dim = input_dim
129+
opt.output_dim = output_dim
130+
131+
transform = transforms.Compose([
132+
transforms.ToTensor(),
133+
transforms.Normalize((0.1307,), (0.3081,)),
134+
])
135+
feed_dict = dict(download=True, root=root, transform=transform)
136+
train_dataset = dataset_builder(**feed_dict) if opt.problem=='SVHN' else dataset_builder(train=True, **feed_dict)
137+
test_dataset = dataset_builder(**feed_dict) if opt.problem=='SVHN' else dataset_builder(train=False, **feed_dict)
138+
139+
feed_dict = dict(num_workers=2, drop_last=True)
140+
train_loader = DataLoaderX(train_dataset, batch_size=opt.batch_size, shuffle=True, **feed_dict)
141+
test_loader = DataLoaderX(test_dataset, batch_size=test_batch_size, shuffle=False, **feed_dict)
142+
143+
return train_loader, test_loader

datasets/tabular/__init__.py

Whitespace-only changes.

datasets/tabular/gas.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
5+
class GAS:
6+
7+
class Data:
8+
9+
def __init__(self, data):
10+
11+
self.x = data.astype(np.float32)
12+
self.N = self.x.shape[0]
13+
14+
def __init__(self, path):
15+
16+
file = path / 'ethylene_CO.pickle'
17+
trn, val, tst = load_data_and_clean_and_split(file)
18+
19+
self.trn = self.Data(trn)
20+
self.val = self.Data(val)
21+
self.tst = self.Data(tst)
22+
23+
self.n_dims = self.trn.x.shape[1]
24+
25+
26+
def load_data(file):
27+
28+
data = pd.read_pickle(file)
29+
# data = pd.read_pickle(file).sample(frac=0.25)
30+
# data.to_pickle(file)
31+
data.drop("Meth", axis=1, inplace=True)
32+
data.drop("Eth", axis=1, inplace=True)
33+
data.drop("Time", axis=1, inplace=True)
34+
return data
35+
36+
37+
def get_correlation_numbers(data):
38+
C = data.corr()
39+
A = C > 0.98
40+
B = A.to_numpy().sum(axis=1)
41+
return B
42+
43+
44+
def load_data_and_clean(file):
45+
46+
data = load_data(file)
47+
B = get_correlation_numbers(data)
48+
49+
while np.any(B > 1):
50+
col_to_remove = np.where(B > 1)[0][0]
51+
col_name = data.columns[col_to_remove]
52+
data.drop(col_name, axis=1, inplace=True)
53+
B = get_correlation_numbers(data)
54+
# print(data.corr())
55+
data = (data - data.mean()) / data.std()
56+
57+
return data
58+
59+
60+
def load_data_and_clean_and_split(file):
61+
62+
data = load_data_and_clean(file).to_numpy()
63+
N_test = int(0.1 * data.shape[0])
64+
data_test = data[-N_test:]
65+
data_train = data[0:-N_test]
66+
N_validate = int(0.1 * data_train.shape[0])
67+
data_validate = data_train[-N_validate:]
68+
data_train = data_train[0:-N_validate]
69+
70+
return data_train, data_validate, data_test

datasets/tabular/maf.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import collections as co
2+
import numpy as np
3+
import os
4+
import pathlib
5+
import sktime.utils.load_data
6+
import torch
7+
import urllib.request
8+
import tarfile
9+
10+
from .gas import GAS
11+
from .miniboone import MINIBOONE
12+
13+
14+
here = pathlib.Path(__file__).resolve().parent.parent.parent
15+
16+
def download():
17+
base_base_loc = here / 'data'
18+
base_loc = base_base_loc / 'maf'
19+
loc = base_loc / 'maf.tar.gz'
20+
if os.path.exists(loc):
21+
return
22+
if not os.path.exists(base_base_loc):
23+
os.mkdir(base_base_loc)
24+
if not os.path.exists(base_loc):
25+
os.mkdir(base_loc)
26+
27+
print('download from https://zenodo.org/record/1161203/files/data.tar.gz .....')
28+
urllib.request.urlretrieve('https://zenodo.org/record/1161203/files/data.tar.gz',
29+
str(loc))
30+
31+
def gas(tar):
32+
l = len("data/")
33+
for member in tar.getmembers():
34+
if member.path.startswith("data/gas"):
35+
member.path = member.path[l:]
36+
yield member
37+
38+
def miniboone(tar):
39+
l = len("data/")
40+
for member in tar.getmembers():
41+
if member.path.startswith("data/miniboone"):
42+
member.path = member.path[l:]
43+
yield member
44+
45+
with tarfile.open(loc, "r:gz") as tar:
46+
# tar.extractall(path=base_loc) # <---- TODO(Guan) use this if you wish to extract all datasets.
47+
tar.extractall(path=base_loc, members=gas(tar))
48+
tar.extractall(path=base_loc, members=miniboone(tar))
49+
50+
def get_data(dataset_name):
51+
52+
base_base_loc = here / 'data'
53+
base_loc = base_base_loc / 'maf'
54+
loc = base_loc / dataset_name
55+
56+
if not os.path.exists(loc):
57+
download()
58+
59+
return {
60+
'gas': GAS,
61+
'miniboone': MINIBOONE
62+
}.get(dataset_name)(loc)
63+

datasets/tabular/miniboone.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import numpy as np
2+
3+
4+
class MINIBOONE:
5+
6+
class Data:
7+
8+
def __init__(self, data):
9+
10+
self.x = data.astype(np.float32)
11+
self.N = self.x.shape[0]
12+
13+
def __init__(self, path):
14+
15+
file = path / 'data.npy'
16+
trn, val, tst = load_data_normalised(file)
17+
18+
self.trn = self.Data(trn)
19+
self.val = self.Data(val)
20+
self.tst = self.Data(tst)
21+
22+
self.n_dims = self.trn.x.shape[1]
23+
24+
25+
def load_data(root_path):
26+
# NOTE: To remember how the pre-processing was done.
27+
# data = pd.read_csv(root_path, names=[str(x) for x in range(50)], delim_whitespace=True)
28+
# print data.head()
29+
# data = data.to_numpy()
30+
# # Remove some random outliers
31+
# indices = (data[:, 0] < -100)
32+
# data = data[~indices]
33+
#
34+
# i = 0
35+
# # Remove any features that have too many re-occuring real values.
36+
# features_to_remove = []
37+
# for feature in data.T:
38+
# c = Counter(feature)
39+
# max_count = np.array([v for k, v in sorted(c.iteritems())])[0]
40+
# if max_count > 5:
41+
# features_to_remove.append(i)
42+
# i += 1
43+
# data = data[:, np.array([i for i in range(data.shape[1]) if i not in features_to_remove])]
44+
# np.save("~/data/miniboone/data.npy", data)
45+
46+
data = np.load(root_path)
47+
N_test = int(0.1 * data.shape[0])
48+
data_test = data[-N_test:]
49+
data = data[0:-N_test]
50+
N_validate = int(0.1 * data.shape[0])
51+
data_validate = data[-N_validate:]
52+
data_train = data[0:-N_validate]
53+
54+
return data_train, data_validate, data_test
55+
56+
57+
def load_data_normalised(root_path):
58+
59+
data_train, data_validate, data_test = load_data(root_path)
60+
data = np.vstack((data_train, data_validate))
61+
mu = data.mean(axis=0)
62+
s = data.std(axis=0)
63+
data_train = (data_train - mu) / s
64+
data_validate = (data_validate - mu) / s
65+
data_test = (data_test - mu) / s
66+
67+
return data_train, data_validate, data_test

datasets/time_series/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)