|
| 1 | +"""Extract tsfresh features from the Handwriting dataset.""" |
| 2 | + |
| 3 | +import pickle |
| 4 | +from pathlib import Path |
| 5 | + |
| 6 | +import kagglehub |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +from aeon.datasets import load_classification |
| 10 | +from PIL import Image |
| 11 | +from tsfresh import extract_features |
| 12 | + |
| 13 | +PATH = "/nas/pohan/datasets/Handwriting/" |
| 14 | +PATH_SAVE = "/nas/pohan/datasets/Handwriting/embeddings/" |
| 15 | + |
| 16 | + |
| 17 | +def load_handwriting() -> tuple[np.ndarray, np.ndarray, dict[str, tuple[str, str]]]: |
| 18 | + """Load the Handwriting dataset (https://github.com/amazon-science/aeon). |
| 19 | +
|
| 20 | + Args: |
| 21 | + cfg_dataset: configuration file |
| 22 | + Returns: |
| 23 | + data: data. shape: (num_samples, 3, 152) |
| 24 | + labels: labels. e.g. "1.0" |
| 25 | + num2alphabet: a dict of index to alphabet |
| 26 | + alphabets_hand: list of PIL images |
| 27 | + """ |
| 28 | + # train_x.shape: (150, 3, 152), test_x.shape: (850, 3, 152) |
| 29 | + train_x, train_y = load_classification( |
| 30 | + "Handwriting", split="train" |
| 31 | + ) # np.ndarray, list[str] |
| 32 | + test_x, test_y = load_classification("Handwriting", split="test") |
| 33 | + # merge train and test |
| 34 | + x = np.concatenate([train_x, test_x], axis=0) |
| 35 | + y = np.concatenate([train_y, test_y], axis=0) |
| 36 | + num2alphabet = {f"{i+1}.0": chr(65 + i) for i in range(26)} |
| 37 | + idx = np.arange(x.shape[0]) |
| 38 | + x = x[idx] |
| 39 | + y = y[idx] |
| 40 | + |
| 41 | + def load_alphabets_img() -> tuple[np.ndarray, np.ndarray]: |
| 42 | + """Load the MNIST dataset. |
| 43 | +
|
| 44 | + Returns: |
| 45 | + data: data |
| 46 | + labels: labels |
| 47 | + """ |
| 48 | + # Download latest version |
| 49 | + path = kagglehub.dataset_download( |
| 50 | + "sachinpatel21/az-handwritten-alphabets-in-csv-format" |
| 51 | + ) |
| 52 | + df = pd.read_csv(path + "/A_Z Handwritten Data.csv") |
| 53 | + labels = df.iloc[:, 0] |
| 54 | + data = df.iloc[:, 1:] |
| 55 | + return data, labels |
| 56 | + |
| 57 | + alphabets_x, alphabets_y = load_alphabets_img() |
| 58 | + alphabets_img = {} |
| 59 | + for i in range(26): |
| 60 | + alphabets_img[i + 1] = alphabets_x[alphabets_y == i][:100] |
| 61 | + |
| 62 | + alphabets_hand = [] |
| 63 | + for i in range(x.shape[0]): |
| 64 | + label = int(y[i].split(".")[0]) |
| 65 | + random_idx = np.random.choice(alphabets_img[label].shape[0]) |
| 66 | + random_df = alphabets_img[label].iloc[random_idx].to_numpy() |
| 67 | + random_df = random_df.reshape(28, 28).astype(np.uint8) |
| 68 | + # save image to png |
| 69 | + path = Path(PATH, f"alphabet_{label}_{random_idx}.png") |
| 70 | + Image.fromarray(random_df, mode="L").save(path) |
| 71 | + alphabets_hand.append(path) |
| 72 | + return ( |
| 73 | + x, |
| 74 | + y, |
| 75 | + num2alphabet, |
| 76 | + alphabets_hand, |
| 77 | + ) |
| 78 | + |
| 79 | + |
| 80 | +def tsfresh_features() -> np.ndarray: |
| 81 | + """Extract tsfresh features from the data. |
| 82 | +
|
| 83 | + Returns: |
| 84 | + features: features |
| 85 | + """ |
| 86 | + data, labels, num2alphabet, alphabets_hand = load_handwriting() |
| 87 | + |
| 88 | + path = Path(PATH_SAVE, "Handwriting_tsfresh.csv") |
| 89 | + |
| 90 | + if path.exists(): |
| 91 | + df = pd.read_csv(path) |
| 92 | + else: |
| 93 | + # convert data to a df |
| 94 | + # column_id: id, column_sort: time, values: 3 channels |
| 95 | + df = pd.DataFrame(columns=["id", "time", "channel_1", "channel_2", "channel_3"]) |
| 96 | + for idx in range(data.shape[0]): |
| 97 | + for time in range(data.shape[2]): # 152 |
| 98 | + df.loc[idx, "id"] = idx |
| 99 | + df.loc[idx, "time"] = time |
| 100 | + df.loc[idx, "channel_1"] = data[idx, 0, time] |
| 101 | + df.loc[idx, "channel_2"] = data[idx, 1, time] |
| 102 | + df.loc[idx, "channel_3"] = data[idx, 2, time] |
| 103 | + print(df.head()) |
| 104 | + print(df.tail()) |
| 105 | + |
| 106 | + df.to_csv(path, index=False) |
| 107 | + ts_features = extract_features(df, column_id="id", column_sort="time") |
| 108 | + ts_features = ts_features.dropna(axis=1) |
| 109 | + print(type(ts_features)) |
| 110 | + print(ts_features.shape) |
| 111 | + print(ts_features.head()) |
| 112 | + print("ts_features shape:", ts_features.shape) |
| 113 | + with Path(PATH_SAVE, "Handwriting_emb_tsfresh.pkl.pkl").open("wb") as f: |
| 114 | + pickle.dump(ts_features, f) |
| 115 | + print("TSFresh features saved") |
| 116 | + |
| 117 | + |
| 118 | +if __name__ == "__main__": |
| 119 | + tsfresh_features() |
0 commit comments