Skip to content

Submodule for datasets #41

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
6 changes: 4 additions & 2 deletions src/SoleData.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ using DataFrames
using MultiData
using MultiData: AbstractDimensionalDataset

using DataStructures: OrderedDict

const DF = DataFrames
const MD = MultiData

Expand All @@ -23,8 +25,8 @@ include("utils/minify.jl")

include("MLJ-utils.jl")

include("example-datasets.jl")

include("datasets/datasets.jl")
@reexport using .Datasets

export atoms

Expand Down
24 changes: 24 additions & 0 deletions src/datasets/datasets.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
module Datasets

using ZipFile
using DataFrames
using CategoricalArrays

using DataStructures: OrderedDict

include("epilepsy-loader.jl")
export load_epilepsy

include("hugadb-loader.jl")
export load_hugadb

include("libras-loader.jl")
export load_libras

include("natops-loader.jl")
export load_NATOPS

include("example-datasets.jl")
export load_arff_dataset

end
36 changes: 36 additions & 0 deletions src/datasets/epilepsy-loader.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
function load_epilepsy(
dirpath::S;
fileprefix::S="Epilepsy",
variablenames::Vector{S}=["x", "y", "z"]
) where {S<:AbstractString}

Loader for `Epilepsy` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=Epilepsy).

# Arguments
- `dirpath::S`: the directory in which all the .arff files are stored.

# Keyword Arguments
- `fileprefix::S="Epilepsy"`: the prefix shared by both test and train parts of the dataset;
the default name for such files is Epilepsy_TEST.arff and Epilepsy_TRAIN.arff;
- `variablenames::Vector{S}=["x", "y", "z"]`: the names of the columns.
"""
function load_epilepsy(
dirpath::S;
fileprefix::S="Epilepsy",
variablenames::Vector{S}=["x", "y", "z"]
) where {S<:AbstractString}
(X_train, y_train), (X_test, y_test) =
(
read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF,
read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF,
)

X_train = SoleData.fix_dataframe(X_train, variablenames)
X_test = SoleData.fix_dataframe(X_test, variablenames)

y_train = categorical(y_train)
y_test = categorical(y_test)

vcat(X_train, X_test), vcat(y_train, y_test)
end
File renamed without changes.
162 changes: 162 additions & 0 deletions src/datasets/hugadb-loader.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
HUGADB_VARIABLENAMES = ["acc_rf_x","acc_rf_y","acc_rf_z",
"gyro_rf_x","gyro_rf_y","gyro_rf_z",
"acc_rs_x","acc_rs_y","acc_rs_z",
"gyro_rs_x","gyro_rs_y","gyro_rs_z",
"acc_rt_x","acc_rt_y","acc_rt_z",
"gyro_rt_x","gyro_rt_y","gyro_rt_z",
"acc_lf_x","acc_lf_y","acc_lf_z",
"gyro_lf_x","gyro_lf_y","gyro_lf_z",
"acc_ls_x","acc_ls_y","acc_ls_z",
"gyro_ls_x","gyro_ls_y","gyro_ls_z",
"acc_lt_x","acc_lt_y","acc_lt_z",
"gyro_lt_x","gyro_lt_y","gyro_lt_z",
"EMG_r","EMG_l","act",
]

# activity strings (labels) to ids as in the table at https://github.com/romanchereshnev/HuGaDB
_activity2id = x -> findfirst(activity -> x == activity, [
"walking", "running", "going_up", "going_down", "sitting", "sitting_down",
"standing_up", "standing", "bicycling", "elevator_up", "elevator_down",
"sitting_car"
])

"""
function load_hugadb(dirpath::S, filename::S) where {S<:AbstractString}

Loader for a single instance of `HuGaDB` dataset, available [here](https://github.com/romanchereshnev/HuGaDB).

# Arguments
- `dirpath::S`: the directory in which all the .txt files are stored;
- `filename::S`: the specific filename associated with the instance, such as
HuGaDB_v2_various_01_00.txt.

# Keyword Arguments
- `variablenames::Vector{S}=["x_accelometer_right_foot", "y_accelerometer_right_foot", ...]`:
the names of the columns.
"""
function load_hugadb(
dirpath::S,
filename::S;
variablenames::Vector{S}=HUGADB_VARIABLENAMES
) where {S<:AbstractString}
filepath = joinpath(dirpath, filename)

# e.g. open("test/data/HuGaDB/HuGaDB_v2_various_01_00.txt", "r")
f = open(filepath, "r")

# get the activities recorded for the performer specified in `filename`
activities = split(readline(f), " ")[1:end-1]
activities[1] = activities[1][11:end] # remove the initial "#Activity\t"


activity_ids = [_activity2id(activity) for activity in activities]

# ignore #ActivityID array (we only keep the string version instead of integer IDs)
readline(f)

# ignore #Date row
readline(f)

# ignore the variable names, as we already explicited them in `variablenames`
readline(f)

_substr2float = x -> parse(Float64, x)
lines = [_substr2float.(split(line, "\t")) for line in eachline(f)]

close(f)

X = DataFrame([
# get the i-th element from each line, and concatenate them together
[[line[i] for line in lines]]
for i in 1:length(variablenames)
], variablenames)

# variablenames is returned to help the user, for example to let him know our default
# values if he did not provide any.
return X, (activities, activity_ids), variablenames
end

# load multiple HuGaDB instances in one DataFrame
"""
function load_hugadb(
dirpath::S,
filenames::Vector{S};
kwargs...
) where {S<:AbstractString}

Loader for multiple instances of `HuGaDB` dataset, each of which is identified by a file
name (in `filenames` vector) inside the directory `dirpath`.

!!! note
The main purpose of this dispatch is to be picky about which instances to load and
which to discard, since some HuGaDB recordings are corrupted.
More info on [the official GitHub page](https://github.com/romanchereshnev/HuGaDB).

See also the dispatch of this method which only considers one filename.
"""
function load_hugadb(
dirpath::S,
filenames::Vector{S};
kwargs...
) where {S<:AbstractString}
# leverage the first instance to get, once for all, the common outputs such as
# the list of activities (as pairs string/id pairs) and the names of each column.
X, (activity_strings, activity_ids), variablenames = load_hugadb(
dirpath, filenames[1], kwargs...)

# return the concatenation of each DataFrame obtained by a `load_hugadb` call
return vcat([X, [
load_hugadb(dirpath, filename; kwargs...) |> first
for filename in filenames[2:end]
]...]...), (activity_strings, activity_ids), variablenames
end

# in each instance, isolate the recorded part dedicated to `id` movement;
# if no such part exists, discard the instance.
# The survivor tracks are trimmed to have the same length.
"""
function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39)

Utility function related to `HuGaDB` dataset, called `X`.

Consider the recordings of each performer (i.e., every instance) and isolate the data
related to one specific movement id.

# Arguments
- `X::DataFrame`: the HuGaDB dataset;
- `id`: any kind of data contained in `labelcolumn` (probably an integer or a string), to
discriminate between different movements.

# Keyword Arguments
- `labelcolumn::Integer=39`: by default, movement ids are stored in this column.
"""
function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39)
nvariables = X |> size |> last

# pick only the instances for which an `id` type of movement is recorded
_X = [
let indices = findall(x -> x == id, X[instance, labelcolumn])
isempty(indices) ? nothing :
DataFrame([
[X[instance, variable][indices]]
for variable in 1:nvariables
], variablenames)
end
for instance in 1:(X |> size |> first)
] |> x -> filter(!isnothing, x)

# concatenate all the picked instances in an unique DataFrame
_Xfiltered = vcat(_X...)

# we want to trim every recording to have the same length across instances;
# since, when selecting an instance, each column of the latter has the same length,
# we arbitrarily choose to compute the minimum length starting from the first column.
minimum_length = minimum(length.(_Xfiltered[:,1]))
for instance in 1:(_Xfiltered |> size |> first)
for variable in 1:nvariables
_Xfiltered[instance,variable] = _Xfiltered[instance,variable][1:minimum_length]
end
end

return _Xfiltered
end
61 changes: 61 additions & 0 deletions src/datasets/libras-loader.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Differently from other datasets, here there is no need to specify a variablenames vector
# of strings, since each variable is named "x_frame_1", "x_frame_2", ..., by default.
"""
function load_libras(
dirpath::S;
fileprefix::S="Epilepsy",
variablenames::Vector{S}=["x", "y", "z"]
) where {S<:AbstractString}

Loader for `Libras` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=Libras).

# Arguments
- `dirpath::S`: the directory in which all the .arff files are stored.

# Keyword Arguments
- `fileprefix::S="Libras"`: the prefix shared by both test and train parts of the dataset;
the default name for such files is Libras_TEST.arff and Libras_TRAIN.arff;
"""
function load_libras(
dirpath::S;
fileprefix::S="Libras"
) where {S<:AbstractString}
_load_libras(dirpath, fileprefix)
end

function _load_libras(dirpath::String, fileprefix::String)
(X_train, y_train), (X_test, y_test) =
(
read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF,
read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF,
)

class_names = [
"curved_swing",
"horizontal_swing",
"vertical_swing",
"anti_clockwise_arc",
"clokcwise_arc",
"circle",
"horizontal_straight_line",
"vertical_straight_line",
"horizontal_zigzag",
"vertical_zigzag",
"horizontal_wavy",
"vertical_wavy",
"face_up_curve",
"face_down_curve",
"tremble"
]

# convert from .arff class codes to string
fix_class_names(y) = class_names[round(Int, parse(Float64, y))]

y_train = map(fix_class_names, y_train)
y_test = map(fix_class_names, y_test)

y_train = categorical(y_train)
y_test = categorical(y_test)

vcat(X_train, X_test), vcat(y_train, y_test)
end
76 changes: 76 additions & 0 deletions src/datasets/natops-loader.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
NATOPS_VARIABLENAMES = [
"X[Hand tip l]", "Y[Hand tip l]", "Z[Hand tip l]",
"X[Hand tip r]", "Y[Hand tip r]", "Z[Hand tip r]",
"X[Elbow l]", "Y[Elbow l]", "Z[Elbow l]",
"X[Elbow r]","Y[Elbow r]","Z[Elbow r]",
"X[Wrist l]", "Y[Wrist l]", "Z[Wrist l]",
"X[Wrist r]", "Y[Wrist r]", "Z[Wrist r]",
"X[Thumb l]", "Y[Thumb l]", "Z[Thumb l]",
"X[Thumb r]", "Y[Thumb r]", "Z[Thumb r]",
]

"""
function load_NATOPS(
dirpath::S;
fileprefix::S="Epilepsy",
variablenames::Vector{S}=["x", "y", "z"]
) where {S<:AbstractString}

Loader for `Epilepsy` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=NATOPS).

# Arguments
- `dirpath::S`: the directory in which all the .arff files are stored.

# Keyword Arguments
- `fileprefix::S="Epilepsy"`: the prefix shared by both test and train parts of the dataset;
the default name for such files is NATOPS_TEST.arff and NATOPS_TRAIN.arff;
- `variablenames::Vector{S}=NATOPS_VARIABLENAMES`: the names of the columns.
"""
function load_NATOPS(
dirpath::S;
fileprefix::S="NATOPS",
variablenames::Vector{S}=NATOPS_VARIABLENAMES
) where {S<:AbstractString}
# A previous implementation of this loader was very kind with the user, and tried
# to download NATOPS by internet if an error occurred locally:
# try
# _load_NATOPS(dirpath, fileprefix)
# catch error
# if error isa SystemError
# SoleData.load_arff_dataset("NATOPS")
# else
# rethrow(error)
# end
# end

_load_NATOPS(dirpath, fileprefix)
end

function _load_NATOPS(dirpath::String, fileprefix::String)
(X_train, y_train), (X_test, y_test) =
(
read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF,
read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF,
)

X_train = SoleData.fix_dataframe(X_train, variablenames)
X_test = SoleData.fix_dataframe(X_test, variablenames)

class_names = [
"I have command",
"All clear",
"Not clear",
"Spread wings",
"Fold wings",
"Lock wings",
]

fix_class_names(y) = class_names[round(Int, parse(Float64, y))]

y_train = map(fix_class_names, y_train)
y_test = map(fix_class_names, y_test)

y_train = categorical(y_train)
y_test = categorical(y_test)
vcat(X_train, X_test), vcat(y_train, y_test)
end
Loading