diff --git a/src/SoleData.jl b/src/SoleData.jl index fd9144b..38d77a6 100644 --- a/src/SoleData.jl +++ b/src/SoleData.jl @@ -9,6 +9,8 @@ using DataFrames using MultiData using MultiData: AbstractDimensionalDataset +using DataStructures: OrderedDict + const DF = DataFrames const MD = MultiData @@ -23,8 +25,8 @@ include("utils/minify.jl") include("MLJ-utils.jl") -include("example-datasets.jl") - +include("datasets/datasets.jl") +@reexport using .Datasets export atoms diff --git a/src/datasets/datasets.jl b/src/datasets/datasets.jl new file mode 100644 index 0000000..e508858 --- /dev/null +++ b/src/datasets/datasets.jl @@ -0,0 +1,24 @@ +module Datasets + +using ZipFile +using DataFrames +using CategoricalArrays + +using DataStructures: OrderedDict + +include("epilepsy-loader.jl") +export load_epilepsy + +include("hugadb-loader.jl") +export load_hugadb + +include("libras-loader.jl") +export load_libras + +include("natops-loader.jl") +export load_NATOPS + +include("example-datasets.jl") +export load_arff_dataset + +end diff --git a/src/datasets/epilepsy-loader.jl b/src/datasets/epilepsy-loader.jl new file mode 100644 index 0000000..bc19bb6 --- /dev/null +++ b/src/datasets/epilepsy-loader.jl @@ -0,0 +1,36 @@ +""" + function load_epilepsy( + dirpath::S; + fileprefix::S="Epilepsy", + variablenames::Vector{S}=["x", "y", "z"] + ) where {S<:AbstractString} + +Loader for `Epilepsy` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=Epilepsy). + +# Arguments +- `dirpath::S`: the directory in which all the .arff files are stored. + +# Keyword Arguments +- `fileprefix::S="Epilepsy"`: the prefix shared by both test and train parts of the dataset; + the default name for such files is Epilepsy_TEST.arff and Epilepsy_TRAIN.arff; +- `variablenames::Vector{S}=["x", "y", "z"]`: the names of the columns. +""" +function load_epilepsy( + dirpath::S; + fileprefix::S="Epilepsy", + variablenames::Vector{S}=["x", "y", "z"] +) where {S<:AbstractString} + (X_train, y_train), (X_test, y_test) = + ( + read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF, + read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF, + ) + + X_train = SoleData.fix_dataframe(X_train, variablenames) + X_test = SoleData.fix_dataframe(X_test, variablenames) + + y_train = categorical(y_train) + y_test = categorical(y_test) + + vcat(X_train, X_test), vcat(y_train, y_test) +end diff --git a/src/example-datasets.jl b/src/datasets/example-datasets.jl similarity index 100% rename from src/example-datasets.jl rename to src/datasets/example-datasets.jl diff --git a/src/datasets/hugadb-loader.jl b/src/datasets/hugadb-loader.jl new file mode 100644 index 0000000..48cd56f --- /dev/null +++ b/src/datasets/hugadb-loader.jl @@ -0,0 +1,162 @@ +HUGADB_VARIABLENAMES = ["acc_rf_x","acc_rf_y","acc_rf_z", + "gyro_rf_x","gyro_rf_y","gyro_rf_z", + "acc_rs_x","acc_rs_y","acc_rs_z", + "gyro_rs_x","gyro_rs_y","gyro_rs_z", + "acc_rt_x","acc_rt_y","acc_rt_z", + "gyro_rt_x","gyro_rt_y","gyro_rt_z", + "acc_lf_x","acc_lf_y","acc_lf_z", + "gyro_lf_x","gyro_lf_y","gyro_lf_z", + "acc_ls_x","acc_ls_y","acc_ls_z", + "gyro_ls_x","gyro_ls_y","gyro_ls_z", + "acc_lt_x","acc_lt_y","acc_lt_z", + "gyro_lt_x","gyro_lt_y","gyro_lt_z", + "EMG_r","EMG_l","act", +] + +# activity strings (labels) to ids as in the table at https://github.com/romanchereshnev/HuGaDB +_activity2id = x -> findfirst(activity -> x == activity, [ + "walking", "running", "going_up", "going_down", "sitting", "sitting_down", + "standing_up", "standing", "bicycling", "elevator_up", "elevator_down", + "sitting_car" +]) + +""" + function load_hugadb(dirpath::S, filename::S) where {S<:AbstractString} + +Loader for a single instance of `HuGaDB` dataset, available [here](https://github.com/romanchereshnev/HuGaDB). + +# Arguments +- `dirpath::S`: the directory in which all the .txt files are stored; +- `filename::S`: the specific filename associated with the instance, such as + HuGaDB_v2_various_01_00.txt. + +# Keyword Arguments +- `variablenames::Vector{S}=["x_accelometer_right_foot", "y_accelerometer_right_foot", ...]`: + the names of the columns. +""" +function load_hugadb( + dirpath::S, + filename::S; + variablenames::Vector{S}=HUGADB_VARIABLENAMES +) where {S<:AbstractString} + filepath = joinpath(dirpath, filename) + + # e.g. open("test/data/HuGaDB/HuGaDB_v2_various_01_00.txt", "r") + f = open(filepath, "r") + + # get the activities recorded for the performer specified in `filename` + activities = split(readline(f), " ")[1:end-1] + activities[1] = activities[1][11:end] # remove the initial "#Activity\t" + + + activity_ids = [_activity2id(activity) for activity in activities] + + # ignore #ActivityID array (we only keep the string version instead of integer IDs) + readline(f) + + # ignore #Date row + readline(f) + + # ignore the variable names, as we already explicited them in `variablenames` + readline(f) + + _substr2float = x -> parse(Float64, x) + lines = [_substr2float.(split(line, "\t")) for line in eachline(f)] + + close(f) + + X = DataFrame([ + # get the i-th element from each line, and concatenate them together + [[line[i] for line in lines]] + for i in 1:length(variablenames) + ], variablenames) + + # variablenames is returned to help the user, for example to let him know our default + # values if he did not provide any. + return X, (activities, activity_ids), variablenames +end + +# load multiple HuGaDB instances in one DataFrame +""" + function load_hugadb( + dirpath::S, + filenames::Vector{S}; + kwargs... + ) where {S<:AbstractString} + +Loader for multiple instances of `HuGaDB` dataset, each of which is identified by a file +name (in `filenames` vector) inside the directory `dirpath`. + +!!! note + The main purpose of this dispatch is to be picky about which instances to load and + which to discard, since some HuGaDB recordings are corrupted. + More info on [the official GitHub page](https://github.com/romanchereshnev/HuGaDB). + +See also the dispatch of this method which only considers one filename. +""" +function load_hugadb( + dirpath::S, + filenames::Vector{S}; + kwargs... +) where {S<:AbstractString} + # leverage the first instance to get, once for all, the common outputs such as + # the list of activities (as pairs string/id pairs) and the names of each column. + X, (activity_strings, activity_ids), variablenames = load_hugadb( + dirpath, filenames[1], kwargs...) + + # return the concatenation of each DataFrame obtained by a `load_hugadb` call + return vcat([X, [ + load_hugadb(dirpath, filename; kwargs...) |> first + for filename in filenames[2:end] + ]...]...), (activity_strings, activity_ids), variablenames +end + +# in each instance, isolate the recorded part dedicated to `id` movement; +# if no such part exists, discard the instance. +# The survivor tracks are trimmed to have the same length. +""" + function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39) + +Utility function related to `HuGaDB` dataset, called `X`. + +Consider the recordings of each performer (i.e., every instance) and isolate the data +related to one specific movement id. + +# Arguments +- `X::DataFrame`: the HuGaDB dataset; +- `id`: any kind of data contained in `labelcolumn` (probably an integer or a string), to + discriminate between different movements. + +# Keyword Arguments +- `labelcolumn::Integer=39`: by default, movement ids are stored in this column. +""" +function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39) + nvariables = X |> size |> last + + # pick only the instances for which an `id` type of movement is recorded + _X = [ + let indices = findall(x -> x == id, X[instance, labelcolumn]) + isempty(indices) ? nothing : + DataFrame([ + [X[instance, variable][indices]] + for variable in 1:nvariables + ], variablenames) + end + for instance in 1:(X |> size |> first) + ] |> x -> filter(!isnothing, x) + + # concatenate all the picked instances in an unique DataFrame + _Xfiltered = vcat(_X...) + + # we want to trim every recording to have the same length across instances; + # since, when selecting an instance, each column of the latter has the same length, + # we arbitrarily choose to compute the minimum length starting from the first column. + minimum_length = minimum(length.(_Xfiltered[:,1])) + for instance in 1:(_Xfiltered |> size |> first) + for variable in 1:nvariables + _Xfiltered[instance,variable] = _Xfiltered[instance,variable][1:minimum_length] + end + end + + return _Xfiltered +end diff --git a/src/datasets/libras-loader.jl b/src/datasets/libras-loader.jl new file mode 100644 index 0000000..14fc62b --- /dev/null +++ b/src/datasets/libras-loader.jl @@ -0,0 +1,61 @@ +# Differently from other datasets, here there is no need to specify a variablenames vector +# of strings, since each variable is named "x_frame_1", "x_frame_2", ..., by default. +""" + function load_libras( + dirpath::S; + fileprefix::S="Epilepsy", + variablenames::Vector{S}=["x", "y", "z"] + ) where {S<:AbstractString} + +Loader for `Libras` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=Libras). + +# Arguments +- `dirpath::S`: the directory in which all the .arff files are stored. + +# Keyword Arguments +- `fileprefix::S="Libras"`: the prefix shared by both test and train parts of the dataset; + the default name for such files is Libras_TEST.arff and Libras_TRAIN.arff; +""" +function load_libras( + dirpath::S; + fileprefix::S="Libras" +) where {S<:AbstractString} + _load_libras(dirpath, fileprefix) +end + +function _load_libras(dirpath::String, fileprefix::String) + (X_train, y_train), (X_test, y_test) = + ( + read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF, + read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF, + ) + + class_names = [ + "curved_swing", + "horizontal_swing", + "vertical_swing", + "anti_clockwise_arc", + "clokcwise_arc", + "circle", + "horizontal_straight_line", + "vertical_straight_line", + "horizontal_zigzag", + "vertical_zigzag", + "horizontal_wavy", + "vertical_wavy", + "face_up_curve", + "face_down_curve", + "tremble" + ] + + # convert from .arff class codes to string + fix_class_names(y) = class_names[round(Int, parse(Float64, y))] + + y_train = map(fix_class_names, y_train) + y_test = map(fix_class_names, y_test) + + y_train = categorical(y_train) + y_test = categorical(y_test) + + vcat(X_train, X_test), vcat(y_train, y_test) +end diff --git a/src/datasets/natops-loader.jl b/src/datasets/natops-loader.jl new file mode 100644 index 0000000..e98ec5f --- /dev/null +++ b/src/datasets/natops-loader.jl @@ -0,0 +1,76 @@ +NATOPS_VARIABLENAMES = [ + "X[Hand tip l]", "Y[Hand tip l]", "Z[Hand tip l]", + "X[Hand tip r]", "Y[Hand tip r]", "Z[Hand tip r]", + "X[Elbow l]", "Y[Elbow l]", "Z[Elbow l]", + "X[Elbow r]","Y[Elbow r]","Z[Elbow r]", + "X[Wrist l]", "Y[Wrist l]", "Z[Wrist l]", + "X[Wrist r]", "Y[Wrist r]", "Z[Wrist r]", + "X[Thumb l]", "Y[Thumb l]", "Z[Thumb l]", + "X[Thumb r]", "Y[Thumb r]", "Z[Thumb r]", + ] + +""" + function load_NATOPS( + dirpath::S; + fileprefix::S="Epilepsy", + variablenames::Vector{S}=["x", "y", "z"] + ) where {S<:AbstractString} + +Loader for `Epilepsy` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=NATOPS). + +# Arguments +- `dirpath::S`: the directory in which all the .arff files are stored. + +# Keyword Arguments +- `fileprefix::S="Epilepsy"`: the prefix shared by both test and train parts of the dataset; + the default name for such files is NATOPS_TEST.arff and NATOPS_TRAIN.arff; +- `variablenames::Vector{S}=NATOPS_VARIABLENAMES`: the names of the columns. +""" +function load_NATOPS( + dirpath::S; + fileprefix::S="NATOPS", + variablenames::Vector{S}=NATOPS_VARIABLENAMES +) where {S<:AbstractString} + # A previous implementation of this loader was very kind with the user, and tried + # to download NATOPS by internet if an error occurred locally: + # try + # _load_NATOPS(dirpath, fileprefix) + # catch error + # if error isa SystemError + # SoleData.load_arff_dataset("NATOPS") + # else + # rethrow(error) + # end + # end + + _load_NATOPS(dirpath, fileprefix) +end + +function _load_NATOPS(dirpath::String, fileprefix::String) + (X_train, y_train), (X_test, y_test) = + ( + read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF, + read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF, + ) + + X_train = SoleData.fix_dataframe(X_train, variablenames) + X_test = SoleData.fix_dataframe(X_test, variablenames) + + class_names = [ + "I have command", + "All clear", + "Not clear", + "Spread wings", + "Fold wings", + "Lock wings", + ] + + fix_class_names(y) = class_names[round(Int, parse(Float64, y))] + + y_train = map(fix_class_names, y_train) + y_test = map(fix_class_names, y_test) + + y_train = categorical(y_train) + y_test = categorical(y_test) + vcat(X_train, X_test), vcat(y_train, y_test) +end