aclai-lab · mauro-milella · May 9, 2025 · May 9, 2025 · May 9, 2025 · May 9, 2025
diff --git a/src/SoleData.jl b/src/SoleData.jl
@@ -9,6 +9,8 @@ using DataFrames
 using MultiData
 using MultiData: AbstractDimensionalDataset
 
+using DataStructures: OrderedDict
+
 const DF = DataFrames
 const MD = MultiData
 
@@ -23,8 +25,8 @@ include("utils/minify.jl")
 
 include("MLJ-utils.jl")
 
-include("example-datasets.jl")
-
+include("datasets/datasets.jl")
+@reexport using .Datasets
 
 export atoms
 

diff --git a/src/datasets/datasets.jl b/src/datasets/datasets.jl
@@ -0,0 +1,24 @@
+module Datasets
+
+using ZipFile
+using DataFrames
+using CategoricalArrays
+
+using DataStructures: OrderedDict
+
+include("epilepsy-loader.jl")
+export load_epilepsy
+
+include("hugadb-loader.jl")
+export load_hugadb
+
+include("libras-loader.jl")
+export load_libras
+
+include("natops-loader.jl")
+export load_NATOPS
+
+include("example-datasets.jl")
+export load_arff_dataset
+
+end
diff --git a/src/datasets/epilepsy-loader.jl b/src/datasets/epilepsy-loader.jl
@@ -0,0 +1,36 @@
+"""
+    function load_epilepsy(
+        dirpath::S;
+        fileprefix::S="Epilepsy",
+        variablenames::Vector{S}=["x", "y", "z"]
+    ) where {S<:AbstractString}
+
+Loader for `Epilepsy` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=Epilepsy).
+
+# Arguments
+- `dirpath::S`: the directory in which all the .arff files are stored.
+
+# Keyword Arguments
+- `fileprefix::S="Epilepsy"`: the prefix shared by both test and train parts of the dataset;
+    the default name for such files is Epilepsy_TEST.arff and Epilepsy_TRAIN.arff;
+- `variablenames::Vector{S}=["x", "y", "z"]`: the names of the columns.
+"""
+function load_epilepsy(
+    dirpath::S;
+    fileprefix::S="Epilepsy",
+    variablenames::Vector{S}=["x", "y", "z"]
+) where {S<:AbstractString}
+    (X_train, y_train), (X_test, y_test) =
+        (
+            read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF,
+            read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF,
+        )
+
+    X_train  = SoleData.fix_dataframe(X_train, variablenames)
+    X_test   = SoleData.fix_dataframe(X_test, variablenames)
+
+    y_train = categorical(y_train)
+    y_test = categorical(y_test)
+
+    vcat(X_train, X_test), vcat(y_train, y_test)
+end
diff --git a/src/example-datasets.jl → src/datasets/example-datasets.jl b/src/example-datasets.jl → src/datasets/example-datasets.jl
diff --git a/src/datasets/hugadb-loader.jl b/src/datasets/hugadb-loader.jl
@@ -0,0 +1,162 @@
+HUGADB_VARIABLENAMES = ["acc_rf_x","acc_rf_y","acc_rf_z",
+    "gyro_rf_x","gyro_rf_y","gyro_rf_z",
+    "acc_rs_x","acc_rs_y","acc_rs_z",
+    "gyro_rs_x","gyro_rs_y","gyro_rs_z",
+    "acc_rt_x","acc_rt_y","acc_rt_z",
+    "gyro_rt_x","gyro_rt_y","gyro_rt_z",
+    "acc_lf_x","acc_lf_y","acc_lf_z",
+    "gyro_lf_x","gyro_lf_y","gyro_lf_z",
+    "acc_ls_x","acc_ls_y","acc_ls_z",
+    "gyro_ls_x","gyro_ls_y","gyro_ls_z",
+    "acc_lt_x","acc_lt_y","acc_lt_z",
+    "gyro_lt_x","gyro_lt_y","gyro_lt_z",
+    "EMG_r","EMG_l","act",
+]
+
+# activity strings (labels) to ids as in the table at https://github.com/romanchereshnev/HuGaDB
+_activity2id = x -> findfirst(activity -> x == activity, [
+    "walking", "running", "going_up", "going_down", "sitting", "sitting_down",
+    "standing_up", "standing", "bicycling", "elevator_up", "elevator_down",
+    "sitting_car"
+])
+
+"""
+    function load_hugadb(dirpath::S, filename::S) where {S<:AbstractString}
+
+Loader for a single instance of `HuGaDB` dataset, available [here](https://github.com/romanchereshnev/HuGaDB).
+
+# Arguments
+- `dirpath::S`: the directory in which all the .txt files are stored;
+- `filename::S`: the specific filename associated with the instance, such as
+    HuGaDB_v2_various_01_00.txt.
+
+# Keyword Arguments
+- `variablenames::Vector{S}=["x_accelometer_right_foot", "y_accelerometer_right_foot", ...]`:
+    the names of the columns.
+"""
+function load_hugadb(
+    dirpath::S,
+    filename::S;
+    variablenames::Vector{S}=HUGADB_VARIABLENAMES
+) where {S<:AbstractString}
+    filepath = joinpath(dirpath, filename)
+
+    # e.g. open("test/data/HuGaDB/HuGaDB_v2_various_01_00.txt", "r")
+    f = open(filepath, "r")
+
+    # get the activities recorded for the performer specified in `filename`
+    activities = split(readline(f), " ")[1:end-1]
+    activities[1] = activities[1][11:end] # remove the initial "#Activity\t"
+
+
+    activity_ids = [_activity2id(activity) for activity in activities]
+
+    # ignore #ActivityID array (we only keep the string version instead of integer IDs)
+    readline(f)
+
+    # ignore #Date row
+    readline(f)
+
+    # ignore the variable names, as we already explicited them in `variablenames`
+    readline(f)
+
+    _substr2float = x -> parse(Float64, x)
+    lines = [_substr2float.(split(line, "\t")) for line in eachline(f)]
+
+    close(f)
+
+    X = DataFrame([
+        # get the i-th element from each line, and concatenate them together
+        [[line[i] for line in lines]]
+        for i in 1:length(variablenames)
+    ], variablenames)
+
+    # variablenames is returned to help the user, for example to let him know our default
+    # values if he did not provide any.
+    return X, (activities, activity_ids), variablenames
+end
+
+# load multiple HuGaDB instances in one DataFrame
+"""
+    function load_hugadb(
+        dirpath::S,
+        filenames::Vector{S};
+        kwargs...
+    ) where {S<:AbstractString}
+
+Loader for multiple instances of `HuGaDB` dataset, each of which is identified by a file
+name (in `filenames` vector) inside the directory `dirpath`.
+
+!!! note
+    The main purpose of this dispatch is to be picky about which instances to load and
+    which to discard, since some HuGaDB recordings are corrupted.
+    More info on [the official GitHub page](https://github.com/romanchereshnev/HuGaDB).
+
+See also the dispatch of this method which only considers one filename.
+"""
+function load_hugadb(
+    dirpath::S,
+    filenames::Vector{S};
+    kwargs...
+) where {S<:AbstractString}
+    # leverage the first instance to get, once for all, the common outputs such as
+    # the list of activities (as pairs string/id pairs) and the names of each column.
+    X, (activity_strings, activity_ids), variablenames = load_hugadb(
+        dirpath, filenames[1], kwargs...)
+
+    # return the concatenation of each DataFrame obtained by a `load_hugadb` call
+    return vcat([X, [
+            load_hugadb(dirpath, filename; kwargs...) |> first
+            for filename in filenames[2:end]
+        ]...]...), (activity_strings, activity_ids), variablenames
+end
+
+# in each instance, isolate the recorded part dedicated to `id` movement;
+# if no such part exists, discard the instance.
+# The survivor tracks are trimmed to have the same length.
+"""
+    function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39)
+
+Utility function related to `HuGaDB` dataset, called `X`.
+
+Consider the recordings of each performer (i.e., every instance) and isolate the data
+related to one specific movement id.
+
+# Arguments
+- `X::DataFrame`: the HuGaDB dataset;
+- `id`: any kind of data contained in `labelcolumn` (probably an integer or a string), to
+    discriminate between different movements.
+
+# Keyword Arguments
+- `labelcolumn::Integer=39`: by default, movement ids are stored in this column.
+"""
+function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39)
+    nvariables = X |> size |> last
+
+    # pick only the instances for which an `id` type of movement is recorded
+    _X = [
+        let indices = findall(x -> x == id, X[instance, labelcolumn])
+        isempty(indices) ? nothing :
+        DataFrame([
+                [X[instance, variable][indices]]
+                for variable in 1:nvariables
+            ], variablenames)
+        end
+        for instance in 1:(X |> size |> first)
+    ] |> x -> filter(!isnothing, x)
+
+    # concatenate all the picked instances in an unique DataFrame
+    _Xfiltered = vcat(_X...)
+
+    # we want to trim every recording to have the same length across instances;
+    # since, when selecting an instance, each column of the latter has the same length,
+    # we arbitrarily choose to compute the minimum length starting from the first column.
+    minimum_length = minimum(length.(_Xfiltered[:,1]))
+    for instance in 1:(_Xfiltered |> size |> first)
+        for variable in 1:nvariables
+            _Xfiltered[instance,variable] = _Xfiltered[instance,variable][1:minimum_length]
+        end
+    end
+
+    return _Xfiltered
+end
diff --git a/src/datasets/libras-loader.jl b/src/datasets/libras-loader.jl
@@ -0,0 +1,61 @@
+# Differently from other datasets, here there is no need to specify a variablenames vector
+# of strings, since each variable is named "x_frame_1", "x_frame_2", ..., by default.
+"""
+    function load_libras(
+        dirpath::S;
+        fileprefix::S="Epilepsy",
+        variablenames::Vector{S}=["x", "y", "z"]
+    ) where {S<:AbstractString}
+
+Loader for `Libras` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=Libras).
+
+# Arguments
+- `dirpath::S`: the directory in which all the .arff files are stored.
+
+# Keyword Arguments
+- `fileprefix::S="Libras"`: the prefix shared by both test and train parts of the dataset;
+    the default name for such files is Libras_TEST.arff and Libras_TRAIN.arff;
+"""
+function load_libras(
+    dirpath::S;
+    fileprefix::S="Libras"
+) where {S<:AbstractString}
+    _load_libras(dirpath, fileprefix)
+end
+
+function _load_libras(dirpath::String, fileprefix::String)
+    (X_train, y_train), (X_test, y_test) =
+        (
+            read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF,
+            read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF,
+        )
+
+    class_names = [
+        "curved_swing",
+        "horizontal_swing",
+        "vertical_swing",
+        "anti_clockwise_arc",
+        "clokcwise_arc",
+        "circle",
+        "horizontal_straight_line",
+        "vertical_straight_line",
+        "horizontal_zigzag",
+        "vertical_zigzag",
+        "horizontal_wavy",
+        "vertical_wavy",
+        "face_up_curve",
+        "face_down_curve",
+        "tremble"
+    ]
+
+    # convert from .arff class codes to string
+    fix_class_names(y) = class_names[round(Int, parse(Float64, y))]
+
+    y_train = map(fix_class_names, y_train)
+    y_test = map(fix_class_names, y_test)
+
+    y_train = categorical(y_train)
+    y_test = categorical(y_test)
+
+    vcat(X_train, X_test), vcat(y_train, y_test)
+end
diff --git a/src/datasets/natops-loader.jl b/src/datasets/natops-loader.jl
@@ -0,0 +1,76 @@
+NATOPS_VARIABLENAMES = [
+        "X[Hand tip l]", "Y[Hand tip l]", "Z[Hand tip l]",
+        "X[Hand tip r]", "Y[Hand tip r]", "Z[Hand tip r]",
+        "X[Elbow l]", "Y[Elbow l]", "Z[Elbow l]",
+        "X[Elbow r]","Y[Elbow r]","Z[Elbow r]",
+        "X[Wrist l]", "Y[Wrist l]", "Z[Wrist l]",
+        "X[Wrist r]", "Y[Wrist r]", "Z[Wrist r]",
+        "X[Thumb l]", "Y[Thumb l]", "Z[Thumb l]",
+        "X[Thumb r]", "Y[Thumb r]", "Z[Thumb r]",
+    ]
+
+"""
+    function load_NATOPS(
+        dirpath::S;
+        fileprefix::S="Epilepsy",
+        variablenames::Vector{S}=["x", "y", "z"]
+    ) where {S<:AbstractString}
+
+Loader for `Epilepsy` dataset, available [here](https://timeseriesclassification.com/description.php?Dataset=NATOPS).
+
+# Arguments
+- `dirpath::S`: the directory in which all the .arff files are stored.
+
+# Keyword Arguments
+- `fileprefix::S="Epilepsy"`: the prefix shared by both test and train parts of the dataset;
+    the default name for such files is NATOPS_TEST.arff and NATOPS_TRAIN.arff;
+- `variablenames::Vector{S}=NATOPS_VARIABLENAMES`: the names of the columns.
+"""
+function load_NATOPS(
+    dirpath::S;
+    fileprefix::S="NATOPS",
+    variablenames::Vector{S}=NATOPS_VARIABLENAMES
+) where {S<:AbstractString}
+    # A previous implementation of this loader was very kind with the user, and tried
+    # to download NATOPS by internet if an error occurred locally:
+    # try
+    #     _load_NATOPS(dirpath, fileprefix)
+    # catch error
+    #     if error isa SystemError
+    #         SoleData.load_arff_dataset("NATOPS")
+    #     else
+    #         rethrow(error)
+    #     end
+    # end
+
+    _load_NATOPS(dirpath, fileprefix)
+end
+
+function _load_NATOPS(dirpath::String, fileprefix::String)
+    (X_train, y_train), (X_test, y_test) =
+        (
+            read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF,
+            read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF,
+        )
+
+    X_train  = SoleData.fix_dataframe(X_train, variablenames)
+    X_test   = SoleData.fix_dataframe(X_test, variablenames)
+
+    class_names = [
+        "I have command",
+        "All clear",
+        "Not clear",
+        "Spread wings",
+        "Fold wings",
+        "Lock wings",
+    ]
+
+    fix_class_names(y) = class_names[round(Int, parse(Float64, y))]
+
+    y_train = map(fix_class_names, y_train)
+    y_test  = map(fix_class_names, y_test)
+
+    y_train = categorical(y_train)
+    y_test = categorical(y_test)
+    vcat(X_train, X_test), vcat(y_train, y_test)
+end