diff --git a/Project.toml b/Project.toml index b12110b..3c210e2 100644 --- a/Project.toml +++ b/Project.toml @@ -22,10 +22,12 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Query = "1a8c2f83-1ff3-5112-b086-8aa67b057ba1" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492" SoleLogics = "b002da8f-3cb3-4d91-bbe3-2953433912b5" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" ThreadSafeDicts = "4239201d-c60e-5e0a-9702-85d713665ba7" @@ -35,7 +37,8 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [compat] CategoricalArrays = "0.10, 1" DataFrames = "1" -DataStructures = "0.18, 0.19" +DataStructures = "0.18" +Downloads = "1.6.0" Graphs = "1.12" HTTP = "1.10" IntervalSets = "0.7" @@ -47,12 +50,14 @@ ProgressMeter = "1.10" Query = "1" Random = "1" Reexport = "1" +SHA = "0.7.0" ScientificTypes = "3" SoleBase = "0.13" SoleLogics = "0.13.1" StatsBase = "0.30 - 0.34" +TOML = "1.0.3" Tables = "1" -ThreadSafeDicts = "0.1" +Tar = "1.10.0" UniqueVectors = "1.2" ZipFile = "0.10" julia = "1" diff --git a/loader.jl b/loader.jl new file mode 100644 index 0000000..a1cfc61 --- /dev/null +++ b/loader.jl @@ -0,0 +1,92 @@ +# This file contains all the methods necessary to automatically generate a functioning +# Artifacts.toml. +# +# For more information see the official documentation at: +# https://pkgdocs.julialang.org/v1/artifacts/#The-Pkg.Artifacts-API +# https://pkgdocs.julialang.org/v1/api/#Artifacts-Reference + +using Downloads +using Pkg.Artifacts +using SHA +using TOML + + +const ARTIFACTS_PATH = joinpath(@__DIR__, "Artifacts.toml") + +# INSERT HERE YOUR ARTIFACT URLS: es "https://github.com/aclai-lab/Artifacts/raw/main/sole/binaries/minimizers/mitespresso.tar.gz" +URLS = [ + "https://github.com/aclai-lab/Artifacts/raw/main/sole/binaries/minimizers/mitespresso.tar.gz" +] + +""" + fillartifacts(URLS::Vector{String}) + +Map of [`fillartifacts(url::String)`](@ref). +""" +function fillartifacts(URLS::Vector{String}) + map(url -> fillartifacts(url), URLS) +end + +""" + function fillartifacts(url::String) + +Completely automatize the insertion of a new resource into the Artifacts.toml file. + +After the execution of the following command, the Artifacts.toml is updated as below; +note that the new entry is named with using the lowercase version of the resource name +provided. + +```julia +julia> fillartifacts("https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/NATOPS.tar.gz") +``` +[natops] +git-tree-sha1 = "87856b9b41a235ec57f36d1029d0467876660e6e" + + [[natops.download]] + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/NATOPS.tar.gz" + sha256 = "2586be714b4112b874d050dd3f873e12156d1921e5ded9bd8f66bf5bf8d9c2d1" +""" +function fillartifacts(url::String) + filename_with_extension = split(url, "/")[end] + filename_no_extension = split(filename_with_extension, ".")[1] |> lowercase + + # Prima scarica il file in un percorso temporaneo per calcolare il suo SHA256 + temp_file = tempname() + Downloads.download(url, temp_file) + file_sha256 = bytes2hex(open(sha256, temp_file)) + + # Ora crea l'artifact + SHA1 = create_artifact( + tmp_dir -> cp(temp_file, joinpath(tmp_dir, filename_with_extension))) + + # Pulisci il file temporaneo + rm(temp_file) + + bind_artifact!(ARTIFACTS_PATH, filename_no_extension, SHA1; force=true) + + # content of the ARTIFACTS_PATH + content = TOML.parsefile(ARTIFACTS_PATH) + + open(ARTIFACTS_PATH, "w") do tomlfile + # eg: Dict{String, Any} with 1 entry: + # "natops" => Dict{String, Any}( + # "git-tree-sha1"=>"87856b9b41a235ec57f36d1029d0467876660e6e", + # "download"=>Any[Dict{String, Any}("sha256"=>"2586be714b4… + + if "download" in keys(content[filename_no_extension]) + @warn "Entry $(filename_no_extension) already exists." + else + # we insert a vector of possible new entries; this method automatically infers + # the most simple one but actually it could be possible to add many sources; + # to do so, we could iterate some kwargs here. + new_entry = Dict{String,Any}() + content[filename_no_extension]["download"] = [new_entry] + new_entry["sha256"] = file_sha256 # Usa l'hash del file originale + new_entry["url"] = url + end + + redirect_stdout(tomlfile) do + TOML.print(content) + end + end +end diff --git a/src/SoleData.jl b/src/SoleData.jl index 1af6cd4..a8e5483 100644 --- a/src/SoleData.jl +++ b/src/SoleData.jl @@ -9,6 +9,8 @@ using DataFrames using MultiData using MultiData: AbstractDimensionalDataset +using DataStructures: OrderedDict + const DF = DataFrames const MD = MultiData @@ -26,8 +28,7 @@ include("utils/minify.jl") include("MLJ-utils.jl") -include("example-datasets.jl") - +include("artifacts/artifacts.jl") export atoms diff --git a/src/artifacts/Artifacts.toml b/src/artifacts/Artifacts.toml new file mode 100644 index 0000000..d97f4ab --- /dev/null +++ b/src/artifacts/Artifacts.toml @@ -0,0 +1,41 @@ +[abc] +git-tree-sha1 = "2fde6d24ce7bffccc0a66a5447787bd95a057cca" + + [[abc.download]] + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/binaries/generic/abc.tar.gz" + +[libras] +git-tree-sha1 = "5c13bb56bcf0d6866d26c80fde16b27e6ad2e75f" + + [[libras.download]] + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/libras.tar.gz" + +[natops] +git-tree-sha1 = "3838ff4af3c2cf45a1cb6369a6138329f6362dcc" + + [[natops.download]] + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/natops.tar.gz" + +[mitespresso] +git-tree-sha1 = "dbe79220f8352a25e5a79cdc3ede3c58abf9038f" + + [[mitespresso.download]] + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/binaries/minimizers/mitespresso.tar.gz" + +[epilepsy] +git-tree-sha1 = "d4a858762ece9feb48b15831b93069f638ac610a" + + [[epilepsy.download]] + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/epilepsy.tar.gz" + +[hugadb] +git-tree-sha1 = "71ea5aba1c2022f79cf805831f64d3e1a46d511a" + + [[hugadb.download]] + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/hugadb.tar.gz" diff --git a/src/artifacts/artifacts.jl b/src/artifacts/artifacts.jl new file mode 100644 index 0000000..7d3b057 --- /dev/null +++ b/src/artifacts/artifacts.jl @@ -0,0 +1,75 @@ +module Artifacts + +using CategoricalArrays +using CodecZlib +using DataFrames +using Downloads +using Pkg.Artifacts +using SHA +using Tar +using TOML +using ZipFile + +using DataStructures: OrderedDict + +# Global variables related to the packages + +# Path to the Artifacts.toml configuration file +const ARTIFACTS_PATH = joinpath(@__DIR__, "Artifacts.toml") + +# URLS from which to download the deafult artifacts of SoleData +ARTIFACT_URLS = [ + # binaries + "https://github.com/aclai-lab/Artifacts/raw/main/sole/binaries/generic/abc.tar.gz", + "https://github.com/aclai-lab/Artifacts/raw/main/sole/binaries/minimizers/mitespresso.tar.gz", + + # datasets + "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/epilepsy.tar.gz", + "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/hugadb.tar.gz", + "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/libras.tar.gz", + "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/natops.tar.gz", +] + +include("utils/artifact-utils.jl") + +export fillartifacts + +export AbstractLoader, AbstractLoaderDataset, AbstractLoaderBinary +export name, url +export load +export extract_artifact + +export classes, variablenames + +# general loading logic, common to any AbstractLoader +include("loaders/loaders.jl") + + +# Binaries + +export ABCLoader +include("loaders/abc-loader.jl") + +export MITESPRESSOLoader +include("loaders/mitespresso-loader.jl") + + +# Datasets + +export load_arff_dataset, parseARFF, fix_dataframe +include("utils/dataset-utils.jl") + +export EpilepsyLoader +include("loaders/epilepsy-loader.jl") + +export HuGaDBLoader +include("loaders/hugadb-loader.jl") + +export LibrasLoader +include("loaders/libras-loader.jl") + +export NatopsLoader +include("loaders/natops-loader.jl") + + +end diff --git a/src/artifacts/loaders/abc-loader.jl b/src/artifacts/loaders/abc-loader.jl new file mode 100644 index 0000000..cd58431 --- /dev/null +++ b/src/artifacts/loaders/abc-loader.jl @@ -0,0 +1,23 @@ +struct ABCLoader <: AbstractLoaderBinary + name::String # Name of the artifact in Artifacts.toml + url::String # Fallback download URL + + # Internal constructor with default values + ABCLoader() = new( + "abc", + "https://github.com/berkeley-abc/abc/archive/refs/heads/master.tar.gz" + ) +end + +function load(al::ABCLoader) + artifact_path = ensure_artifact_installed(name(al), ARTIFACTS_PATH) + + # Check if tar.gz file needs extraction + tarfile = joinpath(artifact_path, "$(name(al)).tar.gz") + if isfile(tarfile) + extracted_path = extract_artifact(artifact_path, name(al)) + return joinpath(extracted_path, "$(name(al))") + else + return joinpath(artifact_path, "$(name(al))") + end +end diff --git a/src/artifacts/loaders/epilepsy-loader.jl b/src/artifacts/loaders/epilepsy-loader.jl new file mode 100644 index 0000000..2146100 --- /dev/null +++ b/src/artifacts/loaders/epilepsy-loader.jl @@ -0,0 +1,44 @@ +struct EpilepsyLoader <: AbstractLoaderDataset + name::String + url::String + + EpilepsyLoader() = new( + "epilepsy", + "" + ) +end + +""" + function load(l::EpilepsyLoader) + +Load Epilepsy dataset as specified by the [`EpilepsyLoader`](@ref). +""" +function load(l::EpilepsyLoader) + artifact_path = ensure_artifact_installed(name(l), ARTIFACTS_PATH) + + tarfile = joinpath(artifact_path, "$(name(l)).tar.gz") + + dirpath = begin + tarfile = joinpath(artifact_path, "$(name(l)).tar.gz") + if isfile(tarfile) + extracted_path = extract_artifact(artifact_path, name(l)) + joinpath(extracted_path, "$(name(l))") + else + joinpath(artifact_path, "$(name(l))") + end + end + + (X_train, y_train), (X_test, y_test) = + ( + read("$(dirpath)/epilepsy_TEST.arff", String) |> parseARFF, + read("$(dirpath)/epilepsy_TRAIN.arff", String) |> parseARFF, + ) + + X_train = SoleData.fix_dataframe(X_train, variablenames) + X_test = SoleData.fix_dataframe(X_test, variablenames) + + y_train = categorical(y_train) + y_test = categorical(y_test) + + return vcat(X_train, X_test), vcat(y_train, y_test) +end diff --git a/src/artifacts/loaders/hugadb-loader.jl b/src/artifacts/loaders/hugadb-loader.jl new file mode 100644 index 0000000..27f818d --- /dev/null +++ b/src/artifacts/loaders/hugadb-loader.jl @@ -0,0 +1,220 @@ +struct HuGaDBLoader <: AbstractLoaderDataset + name::String + url::String + + # non-corrupted files, suitable to perform an experiment on Walk and Running classes; + # see https://github.com/romanchereshnev/HuGaDB + expfiles::Vector{String} + + # column names + variablenames::Vector{String} + + # lambda function to convert each variable name to a specific ID + activity2id::Function + + HuGaDBLoader() = new( + "epilepsy", + "", + + # expfiles + [ + "HuGaDB_v2_various_02_00.txt", "HuGaDB_v2_various_02_01.txt", + "HuGaDB_v2_various_02_02.txt", "HuGaDB_v2_various_02_03.txt", + "HuGaDB_v2_various_02_04.txt", "HuGaDB_v2_various_02_05.txt", + "HuGaDB_v2_various_02_06.txt", "HuGaDB_v2_various_06_00.txt", + "HuGaDB_v2_various_06_01.txt", "HuGaDB_v2_various_06_02.txt", + "HuGaDB_v2_various_06_03.txt", "HuGaDB_v2_various_06_04.txt", + "HuGaDB_v2_various_06_05.txt", "HuGaDB_v2_various_06_06.txt", + "HuGaDB_v2_various_06_07.txt", "HuGaDB_v2_various_06_08.txt", + "HuGaDB_v2_various_06_09.txt", "HuGaDB_v2_various_06_10.txt", + "HuGaDB_v2_various_06_11.txt", "HuGaDB_v2_various_06_12.txt", + "HuGaDB_v2_various_06_13.txt", "HuGaDB_v2_various_06_14.txt", + "HuGaDB_v2_various_06_15.txt", "HuGaDB_v2_various_06_16.txt", + "HuGaDB_v2_various_06_17.txt", "HuGaDB_v2_various_06_18.txt", + "HuGaDB_v2_various_06_19.txt", "HuGaDB_v2_various_06_20.txt", + "HuGaDB_v2_various_06_21.txt", "HuGaDB_v2_various_06_22.txt", + "HuGaDB_v2_various_06_23.txt", "HuGaDB_v2_various_06_24.txt", + "HuGaDB_v2_various_06_25.txt", "HuGaDB_v2_various_06_26.txt", + "HuGaDB_v2_various_06_27.txt" + ], + + # variablenames + [ + "acc_rf_x","acc_rf_y","acc_rf_z", + "gyro_rf_x","gyro_rf_y","gyro_rf_z", + "acc_rs_x","acc_rs_y","acc_rs_z", + "gyro_rs_x","gyro_rs_y","gyro_rs_z", + "acc_rt_x","acc_rt_y","acc_rt_z", + "gyro_rt_x","gyro_rt_y","gyro_rt_z", + "acc_lf_x","acc_lf_y","acc_lf_z", + "gyro_lf_x","gyro_lf_y","gyro_lf_z", + "acc_ls_x","acc_ls_y","acc_ls_z", + "gyro_ls_x","gyro_ls_y","gyro_ls_z", + "acc_lt_x","acc_lt_y","acc_lt_z", + "gyro_lt_x","gyro_lt_y","gyro_lt_z", + "EMG_r","EMG_l","act", + ], + + # lambda function to assign a numerical ID to each activity/class + class -> findfirst(activity -> class == activity, [ + "walking", "running", "going_up", "going_down", "sitting", "sitting_down", + "standing_up", "standing", "bicycling", "elevator_up", "elevator_down", + "sitting_car" + ]) + ) +end + +""" + expfiles(l::HuGaDBLoader) + +Retrieve the specific files to be loaded. +""" +expfiles(l::HuGaDBLoader) = l.expfiles + +""" + variablenames(l::HuGaDBLoader) = l.variablenames + +Retrieve all the names of the columns for HuGaDB dataset. +""" +variablenames(l::HuGaDBLoader) = l.variablenames + +""" + activity2id(l::HuGaDBLoader, class::String) = l.activity2id(class) + +Convert a specific variable name to an ID. +For example, convert "acc_rf_x" to 1. +""" +activity2id(l::HuGaDBLoader, class::String) = l.activity2id(class) + + +# load a single instance of HuGaDB dataset; +# driver code is load(::HuGaDBLoader) +function _load_hugadb( + dirpath::S, + filename::S; + variablenames::Vector{S}=HUGADB_VARIABLENAMES +) where {S<:AbstractString} + filepath = joinpath(dirpath, filename) + + # e.g. open("test/data/HuGaDB/HuGaDB_v2_various_01_00.txt", "r") + f = open(filepath, "r") + + # get the activities recorded for the performer specified in `filename` + activities = split(readline(f), " ")[1:end-1] + activities[1] = activities[1][11:end] # remove the initial "#Activity\t" + + + activity_ids = [_activity2id(activity) for activity in activities] + + # ignore #ActivityID array (we only keep the string version instead of integer IDs) + readline(f) + + # ignore #Date row + readline(f) + + # ignore the variable names, as we already explicited them in `variablenames` + readline(f) + + _substr2float = x -> parse(Float64, x) + lines = [_substr2float.(split(line, "\t")) for line in eachline(f)] + + close(f) + + X = DataFrame([ + # get the i-th element from each line, and concatenate them together + [[line[i] for line in lines]] + for i in 1:length(variablenames) + ], variablenames) + + # variablenames is returned to help the user, for example to let him know our default + # values if he did not provide any. + return X, (activities, activity_ids), variablenames +end + + +""" + function load(l::HuGaDBLoader) + +Load HuGaDB dataset, as specified by the [`HuGaDBLoader`](@ref); +in particular, load the files returned by [`expfiles`](@ref). + +See also [`expfiles(l::HuGaDBLoader)`], [`filter_hugadb`](@ref). +""" +function load(l::HuGaDBLoader) + artifact_path = ensure_artifact_installed(name(al), ARTIFACTS_PATH) + + dirpath = begin + tarfile = joinpath(artifact_path, "$(name(al)).tar.gz") + if isfile(tarfile) + extracted_path = extract_artifact(artifact_path, name(al)) + joinpath(extracted_path, "$(name(al))") + else + joinpath(artifact_path, "$(name(al))") + end + end + + # leverage the first instance to get, once for all, the common outputs such as + # the list of activities (as pairs string/id pairs) and the names of each column. + X, (activity_strings, activity_ids), variablenames = _load_hugadb( + dirpath, + expfiles(l)[1], + variablenames=variablenames(l) + ) + + # return the concatenation of each DataFrame obtained by a `_load_hugadb` call + return vcat([X, [ + _load_hugadb(dirpath, filename; kwargs...) |> first + for filename in expfiles(l)[2:end] + ]...]...), (activity_strings, activity_ids), variablenames(l) +end + + +# in each instance, isolate the recorded part dedicated to `id` movement; +# if no such part exists, discard the instance. +# The survivor tracks are trimmed to have the same length. +""" + function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39) + +Utility function related to `HuGaDB` dataset, called `X`. + +Consider the recordings of each performer (i.e., every instance) and isolate the data +related to one specific movement id. + +# Arguments +- `X::DataFrame`: the HuGaDB dataset; +- `id`: any kind of data contained in `labelcolumn` (probably an integer or a string), to + discriminate between different movements. + +# Keyword Arguments +- `labelcolumn::Integer=39`: by default, movement ids are stored in this column. +""" +function filter_hugadb(X::DataFrame, id; labelcolumn::Integer=39) + nvariables = X |> size |> last + + # pick only the instances for which an `id` type of movement is recorded + _X = [ + let indices = findall(x -> x == id, X[instance, labelcolumn]) + isempty(indices) ? nothing : + DataFrame([ + [X[instance, variable][indices]] + for variable in 1:nvariables + ], variablenames) + end + for instance in 1:(X |> size |> first) + ] |> x -> filter(!isnothing, x) + + # concatenate all the picked instances in an unique DataFrame + _Xfiltered = vcat(_X...) + + # we want to trim every recording to have the same length across instances; + # since, when selecting an instance, each column of the latter has the same length, + # we arbitrarily choose to compute the minimum length starting from the first column. + minimum_length = minimum(length.(_Xfiltered[:,1])) + for instance in 1:(_Xfiltered |> size |> first) + for variable in 1:nvariables + _Xfiltered[instance,variable] = _Xfiltered[instance,variable][1:minimum_length] + end + end + + return _Xfiltered +end diff --git a/src/artifacts/loaders/libras-loader.jl b/src/artifacts/loaders/libras-loader.jl new file mode 100644 index 0000000..689926e --- /dev/null +++ b/src/artifacts/loaders/libras-loader.jl @@ -0,0 +1,76 @@ +struct LibrasLoader <: AbstractLoaderDataset + name::String + url::String + + classes::Vector{String} + + LibrasLoader() = new( + "libras", + "", + + # class names + [ + "curved_swing", + "horizontal_swing", + "vertical_swing", + "anti_clockwise_arc", + "clokcwise_arc", + "circle", + "horizontal_straight_line", + "vertical_straight_line", + "horizontal_zigzag", + "vertical_zigzag", + "horizontal_wavy", + "vertical_wavy", + "face_up_curve", + "face_down_curve", + "tremble" + ] + ) +end + +""" + classes(l::LibrasLoader) = l.classes + +Retrieve the classes of Libras dataset. +""" +classes(l::LibrasLoader) = l.classes + + +""" + function load(l::LibrasLoader) + +Load Libras dataset as specified by the [`LibrasLoader`](@ref). +""" +function load(l::LibrasLoader) + artifact_path = ensure_artifact_installed(name(l), ARTIFACTS_PATH) + + tarfile = joinpath(artifact_path, "$(name(l)).tar.gz") + + dirpath = begin + tarfile = joinpath(artifact_path, "$(name(l)).tar.gz") + if isfile(tarfile) + extracted_path = extract_artifact(artifact_path, name(l)) + joinpath(extracted_path, "$(name(l))") + else + joinpath(artifact_path, "$(name(l))") + end + end + + (X_train, y_train), (X_test, y_test) = + ( + read("$(dirpath)/libras_TEST.arff", String) |> parseARFF, + read("$(dirpath)/libras_TRAIN.arff", String) |> parseARFF, + ) + + # convert from .arff class codes to string + fix_class_names(y) = classes(l)[round(Int, parse(Float64, y))] + + y_train = map(fix_class_names, y_train) + y_test = map(fix_class_names, y_test) + + y_train = categorical(y_train) + y_test = categorical(y_test) + + return vcat(X_train, X_test), vcat(y_train, y_test) +end diff --git a/src/artifacts/loaders/loaders.jl b/src/artifacts/loaders/loaders.jl new file mode 100644 index 0000000..ceb44d4 --- /dev/null +++ b/src/artifacts/loaders/loaders.jl @@ -0,0 +1,133 @@ +""" +Abstract type representing a generic configuration for loading an artifact resource. + +# Interface +Every structure subtyping `AbstractLoader` must implement the following interface. +- `name(obj::DummyConcreteLoader)::String` +- `url(obj::DummyConcreteLoader)::String` +- `path(obj::DummyConcreteLoader)::String` + +By default, the three methods above returns the fields `.name`, ``.url` and `.path`, +respectively. + +See [`AbstractLoaderBinary`](@ref) and [`AbstractLoaderDataset`](@ref). +""" +abstract type AbstractLoader end + +""" + name(al::AbstractLoader) = al.name + +Return the identifier name of the artifact associated with `al`. +""" +name(al::AbstractLoader) = al.name + +""" + url(al::AbstractLoader) = al.url + +Return a fallback url that could be used to download the artifact identified by `al`. +""" +url(al::AbstractLoader) = al.url + +""" + abstract type AbstractLoaderBinary <: AbstractLoader end + +Specific [`AbstractLoader`](@ref) for binaries. +""" +abstract type AbstractLoaderBinary <: AbstractLoader end + +""" + abstract type AbstractLoaderDataset <: AbstractLoader end + +Specific [`AbstractLoader`](@ref) for datasets. +""" +abstract type AbstractLoaderDataset <: AbstractLoader end + +""" + Artifacts.load(::T) where {T} + +Method to implementing the loading logic for your custom artifact. + +!!! warning + When implementing this method for an [`AbstractLoader`](@ref), be sure that the + [`name`](@ref) getter for that particular loader has the same name of the resource you + want to load. + +See [`AbstractLoader`](@ref) + +See also the implementations of [`load(al::ABCLoader)`](@ref) and +[`load(al::MITESPRESSOLoader)`](@ref). +""" +load(::T) where {T} = throw(ArgumentError("Invalid method for type $T")) + + +# Extract tar.gz file in the artifact directory (cross-platform); +# see extract_artifact. +function _extract_artifact(path::String, name::String;silent::bool = true) + tarfile = joinpath(path, "$(name).tar.gz") + + if !isfile(tarfile) + error("Artifact file $(tarfile) not found") + end + + # Create a temporary directory for extraction + extract_dir = joinpath(path, "extracted") + + # Remove existing extraction directory if it exists + if isdir(extract_dir) + rm(extract_dir; recursive=true) + end + + # Create the extraction directory + mkpath(extract_dir) + + # Extract the tar.gz file using Julia's cross-platform libraries + try + open(tarfile, "r") do tar_gz + tar_stream = GzipDecompressorStream(tar_gz) + Tar.extract(tar_stream, extract_dir) + end + silent || println("Successfully extracted $(name).tar.gz to $(extract_dir)") + + # Remove the original tar.gz file to save space (optional) + # rm(tarfile) + + catch e + # Clean up on error + if isdir(extract_dir) + rm(extract_dir; recursive=true) + end + error("Failed to extract $(tarfile): $(e)") + end + + return extract_dir +end + +""" + extract_artifact(loader::AbstractLoader) + extract_artifact(path::String, name::String) + +Given an [`AbstractLoader`](@ref), download and extract it (if necessary). + +!!! warn + This method expects the resource to be saved as a .tar.gz archive. + +See [`AbstractLoader`](@ref). + +See also (the implementation of) [`load(al::ABCLoader)`](@ref) or +[`load(al::MITESPRESSOLoader)`](@ref). +""" +function extract_artifact(loader::AbstractLoader) + extract_artifact(path(loader), name(loader)) +end +function extract_artifact(path::String, name::String;silent::Bool = true) + extract_dir = joinpath(path, "extracted") + + # If the extraction directory already exists and is not empty, assume extraction is done + if isdir(extract_dir) && !isempty(readdir(extract_dir)) + silent || println("Artifact $(name) already extracted at $(extract_dir)") + return extract_dir + else + # Otherwise, proceed with extraction + return _extract_artifact(path, name) + end +end diff --git a/src/artifacts/loaders/mitespresso-loader.jl b/src/artifacts/loaders/mitespresso-loader.jl new file mode 100644 index 0000000..7937c5e --- /dev/null +++ b/src/artifacts/loaders/mitespresso-loader.jl @@ -0,0 +1,26 @@ +""" + Loading configuration for MITESPRESSO minimizer. +""" +struct MITESPRESSOLoader <: AbstractLoaderBinary + name::String # Name of the artifact in Artifacts.toml + url::String # Fallback download URL + + # Internal constructor with default values + MITESPRESSOLoader() = new( + "mitespresso", + "https://jackhack96.github.io/logic-synthesis/espresso.html" + ) +end + +function load(al::MITESPRESSOLoader) + artifact_path = ensure_artifact_installed(name(al), ARTIFACTS_PATH) + + # Check if tar.gz file needs extraction + tarfile = joinpath(artifact_path, "$(name(al)).tar.gz") + if isfile(tarfile) + extracted_path = extract_artifact(artifact_path, name(al)) + return joinpath(extracted_path, "$(name(al))") + else + return joinpath(artifact_path, "$(name(al))") + end +end diff --git a/src/artifacts/loaders/natops-loader.jl b/src/artifacts/loaders/natops-loader.jl new file mode 100644 index 0000000..7408c2c --- /dev/null +++ b/src/artifacts/loaders/natops-loader.jl @@ -0,0 +1,87 @@ +struct NatopsLoader <: AbstractLoaderDataset + name::String + url::String + + variablenames::Vector{String} + classes::Vector{String} + + NatopsLoader() = new( + "natops", + "", + + # variablenames + [ + "X[Hand tip l]", "Y[Hand tip l]", "Z[Hand tip l]", + "X[Hand tip r]", "Y[Hand tip r]", "Z[Hand tip r]", + "X[Elbow l]", "Y[Elbow l]", "Z[Elbow l]", + "X[Elbow r]","Y[Elbow r]","Z[Elbow r]", + "X[Wrist l]", "Y[Wrist l]", "Z[Wrist l]", + "X[Wrist r]", "Y[Wrist r]", "Z[Wrist r]", + "X[Thumb l]", "Y[Thumb l]", "Z[Thumb l]", + "X[Thumb r]", "Y[Thumb r]", "Z[Thumb r]", + ], + + # classes + [ + "I have command", + "All clear", + "Not clear", + "Spread wings", + "Fold wings", + "Lock wings", + ] + ) +end + +""" + variablenames(l::NatopsLoader) = l.variablenames + +Retrieve the variable names of Natops dataset. +""" +variablenames(l::NatopsLoader) = l.variablenames + +""" + classes(l::NatopsLoader) = l.classes + +Retrieve the classes of Natops dataset. +""" +classes(l::NatopsLoader) = l.classes + +""" + function load(l::NatopsLoader) + +Load NATOPS dataset as specified by the [`NatopsLoader`](@ref). +""" +function load(l::NatopsLoader) + artifact_path = ensure_artifact_installed(name(l), ARTIFACTS_PATH) + + dirpath = begin + tarfile = joinpath(artifact_path, "$(name(l)).tar.gz") + + if isfile(tarfile) + extracted_path = extract_artifact(artifact_path, name(l)) + joinpath(extracted_path, "$(name(l))") + else + joinpath(artifact_path, "$(name(l))") + end + end + + (X_train, y_train), (X_test, y_test) = + ( + read("$(dirpath)/natops_TEST.arff", String) |> parseARFF, + read("$(dirpath)/natops_TRAIN.arff", String) |> parseARFF, + ) + + fix_class_names(y) = classes(l)[round(Int, parse(Float64, y))] + + X_train = fix_dataframe(X_train, variablenames(l)) + X_test = fix_dataframe(X_test, variablenames(l)) + + y_train = map(fix_class_names, y_train) + y_test = map(fix_class_names, y_test) + + y_train = categorical(y_train) + y_test = categorical(y_test) + + return vcat(X_train, X_test), vcat(y_train, y_test) +end diff --git a/src/artifacts/utils/artifact-utils.jl b/src/artifacts/utils/artifact-utils.jl new file mode 100644 index 0000000..f0285c0 --- /dev/null +++ b/src/artifacts/utils/artifact-utils.jl @@ -0,0 +1,87 @@ +# This file contains all the methods necessary to automatically generate a functioning +# Artifacts.toml. +# +# For more information see the official documentation at: +# https://pkgdocs.julialang.org/v1/artifacts/#The-Pkg.Artifacts-API +# https://pkgdocs.julialang.org/v1/api/#Artifacts-Reference + + +""" + fillartifacts() + fillartifacts(URLS::Vector{String}) + function fillartifacts(url::String) + +Completely automatize the insertion of a new resource into the Artifacts.toml file. + +After the execution of the following command, the Artifacts.toml is updated as below; +note that the new entry is named with using the lowercase version of the resource name +provided. + +```julia +julia> fillartifacts("https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/NATOPS.tar.gz") +``` +[natops] +git-tree-sha1 = "87856b9b41a235ec57f36d1029d0467876660e6e" + + [[natops.download]] + url = "https://github.com/aclai-lab/Artifacts/raw/main/sole/datasets/NATOPS.tar.gz" + sha256 = "2586be714b4112b874d050dd3f873e12156d1921e5ded9bd8f66bf5bf8d9c2d1" +""" +function fillartifacts() + fillartifacts(ARTIFACT_URLS) +end +function fillartifacts(URLS::Vector{String}) + map(url -> fillartifacts(url), URLS) + return # to avoid returning a vector of nothing +end +function fillartifacts(url::String) + filename_with_extension = split(url, "/")[end] + filename_no_extension = split(filename_with_extension, ".")[1] |> lowercase + + # see https://pkgdocs.julialang.org/v1/artifacts/#The-Pkg.Artifacts-API + # this is ambiguous: create_artifact expects a function F as argument; + # the argument passed to F is a temporary directory in which we must download our things + # and, then, Pkg.Artifacts will move our things to a specific directory in + # .julia/artifacts folder. + + # download the file in a temporary location to compute its SHA256; + # WARNING: the file is created here... and then copied from here with cp! + temp_file = tempname() + Downloads.download(url, temp_file) + file_sha256 = bytes2hex(open(sha256, temp_file)) + + # create the artifact + SHA1 = create_artifact( + tmp_dir -> Downloads.download(url, joinpath(tmp_dir, filename_with_extension))) + + # now we can clear the temporary file + rm(temp_file) + + # and bind the artifact to let the user call the macro artifact"name" + bind_artifact!(ARTIFACTS_PATH, filename_no_extension, SHA1; force=true) + + # proceed to update the Artifact.toml + content = TOML.parsefile(ARTIFACTS_PATH) + open(ARTIFACTS_PATH, "w") do tomlfile + # eg: Dict{String, Any} with 1 entry: + # "natops" => Dict{String, Any}( + # "git-tree-sha1"=>"87856b9b41a235ec57f36d1029d0467876660e6e", + # "download"=>Any[Dict{String, Any}("sha256"=>"2586be714b4… + + if "download" in keys(content[filename_no_extension]) + @warn "Entry $(entry) already exists." + else + # we insert a vector of possible new entries; this method automatically infers + # the most simple one but actually it could be possible to add many sources; + # to do so, we could iterate some kwargs here. + new_entry = Dict{String,Any}() + content[filename_no_extension]["download"] = [new_entry] + new_entry["sha256"] = bytes2hex(open(sha256, artifact_path(SHA1))) + new_entry["url"] = url + end + + redirect_stdout(tomlfile) do + TOML.print(content) + end + end +end diff --git a/src/example-datasets.jl b/src/artifacts/utils/dataset-utils.jl similarity index 96% rename from src/example-datasets.jl rename to src/artifacts/utils/dataset-utils.jl index 3c2387e..1146819 100644 --- a/src/example-datasets.jl +++ b/src/artifacts/utils/dataset-utils.jl @@ -1,3 +1,7 @@ +# This file is almost deprecated, with an exception for poarseARFF function; +# TODO: when .Artifacts module is completed, forward all the calls to this load_arff_dataset +# to the proper loader. + using HTTP using ZipFile using DataFrames @@ -163,11 +167,9 @@ const _ARFF_ESC = UInt8('\\') const _ARFF_MISSING = UInt8('?') const _ARFF_RELMARK = UInt8('\'') -# function readARFF(path::String) -# open(path, "r") do io -# df = DataFrame() -# classes = String[] -# lines = readlines(io) ... +""" +TODO: document this. +""" function parseARFF(arffstring::String) df = DataFrame() classes = String[] @@ -230,6 +232,9 @@ function parseARFF(arffstring::String) return df[p, :], classes[p] end +""" +TODO: document this. +""" function fix_dataframe(df, variable_names = nothing) s = unique(size.(df[:,1])) @assert length(s) == 1 "$(s)" diff --git a/test/arm-thesis.jl b/test/arm-thesis.jl index 2b158a8..d96917f 100644 --- a/test/arm-thesis.jl +++ b/test/arm-thesis.jl @@ -17,8 +17,8 @@ function _load_NATOPS( ) (X_train, y_train), (X_test, y_test) = ( - read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> SoleData.parseARFF, - read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> SoleData.parseARFF, + read("$(dirpath)/$(fileprefix)_TEST.arff", String) |> parseARFF, + read("$(dirpath)/$(fileprefix)_TRAIN.arff", String) |> parseARFF, ) variablenames = ["X[Hand tip l]", "Y[Hand tip l]", "Z[Hand tip l]", "X[Hand tip r]", "Y[Hand tip r]", "Z[Hand tip r]", "X[Elbow l]", "Y[Elbow l]", "Z[Elbow l]", "X[Elbow r]", "Y[Elbow r]", "Z[Elbow r]", "X[Wrist l]", "Y[Wrist l]", "Z[Wrist l]", "X[Wrist r]", "Y[Wrist r]", "Z[Wrist r]", "X[Thumb l]", "Y[Thumb l]", "Z[Thumb l]", "X[Thumb r]", "Y[Thumb r]", "Z[Thumb r]"] diff --git a/test/artifacts.jl b/test/artifacts.jl new file mode 100644 index 0000000..7457626 --- /dev/null +++ b/test/artifacts.jl @@ -0,0 +1,32 @@ +using SoleData.Artifacts + +# fill your Artifacts.toml file; +@test_nowarn fillartifacts() + + +# Loader lists +abcloader = ABCLoader() +mitloader = MITESPRESSOLoader() +epilepsyloader = EpilepsyLoader() +hugadbloader = HuGaDBLoader() +librasloader = LibrasLoader() +natopsloader = NatopsLoader() + +LOADERS = [ + abcloader, + mitloader, + epilepsyloader, + hugadbloader, + librasloader, + natopsloader, +] + + +# Common logic +for l in LOADERS + printstyled("Loading $(name(l))\n", color=:green) + + # this should be enough to also test the specific getters of each loader since, + # if they exist, they are called by the loading logic. + @test_nowarn load(l) +end diff --git a/test/runtests.jl b/test/runtests.jl index 5bc00aa..2fcf0d8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,21 +31,23 @@ test_suites = [ ("Logisets", [ "logisets.jl", ]), ("Propositional Logisets", [ "propositional-logisets.jl", ]), ("Memosets", [ "memosets.jl", ]), - # + # ("Cube to Logiset", [ "cube2logiset.jl", ]), ("DataFrame to Logiset", [ "dataframe2logiset.jl", ]), ("MultiLogisets", [ "multilogisets.jl", ]), - # + # ("Conditions", [ "range-scalar-condition.jl", ]), ("Alphabets", [ "scalar-alphabet.jl", "discretization.jl"]), ("Features", [ "patchedfeatures.jl"]), - # + # ("MLJ", [ "MLJ.jl", ]), ("PLA", [ "pla.jl", ]), ("Minify", ["minify.jl"]), ("Parse", ["parse.jl"]), ("Example Datasets", [ "example-datasets.jl", ]), ("Variable Named Features", [ "var-features.jl", ]), + # + ("Artifacts", ["artifacts.jl"]), ("Simplification", [ "simplification.jl", ]), ]