Add more utils

giopaglia · giopaglia · commit 53abd5cabad6 · 2024-02-06T19:02:01.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -1,9 +1,11 @@
 name = "SoleBase"
 uuid = "4475fa32-7023-44a0-aa70-4813b230e492"
 authors = ["Federico Manzella", "Patrik Cavina", "Eduard I. Stan", "Lorenzo Balboni", "Giovanni Pagliarini"]
-version = "0.11.1"
+version = "0.12.0"
 
 [deps]
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
+FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/src/SoleBase.jl b/src/SoleBase.jl
@@ -108,6 +108,7 @@ function channelsize end
 # includes
 
 include("utils.jl")
+include("machine-learning-utils.jl")
 
 include("movingwindow.jl")
 
diff --git a/src/machine-learning-utils.jl b/src/machine-learning-utils.jl
@@ -0,0 +1,149 @@
+
+using FillArrays
+using CategoricalArrays
+
+doc_supervised_ml = """
+    const CLabel  = Union{String,Integer,CategoricalValue}
+    const RLabel  = AbstractFloat
+    const Label   = Union{CLabel,RLabel}
+
+Types for supervised machine learning labels (classification and regression).
+"""
+
+"""$(doc_supervised_ml)"""
+const CLabel  = Union{String,Integer,CategoricalValue}
+"""$(doc_supervised_ml)"""
+const RLabel  = AbstractFloat
+"""$(doc_supervised_ml)"""
+const Label   = Union{CLabel,RLabel}
+
+# Raw labels
+const _CLabel = Integer # (classification labels are internally represented as integers)
+const _Label  = Union{_CLabel,RLabel}
+
+############################################################################################
+
+# Convert a list of labels to categorical form
+Base.@propagate_inbounds @inline function get_categorical_form(Y::AbstractVector)
+    class_names = unique(Y)
+
+    dict = Dict{eltype(Y),Int64}()
+    @simd for i in 1:length(class_names)
+        @inbounds dict[class_names[i]] = i
+    end
+
+    _Y = Array{Int64}(undef, length(Y))
+    @simd for i in 1:length(Y)
+        @inbounds _Y[i] = dict[Y[i]]
+    end
+
+    return class_names, _Y
+end
+
+############################################################################################
+
+"""
+    bestguess(
+        labels::AbstractVector{<:Label},
+        weights::Union{Nothing,AbstractVector} = nothing;
+        suppress_parity_warning = false,
+    )
+
+Return the best guess for a set of labels; that is, the label that best approximates the
+labels provided. For classification labels, this function returns the majority class; for
+regression labels, the average value.
+If no labels are provided, `nothing` is returned.
+The computation can be weighted.
+
+See also
+[`CLabel`](@ref),
+[`RLabel`](@ref),
+[`Label`](@ref).
+"""
+function bestguess(
+    labels::AbstractVector{<:Label},
+    weights::Union{Nothing,AbstractVector} = nothing;
+    suppress_parity_warning = false,
+) end
+
+# Classification: (weighted) majority vote
+function bestguess(
+    labels::AbstractVector{<:CLabel},
+    weights::Union{Nothing,AbstractVector} = nothing;
+    suppress_parity_warning = false,
+)
+    if length(labels) == 0
+        return nothing
+    end
+
+    counts = begin
+        if isnothing(weights)
+            countmap(labels)
+        else
+            @assert length(labels) === length(weights) "Cannot compute " *
+             "best guess with mismatching number of votes " *
+             "$(length(labels)) and weights $(length(weights))."
+            countmap(labels, weights)
+        end
+    end
+
+    if !suppress_parity_warning && sum(counts[argmax(counts)] .== values(counts)) > 1
+        @warn "Parity encountered in bestguess! " *
+            "counts ($(length(labels)) elements): $(counts), " *
+            "argmax: $(argmax(counts)), " *
+            "max: $(counts[argmax(counts)]) (sum = $(sum(values(counts))))"
+    end
+    argmax(counts)
+end
+
+# Regression: (weighted) mean (or other central tendency measure?)
+function bestguess(
+    labels::AbstractVector{<:RLabel},
+    weights::Union{Nothing,AbstractVector} = nothing;
+    suppress_parity_warning = false,
+)
+    if length(labels) == 0
+        return nothing
+    end
+
+    (isnothing(weights) ? StatsBase.mean(labels) : sum(labels .* weights)/sum(weights))
+end
+
+############################################################################################
+
+# Default weights are optimized using FillArrays
+"""
+    default_weights(n::Integer)::AbstractVector{<:Number}
+
+Return a default weight vector of `n` values.
+"""
+function default_weights(n::Integer)
+    Ones{Int64}(n)
+end
+default_weights(Y::AbstractVector) = default_weights(length(Y))
+
+# Class rebalancing weights (classification case)
+"""
+    default_weights(Y::AbstractVector{L}) where {L<:CLabel}::AbstractVector{<:Number}
+
+Return a class-rebalancing weight vector, given a label vector `Y`.
+"""
+function balanced_weights(Y::AbstractVector{L}) where {L<:CLabel}
+    class_counts_dict = countmap(Y)
+    if length(unique(values(class_counts)_dict)) == 1 # balanced case
+        default_weights(length(Y))
+    else
+        # Assign weights in such a way that the dataset becomes balanced
+        tot = sum(values(class_counts_dict))
+        balanced_tot_per_class = tot/length(class_counts_dict)
+        weights_map = Dict{L,Float64}([class => (balanced_tot_per_class/n_instances)
+            for (class,n_instances) in class_counts_dict])
+        W = [weights_map[y] for y in Y]
+        W ./ sum(W)
+    end
+end
+
+slice_weights(W::Ones{Int64}, inds::AbstractVector) = default_weights(length(inds))
+slice_weights(W::Any,         inds::AbstractVector) = @view W[inds]
+slice_weights(W::Ones{Int64}, i::Integer) = 1
+slice_weights(W::Any,         i::Integer) = W[i]
diff --git a/src/utils.jl b/src/utils.jl
@@ -135,3 +135,58 @@ Useful for reproducibility.
 function spawn(rng::Random.AbstractRNG)
     Random.MersenneTwister(abs(rand(rng, Int)))
 end
+
+############################################################################################
+
+@inline function softminimum(vals, alpha)
+    _vals = SoleBase.vectorize(vals);
+    partialsort!(_vals,ceil(Int, alpha*length(_vals)); rev=true)
+end
+
+@inline function softmaximum(vals, alpha)
+    _vals = SoleBase.vectorize(vals);
+    partialsort!(_vals,ceil(Int, alpha*length(_vals)))
+end
+
+
+############################################################################################
+# I/O utils
+############################################################################################
+
+# Source: https://stackoverflow.com/questions/46671965/printing-variable-subscripts-in-julia/46674866
+# '₀'
+function subscriptnumber(i::Integer)
+    join([
+        (if i < 0
+            [Char(0x208B)]
+        else [] end)...,
+        [Char(0x2080+d) for d in reverse(digits(abs(i)))]...
+    ])
+end
+# https://www.w3.org/TR/xml-entity-names/020.html
+# '․', 'ₑ', '₋'
+function subscriptnumber(s::AbstractString)
+    char_to_subscript(ch) = begin
+        if ch == 'e'
+            'ₑ'
+        elseif ch == '.'
+            '․'
+        elseif ch == '.'
+            '․'
+        elseif ch == '-'
+            '₋'
+        else
+            subscriptnumber(parse(Int, ch))
+        end
+    end
+
+    try
+        join(map(char_to_subscript, [string(ch) for ch in s]))
+    catch
+        s
+    end
+end
+
+subscriptnumber(i::AbstractFloat) = subscriptnumber(string(i))
+subscriptnumber(i::Any) = i
+