From d0241cb0ffaa8a3eb1db5c039e52a7b2c4cddc74 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sun, 2 Feb 2025 12:11:33 +0100 Subject: [PATCH 1/4] stratobs --- src/obstransform.jl | 6 +-- src/resample.jl | 95 +++++++++++++++++++++++++++++++++++++++++++++ src/splitobs.jl | 50 ++++++++++++++++++------ src/utils.jl | 4 +- 4 files changed, 137 insertions(+), 18 deletions(-) diff --git a/src/obstransform.jl b/src/obstransform.jl index e5c6168..e2aefbb 100644 --- a/src/obstransform.jl +++ b/src/obstransform.jl @@ -208,11 +208,7 @@ accomplish that, which means that the return value is likely of a different type than `data`. Optionally, a random number generator `rng` can be passed as the -first argument. - -The optional parameter `rng` allows one to specify the -random number generator used for shuffling. This is useful when -reproducible results are desired. +first argument. For this function to work, the type of `data` must implement [`numobs`](@ref) and [`getobs`](@ref). diff --git a/src/resample.jl b/src/resample.jl index ce93f8e..97a4cbf 100644 --- a/src/resample.jl +++ b/src/resample.jl @@ -211,3 +211,98 @@ function undersample(rng::AbstractRNG, data::Tuple; kws...) d, c = undersample(rng, data[1:end-1], data[end]; kws...) return (d..., c) end + + +""" + stratifiedobs([rng], data, p; [shuffle = true]) -> Tuple + +Partition the dataset `data` into multiple disjoint subsets +with size proportional to the value(s) of `p`. +The observations are assignmed to a data subset using stratified sampling without replacement. + +If `p` is a float between 0 and 1, then the return value +will be a tuple with two subsests in which the +first element contains the fraction of observations specified by +`p` and the second element contains the rest. In the following +code the first subset `train` will contain around 70% of the +observations and the second subset `test` the rest. The key +difference to [`splitobs`](@ref) is that the class distribution +in `y` will actively be preserved in `train` and `test`. + +```julia +train_data, test_data = stratifiedobs(data, p = 0.7) +``` + +If `p` is a tuple of floats between 0 and 1, then additional subsets will be +created. In this example `train` will contain about 50% of the +observations, `val` will contain around 30%, and `test` the +remaining 20%. + +```julia +train_data, val_data, test_data = stratifiedobs(y, p = (0.5, 0.3)) +``` + +It is also possible to call `stratifiedobs` with multiple data +arguments as tuple, which all must have the same number of total +observations. Note that if `data` is a tuple, then it will be +assumed that the last element of the tuple contains the targets. + +```julia +(X_train, y_train), (X_test, y_test) = stratifiedobs((X, y), p = 0.7) +``` + +The optional parameter `shuffle` determines if the resulting data +subsets should be shuffled. If `false`, then the observations in +the subsets will be grouped together according to their labels. + +```julia +julia> y = ["a", "b", "b", "b", "b", "a"] # 2 imbalanced classes +6-element Array{String,1}: + "a" + "b" + "b" + "b" + "b" + "a" + +julia> train, test = stratifiedobs(y, p = 0.5, shuffle = false) +(String["b","b","a"],String["b","b","a"]) +``` + +The optional argument `rng` allows one to specify the +random number generator used for shuffling. + +For this function to work, the type of `data` must implement +[`numobs`](@ref) and [`getobs`](@ref). + +See also [`undersample`](@ref), [`oversample`](@ref), and [`splitobs`](@ref). +""" +function stratifiedobs(data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG) + stratifiedobs(identity, data, p, shuffle, convert(ObsDimension, obsdim), rng) +end + +function stratifiedobs(f, data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG) + stratifiedobs(f, data, p, shuffle, convert(ObsDimension, obsdim), rng) +end + +function stratifiedobs(data, p::AbstractFloat, args...) + stratifiedobs(identity, data, p, args...) +end + +function stratifiedobs(data, p::NTuple{N,AbstractFloat}, args...) where N + stratifiedobs(identity, data, p, args...) +end + +function stratifiedobs(rng, data, p::Union{NTuple,AbstractFloat}, stratified::AbstractVector) + # The given data is always shuffled to qualify as performing + # stratified sampling without replacement. + idxs_groups = group_indices(stratified) + idxs_splits = ntuple(i -> Int[], length(p)+1) + for (lbl, idxs) in idxs_groups + new_idxs_splits = splitobs(rng, idxs, at=p) + for i in 1:length(idxs_splits) + append!(idxs_splits[i], new_idxs_splits[i]) + end + end + return map(idx -> obsview(data, idx), idxs_splits) +end diff --git a/src/splitobs.jl b/src/splitobs.jl index 7061d43..1d3a9a2 100644 --- a/src/splitobs.jl +++ b/src/splitobs.jl @@ -21,9 +21,12 @@ _splitobs(n::Int, at::NTuple{N, <:Integer}) where {N} = _splitobs(n::Int, at ./ _splitobs(n::Int, at::Tuple{}) = (1:n,) + function _splitobs(n::Int, at::AbstractFloat) 0 <= at <= 1 || throw(ArgumentError("the parameter \"at\" must be in interval (0, 1)")) - n1 = clamp(round(Int, at*n), 0, n) + n1 = floor(Int, n * at) + delta = n*at - n1 + # TODO add random rounding (1:n1, n1+1:n) end @@ -38,21 +41,22 @@ function _splitobs(n::Int, at::NTuple{N,<:AbstractFloat}) where N end """ - splitobs([rng], data; at, shuffle=false) -> Tuple + splitobs([rng], data; at, shuffle=false, stratified=nothing) -> Tuple Partition the `data` into two or more subsets. -When `at` is a number between 0 and 1, this specifies the proportion in the first subset. - -When `at` is an integer, it specifies the number of observations in the first subset. - -When `at` is a tuple, entries specifies the number or proportion in each subset, except +The argument `at` specifies how to split the data: +- When `at` is a number between 0 and 1, this specifies the proportion in the first subset. +- When `at` is an integer, it specifies the number of observations in the first subset. +- When `at` is a tuple, entries specifies the number or proportion in each subset, except for the last which will contain the remaning observations. The number of returned subsets is `length(at)+1`. If `shuffle=true`, randomly permute the observations before splitting. A random number generator `rng` can be optionally passed as the first argument. +If `stratified` is not `nothing`, it should be an array of labels with the same length as the data. +The observations will be split in a way that the proportion of each label is preserved in each subset. Supports any datatype implementing [`numobs`](@ref). @@ -78,10 +82,34 @@ true """ splitobs(data; kws...) = splitobs(Random.default_rng(), data; kws...) -function splitobs(rng::AbstractRNG, data; at, shuffle::Bool=false) +function splitobs(rng::AbstractRNG, data; at, + shuffle::Bool=false, + stratified::Union{Nothing,AbstractVector}=nothing) + n = numobs(data) + at = _normalize_at(n, at) if shuffle - data = shuffleobs(rng, data) + perm = randperm(rng, n) + data = obsview(data, perm) # same as shuffleobs(rng, data), but make it explicit to keep perm end - n = numobs(data) - return map(idx -> obsview(data, idx), splitobs(n; at)) + if stratified !== nothing + @assert length(stratified) == n + if shuffle + stratified = stratified[perm] + end + idxs_groups = group_indices(stratified) + idxs_splits = ntuple(i -> Int[], length(at)+1) + for (lbl, idxs) in idxs_groups + new_idxs_splits = splitobs(idxs; at, shuffle=false) + for i in 1:length(idxs_splits) + append!(idxs_splits[i], new_idxs_splits[i]) + end + end + else + idxs_splits = splitobs(n; at) + end + return map(idxs -> obsview(data, idxs), idxs_splits) end + +_normalize_at(n, at::Integer) = at / n +_normalize_at(n, at::NTuple{N, <:Integer}) where N = at ./ n +_normalize_at(n, at) = at \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index b928135..82ca407 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -380,7 +380,7 @@ function batch(xs::Vector{<:NamedTuple}) all_keys = [sort(collect(keys(x))) for x in xs] ks = all_keys[1] @assert all(==(ks), all_keys) "Cannot batch named tuples with different keys" - NamedTuple(k => batch([x[k] for x in xs]) for k in ks) + return NamedTuple(k => batch([x[k] for x in xs]) for k in ks) end function batch(xs::Vector{<:Dict}) @@ -388,7 +388,7 @@ function batch(xs::Vector{<:Dict}) all_keys = [sort(collect(keys(x))) for x in xs] ks = all_keys[1] @assert all(==(ks), all_keys) "cannot batch dicts with different keys" - Dict(k => batch([x[k] for x in xs]) for k in ks) + return Dict(k => batch([x[k] for x in xs]) for k in ks) end """ From ca8e10a388eeab748cc500a12daaf106b17245aa Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sun, 2 Feb 2025 12:12:41 +0100 Subject: [PATCH 2/4] cleanup --- src/resample.jl | 95 ------------------------------------------------- 1 file changed, 95 deletions(-) diff --git a/src/resample.jl b/src/resample.jl index 97a4cbf..ce93f8e 100644 --- a/src/resample.jl +++ b/src/resample.jl @@ -211,98 +211,3 @@ function undersample(rng::AbstractRNG, data::Tuple; kws...) d, c = undersample(rng, data[1:end-1], data[end]; kws...) return (d..., c) end - - -""" - stratifiedobs([rng], data, p; [shuffle = true]) -> Tuple - -Partition the dataset `data` into multiple disjoint subsets -with size proportional to the value(s) of `p`. -The observations are assignmed to a data subset using stratified sampling without replacement. - -If `p` is a float between 0 and 1, then the return value -will be a tuple with two subsests in which the -first element contains the fraction of observations specified by -`p` and the second element contains the rest. In the following -code the first subset `train` will contain around 70% of the -observations and the second subset `test` the rest. The key -difference to [`splitobs`](@ref) is that the class distribution -in `y` will actively be preserved in `train` and `test`. - -```julia -train_data, test_data = stratifiedobs(data, p = 0.7) -``` - -If `p` is a tuple of floats between 0 and 1, then additional subsets will be -created. In this example `train` will contain about 50% of the -observations, `val` will contain around 30%, and `test` the -remaining 20%. - -```julia -train_data, val_data, test_data = stratifiedobs(y, p = (0.5, 0.3)) -``` - -It is also possible to call `stratifiedobs` with multiple data -arguments as tuple, which all must have the same number of total -observations. Note that if `data` is a tuple, then it will be -assumed that the last element of the tuple contains the targets. - -```julia -(X_train, y_train), (X_test, y_test) = stratifiedobs((X, y), p = 0.7) -``` - -The optional parameter `shuffle` determines if the resulting data -subsets should be shuffled. If `false`, then the observations in -the subsets will be grouped together according to their labels. - -```julia -julia> y = ["a", "b", "b", "b", "b", "a"] # 2 imbalanced classes -6-element Array{String,1}: - "a" - "b" - "b" - "b" - "b" - "a" - -julia> train, test = stratifiedobs(y, p = 0.5, shuffle = false) -(String["b","b","a"],String["b","b","a"]) -``` - -The optional argument `rng` allows one to specify the -random number generator used for shuffling. - -For this function to work, the type of `data` must implement -[`numobs`](@ref) and [`getobs`](@ref). - -See also [`undersample`](@ref), [`oversample`](@ref), and [`splitobs`](@ref). -""" -function stratifiedobs(data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG) - stratifiedobs(identity, data, p, shuffle, convert(ObsDimension, obsdim), rng) -end - -function stratifiedobs(f, data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG) - stratifiedobs(f, data, p, shuffle, convert(ObsDimension, obsdim), rng) -end - -function stratifiedobs(data, p::AbstractFloat, args...) - stratifiedobs(identity, data, p, args...) -end - -function stratifiedobs(data, p::NTuple{N,AbstractFloat}, args...) where N - stratifiedobs(identity, data, p, args...) -end - -function stratifiedobs(rng, data, p::Union{NTuple,AbstractFloat}, stratified::AbstractVector) - # The given data is always shuffled to qualify as performing - # stratified sampling without replacement. - idxs_groups = group_indices(stratified) - idxs_splits = ntuple(i -> Int[], length(p)+1) - for (lbl, idxs) in idxs_groups - new_idxs_splits = splitobs(rng, idxs, at=p) - for i in 1:length(idxs_splits) - append!(idxs_splits[i], new_idxs_splits[i]) - end - end - return map(idx -> obsview(data, idx), idxs_splits) -end From 3aa4372def291942a06b471f181c49df9419f1ac Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sun, 2 Feb 2025 18:09:51 +0100 Subject: [PATCH 3/4] cleanup --- src/splitobs.jl | 13 +++++-------- test/splitobs.jl | 10 ++++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/splitobs.jl b/src/splitobs.jl index 1d3a9a2..e33dde7 100644 --- a/src/splitobs.jl +++ b/src/splitobs.jl @@ -2,7 +2,7 @@ splitobs(n::Int; at) -> Tuple Compute the indices for two or more disjoint subsets of -the range `1:n` with splits given by `at`. +the range `1:n` with split sizes determined by `at`. # Examples @@ -18,16 +18,12 @@ splitobs(n::Int; at) = _splitobs(n, at) _splitobs(n::Int, at::Integer) = _splitobs(n::Int, at / n) _splitobs(n::Int, at::NTuple{N, <:Integer}) where {N} = _splitobs(n::Int, at ./ n) - _splitobs(n::Int, at::Tuple{}) = (1:n,) - function _splitobs(n::Int, at::AbstractFloat) 0 <= at <= 1 || throw(ArgumentError("the parameter \"at\" must be in interval (0, 1)")) - n1 = floor(Int, n * at) - delta = n*at - n1 - # TODO add random rounding - (1:n1, n1+1:n) + n1 = round(Int, n * at) + return (1:n1, n1+1:n) end function _splitobs(n::Int, at::NTuple{N,<:AbstractFloat}) where N @@ -40,8 +36,9 @@ function _splitobs(n::Int, at::NTuple{N,<:AbstractFloat}) where N return (a, rest...) end + """ - splitobs([rng], data; at, shuffle=false, stratified=nothing) -> Tuple + splitobs([rng,] data; at, shuffle=false, stratified=nothing) -> Tuple Partition the `data` into two or more subsets. diff --git a/test/splitobs.jl b/test/splitobs.jl index f5ce335..a5238e4 100644 --- a/test/splitobs.jl +++ b/test/splitobs.jl @@ -90,3 +90,13 @@ end p2, _ = splitobs(rng, data, at=3, shuffle=true) @test p1 == p2 end + +@testset "stratified" begin + data = (a=zeros(Float32, 2, 10), b=[0,0,0,0,1,1,1,1,1,1]) + d1, d2 = splitobs(data, at=0.5, stratified=data.b) + @test d1.b == [0,0,1,1,1] + @test d2.b == [0,0,1,1,1] + d1, d2 = splitobs(data, at=0.25, stratified=data.b) + @test d1.b == [0,1,1] + @test d2.b == [0,0,0,1,1,1,1] +end From 7f4c1a4279420e471105cba5d9355ece3857f851 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sun, 2 Feb 2025 18:14:16 +0100 Subject: [PATCH 4/4] statified --- src/splitobs.jl | 5 ++++- test/splitobs.jl | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/splitobs.jl b/src/splitobs.jl index e33dde7..9ea6e15 100644 --- a/src/splitobs.jl +++ b/src/splitobs.jl @@ -53,7 +53,7 @@ If `shuffle=true`, randomly permute the observations before splitting. A random number generator `rng` can be optionally passed as the first argument. If `stratified` is not `nothing`, it should be an array of labels with the same length as the data. -The observations will be split in a way that the proportion of each label is preserved in each subset. +The observations will be split in such a way that the proportion of each label is preserved in each subset. Supports any datatype implementing [`numobs`](@ref). @@ -75,6 +75,9 @@ julia> train, test = splitobs((reshape(1.0:100.0, 1, :), 101:200), at=0.7, shuff julia> vec(test[1]) .+ 100 == test[2] true + +julia> splitobs(1:10, at=0.5, stratified=[0,0,0,0,1,1,1,1,1,1]) # 2 zeros and 3 ones in each subset +([1, 2, 5, 6, 7], [3, 4, 8, 9, 10]) ``` """ splitobs(data; kws...) = splitobs(Random.default_rng(), data; kws...) diff --git a/test/splitobs.jl b/test/splitobs.jl index a5238e4..04a9e62 100644 --- a/test/splitobs.jl +++ b/test/splitobs.jl @@ -99,4 +99,11 @@ end d1, d2 = splitobs(data, at=0.25, stratified=data.b) @test d1.b == [0,1,1] @test d2.b == [0,0,0,1,1,1,1] + + d1, d2 = splitobs(data, at=0., stratified=data.b) + @test d1.b == [] + @test d2.b == [0,0,0,0,1,1,1,1,1,1] + d1, d2 = splitobs(data, at=1., stratified=data.b) + @test d1.b == [0,0,0,0,1,1,1,1,1,1] + @test d2.b == [] end