From d0241cb0ffaa8a3eb1db5c039e52a7b2c4cddc74 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sun, 2 Feb 2025 12:11:33 +0100
Subject: [PATCH 1/4] stratobs

---
 src/obstransform.jl |  6 +--
 src/resample.jl     | 95 +++++++++++++++++++++++++++++++++++++++++++++
 src/splitobs.jl     | 50 ++++++++++++++++++------
 src/utils.jl        |  4 +-
 4 files changed, 137 insertions(+), 18 deletions(-)

diff --git a/src/obstransform.jl b/src/obstransform.jl
index e5c6168..e2aefbb 100644
--- a/src/obstransform.jl
+++ b/src/obstransform.jl
@@ -208,11 +208,7 @@ accomplish that, which means that the return value is likely of a
 different type than `data`.
 
 Optionally, a random number generator `rng` can be passed as the
-first argument.
-
-The optional parameter `rng` allows one to specify the
-random number generator used for shuffling. This is useful when
-reproducible results are desired.
+first argument. 
 
 For this function to work, the type of `data` must implement
 [`numobs`](@ref) and [`getobs`](@ref). 
diff --git a/src/resample.jl b/src/resample.jl
index ce93f8e..97a4cbf 100644
--- a/src/resample.jl
+++ b/src/resample.jl
@@ -211,3 +211,98 @@ function undersample(rng::AbstractRNG, data::Tuple; kws...)
     d, c = undersample(rng, data[1:end-1], data[end]; kws...)
     return (d..., c)
 end
+
+
+"""
+    stratifiedobs([rng], data, p; [shuffle = true]) -> Tuple
+
+Partition the dataset `data` into multiple disjoint subsets 
+with size proportional to the value(s) of `p`. 
+The observations are assignmed to a data subset using stratified sampling without replacement. 
+
+If `p` is a float between 0 and 1, then the return value
+will be a tuple with two subsests in which the
+first element contains the fraction of observations specified by
+`p` and the second element contains the rest. In the following
+code the first subset `train` will contain around 70% of the
+observations and the second subset `test` the rest. The key
+difference to [`splitobs`](@ref) is that the class distribution
+in `y` will actively be preserved in `train` and `test`.
+
+```julia
+train_data, test_data = stratifiedobs(data, p = 0.7)
+```
+
+If `p` is a tuple of floats between 0 and 1, then additional subsets will be
+created. In this example `train` will contain about 50% of the
+observations, `val` will contain around 30%, and `test` the
+remaining 20%.
+
+```julia
+train_data, val_data, test_data = stratifiedobs(y, p = (0.5, 0.3))
+```
+
+It is also possible to call `stratifiedobs` with multiple data
+arguments as tuple, which all must have the same number of total
+observations. Note that if `data` is a tuple, then it will be
+assumed that the last element of the tuple contains the targets.
+
+```julia
+(X_train, y_train), (X_test, y_test) = stratifiedobs((X, y), p = 0.7)
+```
+
+The optional parameter `shuffle` determines if the resulting data
+subsets should be shuffled. If `false`, then the observations in
+the subsets will be grouped together according to their labels.
+
+```julia
+julia> y = ["a", "b", "b", "b", "b", "a"] # 2 imbalanced classes
+6-element Array{String,1}:
+ "a"
+ "b"
+ "b"
+ "b"
+ "b"
+ "a"
+
+julia> train, test = stratifiedobs(y, p = 0.5, shuffle = false)
+(String["b","b","a"],String["b","b","a"])
+```
+
+The optional argument `rng` allows one to specify the
+random number generator used for shuffling. 
+
+For this function to work, the type of `data` must implement
+[`numobs`](@ref) and [`getobs`](@ref). 
+
+See also [`undersample`](@ref), [`oversample`](@ref), and [`splitobs`](@ref).
+"""
+function stratifiedobs(data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG)
+    stratifiedobs(identity, data, p, shuffle, convert(ObsDimension, obsdim), rng)
+end
+
+function stratifiedobs(f, data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG)
+    stratifiedobs(f, data, p, shuffle, convert(ObsDimension, obsdim), rng)
+end
+
+function stratifiedobs(data, p::AbstractFloat, args...)
+    stratifiedobs(identity, data, p, args...)
+end
+
+function stratifiedobs(data, p::NTuple{N,AbstractFloat}, args...) where N
+    stratifiedobs(identity, data, p, args...)
+end
+
+function stratifiedobs(rng, data, p::Union{NTuple,AbstractFloat}, stratified::AbstractVector)
+    # The given data is always shuffled to qualify as performing
+    # stratified sampling without replacement.
+    idxs_groups = group_indices(stratified)
+    idxs_splits = ntuple(i -> Int[], length(p)+1)
+    for (lbl, idxs) in idxs_groups
+        new_idxs_splits = splitobs(rng, idxs, at=p)
+        for i in 1:length(idxs_splits)
+            append!(idxs_splits[i], new_idxs_splits[i])
+        end
+    end
+    return map(idx -> obsview(data, idx), idxs_splits)
+end
diff --git a/src/splitobs.jl b/src/splitobs.jl
index 7061d43..1d3a9a2 100644
--- a/src/splitobs.jl
+++ b/src/splitobs.jl
@@ -21,9 +21,12 @@ _splitobs(n::Int, at::NTuple{N, <:Integer}) where {N} = _splitobs(n::Int, at ./
 
 _splitobs(n::Int, at::Tuple{}) = (1:n,)
 
+
 function _splitobs(n::Int, at::AbstractFloat)
     0 <= at <= 1 || throw(ArgumentError("the parameter \"at\" must be in interval (0, 1)"))
-    n1 = clamp(round(Int, at*n), 0, n)
+    n1 = floor(Int, n * at)
+    delta = n*at - n1
+    # TODO add random rounding
     (1:n1, n1+1:n)
 end
 
@@ -38,21 +41,22 @@ function _splitobs(n::Int, at::NTuple{N,<:AbstractFloat}) where N
 end
 
 """
-    splitobs([rng], data; at, shuffle=false) -> Tuple
+    splitobs([rng], data; at, shuffle=false, stratified=nothing) -> Tuple
 
 Partition the `data` into two or more subsets.
 
-When `at` is a number between 0 and 1, this specifies the proportion in the first subset.
-
-When `at` is an integer, it specifies the number of observations in the first subset.
-
-When `at` is a tuple, entries specifies the number or proportion in each subset, except
+The argument `at` specifies how to split the data:
+- When `at` is a number between 0 and 1, this specifies the proportion in the first subset.
+- When `at` is an integer, it specifies the number of observations in the first subset.
+- When `at` is a tuple, entries specifies the number or proportion in each subset, except
 for the last which will contain the remaning observations. 
 The number of returned subsets is `length(at)+1`.
 
 If `shuffle=true`, randomly permute the observations before splitting.
 A random number generator `rng` can be optionally passed as the first argument.
 
+If `stratified` is not `nothing`, it should be an array of labels with the same length as the data.
+The observations will be split in a way that the proportion of each label is preserved in each subset.
 
 Supports any datatype implementing [`numobs`](@ref). 
 
@@ -78,10 +82,34 @@ true
 """
 splitobs(data; kws...) = splitobs(Random.default_rng(), data; kws...)
 
-function splitobs(rng::AbstractRNG, data; at, shuffle::Bool=false)
+function splitobs(rng::AbstractRNG, data; at, 
+        shuffle::Bool=false, 
+        stratified::Union{Nothing,AbstractVector}=nothing)
+    n = numobs(data)
+    at = _normalize_at(n, at)
     if shuffle
-        data = shuffleobs(rng, data)
+        perm = randperm(rng, n)
+        data = obsview(data, perm) # same as shuffleobs(rng, data), but make it explicit to keep perm
     end
-    n = numobs(data)
-    return map(idx -> obsview(data, idx), splitobs(n; at))
+    if stratified !== nothing
+        @assert length(stratified) == n
+        if shuffle
+            stratified = stratified[perm]
+        end
+        idxs_groups = group_indices(stratified)
+        idxs_splits = ntuple(i -> Int[], length(at)+1)
+        for (lbl, idxs) in idxs_groups
+            new_idxs_splits = splitobs(idxs; at, shuffle=false)
+            for i in 1:length(idxs_splits)
+                append!(idxs_splits[i], new_idxs_splits[i])
+            end
+        end
+    else
+        idxs_splits = splitobs(n; at)
+    end
+    return map(idxs -> obsview(data, idxs), idxs_splits)
 end
+
+_normalize_at(n, at::Integer) = at / n
+_normalize_at(n, at::NTuple{N, <:Integer}) where N = at ./ n
+_normalize_at(n, at) = at
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
index b928135..82ca407 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -380,7 +380,7 @@ function batch(xs::Vector{<:NamedTuple})
     all_keys = [sort(collect(keys(x))) for x in xs]
     ks = all_keys[1]
     @assert all(==(ks), all_keys) "Cannot batch named tuples with different keys"
-    NamedTuple(k => batch([x[k] for x in xs]) for k in ks)
+    return NamedTuple(k => batch([x[k] for x in xs]) for k in ks)
 end
 
 function batch(xs::Vector{<:Dict})
@@ -388,7 +388,7 @@ function batch(xs::Vector{<:Dict})
     all_keys = [sort(collect(keys(x))) for x in xs]
     ks = all_keys[1]
     @assert all(==(ks), all_keys) "cannot batch dicts with different keys"
-    Dict(k => batch([x[k] for x in xs]) for k in ks)
+    return Dict(k => batch([x[k] for x in xs]) for k in ks)
 end
 
 """

From ca8e10a388eeab748cc500a12daaf106b17245aa Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sun, 2 Feb 2025 12:12:41 +0100
Subject: [PATCH 2/4] cleanup

---
 src/resample.jl | 95 -------------------------------------------------
 1 file changed, 95 deletions(-)

diff --git a/src/resample.jl b/src/resample.jl
index 97a4cbf..ce93f8e 100644
--- a/src/resample.jl
+++ b/src/resample.jl
@@ -211,98 +211,3 @@ function undersample(rng::AbstractRNG, data::Tuple; kws...)
     d, c = undersample(rng, data[1:end-1], data[end]; kws...)
     return (d..., c)
 end
-
-
-"""
-    stratifiedobs([rng], data, p; [shuffle = true]) -> Tuple
-
-Partition the dataset `data` into multiple disjoint subsets 
-with size proportional to the value(s) of `p`. 
-The observations are assignmed to a data subset using stratified sampling without replacement. 
-
-If `p` is a float between 0 and 1, then the return value
-will be a tuple with two subsests in which the
-first element contains the fraction of observations specified by
-`p` and the second element contains the rest. In the following
-code the first subset `train` will contain around 70% of the
-observations and the second subset `test` the rest. The key
-difference to [`splitobs`](@ref) is that the class distribution
-in `y` will actively be preserved in `train` and `test`.
-
-```julia
-train_data, test_data = stratifiedobs(data, p = 0.7)
-```
-
-If `p` is a tuple of floats between 0 and 1, then additional subsets will be
-created. In this example `train` will contain about 50% of the
-observations, `val` will contain around 30%, and `test` the
-remaining 20%.
-
-```julia
-train_data, val_data, test_data = stratifiedobs(y, p = (0.5, 0.3))
-```
-
-It is also possible to call `stratifiedobs` with multiple data
-arguments as tuple, which all must have the same number of total
-observations. Note that if `data` is a tuple, then it will be
-assumed that the last element of the tuple contains the targets.
-
-```julia
-(X_train, y_train), (X_test, y_test) = stratifiedobs((X, y), p = 0.7)
-```
-
-The optional parameter `shuffle` determines if the resulting data
-subsets should be shuffled. If `false`, then the observations in
-the subsets will be grouped together according to their labels.
-
-```julia
-julia> y = ["a", "b", "b", "b", "b", "a"] # 2 imbalanced classes
-6-element Array{String,1}:
- "a"
- "b"
- "b"
- "b"
- "b"
- "a"
-
-julia> train, test = stratifiedobs(y, p = 0.5, shuffle = false)
-(String["b","b","a"],String["b","b","a"])
-```
-
-The optional argument `rng` allows one to specify the
-random number generator used for shuffling. 
-
-For this function to work, the type of `data` must implement
-[`numobs`](@ref) and [`getobs`](@ref). 
-
-See also [`undersample`](@ref), [`oversample`](@ref), and [`splitobs`](@ref).
-"""
-function stratifiedobs(data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG)
-    stratifiedobs(identity, data, p, shuffle, convert(ObsDimension, obsdim), rng)
-end
-
-function stratifiedobs(f, data; p = 0.7, shuffle = true, obsdim = default_obsdim(data), rng = Random.GLOBAL_RNG)
-    stratifiedobs(f, data, p, shuffle, convert(ObsDimension, obsdim), rng)
-end
-
-function stratifiedobs(data, p::AbstractFloat, args...)
-    stratifiedobs(identity, data, p, args...)
-end
-
-function stratifiedobs(data, p::NTuple{N,AbstractFloat}, args...) where N
-    stratifiedobs(identity, data, p, args...)
-end
-
-function stratifiedobs(rng, data, p::Union{NTuple,AbstractFloat}, stratified::AbstractVector)
-    # The given data is always shuffled to qualify as performing
-    # stratified sampling without replacement.
-    idxs_groups = group_indices(stratified)
-    idxs_splits = ntuple(i -> Int[], length(p)+1)
-    for (lbl, idxs) in idxs_groups
-        new_idxs_splits = splitobs(rng, idxs, at=p)
-        for i in 1:length(idxs_splits)
-            append!(idxs_splits[i], new_idxs_splits[i])
-        end
-    end
-    return map(idx -> obsview(data, idx), idxs_splits)
-end

From 3aa4372def291942a06b471f181c49df9419f1ac Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sun, 2 Feb 2025 18:09:51 +0100
Subject: [PATCH 3/4] cleanup

---
 src/splitobs.jl  | 13 +++++--------
 test/splitobs.jl | 10 ++++++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/splitobs.jl b/src/splitobs.jl
index 1d3a9a2..e33dde7 100644
--- a/src/splitobs.jl
+++ b/src/splitobs.jl
@@ -2,7 +2,7 @@
     splitobs(n::Int; at) -> Tuple
 
 Compute the indices for two or more disjoint subsets of
-the range `1:n` with splits given by `at`.
+the range `1:n` with split sizes determined by `at`.
 
 # Examples
 
@@ -18,16 +18,12 @@ splitobs(n::Int; at) = _splitobs(n, at)
 
 _splitobs(n::Int, at::Integer) = _splitobs(n::Int, at / n) 
 _splitobs(n::Int, at::NTuple{N, <:Integer}) where {N} = _splitobs(n::Int, at ./ n) 
-
 _splitobs(n::Int, at::Tuple{}) = (1:n,)
 
-
 function _splitobs(n::Int, at::AbstractFloat)
     0 <= at <= 1 || throw(ArgumentError("the parameter \"at\" must be in interval (0, 1)"))
-    n1 = floor(Int, n * at)
-    delta = n*at - n1
-    # TODO add random rounding
-    (1:n1, n1+1:n)
+    n1 = round(Int, n * at)
+    return (1:n1, n1+1:n)
 end
 
 function _splitobs(n::Int, at::NTuple{N,<:AbstractFloat}) where N
@@ -40,8 +36,9 @@ function _splitobs(n::Int, at::NTuple{N,<:AbstractFloat}) where N
     return (a, rest...)
 end
 
+
 """
-    splitobs([rng], data; at, shuffle=false, stratified=nothing) -> Tuple
+    splitobs([rng,] data; at, shuffle=false, stratified=nothing) -> Tuple
 
 Partition the `data` into two or more subsets.
 
diff --git a/test/splitobs.jl b/test/splitobs.jl
index f5ce335..a5238e4 100644
--- a/test/splitobs.jl
+++ b/test/splitobs.jl
@@ -90,3 +90,13 @@ end
     p2, _ = splitobs(rng, data, at=3, shuffle=true)
     @test p1 == p2
 end
+
+@testset "stratified" begin
+    data = (a=zeros(Float32, 2, 10), b=[0,0,0,0,1,1,1,1,1,1])
+    d1, d2 = splitobs(data, at=0.5, stratified=data.b)
+    @test d1.b == [0,0,1,1,1]
+    @test d2.b == [0,0,1,1,1]
+    d1, d2 = splitobs(data, at=0.25, stratified=data.b)
+    @test d1.b == [0,1,1]
+    @test d2.b == [0,0,0,1,1,1,1]
+end

From 7f4c1a4279420e471105cba5d9355ece3857f851 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sun, 2 Feb 2025 18:14:16 +0100
Subject: [PATCH 4/4] statified

---
 src/splitobs.jl  | 5 ++++-
 test/splitobs.jl | 7 +++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/splitobs.jl b/src/splitobs.jl
index e33dde7..9ea6e15 100644
--- a/src/splitobs.jl
+++ b/src/splitobs.jl
@@ -53,7 +53,7 @@ If `shuffle=true`, randomly permute the observations before splitting.
 A random number generator `rng` can be optionally passed as the first argument.
 
 If `stratified` is not `nothing`, it should be an array of labels with the same length as the data.
-The observations will be split in a way that the proportion of each label is preserved in each subset.
+The observations will be split in such a way that the proportion of each label is preserved in each subset.
 
 Supports any datatype implementing [`numobs`](@ref). 
 
@@ -75,6 +75,9 @@ julia> train, test = splitobs((reshape(1.0:100.0, 1, :), 101:200), at=0.7, shuff
 
 julia> vec(test[1]) .+ 100 == test[2]
 true
+
+julia> splitobs(1:10, at=0.5, stratified=[0,0,0,0,1,1,1,1,1,1]) # 2 zeros and 3 ones in each subset
+([1, 2, 5, 6, 7], [3, 4, 8, 9, 10])
 ```
 """
 splitobs(data; kws...) = splitobs(Random.default_rng(), data; kws...)
diff --git a/test/splitobs.jl b/test/splitobs.jl
index a5238e4..04a9e62 100644
--- a/test/splitobs.jl
+++ b/test/splitobs.jl
@@ -99,4 +99,11 @@ end
     d1, d2 = splitobs(data, at=0.25, stratified=data.b)
     @test d1.b == [0,1,1]
     @test d2.b == [0,0,0,1,1,1,1]
+
+    d1, d2 = splitobs(data, at=0., stratified=data.b)
+    @test d1.b == []
+    @test d2.b == [0,0,0,0,1,1,1,1,1,1]
+    d1, d2 = splitobs(data, at=1., stratified=data.b)
+    @test d1.b == [0,0,0,0,1,1,1,1,1,1]
+    @test d2.b == []
 end