Skip to content

Commit 5f7484b

Browse files
committed
tmp
1 parent 7e7a73c commit 5f7484b

File tree

3 files changed

+17
-8
lines changed

3 files changed

+17
-8
lines changed

pandas/core/generic.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5816,7 +5816,7 @@ def sample(
58165816
Missing values in the weights column will be treated as zero.
58175817
Infinite values not allowed.
58185818
When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``,
5819-
in order to avoid biased results.
5819+
in order to avoid biased results. See the Notes below for more details.
58205820
random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
58215821
If int, array-like, or BitGenerator, seed for random number generator.
58225822
If np.random.RandomState or np.random.Generator, use as given.
@@ -5853,6 +5853,10 @@ def sample(
58535853
-----
58545854
If `frac` > 1, `replacement` should be set to `True`.
58555855
5856+
When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``,
5857+
since that would cause results to be biased. E.g. sampling 2 items without replacement,
5858+
with weights [100, 1, 1] would yield two last items in 1/2 of cases, instead of 1/102
5859+
58565860
Examples
58575861
--------
58585862
>>> df = pd.DataFrame(

pandas/core/sample.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,12 @@ def sample(
150150
else:
151151
raise ValueError("Invalid weights: weights sum to zero")
152152

153-
if weights is not None:
154-
is_max_weight_dominating = size * weights.max() > 1
155-
if is_max_weight_dominating and not replace:
153+
assert weights is not None # for mypy
154+
if not replace and size * weights.max() > 1:
156155
raise ValueError(
157-
"Invalid weights: If `replace`=False, "
158-
"total unit probabilities have to be less than 1"
156+
"Weighted sampling cannot be achieved with replace=False. Either "
157+
"set replace=True or use smaller weights. See the docstring of "
158+
"sample for details."
159159
)
160160

161161
return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(

pandas/tests/frame/methods/test_sample.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,18 +139,23 @@ def test_sample_unit_probabilities_raises(self, obj):
139139
high_variance_weights = [1] * 10
140140
high_variance_weights[0] = 100
141141
msg = (
142-
"Invalid weights: If `replace`=False, "
143-
"total unit probabilities have to be less than 1"
142+
"Weighted sampling cannot be achieved with replace=False. Either "
143+
"set replace=True or use smaller weights. See the docstring of "
144+
"sample for details."
144145
)
145146
with pytest.raises(ValueError, match=msg):
146147
obj.sample(n=2, weights=high_variance_weights, replace=False)
147148

149+
def test_sample_unit_probabilities_edge_case_do_not_raise(self, obj):
150+
# GH#61516
148151
# edge case, n*max(weights)/sum(weights) == 1
149152
edge_variance_weights = [1] * 10
150153
edge_variance_weights[0] = 9
151154
# should not raise
152155
obj.sample(n=2, weights=edge_variance_weights, replace=False)
153156

157+
def test_sample_unit_normal_probabilities_do_not_raise(self, obj):
158+
# GH#61516
154159
low_variance_weights = [1] * 10
155160
low_variance_weights[0] = 8
156161
# should not raise

0 commit comments

Comments
 (0)