Skip to content

Commit fbb78f4

Browse files
Merge pull request #446 from apache/tdigest_pmf_and_cdf
implemented get_PMF() and get_CDF()
2 parents 8b86cf1 + e473c11 commit fbb78f4

File tree

4 files changed

+103
-10
lines changed

4 files changed

+103
-10
lines changed

common/include/quantiles_sorted_view_impl.hpp

+5-7
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,17 @@ auto quantiles_sorted_view<T, C, A>::get_quantile(double rank, bool inclusive) c
8686
template<typename T, typename C, typename A>
8787
auto quantiles_sorted_view<T, C, A>::get_CDF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
8888
if (entries_.empty()) throw std::runtime_error("operation is undefined for an empty sketch");
89-
vector_double buckets(entries_.get_allocator());
90-
if (entries_.size() == 0) return buckets;
9189
check_split_points(split_points, size);
92-
buckets.reserve(size + 1);
93-
for (uint32_t i = 0; i < size; ++i) buckets.push_back(get_rank(split_points[i], inclusive));
94-
buckets.push_back(1);
95-
return buckets;
90+
vector_double ranks(entries_.get_allocator());
91+
ranks.reserve(size + 1);
92+
for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i], inclusive));
93+
ranks.push_back(1);
94+
return ranks;
9695
}
9796

9897
template<typename T, typename C, typename A>
9998
auto quantiles_sorted_view<T, C, A>::get_PMF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
10099
auto buckets = get_CDF(split_points, size, inclusive);
101-
if (buckets.size() == 0) return buckets;
102100
for (uint32_t i = size; i > 0; --i) {
103101
buckets[i] -= buckets[i - 1];
104102
}

tdigest/include/tdigest.hpp

+50
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ class tdigest {
8989
using vector_t = std::vector<T, Allocator>;
9090
using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
9191
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
92+
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
9293

9394
struct centroid_cmp {
9495
centroid_cmp() {}
@@ -142,20 +143,67 @@ class tdigest {
142143
*/
143144
uint64_t get_total_weight() const;
144145

146+
/**
147+
* Returns an instance of the allocator for this t-Digest.
148+
* @return allocator
149+
*/
150+
Allocator get_allocator() const;
151+
145152
/**
146153
* Compute approximate normalized rank of the given value.
154+
*
155+
* <p>If the sketch is empty this throws std::runtime_error.
156+
*
147157
* @param value to be ranked
148158
* @return normalized rank (from 0 to 1 inclusive)
149159
*/
150160
double get_rank(T value) const;
151161

152162
/**
153163
* Compute approximate quantile value corresponding to the given normalized rank
164+
*
165+
* <p>If the sketch is empty this throws std::runtime_error.
166+
*
154167
* @param rank normalized rank (from 0 to 1 inclusive)
155168
* @return quantile value corresponding to the given rank
156169
*/
157170
T get_quantile(double rank) const;
158171

172+
/**
173+
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
174+
* given a set of split points.
175+
*
176+
* <p>If the sketch is empty this throws std::runtime_error.
177+
*
178+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
179+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
180+
*
181+
* @param size the number of split points in the array
182+
*
183+
* @return an array of m+1 doubles each of which is an approximation
184+
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
185+
*/
186+
vector_double get_PMF(const T* split_points, uint32_t size) const;
187+
188+
/**
189+
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
190+
* cumulative analog of the PMF, of the input stream given a set of split points.
191+
*
192+
* <p>If the sketch is empty this throws std::runtime_error.
193+
*
194+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
195+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
196+
*
197+
* @param size the number of split points in the array
198+
*
199+
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
200+
* of the input stream given the split_points. The value at array position j of the returned
201+
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
202+
* array. This can be viewed as array of ranks of the given split points plus one more value
203+
* that is always 1.
204+
*/
205+
vector_double get_CDF(const T* split_points, uint32_t size) const;
206+
159207
/**
160208
* @return parameter k (compression) that was used to configure this t-Digest
161209
*/
@@ -245,6 +293,8 @@ class tdigest {
245293
// for compatibility with format of the reference implementation
246294
static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
247295
static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
296+
297+
static inline void check_split_points(const T* values, uint32_t size);
248298
};
249299

250300
} /* namespace datasketches */

tdigest/include/tdigest_impl.hpp

+36
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const {
8585
return centroids_weight_ + buffer_.size();
8686
}
8787

88+
template<typename T, typename A>
89+
A tdigest<T, A>::get_allocator() const {
90+
return buffer_.get_allocator();
91+
}
92+
8893
template<typename T, typename A>
8994
double tdigest<T, A>::get_rank(T value) const {
9095
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
@@ -191,6 +196,25 @@ T tdigest<T, A>::get_quantile(double rank) const {
191196
return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
192197
}
193198

199+
template<typename T, typename A>
200+
auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
201+
auto buckets = get_CDF(split_points, size);
202+
for (uint32_t i = size; i > 0; --i) {
203+
buckets[i] -= buckets[i - 1];
204+
}
205+
return buckets;
206+
}
207+
208+
template<typename T, typename A>
209+
auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
210+
check_split_points(split_points, size);
211+
vector_double ranks(get_allocator());
212+
ranks.reserve(size + 1);
213+
for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
214+
ranks.push_back(1);
215+
return ranks;
216+
}
217+
194218
template<typename T, typename A>
195219
uint16_t tdigest<T, A>::get_k() const {
196220
return k_;
@@ -591,6 +615,18 @@ buffer_(std::move(buffer))
591615
buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
592616
}
593617

618+
template<typename T, typename A>
619+
void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
620+
for (uint32_t i = 0; i < size ; i++) {
621+
if (std::isnan(values[i])) {
622+
throw std::invalid_argument("Values must not be NaN");
623+
}
624+
if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
625+
throw std::invalid_argument("Values must be unique and monotonically increasing");
626+
}
627+
}
628+
}
629+
594630
} /* namespace datasketches */
595631

596632
#endif // _TDIGEST_IMPL_HPP_

tdigest/test/tdigest_test.cpp

+12-3
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ TEST_CASE("empty", "[tdigest]") {
3535
REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
3636
REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
3737
REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
38+
const double split_points[1] {0};
39+
REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
40+
REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
3841
}
3942

4043
TEST_CASE("one value", "[tdigest]") {
@@ -56,9 +59,6 @@ TEST_CASE("many values", "[tdigest]") {
5659
const size_t n = 10000;
5760
tdigest_double td;
5861
for (size_t i = 0; i < n; ++i) td.update(i);
59-
// std::cout << td.to_string(true);
60-
// td.compress();
61-
// std::cout << td.to_string(true);
6262
REQUIRE_FALSE(td.is_empty());
6363
REQUIRE(td.get_total_weight() == n);
6464
REQUIRE(td.get_min_value() == 0);
@@ -73,6 +73,15 @@ TEST_CASE("many values", "[tdigest]") {
7373
REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
7474
REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
7575
REQUIRE(td.get_quantile(1) == n - 1);
76+
const double split_points[1] {n / 2};
77+
const auto pmf = td.get_PMF(split_points, 1);
78+
REQUIRE(pmf.size() == 2);
79+
REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
80+
REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
81+
const auto cdf = td.get_CDF(split_points, 1);
82+
REQUIRE(cdf.size() == 2);
83+
REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
84+
REQUIRE(cdf[1] == 1);
7685
}
7786

7887
TEST_CASE("rank - two values", "[tdigest]") {

0 commit comments

Comments
 (0)