#include <kll_sketch.hpp>

Classes
class	const_iterator

class	items_deleter

Public Types
using	value_type = T

using	comparator = C

using	allocator_type = A

using	vector_u32 = std::vector< uint32_t, typename std::allocator_traits< A >::template rebind_alloc< uint32_t > >

using	vector_double = typename quantiles_sorted_view< T, C, A >::vector_double

using	quantile_return_type = typename quantiles_sorted_view< T, C, A >::quantile_return_type

using	vector_bytes = std::vector< uint8_t, typename std::allocator_traits< A >::template rebind_alloc< uint8_t > >

Public Member Functions
	kll_sketch (uint16_t k=kll_constants::DEFAULT_K, const C &comparator=C(), const A &allocator=A())

	kll_sketch (const kll_sketch &other)

	kll_sketch (kll_sketch &&other) noexcept

kll_sketch &	operator= (const kll_sketch &other)

kll_sketch &	operator= (kll_sketch &&other)

template<typename TT , typename CC , typename AA >
	kll_sketch (const kll_sketch< TT, CC, AA > &other, const C &comparator=C(), const A &allocator=A())

template<typename FwdT >
void	update (FwdT &&item)

template<typename FwdSk >
void	merge (FwdSk &&other)

bool	is_empty () const

uint16_t	get_k () const

uint64_t	get_n () const

uint32_t	get_num_retained () const

bool	is_estimation_mode () const

T	get_min_item () const

T	get_max_item () const

C	get_comparator () const

A	get_allocator () const

quantile_return_type	get_quantile (double rank, bool inclusive=true) const

double	get_rank (const T &item, bool inclusive=true) const

vector_double	get_PMF (const T *split_points, uint32_t size, bool inclusive=true) const

vector_double	get_CDF (const T *split_points, uint32_t size, bool inclusive=true) const

double	get_normalized_rank_error (bool pmf) const

template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if< std::is_arithmetic< TT >::value, int >::type = 0>
size_t	get_serialized_size_bytes (const SerDe &sd=SerDe()) const

template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic< TT >::value, int >::type = 0>
size_t	get_serialized_size_bytes (const SerDe &sd=SerDe()) const

template<typename SerDe = serde<T>>
void	serialize (std::ostream &os, const SerDe &sd=SerDe()) const

template<typename SerDe = serde<T>>
vector_bytes	serialize (unsigned header_size_bytes=0, const SerDe &sd=SerDe()) const

string< A >	to_string (bool print_levels=false, bool print_items=false) const

const_iterator	begin () const

const_iterator	end () const

quantiles_sorted_view< T, C, A >	get_sorted_view () const

template<typename SerDe >
auto	serialize (unsigned header_size_bytes, const SerDe &sd) const -> vector_bytes

template<typename SerDe >
kll_sketch< T, C, A >	deserialize (std::istream &is, const SerDe &sd, const C &comparator, const A &allocator)

template<typename SerDe >
kll_sketch< T, C, A >	deserialize (const void *bytes, size_t size, const SerDe &sd, const C &comparator, const A &allocator)

Static Public Member Functions
template<typename TT = T, typename std::enable_if< std::is_arithmetic< TT >::value, int >::type = 0>
static size_t	get_max_serialized_size_bytes (uint16_t k, uint64_t n)

template<typename TT = T, typename std::enable_if<!std::is_arithmetic< TT >::value, int >::type = 0>
static size_t	get_max_serialized_size_bytes (uint16_t k, uint64_t n, size_t max_item_size_bytes)

template<typename SerDe = serde<T>>
static kll_sketch	deserialize (std::istream &is, const SerDe &sd=SerDe(), const C &comparator=C(), const A &allocator=A())

template<typename SerDe = serde<T>>
static kll_sketch	deserialize (const void *bytes, size_t size, const SerDe &sd=SerDe(), const C &comparator=C(), const A &allocator=A())

static double	get_normalized_rank_error (uint16_t k, bool pmf)

Detailed Description

template<typename T, typename C = std::less<T>, typename A = std::allocator<T>>
class datasketches::kll_sketch< T, C, A >

Implementation of a very compact quantiles sketch with lazy compaction scheme and nearly optimal accuracy per retained item. See Optimal Quantile Approximation in Streams.

This is a stochastic streaming sketch that enables near real-time analysis of the approximate distribution of items from a very large stream in a single pass, requiring only that the items are comparable. The analysis is obtained using get_quantile() function or the inverse functions get_rank(), get_PMF() (Probability Mass Function), and get_CDF() (Cumulative Distribution Function).

As of May 2020, this implementation produces serialized sketches which are binary-compatible with the equivalent Java implementation only when template parameter T = float (32-bit single precision values).

Given an input stream of N items, the natural rank of any specific item is defined as its index (1 to N) in inclusive mode or (0 to N-1) in exclusive mode in the hypothetical sorted stream of all N input items.

The normalized rank (rank) of any specific item is defined as its natural rank divided by N. Thus, the normalized rank is between zero and one. In the documentation for this sketch natural rank is never used so any reference to just rank should be interpreted to mean normalized rank.

This sketch is configured with a parameter k, which affects the size of the sketch and its estimation error.

The estimation error is commonly called epsilon (or eps) and is a fraction between zero and one. Larger values of k result in smaller values of epsilon. Epsilon is always with respect to the rank and cannot be applied to the corresponding items.

The relationship between the normalized rank and the corresponding items can be viewed as a two dimensional monotonic plot with the normalized rank on one axis and the corresponding items on the other axis. If the y-axis is specified as the item-axis and the x-axis as the normalized rank, then y = get_quantile(x) is a monotonically increasing function.

The function get_quantile(rank) translates ranks into corresponding quantiles. The functions get_rank(item), get_CDF(...) (Cumulative Distribution Function), and get_PMF(...) (Probability Mass Function) perform the opposite operation and translate items into ranks.

The getPMF(...) function has about 13 to 47% worse rank error (depending on k) than the other queries because the mass of each "bin" of the PMF has "double-sided" error from the upper and lower edges of the bin as a result of a subtraction, as the errors from the two edges can sometimes add.

The default k of 200 yields a "single-sided" epsilon of about 1.33% and a "double-sided" (PMF) epsilon of about 1.65%.

A get_quantile(rank) query has the following guarantees:

Let q = get_quantile(r) where r is the rank between zero and one.
The quantile q will be an item from the input stream.
Let trueRank be the true rank of q derived from the hypothetical sorted stream of all N items.
Let eps = get_normalized_rank_error(false).
Then r - eps ≤ trueRank ≤ r + eps with a confidence of 99%. Note that the error is on the rank, not the quantile.

A get_rank(item) query has the following guarantees:

Let r = get_rank(i) where i is an item between the min and max items of the input stream.
Let true_rank be the true rank of i derived from the hypothetical sorted stream of all N items.
Let eps = get_normalized_rank_error(false).
Then r - eps ≤ trueRank ≤ r + eps with a confidence of 99%.

A get_PMF() query has the following guarantees:

Let {r1, r2, ..., r(m+1)} = get_PMF(s1, s2, ..., sm) where s1, s2 are split points (items from the input domain) between the min and max items of the input stream.
Let mass_i = estimated mass between s_i and s_i+1.
Let trueMass be the true mass between the items of s_i, s_i+1 derived from the hypothetical sorted stream of all N items.
Let eps = get_normalized_rank_error(true).
then mass - eps ≤ trueMass ≤ mass + eps with a confidence of 99%.
r(m+1) includes the mass of all points larger than sm.

A get_CDF(...) query has the following guarantees;

Let {r1, r2, ..., r(m+1)} = get_CDF(s1, s2, ..., sm) where s1, s2, ... are split points (items from the input domain) between the min and max items of the input stream.
Let mass_i = r_i+1 - r_i.
Let trueMass be the true mass between the true ranks of s_i, s_i+1 derived from the hypothetical sorted stream of all N items.
Let eps = get_normalized_rank_error(true).
then mass - eps ≤ trueMass ≤ mass + eps with a confidence of 99%.
1 - r(m+1) includes the mass of all points larger than sm.

From the above, it might seem like we could make some estimates to bound the item returned from a call to get_quantile(). The sketch, however, does not let us derive error bounds or confidences around items. Because errors are independent, we can approximately bracket a value as shown below, but there are no error estimates available. Additionally, the interval may be quite large for certain distributions.

Let q = get_quantile(r), the estimated quantile of rank r.
Let eps = get_normalized_rank_error(false).
Let q_lo = estimated quantile of rank (r - eps).
Let q_hi = estimated quantile of rank (r + eps).
Then q_lo ≤ q ≤ q_hi, with 99% confidence.

author Kevin Lang author Alexander Saydakov author Lee Rhodes

Member Typedef Documentation

◆ quantile_return_type

template<typename T , typename C = std::less<T>, typename A = std::allocator<T>>

using datasketches::kll_sketch< T, C, A >::quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type

Quantile return type. This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)

Constructor & Destructor Documentation

◆ kll_sketch() [1/3]

template<typename T , typename C , typename A >

datasketches::kll_sketch< T, C, A >::kll_sketch	(	uint16_t	k = `kll_constants::DEFAULT_K`,
		const C &	comparator = `C()`,
		const A &	allocator = `A()`
	)

explicit

Constructor

Parameters

k	affects the size of the sketch and its estimation error
comparator	strict weak ordering function (see C++ named requirements: Compare)
allocator	used by this sketch to allocate memory

◆ kll_sketch() [2/3]

template<typename T , typename C , typename A >

datasketches::kll_sketch< T, C, A >::kll_sketch ( const kll_sketch< T, C, A > & other )

Copy constructor

Parameters

other sketch to be copied

◆ kll_sketch() [3/3]

template<typename T , typename C , typename A >

datasketches::kll_sketch< T, C, A >::kll_sketch ( kll_sketch< T, C, A > && other )

noexcept

Move constructor

Parameters

other sketch to be moved

Member Function Documentation

◆ begin()

template<typename T , typename C , typename A >

kll_sketch< T, C, A >::const_iterator datasketches::kll_sketch< T, C, A >::begin

Iterator pointing to the first item in the sketch. If the sketch is empty, the returned iterator must not be dereferenced or incremented.

Returns: iterator pointing to the first item in the sketch

◆ deserialize() [1/2]

template<typename T , typename C = std::less<T>, typename A = std::allocator<T>>

template<typename SerDe = serde<T>>

static kll_sketch datasketches::kll_sketch< T, C, A >::deserialize	(	const void *	bytes,
		size_t	size,
		const SerDe &	sd = `SerDe()`,
		const C &	comparator = `C()`,
		const A &	allocator = `A()`
	)

static

This method deserializes a sketch from a given array of bytes.

Parameters

bytes	pointer to the array of bytes
size	the size of the array
sd	instance of a SerDe
comparator	instance of a Comparator
allocator	instance of an Allocator

Returns: an instance of a sketch

◆ deserialize() [2/2]

template<typename T , typename C = std::less<T>, typename A = std::allocator<T>>

template<typename SerDe = serde<T>>

static kll_sketch datasketches::kll_sketch< T, C, A >::deserialize	(	std::istream &	is,
		const SerDe &	sd = `SerDe()`,
		const C &	comparator = `C()`,
		const A &	allocator = `A()`
	)

static

This method deserializes a sketch from a given stream.

Parameters

is	input stream
sd	instance of a SerDe
comparator	instance of a Comparator
allocator	instance of an Allocator

Returns: an instance of a sketch

◆ end()

template<typename T , typename C , typename A >

kll_sketch< T, C, A >::const_iterator datasketches::kll_sketch< T, C, A >::end

Iterator pointing to the past-the-end item in the sketch. The past-the-end item is the hypothetical item that would follow the last item. It does not point to any item, and must not be dereferenced or incremented.

Returns: iterator pointing to the past-the-end item in the sketch

◆ get_allocator()

template<typename T , typename C , typename A >

A datasketches::kll_sketch< T, C, A >::get_allocator

Returns an instance of the allocator for this sketch.

Returns: allocator

◆ get_CDF()

template<typename T , typename C , typename A >

auto datasketches::kll_sketch< T, C, A >::get_CDF	(	const T *	split_points,
		uint32_t	size,
		bool	inclusive = `true`
	)		const

Returns an approximation to the Cumulative Distribution Function (CDF), which is the cumulative analog of the PMF, of the input stream given a set of split points (items).

The resulting approximations have a probabilistic guarantee that can be obtained from the get_normalized_rank_error(false) function.

If the sketch is empty this throws std::runtime_error.

Parameters

split_points	an array of m unique, monotonically increasing items that divide the input domain into m+1 consecutive disjoint intervals.
size	the number of split points in the array
inclusive	if true the rank of an item includes its own weight, and therefore if the sketch contains items equal to a slit point, then in CDF such items are included into the interval to the left of split point. Otherwise they are included into the interval to the right of split point.

Returns: an array of m+1 doubles, which are a consecutive approximation to the CDF of the input stream given the split_points. The value at array position j of the returned CDF array is the sum of the returned values in positions 0 through j of the returned PMF array. This can be viewed as array of ranks of the given split points plus one more value that is always 1.

◆ get_comparator()

template<typename T , typename C , typename A >

C datasketches::kll_sketch< T, C, A >::get_comparator

Returns an instance of the comparator for this sketch.

Returns: comparator

◆ get_k()

template<typename T , typename C , typename A >

uint16_t datasketches::kll_sketch< T, C, A >::get_k

Returns configured parameter k

Returns: parameter k

◆ get_max_item()

template<typename T , typename C , typename A >

T datasketches::kll_sketch< T, C, A >::get_max_item

Returns the max item of the stream. If the sketch is empty this throws std::runtime_error.

Returns: the max item of the stream

◆ get_max_serialized_size_bytes() [1/2]

template<typename T , typename C , typename A >

template<typename TT , typename std::enable_if< std::is_arithmetic< TT >::value, int >::type >

size_t datasketches::kll_sketch< T, C, A >::get_max_serialized_size_bytes	(	uint16_t	k,
		uint64_t	n
	)

static

Returns upper bound on the serialized size of a sketch given a parameter k and stream length. The resulting size is an overestimate to make sure actual sketches don't exceed it. This method can be used if allocation of storage is necessary beforehand, but it is not optimal. This method is for arithmetic types (integral and floating point)

Parameters

k	parameter that controls size of the sketch and accuracy of estimates
n	stream length

Returns: upper bound on the serialized size

◆ get_max_serialized_size_bytes() [2/2]

template<typename T , typename C , typename A >

template<typename TT , typename std::enable_if<!std::is_arithmetic< TT >::value, int >::type >

size_t datasketches::kll_sketch< T, C, A >::get_max_serialized_size_bytes	(	uint16_t	k,
		uint64_t	n,
		size_t	max_item_size_bytes
	)

static

Returns upper bound on the serialized size of a sketch given a parameter k and stream length. The resulting size is an overestimate to make sure actual sketches don't exceed it. This method can be used if allocation of storage is necessary beforehand, but it is not optimal. This method is for all other non-arithmetic types, and it takes a max size of an item as input.

Parameters

k	parameter that controls size of the sketch and accuracy of estimates
n	stream length
max_item_size_bytes	maximum size of an item in bytes

Returns: upper bound on the serialized size

◆ get_min_item()

template<typename T , typename C , typename A >

T datasketches::kll_sketch< T, C, A >::get_min_item

Returns the min item of the stream. If the sketch is empty this throws std::runtime_error.

Returns: the min item of the stream

◆ get_n()

template<typename T , typename C , typename A >

uint64_t datasketches::kll_sketch< T, C, A >::get_n

Returns the length of the input stream.

Returns: stream length

◆ get_normalized_rank_error()

template<typename T , typename C , typename A >

double datasketches::kll_sketch< T, C, A >::get_normalized_rank_error ( bool pmf ) const

Gets the approximate rank error of this sketch normalized as a fraction between zero and one.

Parameters

pmf	if true, returns the "double-sided" normalized rank error for the get_PMF() function. Otherwise, it is the "single-sided" normalized rank error for all the other queries.

Returns: if pmf is true, returns the normalized rank error for the get_PMF() function. Otherwise, it is the "single-sided" normalized rank error for all the other queries.

◆ get_num_retained()

template<typename T , typename C , typename A >

uint32_t datasketches::kll_sketch< T, C, A >::get_num_retained

Returns the number of retained items (samples) in the sketch.

Returns: the number of retained items

◆ get_PMF()

template<typename T , typename C , typename A >

auto datasketches::kll_sketch< T, C, A >::get_PMF	(	const T *	split_points,
		uint32_t	size,
		bool	inclusive = `true`
	)		const

Returns an approximation to the Probability Mass Function (PMF) of the input stream given a set of split points (items).

The resulting approximations have a probabilistic guarantee that can be obtained from the get_normalized_rank_error(true) function.

If the sketch is empty this throws std::runtime_error.

Parameters

split_points	an array of m unique, monotonically increasing items that divide the input domain into m+1 consecutive disjoint intervals (bins).
size	the number of split points in the array
inclusive	if true the rank of an item includes its own weight, and therefore if the sketch contains items equal to a slit point, then in PMF such items are included into the interval to the left of split point. Otherwise they are included into the interval to the right of split point.

Returns: an array of m+1 doubles each of which is an approximation to the fraction of the input stream items (the mass) that fall into one of those intervals.

◆ get_quantile()

template<typename T , typename C , typename A >

auto datasketches::kll_sketch< T, C, A >::get_quantile	(	double	rank,
		bool	inclusive = `true`
	)		const

Returns an item from the sketch that is the best approximation to an item from the original stream with the given rank.

If the sketch is empty this throws std::runtime_error.

Parameters

rank	of an item in the hypothetical sorted stream.
inclusive	if true, the given rank is considered inclusive (includes weight of an item)

Returns: approximate quantile associated with the given rank

◆ get_rank()

template<typename T , typename C , typename A >

double datasketches::kll_sketch< T, C, A >::get_rank	(	const T &	item,
		bool	inclusive = `true`
	)		const

Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.

The resulting approximation has a probabilistic guarantee that can be obtained from the get_normalized_rank_error(false) function.

If the sketch is empty this throws std::runtime_error.

Parameters

item	to be ranked.
inclusive	if true the weight of the given item is included into the rank. Otherwise the rank equals the sum of the weights of all items that are less than the given item according to the comparator C.

Returns: an approximate rank of the given item

◆ get_serialized_size_bytes() [1/2]

template<typename T , typename C , typename A >

template<typename TT , typename SerDe , typename std::enable_if<!std::is_arithmetic< TT >::value, int >::type >

size_t datasketches::kll_sketch< T, C, A >::get_serialized_size_bytes ( const SerDe & sd = SerDe() ) const

Computes size needed to serialize the current state of the sketch. This version is for fixed-size arithmetic types (integral and floating point).

Parameters

sd	instance of a SerDe

Returns: size in bytes needed to serialize this sketch

◆ get_serialized_size_bytes() [2/2]

template<typename T , typename C = std::less<T>, typename A = std::allocator<T>>

template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic< TT >::value, int >::type = 0>

size_t datasketches::kll_sketch< T, C, A >::get_serialized_size_bytes ( const SerDe & sd = SerDe() ) const

Computes size needed to serialize the current state of the sketch. This version is for all other types and can be expensive since every item needs to be looked at.

Parameters

sd	instance of a SerDe

Returns: size in bytes needed to serialize this sketch

◆ get_sorted_view()

template<typename T , typename C , typename A >

quantiles_sorted_view< T, C, A > datasketches::kll_sketch< T, C, A >::get_sorted_view

Gets the sorted view of this sketch

Returns: the sorted view of this sketch

◆ is_empty()

template<typename T , typename C , typename A >

bool datasketches::kll_sketch< T, C, A >::is_empty

Returns true if this sketch is empty.

Returns: empty flag

◆ is_estimation_mode()

template<typename T , typename C , typename A >

bool datasketches::kll_sketch< T, C, A >::is_estimation_mode

Returns true if this sketch is in estimation mode.

Returns: estimation mode flag

◆ merge()

template<typename T , typename C , typename A >

template<typename FwdSk >

void datasketches::kll_sketch< T, C, A >::merge ( FwdSk && other )

Merges another sketch into this one.

Parameters

other sketch to merge into this one

◆ operator=() [1/2]

template<typename T , typename C , typename A >

kll_sketch< T, C, A > & datasketches::kll_sketch< T, C, A >::operator= ( const kll_sketch< T, C, A > & other )

Copy assignment

Parameters

other sketch to be copied

Returns: reference to this sketch

◆ operator=() [2/2]

template<typename T , typename C , typename A >

kll_sketch< T, C, A > & datasketches::kll_sketch< T, C, A >::operator= ( kll_sketch< T, C, A > && other )

Move assignment

Parameters

other sketch to be moved

Returns: reference to this sketch

◆ serialize() [1/2]

template<typename T , typename C , typename A >

template<typename SerDe >

void datasketches::kll_sketch< T, C, A >::serialize	(	std::ostream &	os,
		const SerDe &	sd = `SerDe()`
	)		const

This method serializes the sketch into a given stream in a binary form

Parameters

os	output stream
sd	instance of a SerDe

◆ serialize() [2/2]

template<typename T , typename C = std::less<T>, typename A = std::allocator<T>>

template<typename SerDe = serde<T>>

vector_bytes datasketches::kll_sketch< T, C, A >::serialize	(	unsigned	header_size_bytes = `0`,
		const SerDe &	sd = `SerDe()`
	)		const

This method serializes the sketch as a vector of bytes. An optional header can be reserved in front of the sketch. It is a blank space of a given size. This header is used in Datasketches PostgreSQL extension.

Parameters

header_size_bytes	space to reserve in front of the sketch
sd	instance of a SerDe

Returns: serialized sketch as a vector of bytes

◆ to_string()

template<typename T , typename C , typename A >

string< A > datasketches::kll_sketch< T, C, A >::to_string	(	bool	print_levels = `false`,
		bool	print_items = `false`
	)		const

Prints a summary of the sketch.

Parameters

print_levels	if true include information about levels
print_items	if true include sketch data

◆ update()

template<typename T , typename C , typename A >

template<typename FwdT >

void datasketches::kll_sketch< T, C, A >::update ( FwdT && item )

Updates this sketch with the given data item.

Parameters

item	from a stream of items

The documentation for this class was generated from the following files:

include/datasketches/kll_sketch.hpp
include/datasketches/kll_sketch_impl.hpp

Classes

Public Types

Public Member Functions

Static Public Member Functions

Detailed Description

template<typename T, typename C = std::less<T>, typename A = std::allocator<T>> class datasketches::kll_sketch< T, C, A >

Member Typedef Documentation

◆ quantile_return_type

Constructor & Destructor Documentation

◆ kll_sketch() [1/3]

◆ kll_sketch() [2/3]

◆ kll_sketch() [3/3]

Member Function Documentation

◆ begin()

◆ deserialize() [1/2]

◆ deserialize() [2/2]

◆ end()

◆ get_allocator()

◆ get_CDF()

◆ get_comparator()

◆ get_k()

◆ get_max_item()

◆ get_max_serialized_size_bytes() [1/2]

◆ get_max_serialized_size_bytes() [2/2]

◆ get_min_item()

◆ get_n()

◆ get_normalized_rank_error()

◆ get_num_retained()

◆ get_PMF()

◆ get_quantile()

◆ get_rank()

◆ get_serialized_size_bytes() [1/2]

◆ get_serialized_size_bytes() [2/2]

◆ get_sorted_view()

◆ is_empty()

◆ is_estimation_mode()

◆ merge()

◆ operator=() [1/2]

◆ operator=() [2/2]

◆ serialize() [1/2]

◆ serialize() [2/2]

◆ to_string()

◆ update()

template<typename T, typename C = std::less<T>, typename A = std::allocator<T>>
class datasketches::kll_sketch< T, C, A >