Examples

DDSketch Performance Comparison

Compare different mapping types for DDSketch:

from QuantileFlow import DDSketch
import time
import numpy as np

# Generate test data
np.random.seed(42)
data = np.random.normal(0, 1, 100000)

# Test different mapping types
mappings = ['logarithmic', 'lin_interpol', 'cub_interpol']
results = {}

for mapping in mappings:
    # Create sketch with this mapping
    sketch = DDSketch(relative_accuracy=0.01, mapping_type=mapping)
    
    # Measure insertion time
    start = time.time()
    for value in data:
        sketch.insert(value)
    insert_time = time.time() - start
    
    # Measure query time
    start = time.time()
    for q in [0.1, 0.5, 0.9, 0.95, 0.99]:
        sketch.quantile(q)
    query_time = time.time() - start
    
    # Store results
    results[mapping] = {
        'insert_time': insert_time,
        'query_time': query_time,
        'p50': sketch.quantile(0.5),
        'p99': sketch.quantile(0.99)
    }

# Print comparison
print("Mapping Type Comparison:")
for mapping, metrics in results.items():
    print(f"\n{mapping}:")
    print(f"  Insert time: {metrics['insert_time']:.4f}s")
    print(f"  Query time: {metrics['query_time']:.4f}s")
    print(f"  p50: {metrics['p50']:.4f}")
    print(f"  p99: {metrics['p99']:.4f}")

Handling Outliers with Sparse Storage

For datasets with outliers that span a wide range, sparse storage can be more appropriate:

from QuantileFlow import DDSketch
from QuantileFlow.ddsketch import BucketManagementStrategy
import numpy as np

# Generate data with outliers
normal_data = np.random.normal(0, 1, 10000)
outliers = np.random.uniform(1000, 10000, 50)  # Few extreme values
mixed_data = np.concatenate([normal_data, outliers])

# Create sketches with different storage types
contiguous_sketch = DDSketch(
    relative_accuracy=0.01,
    bucket_strategy=BucketManagementStrategy.FIXED
)
sparse_sketch = DDSketch(
    relative_accuracy=0.01,
    bucket_strategy=BucketManagementStrategy.UNLIMITED
)

# Add data to both
for value in mixed_data:
    contiguous_sketch.insert(value)
    sparse_sketch.insert(value)

# Compare results
quantiles = [0.5, 0.9, 0.99, 0.999]
print("Contiguous vs Sparse Storage with Outliers:")
for q in quantiles:
    c_val = contiguous_sketch.quantile(q)
    s_val = sparse_sketch.quantile(q)
    print(f"q={q}: Contiguous={c_val:.4f}, Sparse={s_val:.4f}")

MomentSketch for Summary Statistics

MomentSketch can provide additional statistics beyond quantiles:

from QuantileFlow import MomentSketch
import numpy as np

# Generate data
data = np.random.normal(5, 2, 10000)

# Create MomentSketch
sketch = MomentSketch(num_moments=20)  # Using 20 moments for better accuracy

# Add data
for value in data:
    sketch.insert(value)

# Get summary statistics
stats = sketch.summary_statistics()

print("Summary Statistics:")
print(f"Min: {stats['min']:.4f}")
print(f"Q1: {stats['q1']:.4f}")
print(f"Median: {stats['median']:.4f}")
print(f"Q3: {stats['q3']:.4f}")
print(f"Max: {stats['max']:.4f}")
print(f"Count: {stats['count']}")
print(f"Mean: {stats['mean']:.4f}")

# Compare with actual values
actual_min = np.min(data)
actual_max = np.max(data)
actual_mean = np.mean(data)
print(f"\nActual Min: {actual_min:.4f}")
print(f"Actual Max: {actual_max:.4f}")
print(f"Actual Mean: {actual_mean:.4f}")

# Visualize the distribution
fig = sketch.plot_distribution()

HDRHistogram for Wide-Range Data

HDRHistogram is particularly useful for data spanning multiple orders of magnitude:

from QuantileFlow import HDRHistogram
import numpy as np

# Generate data spanning multiple orders of magnitude
data = np.concatenate([
    np.random.normal(1, 0.1, 1000),      # Values around 1
    np.random.normal(10, 1, 1000),       # Values around 10
    np.random.normal(100, 10, 1000),     # Values around 100
    np.random.normal(1000, 100, 1000)    # Values around 1000
])

# Create HDRHistogram with appropriate configuration
histogram = HDRHistogram(
    num_buckets=12,      # More buckets for better precision
    min_value=0.1,       # Minimum trackable value
    max_value=10000.0    # Maximum trackable value
)

# Add data
for value in data:
    histogram.insert(value)

# Get summary statistics
stats = histogram.summary_statistics()

print("Summary Statistics:")
print(f"Min: {stats['min']:.4f}")
print(f"Q1: {stats['q1']:.4f}")
print(f"Median: {stats['median']:.4f}")
print(f"Q3: {stats['q3']:.4f}")
print(f"Max: {stats['max']:.4f}")
print(f"Count: {stats['count']}")

# Compare with actual values
actual_min = np.min(data)
actual_max = np.max(data)
actual_median = np.median(data)
print(f"\nActual Min: {actual_min:.4f}")
print(f"Actual Max: {actual_max:.4f}")
print(f"Actual Median: {actual_median:.4f}")

# Visualize the distribution
fig = histogram.plot_distribution()

Comparing All Three Algorithms

In the below code block, we show how to compare the performance and accuracy of all three algorithms. Note that HDRHistogram gives wildly inaccurate estimates because we’re sampling from a log normal distribution (it’s works perfectly if you just change the distribution to something with a smaller tail, such as log); we hypothesize this occurs because of two reasons:

(1) Moments (especially higher ones) are extremely sensitive to the long tail in lognormal distributions. Each moment becomes increasingly dominated by the extreme values.

(2) The maximum entropy optimization used to reconstruct the distribution from moments struggles to accurately capture both the bulk and tail of highly skewed distributions.

from QuantileFlow import DDSketch, MomentSketch, HDRHistogram
import numpy as np
import time

# Generate test data
np.random.seed(42)
data = np.random.lognormal(0, 2, 10000)  # Log-normal distribution

# Initialize all three algorithms
dd_sketch = DDSketch(relative_accuracy=0.01)
moment_sketch = MomentSketch(num_moments=20)
hdr_histogram = HDRHistogram(num_buckets=12)

# Time insertion
start = time.time()
for value in data:
    dd_sketch.insert(value)
dd_time = time.time() - start

start = time.time()
for value in data:
    moment_sketch.insert(value)
moment_time = time.time() - start

start = time.time()
for value in data:
    hdr_histogram.insert(value)
hdr_time = time.time() - start

# Get quantiles
quantiles = [0.5, 0.9, 0.99]
print("Performance Comparison:")
print(f"DDSketch insertion time: {dd_time:.4f}s")
print(f"MomentSketch insertion time: {moment_time:.4f}s")
print(f"HDRHistogram insertion time: {hdr_time:.4f}s")

print("\nQuantile Comparison:")
print(f"{'Quantile':>10} | {'DDSketch':>10} | {'MomentSketch':>12} | {'HDRHistogram':>12} | {'Actual':>10}")
print("-" * 70)

for q in quantiles:
    dd_val = dd_sketch.quantile(q)
    moment_val = moment_sketch.quantile(q)
    hdr_val = hdr_histogram.quantile(q)
    actual_val = np.quantile(data, q)
    
    print(f"{q:10.2f} | {dd_val:10.4f} | {moment_val:12.4f} | {hdr_val:12.4f} | {actual_val:10.4f}")