Sketches · JSON Structure
Sketches Structure
JSON structure documentation for probabilistic data structures (sketches) used in streaming analytics and approximate query processing
Type:
Properties: 0
Data StructuresProbabilistic AlgorithmsStreaming AnalyticsApproximate Query ProcessingBig DataReal-Time Analytics
Sketches Structure is a JSON Structure definition published by Sketches. It conforms to the http://json-schema.org/draft-07/schema# meta-schema.
Meta-schema: http://json-schema.org/draft-07/schema#
JSON Structure
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Probabilistic Sketches Structure",
"description": "JSON structure documentation for probabilistic data structures (sketches) used in streaming analytics and approximate query processing",
"version": "1.0",
"resources": {
"HyperLogLog": {
"description": "Cardinality estimation sketch for counting distinct values with ~2% error using O(log log n) space",
"useCase": "COUNT DISTINCT queries on large datasets, unique visitor counting, distinct IP tracking",
"fields": {
"precision": { "type": "integer", "range": "4-18", "description": "Precision bits controlling accuracy vs memory tradeoff" },
"estimate": { "type": "number", "description": "Estimated cardinality (distinct count)" },
"errorRate": { "type": "number", "description": "Standard error rate = 1.04/sqrt(2^precision)" },
"serialized": { "type": "string", "description": "Base64-encoded serialized sketch for storage/transfer" }
},
"implementations": ["Redis HLL", "Apache DataSketches HLL", "Amazon Redshift HLL", "Google BigQuery APPROX_COUNT_DISTINCT"]
},
"CountMinSketch": {
"description": "Frequency estimation sketch that approximates the frequency of elements in a data stream",
"useCase": "Heavy hitter detection, network traffic analysis, trending topics, frequency queries",
"fields": {
"width": { "type": "integer", "description": "Number of columns (controls error rate = e/width)" },
"depth": { "type": "integer", "description": "Number of hash functions/rows (controls failure probability)" },
"errorRate": { "type": "number", "description": "Maximum overcount per query = epsilon × total_count" },
"confidence": { "type": "number", "description": "Probability that estimate is within error bound" }
},
"implementations": ["Redis CMS", "Apache DataSketches Frequency", "RedisBloom CM-SKETCH"]
},
"BloomFilter": {
"description": "Probabilistic membership testing structure that answers 'might be in set' or 'definitely not in set'",
"useCase": "Cache invalidation, URL deduplication, spell checkers, database query optimization",
"fields": {
"capacity": { "type": "integer", "description": "Expected number of elements to insert" },
"falsePositiveRate": { "type": "number", "description": "Target false positive probability (e.g., 0.01 = 1%)" },
"bitArraySize": { "type": "integer", "description": "Size of the bit array in bits" },
"hashFunctions": { "type": "integer", "description": "Number of hash functions used" },
"itemCount": { "type": "integer", "description": "Number of items currently in the filter" }
},
"implementations": ["Redis BF", "Apache DataSketches", "Guava BloomFilter", "RedisBloom BF"]
},
"TDigest": {
"description": "Quantile estimation sketch for computing percentiles, medians, and trimmed means over streaming data",
"useCase": "P95/P99 latency tracking, histogram approximation, percentile analytics, SLA monitoring",
"fields": {
"compression": { "type": "number", "description": "Compression factor controlling accuracy (higher = more accurate)" },
"quantile": { "type": "number", "range": "0-1", "description": "The quantile to estimate (e.g., 0.99 for P99)" },
"estimate": { "type": "number", "description": "Estimated value at the queried quantile" },
"count": { "type": "integer", "description": "Total count of items added to the sketch" }
},
"implementations": ["Elasticsearch T-Digest", "Apache Druid", "InfluxDB", "Prometheus histogram_quantile"]
},
"ThetaSketch": {
"description": "Set operations sketch from Apache DataSketches enabling union, intersection, and difference on estimated sets",
"useCase": "Audience overlap analysis, A/B group comparisons, multi-dimensional cardinality, funnel analysis",
"fields": {
"k": { "type": "integer", "description": "Nominal entries parameter controlling accuracy (power of 2)" },
"theta": { "type": "number", "description": "The sampling threshold (lower = smaller sketch)" },
"estimate": { "type": "number", "description": "Estimated cardinality of the set" },
"isEmpty": { "type": "boolean", "description": "Whether the sketch contains no data" }
},
"operations": ["union", "intersection", "aNotB (set difference)"],
"implementations": ["Apache DataSketches Theta", "Apache Druid Theta Sketch"]
}
},
"tradeoffs": {
"memoryVsAccuracy": "All sketches trade memory for accuracy. Higher memory configuration yields lower error rates.",
"exactVsApproximate": "Exact algorithms (e.g., exact COUNT DISTINCT) require memory proportional to data size. Sketches use fixed memory with bounded error.",
"updateVsQuery": "Sketches are optimized for high-throughput updates; some (like T-Digest) have O(log n) query time.",
"mergeable": "Most sketches are mergeable: two sketches of the same type can be combined into one, enabling distributed computation."
}
}