Sketches · JSON Structure

Sketches Structure

JSON structure documentation for probabilistic data structures (sketches) used in streaming analytics and approximate query processing

Type: Properties: 0
Data StructuresProbabilistic AlgorithmsStreaming AnalyticsApproximate Query ProcessingBig DataReal-Time Analytics

Sketches Structure is a JSON Structure definition published by Sketches. It conforms to the http://json-schema.org/draft-07/schema# meta-schema.

Meta-schema: http://json-schema.org/draft-07/schema#

JSON Structure

sketches-structure.json Raw ↑
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Probabilistic Sketches Structure",
  "description": "JSON structure documentation for probabilistic data structures (sketches) used in streaming analytics and approximate query processing",
  "version": "1.0",
  "resources": {
    "HyperLogLog": {
      "description": "Cardinality estimation sketch for counting distinct values with ~2% error using O(log log n) space",
      "useCase": "COUNT DISTINCT queries on large datasets, unique visitor counting, distinct IP tracking",
      "fields": {
        "precision": { "type": "integer", "range": "4-18", "description": "Precision bits controlling accuracy vs memory tradeoff" },
        "estimate": { "type": "number", "description": "Estimated cardinality (distinct count)" },
        "errorRate": { "type": "number", "description": "Standard error rate = 1.04/sqrt(2^precision)" },
        "serialized": { "type": "string", "description": "Base64-encoded serialized sketch for storage/transfer" }
      },
      "implementations": ["Redis HLL", "Apache DataSketches HLL", "Amazon Redshift HLL", "Google BigQuery APPROX_COUNT_DISTINCT"]
    },
    "CountMinSketch": {
      "description": "Frequency estimation sketch that approximates the frequency of elements in a data stream",
      "useCase": "Heavy hitter detection, network traffic analysis, trending topics, frequency queries",
      "fields": {
        "width": { "type": "integer", "description": "Number of columns (controls error rate = e/width)" },
        "depth": { "type": "integer", "description": "Number of hash functions/rows (controls failure probability)" },
        "errorRate": { "type": "number", "description": "Maximum overcount per query = epsilon × total_count" },
        "confidence": { "type": "number", "description": "Probability that estimate is within error bound" }
      },
      "implementations": ["Redis CMS", "Apache DataSketches Frequency", "RedisBloom CM-SKETCH"]
    },
    "BloomFilter": {
      "description": "Probabilistic membership testing structure that answers 'might be in set' or 'definitely not in set'",
      "useCase": "Cache invalidation, URL deduplication, spell checkers, database query optimization",
      "fields": {
        "capacity": { "type": "integer", "description": "Expected number of elements to insert" },
        "falsePositiveRate": { "type": "number", "description": "Target false positive probability (e.g., 0.01 = 1%)" },
        "bitArraySize": { "type": "integer", "description": "Size of the bit array in bits" },
        "hashFunctions": { "type": "integer", "description": "Number of hash functions used" },
        "itemCount": { "type": "integer", "description": "Number of items currently in the filter" }
      },
      "implementations": ["Redis BF", "Apache DataSketches", "Guava BloomFilter", "RedisBloom BF"]
    },
    "TDigest": {
      "description": "Quantile estimation sketch for computing percentiles, medians, and trimmed means over streaming data",
      "useCase": "P95/P99 latency tracking, histogram approximation, percentile analytics, SLA monitoring",
      "fields": {
        "compression": { "type": "number", "description": "Compression factor controlling accuracy (higher = more accurate)" },
        "quantile": { "type": "number", "range": "0-1", "description": "The quantile to estimate (e.g., 0.99 for P99)" },
        "estimate": { "type": "number", "description": "Estimated value at the queried quantile" },
        "count": { "type": "integer", "description": "Total count of items added to the sketch" }
      },
      "implementations": ["Elasticsearch T-Digest", "Apache Druid", "InfluxDB", "Prometheus histogram_quantile"]
    },
    "ThetaSketch": {
      "description": "Set operations sketch from Apache DataSketches enabling union, intersection, and difference on estimated sets",
      "useCase": "Audience overlap analysis, A/B group comparisons, multi-dimensional cardinality, funnel analysis",
      "fields": {
        "k": { "type": "integer", "description": "Nominal entries parameter controlling accuracy (power of 2)" },
        "theta": { "type": "number", "description": "The sampling threshold (lower = smaller sketch)" },
        "estimate": { "type": "number", "description": "Estimated cardinality of the set" },
        "isEmpty": { "type": "boolean", "description": "Whether the sketch contains no data" }
      },
      "operations": ["union", "intersection", "aNotB (set difference)"],
      "implementations": ["Apache DataSketches Theta", "Apache Druid Theta Sketch"]
    }
  },
  "tradeoffs": {
    "memoryVsAccuracy": "All sketches trade memory for accuracy. Higher memory configuration yields lower error rates.",
    "exactVsApproximate": "Exact algorithms (e.g., exact COUNT DISTINCT) require memory proportional to data size. Sketches use fixed memory with bounded error.",
    "updateVsQuery": "Sketches are optimized for high-throughput updates; some (like T-Digest) have O(log n) query time.",
    "mergeable": "Most sketches are mergeable: two sketches of the same type can be combined into one, enabling distributed computation."
  }
}