diff --git a/conf/defaults.yaml b/conf/defaults.yaml
index 4368099725c..bc9a5979d4c 100644
--- a/conf/defaults.yaml
+++ b/conf/defaults.yaml
@@ -25,7 +25,7 @@ java.library.path: "/usr/local/lib:/opt/local/lib:/usr/lib:/usr/lib64"
 storm.local.dir: "storm-local"
 storm.log4j2.conf.dir: "log4j2"
 storm.zookeeper.servers:
-    - "localhost"
+  - "localhost"
 storm.zookeeper.port: 2181
 storm.zookeeper.root: "/storm"
 storm.zookeeper.session.timeout: 20000
@@ -52,7 +52,7 @@ storm.nimbus.retry.intervalceiling.millis: 60000
 storm.nimbus.zookeeper.acls.check: true
 storm.nimbus.zookeeper.acls.fixup: true
 
-storm.auth.simple-white-list.users: []
+storm.auth.simple-white-list.users: [ ]
 storm.cluster.state.store: "org.apache.storm.cluster.ZKStateStorageFactory"
 storm.meta.serialization.delegate: "org.apache.storm.serialization.GzipThriftSerializationDelegate"
 storm.codedistributor.class: "org.apache.storm.codedistributor.LocalFileSystemCodeDistributor"
@@ -62,7 +62,7 @@ storm.health.check.timeout.ms: 5000
 storm.disable.symlinks: false
 
 ### nimbus.* configs are for the master
-nimbus.seeds : ["localhost"]
+nimbus.seeds: [ "localhost" ]
 nimbus.thrift.port: 6627
 nimbus.thrift.threads: 64
 nimbus.thrift.max_buffer_size: 1048576
@@ -163,10 +163,10 @@ storm.blobstore.acl.validation.enabled: false
 ### supervisor.* configs are for node supervisors
 # Define the amount of workers that can be run on this machine. Each worker is assigned a port to use for communication
 supervisor.slots.ports:
-    - 6700
-    - 6701
-    - 6702
-    - 6703
+  - 6700
+  - 6701
+  - 6702
+  - 6703
 supervisor.childopts: "-Xmx256m"
 supervisor.run.worker.as.user: false
 #how long supervisor will wait to ensure that a worker process is started
@@ -184,8 +184,8 @@ supervisor.worker.heartbeats.max.timeout.secs: 600
 #For topology configurable heartbeat timeout, maximum allowed heartbeat timeout.
 worker.max.timeout.secs: 600
 supervisor.enable: true
-supervisor.supervisors: []
-supervisor.supervisors.commands: []
+supervisor.supervisors: [ ]
+supervisor.supervisors.commands: [ ]
 supervisor.memory.capacity.mb: 4096.0
 #By convention 1 cpu core should be about 100, but this can be adjusted if needed
 # using 100 makes it simple to set the desired value to the capacity measurement
@@ -278,6 +278,8 @@ topology.max.task.parallelism: null
 topology.max.spout.pending: null    # ideally should be larger than topology.producer.batch.size. (esp. if topology.batch.flush.interval.millis=0)
 topology.state.synchronization.timeout.secs: 60
 topology.stats.sample.rate: 0.05
+topology.stats.ewma.enable: false
+topology.stats.ewma.smoothing.factor: 0.0625
 topology.builtin.metrics.bucket.size.secs: 60
 topology.fall.back.on.java.serialization: false
 topology.worker.childopts: null
@@ -287,16 +289,16 @@ topology.worker.shared.thread.pool.size: 4
 
 # Spout Wait Strategy - employed when there is no data to produce
 topology.spout.wait.strategy: "org.apache.storm.policy.WaitStrategyProgressive"
-topology.spout.wait.park.microsec : 100          # park time for org.apache.storm.policy.WaitStrategyPark. Busy spins if set to 0.
+topology.spout.wait.park.microsec: 100          # park time for org.apache.storm.policy.WaitStrategyPark. Busy spins if set to 0.
 
 topology.spout.wait.progressive.level1.count: 0          # number of iterations to spend in level 1 [no sleep] of WaitStrategyProgressive, before progressing to level 2
 topology.spout.wait.progressive.level2.count: 0          # number of iterations to spend in level 2 [parkNanos(1)] of WaitStrategyProgressive, before progressing to level 3
 topology.spout.wait.progressive.level3.sleep.millis: 1   # sleep duration for idling iterations in level 3 of WaitStrategyProgressive
 
 # Bolt Wait Strategy - employed when there is no data in its receive buffer to process
-topology.bolt.wait.strategy : "org.apache.storm.policy.WaitStrategyProgressive"
+topology.bolt.wait.strategy: "org.apache.storm.policy.WaitStrategyProgressive"
 
-topology.bolt.wait.park.microsec : 100          # park time for org.apache.storm.policy.WaitStrategyPark. Busy spins if set to 0.
+topology.bolt.wait.park.microsec: 100          # park time for org.apache.storm.policy.WaitStrategyPark. Busy spins if set to 0.
 
 topology.bolt.wait.progressive.level1.count: 1          # number of iterations to spend in level 1 [no sleep] of WaitStrategyProgressive, before progressing to level 2
 topology.bolt.wait.progressive.level2.count: 1000       # number of iterations to spend in level 2 [parkNanos(1)] of WaitStrategyProgressive, before progressing to level 3
@@ -363,7 +365,7 @@ blacklist.scheduler.assume.supervisor.bad.based.on.bad.slot: true
 
 dev.zookeeper.path: "/tmp/dev-storm-zookeeper"
 
-pacemaker.servers: []
+pacemaker.servers: [ ]
 pacemaker.port: 6699
 pacemaker.base.threads: 10
 pacemaker.max.threads: 50
@@ -371,12 +373,12 @@ pacemaker.client.max.threads: 2
 pacemaker.thread.timeout: 10
 pacemaker.childopts: "-Xmx1024m"
 pacemaker.auth.method: "NONE"
-pacemaker.kerberos.users: []
+pacemaker.kerberos.users: [ ]
 pacemaker.thrift.message.size.max: 10485760
 
 #default storm daemon metrics reporter plugins
 storm.daemon.metrics.reporter.plugins:
-     - "org.apache.storm.daemon.metrics.reporters.JmxPreparableReporter"
+  - "org.apache.storm.daemon.metrics.reporters.JmxPreparableReporter"
 storm.daemon.metrics.reporter.interval.secs: 10
 
 storm.metricstore.class: "org.apache.storm.metricstore.rocksdb.RocksDbStore"
@@ -399,8 +401,8 @@ storm.cgroup.inherit.cpuset.configs: false
 # Configs for CGroup support
 storm.cgroup.hierarchy.dir: "/cgroup/storm_resources"
 storm.cgroup.resources:
-    - "cpu"
-    - "memory"
+  - "cpu"
+  - "memory"
 storm.cgroup.hierarchy.name: "storm"
 storm.supervisor.cgroup.rootdir: "storm"
 storm.cgroup.cgexec.cmd: "/bin/cgexec"
@@ -419,12 +421,12 @@ storm.worker.min.cpu.pcore.percent: 0.0
 
 storm.topology.classpath.beginning.enabled: false
 worker.metrics:
-    "CGroupMemory": "org.apache.storm.metrics2.cgroup.CGroupMemoryUsage"
-    "CGroupMemoryLimit": "org.apache.storm.metrics2.cgroup.CGroupMemoryLimit"
-    "CGroupCpu": "org.apache.storm.metrics2.cgroup.CGroupCpu"
-    "CGroupCpuGuarantee": "org.apache.storm.metrics2.cgroup.CGroupCpuGuarantee"
-    "CGroupCpuGuaranteeByCfsQuota": "org.apache.storm.metrics2.cgroup.CGroupCpuGuaranteeByCfsQuota"
-    "CGroupCpuStat": "org.apache.storm.metrics2.cgroup.CGroupCpuStat"
+  "CGroupMemory": "org.apache.storm.metrics2.cgroup.CGroupMemoryUsage"
+  "CGroupMemoryLimit": "org.apache.storm.metrics2.cgroup.CGroupMemoryLimit"
+  "CGroupCpu": "org.apache.storm.metrics2.cgroup.CGroupCpu"
+  "CGroupCpuGuarantee": "org.apache.storm.metrics2.cgroup.CGroupCpuGuarantee"
+  "CGroupCpuGuaranteeByCfsQuota": "org.apache.storm.metrics2.cgroup.CGroupCpuGuaranteeByCfsQuota"
+  "CGroupCpuStat": "org.apache.storm.metrics2.cgroup.CGroupCpuStat"
 
 # The number of buckets for running statistics
 num.stat.buckets: 20
diff --git a/dev-tools/simulators/jitter_aware_scheduler_sim.py.py b/dev-tools/simulators/jitter_aware_scheduler_sim.py.py
new file mode 100644
index 00000000000..8e0a1dd7a35
--- /dev/null
+++ b/dev-tools/simulators/jitter_aware_scheduler_sim.py.py
@@ -0,0 +1,545 @@
+"""
+JCQueue Discrete Event Simulator
+=================================
+Compares two Apache Storm JCQueue scheduling strategies:
+
+  1. FIFO pure      — poll one tuple at a time, O(1), no reordering
+  2. FIFO + batch   — poll N tuples, sort by downstream EWMA jitter
+                      (lowest-jitter task served first, mirrors
+                      TaskJitterComparator), pay a flat reorder_cost
+                      in ticks before draining the batch
+
+Event model
+-----------
+  - Each producer fires ARRIVE events at Poisson-like intervals
+    (Uniform[1, arrive_rate]).
+  - The consumer fires CONSUME events; it is single-threaded and
+    processes consume_cost ticks per tuple.
+  - The event heap drives a proper discrete-event loop — arrival and
+    consumption advance on independent clocks, so queue depth and
+    back-pressure emerge naturally.
+
+Metrics
+-------
+  Throughput  — events consumed / total ticks (rolling window + overall)
+  Latency     — clock-ticks from tuple birth to end-of-consume (P50/P95/P99/avg)
+  EWMA jitter — per child-task inter-delivery deviation from its EWMA prediction;
+                jitter_t = |actual_inter - ewma_before_update|,
+                global_jitter = EWMA(jitter_t) across all child tasks
+
+Usage
+-----
+    python jcqueue_sim.py                        # defaults
+    python jcqueue_sim.py --events 5000 \\
+        --batch-n 8 --reorder-cost 5 --alpha 0.1
+    python jcqueue_sim.py --plot                 # requires matplotlib
+    python jcqueue_sim.py --sweep reorder-cost   # sensitivity sweep
+"""
+
+import argparse
+import heapq
+import random
+import statistics
+import sys
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+ARRIVE  = 0
+CONSUME = 1
+
+
+# ---------------------------------------------------------------------------
+# Result container
+# ---------------------------------------------------------------------------
+
+@dataclass
+class SimResult:
+    name: str
+    throughput: float        # events / total ticks
+    avg_latency: float       # mean ticks from birth to consume
+    p50_latency: float
+    p95_latency: float
+    p99_latency: float
+    avg_jitter: float        # final global EWMA jitter
+    tp_windows: List[float]  # throughput snapshot per window
+    jit_windows: List[float] # ewma jitter snapshot per window
+    lat_all: List[float]     # raw per-event latencies (for CDF)
+
+
+# ---------------------------------------------------------------------------
+# Core simulator
+# ---------------------------------------------------------------------------
+
+def simulate(
+        *,
+        n_producers: int    = 4,
+        queue_size: int     = 64,
+        arrive_rate: int    = 3,    # max ticks between successive arrivals per producer
+        consume_cost: int   = 12,   # ticks to process one tuple
+        use_batch: bool     = False,
+        batch_n: int        = 8,    # reorder buffer size  (JCQueue: REORDER_BUFFER_SIZE)
+        reorder_cost: int   = 5,    # overhead ticks for the batch sort
+        alpha: float        = 0.10, # EWMA smoothing factor
+        n_events: int       = 5000, # tuples to consume before stopping
+        n_tasks: int        = 6,    # distinct downstream task ids
+        task_skew: float    = 0.40, # fraction of extra traffic sent to hot task-0
+        window_size: int    = 50,   # events between metric snapshots
+        seed: Optional[int] = 42,
+        name: str           = "sim",
+) -> SimResult:
+    """
+    Run one simulation and return a SimResult.
+
+    The consumer is intentionally slower than the aggregate producer rate
+    (consume_cost > arrive_rate) so the queue builds pressure and latency
+    accumulates meaningfully.
+    """
+    rng = random.Random(seed)
+
+    # Task weight distribution: task-0 is hot
+    weights = [1.0] * n_tasks
+    weights[0] += task_skew * (n_tasks - 1)
+    total_w = sum(weights)
+    cum_w = [sum(weights[: i + 1]) / total_w for i in range(n_tasks)]
+
+    def pick_task() -> int:
+        r = rng.random()
+        for i, cw in enumerate(cum_w):
+            if r <= cw:
+                return i
+        return n_tasks - 1
+
+    # Event heap: (time, event_type, payload)
+    heap: list = []
+    for pid in range(n_producers):
+        t = rng.randint(1, max(1, arrive_rate))
+        heapq.heappush(heap, (t, ARRIVE, pid))
+
+    queue: List[dict] = []   # {born: int, task: int}
+    consumer_free: int = 0
+    consumed: int = 0
+    produced: int = 0
+    produce_limit = int(n_events * 1.5)
+
+    latencies: List[float] = []
+
+    # Per child-task delivery cadence.
+    # task_ewma[t]      : EWMA of inter-delivery interval to child task t.
+    #                     Seeded on first delivery; no jitter on first sample.
+    # task_last_seen[t] : clock tick of the last tuple delivered to task t.
+    # jitter per sample  = |actual_inter - ewma_before_update|  (prediction error)
+    # global_jitter      : EWMA of per-sample jitter across all child tasks.
+    task_ewma: List[float] = [0.0] * n_tasks
+    task_last_seen: List[int] = [-1] * n_tasks
+    global_jitter: float = 0.0
+
+    # Windowed snapshot accumulators
+    tp_windows: List[float] = []
+    jit_windows: List[float] = []
+    win_consumed: int = 0
+    win_start: int = 0
+
+    # Main event loop
+    while consumed < n_events:
+        if not heap:
+            break
+
+        now, etype, payload = heapq.heappop(heap)
+
+        if etype == ARRIVE:
+            pid = payload
+            if produced < produce_limit:
+                if len(queue) < queue_size:
+                    queue.append({"born": now, "task": pick_task()})
+                    produced += 1
+
+                # Reschedule producer
+                nxt = now + max(1, rng.randint(1, arrive_rate))
+                heapq.heappush(heap, (nxt, ARRIVE, pid))
+
+                # Wake consumer if idle
+                if consumer_free <= now and queue:
+                    heapq.heappush(heap, (now, CONSUME, None))
+
+        elif etype == CONSUME:
+            if not queue or consumed >= n_events:
+                consumer_free = now
+                continue
+
+            start = max(now, consumer_free)
+
+            if use_batch:
+                take = min(batch_n, len(queue))
+                batch = queue[:take]
+                del queue[:take]
+                if take > 1:
+                    # Mirror TaskJitterComparator: sort by child-task EWMA inter-delivery
+                    # interval ascending — the most frequent / stable downstream tasks
+                    # are served first, smoothing their receive cadence.
+                    batch.sort(key=lambda t: task_ewma[t["task"]])
+                    start += reorder_cost
+            else:
+                batch = [queue.pop(0)]
+
+            t_end = start
+            for item in batch:
+                t_end += consume_cost
+                lat = t_end - item["born"]
+                latencies.append(float(lat))
+
+                # Update child-task delivery cadence and compute jitter.
+                # inter        : actual gap between consecutive deliveries to this task.
+                # ewma_pred    : what the EWMA predicted before seeing this sample.
+                # jitter       : absolute prediction error  |actual - predicted|.
+                # On first delivery the EWMA is seeded with the observed interval
+                # (no jitter recorded — there is no prior prediction to compare).
+                tk = item["task"]
+                if task_last_seen[tk] >= 0:
+                    inter = t_end - task_last_seen[tk]
+                    ewma_pred = task_ewma[tk]              # prediction BEFORE update
+                    task_ewma[tk] = alpha * inter + (1.0 - alpha) * ewma_pred
+                    jitter = abs(inter - ewma_pred)        # deviation from prediction
+                    global_jitter = alpha * jitter + (1.0 - alpha) * global_jitter
+                else:
+                    # First delivery: seed EWMA, no jitter sample
+                    task_ewma[tk] = float(consume_cost)
+                task_last_seen[tk] = t_end
+
+                consumed += 1
+                win_consumed += 1
+
+                if win_consumed >= window_size:
+                    elapsed = t_end - win_start or 1
+                    tp_windows.append(win_consumed / elapsed)
+                    jit_windows.append(global_jitter)
+                    win_consumed = 0
+                    win_start = t_end
+
+                if consumed >= n_events:
+                    break
+
+            consumer_free = t_end
+
+            if queue and consumed < n_events:
+                heapq.heappush(heap, (consumer_free, CONSUME, None))
+
+    # Flush trailing window
+    if win_consumed > 0:
+        elapsed = consumer_free - win_start or 1
+        tp_windows.append(win_consumed / elapsed)
+        jit_windows.append(global_jitter)
+
+    total_time = consumer_free or 1
+
+    def percentile(data: List[float], p: float) -> float:
+        if not data:
+            return 0.0
+        s = sorted(data)
+        return s[max(0, int(len(s) * p / 100) - 1)]
+
+    return SimResult(
+        name=name,
+        throughput=consumed / total_time,
+        avg_latency=statistics.mean(latencies) if latencies else 0.0,
+        p50_latency=percentile(latencies, 50),
+        p95_latency=percentile(latencies, 95),
+        p99_latency=percentile(latencies, 99),
+        avg_jitter=global_jitter,
+        tp_windows=tp_windows,
+        jit_windows=jit_windows,
+        lat_all=latencies,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Text report
+# ---------------------------------------------------------------------------
+
+def _bar(val: float, ref: float, width: int = 28) -> str:
+    if ref <= 0:
+        return "░" * width
+    filled = int(width * min(val / ref, 1.0))
+    return "█" * filled + "░" * (width - filled)
+
+
+def _delta(fifo_val: float, batch_val: float, lower_is_better: bool) -> str:
+    if fifo_val == 0:
+        return "  n/a"
+    diff_pct = (batch_val - fifo_val) / fifo_val * 100
+    sign = "+" if diff_pct >= 0 else ""
+    better = (diff_pct < 0) if lower_is_better else (diff_pct > 0)
+    tag = "✓ better" if better else "✗ worse "
+    return f"batch {sign}{diff_pct:5.1f}%  ({tag})"
+
+
+def print_report(fifo: SimResult, batch: SimResult, cfg: dict) -> None:
+    W = 74
+    sep = "─" * W
+
+    print(f"\n{'JCQueue Discrete Event Simulation':^{W}}")
+    print(sep)
+    print(
+        f"  producers={cfg['n_producers']}  queue_size={cfg['queue_size']}"
+        f"  arrive_rate={cfg['arrive_rate']}  consume_cost={cfg['consume_cost']}"
+    )
+    print(
+        f"  batch_n={cfg['batch_n']}  reorder_cost={cfg['reorder_cost']}"
+        f"  alpha={cfg['alpha']}  skew={cfg['task_skew']:.0%}"
+        f"  events={cfg['n_events']}"
+    )
+    print(sep)
+    print(f"  {'Metric':<28}  {'FIFO pure':>10}  {'Batch+reorder':>13}  Delta")
+    print(sep)
+
+    rows = [
+        ("Throughput (evt/tick)",  fifo.throughput,   batch.throughput,   False),
+        ("Avg latency (ticks)",    fifo.avg_latency,  batch.avg_latency,  True),
+        ("P50 latency (ticks)",    fifo.p50_latency,  batch.p50_latency,  True),
+        ("P95 latency (ticks)",    fifo.p95_latency,  batch.p95_latency,  True),
+        ("P99 latency (ticks)",    fifo.p99_latency,  batch.p99_latency,  True),
+        ("Avg EWMA jitter",        fifo.avg_jitter,   batch.avg_jitter,   True),
+    ]
+    for label, fv, bv, lib in rows:
+        d = _delta(fv, bv, lib)
+        print(f"  {label:<28}  {fv:>10.4f}  {bv:>13.4f}  {d}")
+
+    print(sep)
+    print("\n  Latency bars (normalised to FIFO P99):\n")
+    ref = fifo.p99_latency or 1.0
+    for label, fv, bv in [
+        ("P50", fifo.p50_latency, batch.p50_latency),
+        ("P95", fifo.p95_latency, batch.p95_latency),
+        ("P99", fifo.p99_latency, batch.p99_latency),
+    ]:
+        print(f"  FIFO  {label}  {_bar(fv, ref)}  {fv:,.0f} ticks")
+        print(f"  Batch {label}  {_bar(bv, ref)}  {bv:,.0f} ticks")
+        print()
+
+    print(sep)
+    n = min(20, len(fifo.tp_windows), len(batch.tp_windows))
+    print(f"\n  Throughput windows (first {n}):\n")
+    print(f"  {'Win':>4}  {'FIFO':>10}  {'Batch':>10}  {'Delta%':>8}")
+    for i in range(n):
+        fv = fifo.tp_windows[i]
+        bv = batch.tp_windows[i]
+        d = (bv - fv) / fv * 100 if fv else 0.0
+        sign = "+" if d >= 0 else ""
+        print(f"  {i+1:>4}  {fv:>10.5f}  {bv:>10.5f}  {sign}{d:>6.1f}%")
+
+    print(f"\n{sep}\n")
+
+
+# ---------------------------------------------------------------------------
+# Sensitivity sweep
+# ---------------------------------------------------------------------------
+
+_SWEEP_RANGES = {
+    "reorder-cost": [0, 2, 5, 10, 15, 20],
+    "batch-n":      [2, 4, 8, 12, 16],
+    "alpha":        [0.05, 0.10, 0.20, 0.30, 0.50],
+    "task-skew":    [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
+    "consume-cost": [4, 8, 12, 16, 24],
+}
+
+
+def run_sweep(param: str, values: list, base_cfg: dict) -> None:
+    print(f"\n{'─'*74}")
+    print(f"  Sensitivity sweep: {param}")
+    print(f"{'─'*74}")
+    print(
+        f"  {'Value':>8}  {'FIFO tp':>9}  {'Batch tp':>9}"
+        f"  {'FIFO lat':>9}  {'Batch lat':>9}"
+        f"  {'FIFO jit':>9}  {'Batch jit':>9}"
+    )
+    print(f"{'─'*74}")
+    for v in values:
+        cfg = dict(base_cfg)
+        cfg[param.replace("-", "_")] = v
+        fifo  = simulate(**cfg, use_batch=False, name="fifo")
+        batch = simulate(**cfg, use_batch=True,  name="batch")
+        print(
+            f"  {str(v):>8}  {fifo.throughput:>9.5f}  {batch.throughput:>9.5f}"
+            f"  {fifo.avg_latency:>9.1f}  {batch.avg_latency:>9.1f}"
+            f"  {fifo.avg_jitter:>9.3f}  {batch.avg_jitter:>9.3f}"
+        )
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Matplotlib plots
+# ---------------------------------------------------------------------------
+
+def plot_results(fifo: SimResult, batch: SimResult, cfg: dict) -> None:
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib.gridspec as gridspec
+    except ImportError:
+        print("matplotlib not installed:  pip install matplotlib")
+        return
+
+    BLUE  = "#378ADD"
+    GREEN = "#1D9E75"
+
+    fig = plt.figure(figsize=(14, 10))
+    title = (
+        f"JCQueue DES  —  N={cfg['batch_n']}, reorder_cost={cfg['reorder_cost']}, "
+        f"alpha={cfg['alpha']}, consume_cost={cfg['consume_cost']}, "
+        f"skew={cfg['task_skew']:.0%}, events={cfg['n_events']}"
+    )
+    fig.suptitle(title, fontsize=11, fontweight="bold")
+    gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.42, wspace=0.32)
+
+    n = min(len(fifo.tp_windows), len(batch.tp_windows))
+    xs = list(range(1, n + 1))
+
+    # 1. Throughput over time
+    ax1 = fig.add_subplot(gs[0, 0])
+    ax1.plot(xs, fifo.tp_windows[:n],  label="FIFO pure",     color=BLUE,  lw=1.5)
+    ax1.plot(xs, batch.tp_windows[:n], label="Batch+reorder", color=GREEN, lw=1.5, ls="--")
+    ax1.set_title("Throughput (evt/tick)", fontsize=11)
+    ax1.set_xlabel("Window index")
+    ax1.set_ylabel("Events / tick")
+    ax1.legend(fontsize=9)
+    ax1.grid(alpha=0.3)
+
+    # 2. EWMA Jitter over time
+    ax2 = fig.add_subplot(gs[0, 1])
+    ax2.plot(xs, fifo.jit_windows[:n],  label="FIFO pure",     color=BLUE,  lw=1.5)
+    ax2.plot(xs, batch.jit_windows[:n], label="Batch+reorder", color=GREEN, lw=1.5, ls="--")
+    ax2.set_title("EWMA Jitter over time", fontsize=11)
+    ax2.set_xlabel("Window index")
+    ax2.set_ylabel("Jitter (ticks)")
+    ax2.legend(fontsize=9)
+    ax2.grid(alpha=0.3)
+
+    # 3. Latency percentile bars
+    ax3 = fig.add_subplot(gs[1, 0])
+    pct_labels = ["P50", "P95", "P99"]
+    fifo_vals  = [fifo.p50_latency,  fifo.p95_latency,  fifo.p99_latency]
+    batch_vals = [batch.p50_latency, batch.p95_latency, batch.p99_latency]
+    xp = range(len(pct_labels))
+    w = 0.35
+    ax3.bar([i - w/2 for i in xp], fifo_vals,  width=w, label="FIFO pure",     color=BLUE,  alpha=0.85)
+    ax3.bar([i + w/2 for i in xp], batch_vals, width=w, label="Batch+reorder", color=GREEN, alpha=0.85)
+    ax3.set_xticks(list(xp))
+    ax3.set_xticklabels(pct_labels)
+    ax3.set_title("Latency percentiles", fontsize=11)
+    ax3.set_ylabel("Ticks")
+    ax3.legend(fontsize=9)
+    ax3.grid(axis="y", alpha=0.3)
+
+    # 4. Latency CDF
+    ax4 = fig.add_subplot(gs[1, 1])
+    for res, label, c in [
+        (fifo,  "FIFO pure",     BLUE),
+        (batch, "Batch+reorder", GREEN),
+    ]:
+        s = sorted(res.lat_all)
+        cdf = [i / len(s) for i in range(len(s))]
+        ax4.plot(s, cdf, label=label, color=c, lw=1.5)
+    for q in (0.50, 0.95, 0.99):
+        ax4.axhline(q, color="gray", lw=0.7, ls=":")
+    ax4.set_title("Latency CDF", fontsize=11)
+    ax4.set_xlabel("Latency (ticks)")
+    ax4.set_ylabel("Cumulative probability")
+    ax4.legend(fontsize=9)
+    ax4.grid(alpha=0.3)
+
+    out = "jcqueue_sim.png"
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    print(f"Plot saved → {out}")
+    plt.show()
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="JCQueue DES: FIFO pure vs FIFO+batch reorder",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument("--producers",    type=int,   default=4,
+                   help="Number of concurrent producer threads")
+    p.add_argument("--queue-size",   type=int,   default=64,
+                   help="Bounded recvQueue capacity")
+    p.add_argument("--arrive-rate",  type=int,   default=3,
+                   help="Max inter-arrival ticks per producer (Uniform[1,R])")
+    p.add_argument("--consume-cost", type=int,   default=12,
+                   help="Ticks to process one tuple (consumer speed)")
+    p.add_argument("--batch-n",      type=int,   default=8,
+                   help="Reorder buffer size N")
+    p.add_argument("--reorder-cost", type=int,   default=5,
+                   help="Extra ticks overhead for sorting a batch")
+    p.add_argument("--alpha",        type=float, default=0.10,
+                   help="EWMA smoothing factor alpha in (0,1)")
+    p.add_argument("--events",       type=int,   default=5000,
+                   help="Total tuples to consume")
+    p.add_argument("--tasks",        type=int,   default=6,
+                   help="Number of distinct downstream task ids")
+    p.add_argument("--task-skew",    type=float, default=0.40,
+                   help="Extra traffic fraction to hot task-0 (0=uniform)")
+    p.add_argument("--window",       type=int,   default=50,
+                   help="Events per metric snapshot window")
+    p.add_argument("--seed",         type=int,   default=42,
+                   help="Random seed (0 = non-deterministic)")
+    p.add_argument("--plot",         action="store_true",
+                   help="Render matplotlib charts (pip install matplotlib)")
+    p.add_argument("--sweep",        type=str,   default=None, metavar="PARAM",
+                   help=(
+                           "Sensitivity sweep over PARAM. Choices: "
+                           + ", ".join(_SWEEP_RANGES.keys())
+                   ))
+    return p
+
+
+def main() -> None:
+    args = build_parser().parse_args()
+    seed = args.seed if args.seed != 0 else None
+
+    base_cfg = dict(
+        n_producers  = args.producers,
+        queue_size   = args.queue_size,
+        arrive_rate  = args.arrive_rate,
+        consume_cost = args.consume_cost,
+        batch_n      = args.batch_n,
+        reorder_cost = args.reorder_cost,
+        alpha        = args.alpha,
+        n_events     = args.events,
+        n_tasks      = args.tasks,
+        task_skew    = args.task_skew,
+        window_size  = args.window,
+        seed         = seed,
+    )
+
+    if args.sweep:
+        param = args.sweep
+        if param not in _SWEEP_RANGES:
+            print(f"Unknown sweep param '{param}'. Choose from: {list(_SWEEP_RANGES)}")
+            sys.exit(1)
+        run_sweep(param, _SWEEP_RANGES[param], base_cfg)
+        return
+
+    print(f"\nRunning FIFO pure ...", end=" ", flush=True)
+    fifo = simulate(**base_cfg, use_batch=False, name="FIFO pure")
+    print("done.")
+
+    print(f"Running Batch+reorder (N={args.batch_n}, cost={args.reorder_cost}) ...",
+          end=" ", flush=True)
+    batch = simulate(**base_cfg, use_batch=True, name="Batch+reorder")
+    print("done.")
+
+    print_report(fifo, batch, base_cfg)
+
+    if args.plot:
+        plot_results(fifo, batch, base_cfg)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/docs/Metrics.md b/docs/Metrics.md
index 4a2db2728b7..fe13e9d80f5 100644
--- a/docs/Metrics.md
+++ b/docs/Metrics.md
@@ -180,15 +180,22 @@ Similar to the tuple counting metrics storm also collects average latency metric
 
 ##### `__complete-latency`
 
-The complete latency is just for spouts.  It is the average amount of time it took for `ack` or `fail` to be called for a tuple after it was emitted.  If acking is disabled this metric is likely to be blank or 0 for all values, and should be ignored.
+The complete latency is just for spouts. It is the average amount of time it took for `ack` or `fail` to be called for a
+tuple after it was emitted. If acking is disabled this metric is likely to be blank or 0 for all values, and should be
+ignored.
 
 ##### `__execute-latency`
 
-This is just for bolts.  It is the average amount of time that the bolt spent in the call to the `execute` method.  The higher this gets, the lower the throughput of tuples per bolt instance.
+This is just for bolts. It is the average amount of time that the bolt spent in the call to the `execute` method. The
+higher this gets, the lower the throughput of tuples per bolt instance.
 
 ##### `__process-latency`
 
-This is also just for bolts.  It is the average amount of time between when `execute` was called to start processing a tuple, to when it was acked or failed by the bolt.  If your bolt is a very simple bolt and the processing is synchronous then `__process-latency` and `__execute-latency` should be very close to one another, with process latency being slightly smaller.  If you are doing a join or have asynchronous processing then it may take a while for a tuple to be acked so the process latency would be higher than the execute latency.
+This is also just for bolts. It is the average amount of time between when `execute` was called to start processing a
+tuple, to when it was acked or failed by the bolt. If your bolt is a very simple bolt and the processing is synchronous
+then `__process-latency` and `__execute-latency` should be very close to one another, with process latency being
+slightly smaller. If you are doing a join or have asynchronous processing then it may take a while for a tuple to be
+acked so the process latency would be higher than the execute latency.
 
 ##### `__skipped-max-spout-ms`
 
@@ -207,6 +214,35 @@ This metric indicates the overflow count last time BP status was sent, with a mi
 
 This metric records how much time a spout was idle because the topology was deactivated.  This is the total time in milliseconds, not the average amount of time and is not sub-sampled.
 
+#### Tuple Jitter Metrics
+To activate jitter-based metrics, `topology.stats.ewma.enable` must be set to `true`, which switches the system to a jitter estimation based on an Exponential Moving Average (EWMA).
+In this model, jitter is dynamically updated by weighting new latency samples against historical data using a smoothing factor (`topology.stats.ewma.smoothing.factor`).
+This parameter, which defaults to 0.0625 (equivalent to $1/16$ or a 4-bit right shift), determines the metric's reactivity: higher values make the jitter more sensitive to recent spikes, while lower values prioritize long-term stability.
+Operators should be aware that enabling this feature triples the gauge count for every component-stream pair per task; this significant increase in metric cardinality can impact TSDB storage and costs, so backend capacity should be verified before deployment.
+
+##### `__complete-jitter`
+
+This metric is specific to spouts. It measures the variation (jitter) in the total completion time (end-to-end latency)
+of tuples, calculated using the exponentially weighted moving average (EWMA) algorithm as defined in RFC 1889 §A.8 / RFC 3550 §A.8.
+While `__complete-latency` indicates the average amount of time it took for a tuple to be fully processed by the
+topology (from emission to the final ack), the jitter metric quantifies the consistency of that process. If acking is
+disabled, this metric is likely to be blank or 0 and should be ignored.
+
+##### `__execute-jitter`
+
+This metric is specific to bolts. It measures the variation (jitter) in the time spent within the execute method,
+calculated using the exponentially weighted moving average (EWMA) algorithm as defined in RFC 1889 §A.8 / RFC 3550 §A.8.
+While `__execute-latency` provides the average time spent in the execute call, the jitter metric quantifies the
+predictability of that execution time. It is a critical indicator of computational "smoothness".
+
+##### `__process-jitter`
+
+This metric is specific to bolts. It measures the variation (jitter) in the process latency, calculated using the
+exponentially weighted moving average (EWMA) algorithm as defined in RFC 1889 §A.8 / RFC 3550 §A.8.
+While `__process-latency` provides the average time a tuple spends being processed, the jitter metric quantifies the
+stability of that processing time. It helps identify "noisy" execution environments where processing times fluctuate
+significantly, even if the average remains within acceptable limits.
+
 #### Error Reporting Metrics
 
 Storm also collects error reporting metrics for bolts and spouts.
diff --git a/storm-client/src/jvm/org/apache/storm/Config.java b/storm-client/src/jvm/org/apache/storm/Config.java
index e332b726b28..cb4b1e764b6 100644
--- a/storm-client/src/jvm/org/apache/storm/Config.java
+++ b/storm-client/src/jvm/org/apache/storm/Config.java
@@ -350,13 +350,13 @@ public class Config extends HashMap<String, Object> {
      * <p>comp-1 cannot exist on same worker as comp-2 or comp-3, and at most "2" comp-1 on same node</p>
      * <p>comp-2 and comp-4 cannot be on same worker (missing comp-1 is implied from comp-1 constraint)</p>
      *
-     *  <p>
-     *      { "comp-1": { "maxNodeCoLocationCnt": 2, "incompatibleComponents": ["comp-2", "comp-3" ] },
-     *        "comp-2": { "incompatibleComponents": [ "comp-4" ] }
-     *      }
-     *  </p>
+     * <p>
+     * { "comp-1": { "maxNodeCoLocationCnt": 2, "incompatibleComponents": ["comp-2", "comp-3" ] },
+     * "comp-2": { "incompatibleComponents": [ "comp-4" ] }
+     * }
+     * </p>
      */
-    @IsExactlyOneOf(valueValidatorClasses = { ListOfListOfStringValidator.class, RasConstraintsTypeValidator.class })
+    @IsExactlyOneOf(valueValidatorClasses = {ListOfListOfStringValidator.class, RasConstraintsTypeValidator.class})
     public static final String TOPOLOGY_RAS_CONSTRAINTS = "topology.ras.constraints";
 
     /**
@@ -424,17 +424,17 @@ public class Config extends HashMap<String, Object> {
      *
      * <p>
      * 1. If not setting this variable or setting it as null,
-     *   a. If RAS is not used:
-     *        Nimbus will set it to {@link Config#TOPOLOGY_WORKERS}.
-     *   b. If RAS is used:
-     *        Nimbus will set it to (the estimate number of workers *  {@link Config#TOPOLOGY_RAS_ACKER_EXECUTORS_PER_WORKER}).
-     *        {@link Config#TOPOLOGY_RAS_ACKER_EXECUTORS_PER_WORKER} is default to be 1 if not set.
+     * a. If RAS is not used:
+     * Nimbus will set it to {@link Config#TOPOLOGY_WORKERS}.
+     * b. If RAS is used:
+     * Nimbus will set it to (the estimate number of workers *  {@link Config#TOPOLOGY_RAS_ACKER_EXECUTORS_PER_WORKER}).
+     * {@link Config#TOPOLOGY_RAS_ACKER_EXECUTORS_PER_WORKER} is default to be 1 if not set.
      * 2. If this variable is set to 0,
-     *    then Storm will immediately ack tuples as soon as they come off the spout,
-     *    effectively disabling reliability.
+     * then Storm will immediately ack tuples as soon as they come off the spout,
+     * effectively disabling reliability.
      * 3. If this variable is set to a positive integer,
-     *    Storm will not honor {@link Config#TOPOLOGY_RAS_ACKER_EXECUTORS_PER_WORKER} setting.
-     *    Instead, nimbus will set it as (this variable / estimate num of workers).
+     * Storm will not honor {@link Config#TOPOLOGY_RAS_ACKER_EXECUTORS_PER_WORKER} setting.
+     * Instead, nimbus will set it as (this variable / estimate num of workers).
      * </p>
      */
     @IsInteger
@@ -465,7 +465,7 @@ public class Config extends HashMap<String, Object> {
      * <p>Note that EventLoggerBolt takes care of all the implementations of IEventLogger, hence registering many
      * implementations (especially they're implemented as 'blocking' manner) would slow down overall topology.
      */
-    @IsListEntryCustom(entryValidatorClasses = { EventLoggerRegistryValidator.class })
+    @IsListEntryCustom(entryValidatorClasses = {EventLoggerRegistryValidator.class})
     public static final String TOPOLOGY_EVENT_LOGGER_REGISTER = "topology.event.logger.register";
     /**
      * How many executors to spawn for event logger.
@@ -543,7 +543,7 @@ public class Config extends HashMap<String, Object> {
      * it's parallelism is configurable.
      */
 
-    @IsListEntryCustom(entryValidatorClasses = { MetricRegistryValidator.class })
+    @IsListEntryCustom(entryValidatorClasses = {MetricRegistryValidator.class})
     public static final String TOPOLOGY_METRICS_CONSUMER_REGISTER = "topology.metrics.consumer.register";
     /**
      * Enable tracking of network message byte counts per source-destination task. This is off by default as it creates tasks^2 metric
@@ -596,6 +596,58 @@ public class Config extends HashMap<String, Object> {
      */
     @IsPositiveNumber
     public static final String TOPOLOGY_STATS_SAMPLE_RATE = "topology.stats.sample.rate";
+    /**
+     * Enabling jitter streaming calculation (RFC 1889 §A.8).
+     *
+     * @see <a href="https://www.rfc-editor.org/rfc/rfc1889#appendix-A.8">RFC 1889 §A.8</a>
+     */
+    @IsBoolean
+    public static final String TOPOLOGY_STATS_EWMA_ENABLE = "topology.stats.ewma.enable";
+    /**
+     * The smoothing factor (alpha) used for exponential jitter calculation (RFC 1889 §A.8). The default value is set to 1/16.
+     *
+     * @see <a href="https://www.rfc-editor.org/rfc/rfc1889#appendix-A.8">RFC 1889 §A.8</a>
+     */
+    @CustomValidator(validatorClass = ConfigValidation.ZeroOneOpenIntervalValidator.class)
+    public static final String TOPOLOGY_STATS_EWMA_SMOOTHING_FACTOR = "topology.stats.ewma.smoothing.factor";
+    /**
+     * Flag to enable or disable the feedback channel for upstream communication.
+     * When true, components can send unanchored tuples back to their source tasks.
+     */
+    @IsBoolean
+    public static final String TOPOLOGY_UPSTREAM_FEEDBACK_ENABLE = "topology.upstream.feedback.enable";
+    /**
+     * The specific stream ID used for upstream feedback communication.
+     * Defaults to "__feedback" if not explicitly configured.
+     */
+    @IsString
+    public static final String TOPOLOGY_UPSTREAM_FEEDBACK_STREAM_ID = "topology.upstream.feedback.stream";
+    /**
+     * Configuration for the sampling rate of upstream feedback messages within the topology.
+     *
+     * <p>This ratio defines the probability with which a task will emit a feedback tuple
+     * (containing metrics such as EWMA jitter stats) back to its parent tasks.
+     * This mechanism allows parent tasks to receive performance signals from downstream
+     * components to facilitate adaptive flow control or load balancing.</p>
+     *
+     * <p><b>Validation:</b> Must be a double value within the <b>open interval (0.0, 1.0)</b>.
+     * Values of 0.0 (disabled) or 1.0 (every tuple) are rejected by the
+     * {@link ConfigValidation.ZeroOneOpenIntervalValidator} to prevent improper
+     * configuration of the feedback loop.</p>
+     *
+     * <p><b>Impact:</b>
+     * <ul>
+     *   <li>Higher values provide more precise, real-time performance data but increase
+     *       network overhead and CPU usage on the control plane.</li>
+     *   <li>Lower values minimize the "observer effect" on the topology's throughput
+     *       while still providing statistical snapshots of health.</li>
+     * </ul>
+     * </p>
+     *
+     * Defaults to 0.1 if not explicitly configured.
+     */
+    @CustomValidator(validatorClass = ConfigValidation.ZeroOneOpenIntervalValidator.class)
+    public static final String TOPOLOGY_UPSTREAM_FEEDBACK_RATIO = "topology.upstream.feedback.ratio";
     /**
      * The time period that builtin metrics data in bucketed into.
      */
@@ -754,6 +806,12 @@ public class Config extends HashMap<String, Object> {
     @IsPositiveNumber
     @NotNull
     public static final String TOPOLOGY_BACKPRESSURE_CHECK_MILLIS = "topology.backpressure.check.millis";
+    /**
+     * Predict the backpressure consuming the jitter stats of downstream tasks. Produce deterministic flows.
+     * To produce the stats it is required to enable as well `topology.upstream.feedback.enable`.
+     */
+    @IsBoolean
+    public static final String TOPOLOGY_BACKPRESSURE_PREDICTION_ENABLE = "topology.backpressure.prediction.enable";
     /**
      * How often to send flush tuple to the executors for flushing out batched events.
      */
@@ -833,14 +891,14 @@ public class Config extends HashMap<String, Object> {
      * Topology central logging sensitivity to determine who has access to logs in central logging system. The possible values are: S0 -
      * Public (open to all users on grid) S1 - Restricted S2 - Confidential S3 - Secret (default.)
      */
-    @IsString(acceptedValues = { "S0", "S1", "S2", "S3" })
+    @IsString(acceptedValues = {"S0", "S1", "S2", "S3"})
     public static final String TOPOLOGY_LOGGING_SENSITIVITY = "topology.logging.sensitivity";
     /**
      * Log file the user can use to configure Log4j2.
      * Can be a resource in the jar (specified with classpath:/path/to/resource) or a file.
      * This configuration is applied in addition to the regular worker log4j2 configuration.
      * The configs are merged according to the rules here:
-     *   https://logging.apache.org/log4j/2.x/manual/configuration.html#CompositeConfiguration
+     * https://logging.apache.org/log4j/2.x/manual/configuration.html#CompositeConfiguration
      */
     @IsString
     public static final String TOPOLOGY_LOGGING_CONFIG_FILE = "topology.logging.config";
@@ -884,7 +942,8 @@ public class Config extends HashMap<String, Object> {
      * Alternatively set {@code storm.scheduler} to {@code org.apache.storm.scheduler.resource.ResourceAwareScheduler}
      * using {@link Config#TOPOLOGY_SCHEDULER_STRATEGY} set to
      * {@code org.apache.storm.scheduler.resource.strategies.scheduling.RoundRobinResourceAwareStrategy}
-     * */
+     *
+     */
     @IsInteger
     @IsPositiveNumber
     public static final String TOPOLOGY_ISOLATED_MACHINES = "topology.isolate.machines";
@@ -1434,22 +1493,34 @@ public class Config extends HashMap<String, Object> {
     @IsString
     public static final String STORM_ZOOKEEPER_TOPOLOGY_AUTH_SCHEME = "storm.zookeeper.topology.auth.scheme";
 
-    /** Enable SSL/TLS for ZooKeeper client connection. */
+    /**
+     * Enable SSL/TLS for ZooKeeper client connection.
+     */
     @IsBoolean
     public static final String ZK_SSL_ENABLE = "storm.zookeeper.ssl.enable";
-    /** Keystore location for ZooKeeper client connection over SSL. */
+    /**
+     * Keystore location for ZooKeeper client connection over SSL.
+     */
     @IsString
     public static final String STORM_ZOOKEEPER_SSL_KEYSTORE_PATH = "storm.zookeeper.ssl.keystore.path";
-    /** Keystore password for ZooKeeper client connection over SSL. */
+    /**
+     * Keystore password for ZooKeeper client connection over SSL.
+     */
     @IsString
     public static final String STORM_ZOOKEEPER_SSL_KEYSTORE_PASSWORD = "storm.zookeeper.ssl.keystore.password";
-    /** Truststore location for ZooKeeper client connection over SSL. */
+    /**
+     * Truststore location for ZooKeeper client connection over SSL.
+     */
     @IsString
     public static final String STORM_ZOOKEEPER_SSL_TRUSTSTORE_PATH = "storm.zookeeper.ssl.truststore.path";
-    /** Truststore password for ZooKeeper client connection over SSL.  */
+    /**
+     * Truststore password for ZooKeeper client connection over SSL.
+     */
     @IsString
     public static final String STORM_ZOOKEEPER_SSL_TRUSTSTORE_PASSWORD = "storm.zookeeper.ssl.truststore.password";
-    /** Enable or disable hostname verification.*/
+    /**
+     * Enable or disable hostname verification.
+     */
     @IsBoolean
     public static final String STORM_ZOOKEEPER_SSL_HOSTNAME_VERIFICATION = "storm.zookeeper.ssl.hostnameVerification";
     /**
@@ -1462,13 +1533,13 @@ public class Config extends HashMap<String, Object> {
     /**
      * Configure the topology metrics reporters to be used on workers.
      */
-    @IsListEntryCustom(entryValidatorClasses = { MetricReportersValidator.class })
+    @IsListEntryCustom(entryValidatorClasses = {MetricReportersValidator.class})
     public static final String TOPOLOGY_METRICS_REPORTERS = "topology.metrics.reporters";
 
     /**
      * A list of system metrics reporters that will get added to each topology.
      */
-    @IsListEntryCustom(entryValidatorClasses = { MetricReportersValidator.class })
+    @IsListEntryCustom(entryValidatorClasses = {MetricReportersValidator.class})
     public static final String STORM_TOPOLOGY_METRICS_SYSTEM_REPORTERS = "storm.topology.metrics.system.reporters";
 
     /**
@@ -1476,7 +1547,7 @@ public class Config extends HashMap<String, Object> {
      * Use {@link Config#TOPOLOGY_METRICS_REPORTERS} instead.
      */
     @Deprecated(forRemoval = true, since = "2.0.0")
-    @IsListEntryCustom(entryValidatorClasses = { MetricReportersValidator.class })
+    @IsListEntryCustom(entryValidatorClasses = {MetricReportersValidator.class})
     public static final String STORM_METRICS_REPORTERS = "storm.metrics.reporters";
 
     /**
@@ -1511,6 +1582,7 @@ public class Config extends HashMap<String, Object> {
     public static final String BLOBSTORE_HDFS_PRINCIPAL = "blobstore.hdfs.principal";
     /**
      * keytab for nimbus/supervisor to use to access secure hdfs for the blobstore.
+     *
      * @Deprecated Use {@link Config#STORM_HDFS_LOGIN_KEYTAB} instead.
      */
     @Deprecated
@@ -1753,7 +1825,7 @@ public class Config extends HashMap<String, Object> {
      */
     @IsInteger
     public static final String STORM_MESSAGING_NETTY_CLIENT_WORKER_THREADS =
-            "storm.messaging.netty.client_worker_threads";
+        "storm.messaging.netty.client_worker_threads";
 
     /**
      * Netty based messaging: Enables TLS connections between workers.
@@ -1808,7 +1880,7 @@ public class Config extends HashMap<String, Object> {
      */
     @IsString
     public static final String STORM_MESSAGING_NETTY_TLS_CLIENT_TRUSTSTORE_PASSWORD =
-            "storm.messaging.netty.tls.client.truststore.password";
+        "storm.messaging.netty.tls.client.truststore.password";
 
     /**
      * Netty based messaging: Specifies the client keystore when TLS is enabled.
@@ -1821,7 +1893,7 @@ public class Config extends HashMap<String, Object> {
      */
     @IsString
     public static final String STORM_MESSAGING_NETTY_TLS_CLIENT_KEYSTORE_PASSWORD =
-            "storm.messaging.netty.tls.client.keystore.password";
+        "storm.messaging.netty.tls.client.keystore.password";
 
     /**
      * Netty based messaging: Specifies the protocols TLS is enabled.
@@ -1830,7 +1902,7 @@ public class Config extends HashMap<String, Object> {
     public static final String STORM_MESSAGING_NETTY_TLS_SSL_PROTOCOLS = "storm.messaging.netty.tls.ssl.protocols";
 
     /**
-    /**
+     * /**
      * Netty based messaging: The number of milliseconds that a Netty client will retry flushing messages that are already
      * buffered to be sent.
      */
@@ -1915,7 +1987,7 @@ public class Config extends HashMap<String, Object> {
     @IsPositiveNumber
     @IsInteger
     public static final String STORM_BLOBSTORE_DEPENDENCY_JAR_UPLOAD_CHUNK_SIZE_BYTES =
-            "storm.blobstore.dependency.jar.upload.chunk.size.bytes";
+        "storm.blobstore.dependency.jar.upload.chunk.size.bytes";
     /**
      * FQCN of a class that implements {@code ISubmitterHook} @see ISubmitterHook for details.
      */
@@ -1924,8 +1996,8 @@ public class Config extends HashMap<String, Object> {
     /**
      * Impersonation user ACL config entries.
      */
-    @IsMapEntryCustom(keyValidatorClasses = { ConfigValidation.StringValidator.class },
-        valueValidatorClasses = { ConfigValidation.ImpersonationAclUserEntryValidator.class })
+    @IsMapEntryCustom(keyValidatorClasses = {ConfigValidation.StringValidator.class},
+        valueValidatorClasses = {ConfigValidation.ImpersonationAclUserEntryValidator.class})
     public static final String NIMBUS_IMPERSONATION_ACL = "nimbus.impersonation.acl";
     /**
      * A whitelist of the RAS scheduler strategies allowed by nimbus. Should be a list of fully-qualified class names or null to allow all.
@@ -2430,6 +2502,7 @@ public static String getBlobstoreHDFSPrincipal(Map conf) throws UnknownHostExcep
 
     /**
      * Get the hostname substituted hdfs principal.
+     *
      * @param conf the storm Configuration
      * @return the principal
      * @throws UnknownHostException on UnknowHostException
@@ -2458,6 +2531,7 @@ public static String getHdfsPrincipal(Map<String, Object> conf) throws UnknownHo
 
     /**
      * Get the hdfs keytab.
+     *
      * @param conf the storm Configuration
      * @return the keytab
      */
diff --git a/storm-client/src/jvm/org/apache/storm/daemon/StormCommon.java b/storm-client/src/jvm/org/apache/storm/daemon/StormCommon.java
index b3cfd90d4d6..f9445bf2ef7 100644
--- a/storm-client/src/jvm/org/apache/storm/daemon/StormCommon.java
+++ b/storm-client/src/jvm/org/apache/storm/daemon/StormCommon.java
@@ -360,6 +360,19 @@ public static void addEventLogger(Map<String, Object> conf, StormTopology topolo
         topology.put_to_bolts(EVENTLOGGER_COMPONENT_ID, eventLoggerBolt);
     }
 
+    public static void addUpstreamFeedback(Map<String, Object> conf, StormTopology topology) {
+        Integer numExecutors = ObjectReader.getInt(conf.get(Config.TOPOLOGY_EVENTLOGGER_EXECUTORS),
+            ObjectReader.getInt(conf.get(Config.TOPOLOGY_WORKERS)));
+        if (numExecutors == null || numExecutors == 0) {
+            return;
+        }
+        String feedbackStreamId = ConfigUtils.upstreamFeedbackStreamId(conf);
+        for (Object component : allComponents(topology).values()) {
+            ComponentCommon common = getComponentCommon(component);
+            common.put_to_streams(feedbackStreamId, Thrift.outputFields(eventLoggerBoltFields()));
+        }
+    }
+
     @SuppressWarnings("unchecked")
     public static Map<String, Bolt> metricsConsumerBoltSpecs(Map<String, Object> conf, StormTopology topology) {
         Map<String, Bolt> metricsConsumerBolts = new HashMap<>();
@@ -464,6 +477,10 @@ public static boolean hasEventLoggers(Map<String, Object> topoConf) {
         return eventLoggerNum == null || ObjectReader.getInt(eventLoggerNum) > 0;
     }
 
+    public static boolean hasUpstreamFeedback(Map<String, Object> topoConf) {
+        return ConfigUtils.upstreamFeedbackEnable(topoConf);
+    }
+
     public static int numStartExecutors(Object component) throws InvalidTopologyException {
         ComponentCommon common = getComponentCommon(component);
         return Thrift.getParallelismHint(common);
@@ -538,6 +555,9 @@ protected StormTopology systemTopologyImpl(Map<String, Object> topoConf, StormTo
         if (hasEventLoggers(topoConf)) {
             addEventLogger(topoConf, ret);
         }
+        if (hasUpstreamFeedback(topoConf)) {
+            addUpstreamFeedback(topoConf, ret);
+        }
         addMetricComponents(topoConf, ret);
         addSystemComponents(topoConf, ret);
         addMetricStreams(ret);
diff --git a/storm-client/src/jvm/org/apache/storm/daemon/Task.java b/storm-client/src/jvm/org/apache/storm/daemon/Task.java
index 45a1f2d7e3a..8020445229b 100644
--- a/storm-client/src/jvm/org/apache/storm/daemon/Task.java
+++ b/storm-client/src/jvm/org/apache/storm/daemon/Task.java
@@ -216,6 +216,27 @@ public void sendUnanchored(String stream, List<Object> values, ExecutorTransfer
         }
     }
 
+    /**
+     * Sends an unanchored feedback tuple directly to a specific task ID (typically upstream).
+     * <p>
+     * This method bypasses standard stream grouping logic and routes the tuple
+     * exclusively to the provided {@code targetTaskId}. It is a <b>non-blocking</b> call:
+     * if the destination buffer is full, the tuple is added to the {@code pendingEmits}
+     * queue for later retry, preventing executor stalls.
+     * </p>
+     *
+     * @param stream       The ID of the stream to emit on (must be declared in the topology).
+     * @param values       The data payload to be sent.
+     * @param targetTaskId The unique ID of the destination task (e.g., the sourceTaskId of an incoming tuple).
+     * @param transfer     The {@link ExecutorTransfer} instance handling the physical data transfer.
+     * @param pendingEmits A queue used to store tuples that cannot be transferred immediately due to backpressure.
+     */
+    public void sendUnanchoredFeedback(String stream, List<Object> values, int targetTaskId, ExecutorTransfer transfer, Queue<AddressedTuple> pendingEmits) {
+        Tuple tuple = getTuple(stream, values);
+        AddressedTuple addressedTuple = new AddressedTuple(targetTaskId, tuple);
+        transfer.tryTransfer(addressedTuple, pendingEmits);
+    }
+
     /**
      * Send sampled data to the eventlogger if the global or component level debug flag is set (via nimbus api).
      */
diff --git a/storm-client/src/jvm/org/apache/storm/executor/Executor.java b/storm-client/src/jvm/org/apache/storm/executor/Executor.java
index edd77743fcd..d98a40b9226 100644
--- a/storm-client/src/jvm/org/apache/storm/executor/Executor.java
+++ b/storm-client/src/jvm/org/apache/storm/executor/Executor.java
@@ -23,6 +23,7 @@
 import java.lang.reflect.Field;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
@@ -30,7 +31,9 @@
 import java.util.Objects;
 import java.util.Queue;
 import java.util.Random;
+import java.util.Set;
 import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -131,6 +134,11 @@ public abstract class Executor implements Callable, JCQueue.Consumer {
     private final RateCounter reportedErrorCount;
     private final boolean enableV2MetricsDataPoints;
     private final Integer v2MetricsTickInterval;
+    protected final String upstreamFeedbackStreamId;
+    protected final boolean upstreamFeedbackEnabled;
+    protected final boolean predictiveBackpressureEnabled;
+    protected final Map<Integer, Map<Integer, Map<String, IMetricsConsumer.DataPoint>>> childEwmaStats;
+    protected final Map<Integer, Map<String, Double>> avgCache = new ConcurrentHashMap<>();
 
     protected Executor(WorkerState workerData, List<Long> executorId, Map<String, String> credentials, String type) {
         this.workerData = workerData;
@@ -177,6 +185,8 @@ protected Executor(WorkerState workerData, List<Long> executorId, Map<String, St
         this.credentials = credentials;
         this.hasEventLoggers = StormCommon.hasEventLoggers(topoConf);
         this.ackingEnabled = StormCommon.hasAckers(topoConf);
+        this.upstreamFeedbackEnabled = ConfigUtils.upstreamFeedbackEnable(topoConf);
+        this.upstreamFeedbackStreamId = ConfigUtils.upstreamFeedbackStreamId(topoConf);
 
         try {
             this.hostname = Utils.hostname();
@@ -189,6 +199,18 @@ protected Executor(WorkerState workerData, List<Long> executorId, Map<String, St
 
         enableV2MetricsDataPoints = ObjectReader.getBoolean(topoConf.get(Config.TOPOLOGY_ENABLE_V2_METRICS_TICK), false);
         v2MetricsTickInterval = ObjectReader.getInt(topoConf.get(Config.TOPOLOGY_V2_METRICS_TICK_INTERVAL_SECONDS), 60);
+
+        // configure the upstream feedback heuristic + predictive backpressure scheduling if enabled.
+        if (this.upstreamFeedbackEnabled) {
+            this.childEwmaStats = new ConcurrentHashMap<>();
+            this.predictiveBackpressureEnabled = ConfigUtils.backpressurePredictionEnable(topoConf);
+            if (this.predictiveBackpressureEnabled) {
+                this.receiveQueue.enablePredictiveBackpressure(this);
+            }
+        } else {
+            this.childEwmaStats = null;
+            this.predictiveBackpressureEnabled = false;
+        }
     }
 
     public static Executor mkExecutor(WorkerState workerState, List<Long> executorId, Map<String, String> credentials) {
@@ -364,6 +386,192 @@ public void metricsTick(Task task, TupleImpl tuple) {
         }
     }
 
+    /**
+     * Constructs a Storm {@link Values} object containing a snapshot of specific metrics
+     * to be sent as upstream feedback.
+     *
+     * <p>This method generates a {@link IMetricsConsumer.TaskInfo} header with a timestamp
+     * and a default interval of -1 (indicating an on-demand or non-standard tick),
+     * followed by a list of filtered DataPoints.</p>
+     *
+     * @param taskId  The ID of the task for which metrics are being collected.
+     * @param metrics A set of metric names (e.g., EWMA stats) to include in the feedback.
+     * @return A {@link Values} object containing [TaskInfo, List<DataPoint>],
+     *         compatible with the metrics stream schema.
+     */
+    public Values buildUpstreamFeedbackTuple(int taskId, Set<String> metrics) {
+        IMetricsConsumer.TaskInfo taskInfo = new IMetricsConsumer.TaskInfo(
+            hostname, workerTopologyContext.getThisWorkerPort(),
+            componentId, taskId, Time.currentTimeSecs(), -1);
+        return new Values(taskInfo, buildEwmaDataPoints(taskId, metrics));
+    }
+
+    /**
+     * Updates child task statistics by unwrapping the Storm Values object.
+     *
+     * <p>Extracts the {@link IMetricsConsumer.TaskInfo} and the collection of
+     * {@link IMetricsConsumer.DataPoint} objects from the provided tuple. Updates
+     * to the {@code childEwmaStats} map are thread-safe via {@link ConcurrentHashMap}
+     * atomic operations.</p>
+     *
+     * <p><b>Data Mapping:</b>
+     * <ul>
+     *   <li>Index 0: {@link IMetricsConsumer.TaskInfo}</li>
+     *   <li>Index 1: {@code Collection} of {@link IMetricsConsumer.DataPoint}</li>
+     * </ul>
+     * </p>
+     *
+     * @param task  The {@link Task} associated with this update.
+     * @param tuple The {@link TupleImpl} emitted by the upstream feedback builder.
+     */
+    public void updateChildEwmaStats(Task task, TupleImpl tuple) {
+        if (!this.upstreamFeedbackEnabled || tuple == null) {
+            return;
+        }
+
+        List<Object> values = tuple.getValues();
+        if (values == null || values.size() < 2) {
+            LOG.warn("Feedback tuple for task {} has insufficient elements (size={})",
+                task.getTaskId(), values == null ? 0 : values.size());
+            return;
+        }
+
+        // Safe type check replaces unchecked cast and suppression
+        if (!(values.get(0) instanceof IMetricsConsumer.TaskInfo taskInfo)) {
+            LOG.warn("Unexpected type at index 0 in feedbackTuple for task {}: {}",
+                task.getTaskId(), values.get(0) == null ? "null" : values.get(0).getClass().getName());
+            return;
+        }
+
+        if (!(values.get(1) instanceof Collection<?> rawDataPoints)) {
+            LOG.warn("Unexpected type at index 1 in feedbackTuple for task {}: {}",
+                task.getTaskId(), values.get(1) == null ? "null" : values.get(1).getClass().getName());
+            return;
+        }
+
+        int taskId = task.getTaskId();
+        int childTaskId = taskInfo.srcTaskId;
+
+        // Filter to only valid DataPoint instances, safely skipping any unexpected elements
+        List<IMetricsConsumer.DataPoint> dataPoints = rawDataPoints.stream()
+            .filter(IMetricsConsumer.DataPoint.class::isInstance)
+            .map(IMetricsConsumer.DataPoint.class::cast)
+            .toList();
+
+        if (!dataPoints.isEmpty()) {
+            Map<String, IMetricsConsumer.DataPoint> metricsMap =
+                getOrCreateMetricsMap(taskId, childTaskId);
+
+            for (IMetricsConsumer.DataPoint dp : dataPoints) {
+                metricsMap.put(dp.name, dp);
+            }
+
+            // Invalidate cached averages for this taskId since underlying data changed
+            avgCache.remove(taskId);
+        }
+    }
+
+    /**
+     * Retrieves the collected metric statistics for all child tasks of a given parent task.
+     *
+     * @param taskId The ID of the parent task.
+     * @return A map of childTaskId to metric name to {@link IMetricsConsumer.DataPoint}.
+     *         Returns an empty map if feedback is disabled or no data exists.
+     */
+    public Map<Integer, Map<String, IMetricsConsumer.DataPoint>> getChildEwmaStats(int taskId) {
+        if (!this.upstreamFeedbackEnabled) {
+            return Collections.emptyMap();
+        }
+        return this.childEwmaStats.getOrDefault(taskId, Collections.emptyMap());
+    }
+
+    /**
+     * Calculates the average value for each metric across all child tasks
+     * associated with the given parent task.
+     *
+     * <p>Results are cached per {@code taskId} and recomputed only when the
+     * underlying data has changed (i.e. after a call to {@link #updateChildEwmaStats}).</p>
+     *
+     * @param taskId The ID of the parent task.
+     * @return A map of metric name to computed average value.
+     *         Returns an empty map if no stats are found or feedback is disabled.
+     */
+    public Map<String, Double> getChildEwmaAvgStats(int taskId) {
+        Map<Integer, Map<String, IMetricsConsumer.DataPoint>> taskStats = this.getChildEwmaStats(taskId);
+        if (taskStats.isEmpty()) {
+            return Collections.emptyMap();
+        }
+        return avgCache.computeIfAbsent(taskId, this::computeAvgStats);
+    }
+
+    private Map<String, Double> computeAvgStats(int taskId) {
+        Map<Integer, Map<String, IMetricsConsumer.DataPoint>> taskStats = this.getChildEwmaStats(taskId);
+
+        Map<String, StatsAccumulator> accumulators = new HashMap<>();
+
+        for (Map<String, IMetricsConsumer.DataPoint> childMetrics : taskStats.values()) {
+            for (Map.Entry<String, IMetricsConsumer.DataPoint> entry : childMetrics.entrySet()) {
+                if (entry.getValue().value instanceof Number n) {
+                    accumulators.merge(
+                        entry.getKey(),
+                        new StatsAccumulator(n.doubleValue(), 1),
+                        StatsAccumulator::combine
+                    );
+                }
+            }
+        }
+
+        return accumulators.entrySet().stream()
+            .collect(Collectors.toMap(
+                Map.Entry::getKey,
+                e -> e.getValue().average()
+            ));
+    }
+
+    private Map<String, IMetricsConsumer.DataPoint> getOrCreateMetricsMap(int taskId, int childTaskId) {
+        return childEwmaStats
+            .computeIfAbsent(taskId, k -> new ConcurrentHashMap<>())
+            .computeIfAbsent(childTaskId, k -> new ConcurrentHashMap<>());
+    }
+
+    // Functional pattern
+    private record StatsAccumulator(double sum, int count) {
+        StatsAccumulator combine(StatsAccumulator other) {
+            return new StatsAccumulator(this.sum + other.sum, this.count + other.count);
+        }
+
+        double average() {
+            return count > 0 ? sum / count : 0.0;
+        }
+    }
+
+    private List<IMetricsConsumer.DataPoint> buildEwmaDataPoints(int taskId, Set<String> metrics) {
+        if (metrics == null || metrics.isEmpty()) {
+            return Collections.emptyList();
+        }
+
+        List<IMetricsConsumer.DataPoint> dataPoints = new ArrayList<>(metrics.size());
+        Map<String, Gauge> allGauges = workerData.getMetricRegistry().getTaskGauges(taskId);
+
+        if (allGauges == null || allGauges.isEmpty()) {
+            return dataPoints;
+        }
+
+        for (String metricName : metrics) {
+            Gauge gauge = allGauges.get(metricName);
+
+            if (gauge != null) {
+                Object v = (gauge instanceof PerReporterGauge)
+                    ? ((PerReporterGauge) gauge).getValueForReporter(this)
+                    : gauge.getValue();
+                if (v instanceof Number) {
+                    dataPoints.add(new IMetricsConsumer.DataPoint(metricName, v));
+                }
+            }
+        }
+        return dataPoints;
+    }
+
     // updates v1 metric dataPoints with v2 metric API data
     private void addV2Metrics(int taskId, List<IMetricsConsumer.DataPoint> dataPoints, int interval) {
         if (!enableV2MetricsDataPoints) {
diff --git a/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltExecutor.java b/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltExecutor.java
index 273bab5e69f..9f099ded740 100644
--- a/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltExecutor.java
+++ b/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltExecutor.java
@@ -198,6 +198,13 @@ public void tupleActionFn(int taskId, TupleImpl tuple) throws Exception {
             outputCollector.flush();
         } else if (Constants.METRICS_TICK_STREAM_ID.equals(streamId)) {
             metricsTick(idToTask.get(taskId - idToTaskBase), tuple);
+        } else if (this.upstreamFeedbackStreamId.equals(streamId)) {
+            if (!this.upstreamFeedbackEnabled) {
+                LOG.debug("Upstream feedback skipped.");
+            } else {
+                // update internal metrics
+                this.updateChildEwmaStats(idToTask.get(taskId - idToTaskBase), tuple);
+            }
         } else {
             IBolt boltObject = (IBolt) idToTask.get(taskId - idToTaskBase).getTaskObject();
             boolean isSampled = sampler.getAsBoolean();
diff --git a/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltOutputCollectorImpl.java b/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltOutputCollectorImpl.java
index 886a00c7f89..5fa89f6e3d5 100644
--- a/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltOutputCollectorImpl.java
+++ b/storm-client/src/jvm/org/apache/storm/executor/bolt/BoltOutputCollectorImpl.java
@@ -12,13 +12,18 @@
 
 package org.apache.storm.executor.bolt;
 
+import static org.apache.storm.metrics2.TaskMetrics.EWMA_METRICS_SET;
+
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
 import java.util.Set;
+import java.util.function.Supplier;
 import org.apache.storm.daemon.Acker;
+import org.apache.storm.daemon.StormCommon;
 import org.apache.storm.daemon.Task;
 import org.apache.storm.executor.ExecutorTransfer;
 import org.apache.storm.hooks.info.BoltAckInfo;
@@ -29,6 +34,7 @@
 import org.apache.storm.tuple.Tuple;
 import org.apache.storm.tuple.TupleImpl;
 import org.apache.storm.tuple.Values;
+import org.apache.storm.utils.ConfigUtils;
 import org.apache.storm.utils.Time;
 import org.apache.storm.utils.Utils;
 import org.slf4j.Logger;
@@ -43,6 +49,9 @@ public class BoltOutputCollectorImpl implements IOutputCollector {
     private final int taskId;
     private final Random random;
     private final boolean isEventLoggers;
+    private final boolean isUpstreamFeedback;
+    private final String upstreamFeedbackStreamId;
+    private Supplier<Boolean> upstreamFeedbackRate;
     private final ExecutorTransfer xsfer;
     private final boolean isDebug;
     private boolean ackingEnabled;
@@ -57,6 +66,19 @@ public BoltOutputCollectorImpl(BoltExecutor executor, Task taskData, Random rand
         this.ackingEnabled = ackingEnabled;
         this.isDebug = isDebug;
         this.xsfer = executor.getExecutorTransfer();
+
+        // configure the upstream feedback if enabled.
+        Map<String, Object> conf = executor.getTopoConf();
+        if (StormCommon.hasUpstreamFeedback(conf)) {
+            this.isUpstreamFeedback = true;
+            this.upstreamFeedbackStreamId = ConfigUtils.upstreamFeedbackStreamId(conf);
+            double ratio = ConfigUtils.upstreamFeedbackRatio(conf);
+            this.upstreamFeedbackRate = () -> random.nextDouble() < ratio;
+        } else {
+            // explicitly declare
+            this.isUpstreamFeedback = false;
+            this.upstreamFeedbackStreamId = "__NOT_SET__";
+        }
     }
 
     @Override
@@ -93,6 +115,7 @@ private List<Integer> boltEmit(String streamId, Collection<Tuple> anchors, List<
             MessageId msgId;
             if (ackingEnabled && anchors != null) {
                 final Map<Long, Long> anchorsToIds = new HashMap<>();
+                final boolean sendUpstreamFeedback = isUpstreamFeedback && upstreamFeedbackRate.get();
                 for (Tuple a : anchors) {  // perf critical path. would be nice to avoid iterator allocation here and below
                     Set<Long> rootIds = a.getMessageId().getAnchorsToIds().keySet();
                     if (rootIds.size() > 0) {
@@ -102,6 +125,11 @@ private List<Integer> boltEmit(String streamId, Collection<Tuple> anchors, List<
                             putXor(anchorsToIds, rootId, edgeId);
                         }
                     }
+                    if (sendUpstreamFeedback) {
+                        int parentTask = a.getSourceTask();
+                        task.sendUnanchoredFeedback(upstreamFeedbackStreamId, executor.buildUpstreamFeedbackTuple(taskId, EWMA_METRICS_SET),
+                            parentTask, xsfer, executor.getPendingEmits());
+                    }
                 }
                 msgId = MessageId.makeId(anchorsToIds);
             } else {
diff --git a/storm-client/src/jvm/org/apache/storm/executor/spout/SpoutExecutor.java b/storm-client/src/jvm/org/apache/storm/executor/spout/SpoutExecutor.java
index 734fca2a2d6..880c22e7d41 100644
--- a/storm-client/src/jvm/org/apache/storm/executor/spout/SpoutExecutor.java
+++ b/storm-client/src/jvm/org/apache/storm/executor/spout/SpoutExecutor.java
@@ -321,6 +321,13 @@ public void tupleActionFn(int taskId, TupleImpl tuple) throws Exception {
             if (pendingForId != null) {
                 pending.put(id, pendingForId);
             }
+        } else if (this.upstreamFeedbackStreamId.equals(streamId)) {
+            if (!this.upstreamFeedbackEnabled) {
+                LOG.debug("Upstream feedback skipped.");
+            } else {
+                // update internal metrics
+                this.updateChildEwmaStats(idToTask.get(taskId - idToTaskBase), tuple);
+            }
         } else {
             Long id = (Long) tuple.getValue(0);
             Long timeDeltaMs = (Long) tuple.getValue(1);
diff --git a/storm-client/src/jvm/org/apache/storm/metrics2/EwmaGauge.java b/storm-client/src/jvm/org/apache/storm/metrics2/EwmaGauge.java
new file mode 100644
index 00000000000..857e34215f0
--- /dev/null
+++ b/storm-client/src/jvm/org/apache/storm/metrics2/EwmaGauge.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version
+ * 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+package org.apache.storm.metrics2;
+
+import static org.apache.storm.utils.ConfigUtils.RFC1889_ALPHA;
+
+import com.codahale.metrics.Gauge;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Lock-free jitter estimator following RFC 1889 §A.8 / RFC 3550 §A.8.
+ * The jitter accumulator is stored as raw IEEE 754 bits in an AtomicLong
+ * so that CAS can be used without locks.
+ * Thread safety: addValue is lock-free; getValue is wait-free.
+ */
+public class EwmaGauge implements Gauge<Double> {
+
+    private static final long UNSEEDED = Long.MIN_VALUE;
+    private static final long ZERO_BITS = Double.doubleToLongBits(0.0);
+
+    private final AtomicLong lastTransit = new AtomicLong(UNSEEDED);
+    private final AtomicLong jitterBits = new AtomicLong(ZERO_BITS);
+    private final double alpha;
+
+    EwmaGauge(double alpha) {
+        if (alpha <= 0.0 || alpha >= 1.0 || Double.isNaN(alpha)) {
+            throw new IllegalArgumentException(
+                    "alpha must be in (0, 1), got: " + alpha);
+        }
+        this.alpha = alpha;
+    }
+
+    EwmaGauge() {
+        this(RFC1889_ALPHA);  // 1.0 / 16.0
+    }
+
+    /**
+     * Update the jitter estimate.
+     *
+     * @param transitMs transit time for this tuple: {@code arrival - timestamp}
+     *                  Negative values are silently ignored.
+     */
+    public void addValue(long transitMs) {
+        if (transitMs < 0) {
+            return;
+        }
+        // Seed on the very first packet: store transit, nothing to diff against yet.
+        if (lastTransit.compareAndSet(UNSEEDED, transitMs)) {
+            return;
+        }
+        long prev = lastTransit.getAndSet(transitMs);
+        // Safe from Math.abs(Long.MIN_VALUE) pathology: both transitMs and prev
+        // are >= 0 (enforced by the negative-guard), so their
+        // difference is in [-Long.MAX_VALUE, Long.MAX_VALUE].
+        double d = Math.abs(transitMs - prev);
+        long currentBits;
+        long updatedBits;
+        do {
+            currentBits = jitterBits.get();
+            double currentJitter = Double.longBitsToDouble(currentBits);
+            double updatedJitter = currentJitter + alpha * (d - currentJitter);
+            updatedBits = Double.doubleToLongBits(updatedJitter);
+        } while (!jitterBits.compareAndSet(currentBits, updatedBits));
+    }
+
+    /**
+     * Returns the current jitter estimate in timestamp units.
+     */
+    @Override
+    public Double getValue() {
+        return Double.longBitsToDouble(jitterBits.get());
+    }
+}
diff --git a/storm-client/src/jvm/org/apache/storm/metrics2/TaskMetrics.java b/storm-client/src/jvm/org/apache/storm/metrics2/TaskMetrics.java
index 0ac3a5e9492..9565467302c 100644
--- a/storm-client/src/jvm/org/apache/storm/metrics2/TaskMetrics.java
+++ b/storm-client/src/jvm/org/apache/storm/metrics2/TaskMetrics.java
@@ -12,10 +12,14 @@
 
 package org.apache.storm.metrics2;
 
-import com.codahale.metrics.Counter;
+import com.codahale.metrics.Gauge;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.function.Supplier;
+
 import org.apache.storm.task.WorkerTopologyContext;
 import org.apache.storm.utils.ConfigUtils;
 import org.apache.storm.utils.Utils;
@@ -27,12 +31,24 @@ public class TaskMetrics {
     private static final String METRIC_NAME_TRANSFERRED = "__transfer-count";
     private static final String METRIC_NAME_EXECUTED = "__execute-count";
     private static final String METRIC_NAME_PROCESS_LATENCY = "__process-latency";
+    public static final String METRIC_NAME_PROCESS_JITTER = "__process-jitter";
     private static final String METRIC_NAME_COMPLETE_LATENCY = "__complete-latency";
+    public static final String METRIC_NAME_COMPLETE_JITTER = "__complete-jitter";
     private static final String METRIC_NAME_EXECUTE_LATENCY = "__execute-latency";
+    public static final String METRIC_NAME_EXECUTE_JITTER = "__execute-jitter";
     private static final String METRIC_NAME_CAPACITY = "__capacity";
 
+    public static final Set<String> EWMA_METRICS_SET = Set.of(
+        METRIC_NAME_PROCESS_JITTER,
+        METRIC_NAME_COMPLETE_JITTER,
+        METRIC_NAME_EXECUTE_JITTER
+    );
+
     private final ConcurrentMap<String, RateCounter> rateCounters = new ConcurrentHashMap<>();
-    private final ConcurrentMap<String, RollingAverageGauge> gauges = new ConcurrentHashMap<>();
+    private final ConcurrentMap<String, Gauge<?>> gauges = new ConcurrentHashMap<>();
+    // Gauge supplier singleton factories
+    private final Supplier<EwmaGauge> ewmaGaugeFactory;
+    private final Supplier<RollingAverageGauge> rollingAverageGaugeFactory;
 
     private final String topologyId;
     private final String componentId;
@@ -40,6 +56,7 @@ public class TaskMetrics {
     private final Integer workerPort;
     private final StormMetricRegistry metricRegistry;
     private final int samplingRate;
+    private final boolean ewmaEnable;
 
 
     public TaskMetrics(WorkerTopologyContext context, String componentId, Integer taskid,
@@ -50,6 +67,10 @@ public TaskMetrics(WorkerTopologyContext context, String componentId, Integer ta
         this.taskId = taskid;
         this.workerPort = context.getThisWorkerPort();
         this.samplingRate = ConfigUtils.samplingRate(topoConf);
+        double ewmaSmoothingFactor = ConfigUtils.ewmaSmoothingFactor(topoConf);
+        this.ewmaEnable = ConfigUtils.ewmaEnable(topoConf);
+        this.rollingAverageGaugeFactory = RollingAverageGauge::new;
+        this.ewmaGaugeFactory = () -> new EwmaGauge(ewmaSmoothingFactor);
     }
 
     public void setCapacity(double capacity) {
@@ -67,6 +88,12 @@ public void spoutAckedTuple(String streamId, long latencyMs) {
         metricName = METRIC_NAME_COMPLETE_LATENCY + "-" + streamId;
         RollingAverageGauge gauge = this.getRollingAverageGauge(metricName, streamId);
         gauge.addValue(latencyMs);
+
+        if (this.ewmaEnable) {
+            metricName = METRIC_NAME_COMPLETE_JITTER + "-" + streamId;
+            EwmaGauge ewmaGauge = this.getExponentialWeightedMovingAverageGauge(metricName, streamId);
+            ewmaGauge.addValue(latencyMs);
+        }
     }
 
     public void boltAckedTuple(String sourceComponentId, String sourceStreamId, long latencyMs) {
@@ -78,6 +105,12 @@ public void boltAckedTuple(String sourceComponentId, String sourceStreamId, long
         metricName = METRIC_NAME_PROCESS_LATENCY + "-" + key;
         RollingAverageGauge gauge = this.getRollingAverageGauge(metricName, sourceStreamId);
         gauge.addValue(latencyMs);
+
+        if (this.ewmaEnable) {
+            metricName = METRIC_NAME_PROCESS_JITTER + "-" + key;
+            EwmaGauge ewmaGauge = this.getExponentialWeightedMovingAverageGauge(metricName, sourceStreamId);
+            ewmaGauge.addValue(latencyMs);
+        }
     }
 
     public void spoutFailedTuple(String streamId) {
@@ -117,6 +150,12 @@ public void boltExecuteTuple(String sourceComponentId, String sourceStreamId, lo
         metricName = METRIC_NAME_EXECUTE_LATENCY + "-" + key;
         RollingAverageGauge gauge = this.getRollingAverageGauge(metricName, sourceStreamId);
         gauge.addValue(latencyMs);
+
+        if (this.ewmaEnable) {
+            metricName = METRIC_NAME_EXECUTE_JITTER + "-" + key;
+            EwmaGauge ewmaGauge = this.getExponentialWeightedMovingAverageGauge(metricName, sourceStreamId);
+            ewmaGauge.addValue(latencyMs);
+        }
     }
 
     private RateCounter getRateCounter(String metricName, String streamId) {
@@ -135,18 +174,54 @@ private RateCounter getRateCounter(String metricName, String streamId) {
     }
 
     private RollingAverageGauge getRollingAverageGauge(String metricName, String streamId) {
-        RollingAverageGauge gauge = this.gauges.get(metricName);
-        if (gauge == null) {
+        return getOrCreateGauge(metricName, streamId, RollingAverageGauge.class, this.rollingAverageGaugeFactory);
+    }
+
+    private EwmaGauge getExponentialWeightedMovingAverageGauge(String metricName, String streamId) {
+        return getOrCreateGauge(metricName, streamId, EwmaGauge.class, this.ewmaGaugeFactory);
+    }
+
+    private <G extends Gauge<?>> G getOrCreateGauge(
+            String metricName,
+            String streamId,
+            Class<G> gaugeClass,
+            Supplier<G> factory) {
+
+        Object existing = this.gauges.get(metricName);
+        if (existing == null) {
             synchronized (this) {
-                gauge = this.gauges.get(metricName);
-                if (gauge == null) {
-                    gauge = new RollingAverageGauge();
-                    metricRegistry.gauge(metricName, gauge, this.topologyId, this.componentId,
-                            streamId, this.taskId, this.workerPort);
-                    this.gauges.put(metricName, gauge);
+                existing = this.gauges.get(metricName);
+                if (existing == null) {
+                    G created = factory.get();
+                    registerGauge(metricName, streamId, created);
+                    this.gauges.put(metricName, created);
+                    return created;
                 }
             }
         }
-        return gauge;
+
+        if (!gaugeClass.isInstance(existing)) {
+            throw new IllegalStateException(
+                    "Metric '" + metricName + "' is registered as "
+                            + existing.getClass().getName()
+                            + " but expected " + gaugeClass.getName());
+        }
+
+        return gaugeClass.cast(existing);
+    }
+
+    /*
+     * Safe cast: G is bounded by Gauge<?> in the signature of getOrCreateGauge,
+     * so every instance of G is by definition a Gauge.
+     * The cast to raw Gauge is required because metricRegistry.gauge() does not
+     * accept Gauge<?> the wildcard is not compatible with the type parameter T
+     * expected by the external API. Type-safety is guaranteed by the bound
+     * <G extends Gauge<?>> declared at the call site.
+     */
+    @SuppressWarnings({"unchecked", "rawtypes"})
+    private void registerGauge(String metricName, String streamId, Gauge<?> gauge) {
+        metricRegistry.gauge(metricName, (Gauge) gauge, this.topologyId,
+                this.componentId, streamId, this.taskId, this.workerPort);
     }
+
 }
diff --git a/storm-client/src/jvm/org/apache/storm/utils/ConfigUtils.java b/storm-client/src/jvm/org/apache/storm/utils/ConfigUtils.java
index 9357cc562cf..f326f6ff4e6 100644
--- a/storm-client/src/jvm/org/apache/storm/utils/ConfigUtils.java
+++ b/storm-client/src/jvm/org/apache/storm/utils/ConfigUtils.java
@@ -39,6 +39,9 @@ public class ConfigUtils {
     public static final String FILE_SEPARATOR = File.separator;
     public static final String STORM_HOME = "storm.home";
     public static final String RESOURCES_SUBDIR = "resources";
+    public static final double RFC1889_ALPHA = 1.0 / 16.0;
+    public static final String UPSTREAM_FEEDBACK_STREAM_ID = "__feedback";
+    public static final double UPSTREAM_FEEDBACK_RATIO = 0.1;
 
     private static final Set<String> passwordConfigKeys = new HashSet<>();
 
@@ -175,6 +178,66 @@ public static int samplingRate(Map<String, Object> conf) {
         throw new IllegalArgumentException("Illegal topology.stats.sample.rate in conf: " + rate);
     }
 
+    public static double ewmaSmoothingFactor(Map<String, Object> conf) {
+        Object value = conf.get(Config.TOPOLOGY_STATS_EWMA_SMOOTHING_FACTOR);
+        if (value == null) {
+            return RFC1889_ALPHA;
+        }
+        double alpha = ObjectReader.getDouble(value);
+        if (alpha > 0.0 && alpha < 1.0) {
+            return alpha;
+        }
+        throw new IllegalArgumentException(
+                "Illegal " + Config.TOPOLOGY_STATS_EWMA_SMOOTHING_FACTOR
+                        + " in conf: " + alpha + " must be in (0, 1)");
+    }
+
+    public static boolean ewmaEnable(Map<String, Object> conf) {
+        Object value = conf.get(Config.TOPOLOGY_STATS_EWMA_ENABLE);
+        if (value == null) {
+            return false;
+        }
+        return ObjectReader.getBoolean(value, false);
+    }
+
+    public static boolean upstreamFeedbackEnable(Map<String, Object> conf) {
+        Object value = conf.get(Config.TOPOLOGY_UPSTREAM_FEEDBACK_ENABLE);
+        if (value == null) {
+            return false;
+        }
+        return ObjectReader.getBoolean(value, false);
+    }
+
+    public static boolean backpressurePredictionEnable(Map<String, Object> conf) {
+        Object value = conf.get(Config.TOPOLOGY_BACKPRESSURE_PREDICTION_ENABLE);
+        if (value == null) {
+            return false;
+        }
+        return ObjectReader.getBoolean(value, false);
+    }
+
+    public static String upstreamFeedbackStreamId(Map<String, Object> conf) {
+        Object value = conf.get(Config.TOPOLOGY_UPSTREAM_FEEDBACK_STREAM_ID);
+        if (value == null) {
+            return UPSTREAM_FEEDBACK_STREAM_ID;
+        }
+        return ObjectReader.getString(value);
+    }
+
+    public static double upstreamFeedbackRatio(Map<String, Object> conf) {
+        Object value = conf.get(Config.TOPOLOGY_UPSTREAM_FEEDBACK_RATIO);
+        if (value == null) {
+            return UPSTREAM_FEEDBACK_RATIO;
+        }
+        double ratio = ObjectReader.getDouble(value);
+        if (ratio > 0.0 && ratio < 1.0) {
+            return ratio;
+        }
+        throw new IllegalArgumentException(
+            "Illegal " + Config.TOPOLOGY_UPSTREAM_FEEDBACK_RATIO
+                + " in conf: " + ratio + " must be in (0, 1)");
+    }
+
     public static BooleanSupplier mkStatsSampler(Map<String, Object> conf) {
         return evenSampler(samplingRate(conf));
     }
diff --git a/storm-client/src/jvm/org/apache/storm/utils/JCQueue.java b/storm-client/src/jvm/org/apache/storm/utils/JCQueue.java
index 6aa668f566d..8d5da0629d6 100644
--- a/storm-client/src/jvm/org/apache/storm/utils/JCQueue.java
+++ b/storm-client/src/jvm/org/apache/storm/utils/JCQueue.java
@@ -20,18 +20,23 @@
 
 import java.io.Closeable;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
+import org.apache.storm.executor.Executor;
 import org.apache.storm.metrics2.StormMetricRegistry;
 import org.apache.storm.policy.IWaitStrategy;
 import org.apache.storm.shade.org.jctools.queues.MessagePassingQueue;
 import org.apache.storm.shade.org.jctools.queues.MpscArrayQueue;
 import org.apache.storm.shade.org.jctools.queues.MpscUnboundedArrayQueue;
+import org.apache.storm.tuple.AddressedTuple;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 @SuppressWarnings("checkstyle:AbbreviationAsWordInName")
 public class JCQueue implements Closeable {
     private static final Logger LOG = LoggerFactory.getLogger(JCQueue.class);
+    private static final int REORDER_BUFFER_SIZE = 4;
+
     private final ExitCondition continueRunning = () -> true;
     private final List<JCQueueMetrics> jcqMetrics = new ArrayList<>();
     private final MpscArrayQueue<Object> recvQueue;
@@ -43,6 +48,11 @@ public class JCQueue implements Closeable {
     private final ThreadLocal<BatchInserter> thdLocalBatcher = new ThreadLocal<BatchInserter>(); // ensure 1 instance per producer thd.
     private final IWaitStrategy backPressureWaitStrategy;
     private final String queueName;
+    private final AddressedTuple[] reorderingBuffer = new AddressedTuple[REORDER_BUFFER_SIZE];
+
+    // The TaskJitterComparator is not a mandatory field. It is required for the power of two adaptive task selection.
+    private TaskJitterComparator taskJitterComparator;
+    private boolean enbalePredictiveBackpressure = taskJitterComparator != null;
 
     public JCQueue(String queueName, String metricNamePrefix, int size, int overflowLimit, int producerBatchSz,
                    IWaitStrategy backPressureWaitStrategy, String topologyId, String componentId, List<Integer> taskIds,
@@ -66,6 +76,16 @@ public String getQueueName() {
         return queueName;
     }
 
+    public void enablePredictiveBackpressure(Executor executor) {
+        this.taskJitterComparator = new TaskJitterComparator(executor);
+        this.enbalePredictiveBackpressure = true;
+    }
+
+    public void disablePredictiveBackpressure(){
+        this.enbalePredictiveBackpressure = false;
+        this.taskJitterComparator = null; // gc
+    }
+
     @Override
     public void close() {
         for (JCQueueMetrics jcQueueMetric : jcqMetrics) {
@@ -106,12 +126,18 @@ public double getQueueLoad() {
     private int consumeImpl(Consumer consumer, ExitCondition exitCond) throws InterruptedException {
         int drainCount = 0;
         while (exitCond.keepRunning()) {
-            Object tuple = recvQueue.poll();
-            if (tuple == null) {
-                break;
+            if (consumer instanceof Executor && this.taskJitterComparator != null) {
+                int drained = drainExecutorPriorityBatch(consumer);
+                if (drained == 0) break;
+                drainCount += drained;
+            } else {
+                Object tuple = recvQueue.poll();
+                if (tuple == null) {
+                    break;
+                }
+                consumer.accept(tuple);
+                ++drainCount;
             }
-            consumer.accept(tuple);
-            ++drainCount;
         }
 
         int overflowDrainCount = 0;
@@ -128,6 +154,31 @@ private int consumeImpl(Consumer consumer, ExitCondition exitCond) throws Interr
         return total;
     }
 
+    private int drainExecutorPriorityBatch(Consumer consumer) {
+        int count = 0;
+
+        // for higher value of reorderingBuffer.length better a heap (with a batch of 8 objects the array is the best implementation)
+        for (int i = 0; i < reorderingBuffer.length; i++) {
+            Object tuple = recvQueue.poll();
+            if (tuple == null) break;
+            reorderingBuffer[count++] = (AddressedTuple) tuple;
+        }
+
+        if (count == 0) return 0;
+
+        if (count > 1) {
+            // Dual-Pivot Quicksort or Insertion Sort
+            Arrays.sort(reorderingBuffer, 0, count, this.taskJitterComparator);
+        }
+
+        for (int i = 0; i < count; i++) {
+            consumer.accept(reorderingBuffer[i]);
+            reorderingBuffer[i] = null; // gc
+        }
+
+        return count;
+    }
+
     // Non Blocking. returns true/false indicating success/failure. Fails if full.
     private boolean tryPublishInternal(Object obj) {
         if (recvQueue.offer(obj)) {
diff --git a/storm-client/src/jvm/org/apache/storm/utils/ObjectReader.java b/storm-client/src/jvm/org/apache/storm/utils/ObjectReader.java
index 54445b5eb49..ac28be8cf30 100644
--- a/storm-client/src/jvm/org/apache/storm/utils/ObjectReader.java
+++ b/storm-client/src/jvm/org/apache/storm/utils/ObjectReader.java
@@ -128,6 +128,8 @@ public static Double getDouble(Object o, Double defaultValue) {
         }
         if (o instanceof Number) {
             return ((Number) o).doubleValue();
+        } else if (o instanceof String) {
+            return Double.parseDouble((String) o);
         } else {
             throw new IllegalArgumentException("Don't know how to convert (" + o + ") to double");
         }
diff --git a/storm-client/src/jvm/org/apache/storm/utils/TaskJitterComparator.java b/storm-client/src/jvm/org/apache/storm/utils/TaskJitterComparator.java
new file mode 100644
index 00000000000..cf30247d885
--- /dev/null
+++ b/storm-client/src/jvm/org/apache/storm/utils/TaskJitterComparator.java
@@ -0,0 +1,44 @@
+package org.apache.storm.utils;
+
+import static org.apache.storm.metrics2.TaskMetrics.METRIC_NAME_EXECUTE_JITTER;
+import static org.apache.storm.metrics2.TaskMetrics.METRIC_NAME_PROCESS_JITTER;
+
+import java.util.Comparator;
+import java.util.Map;
+import org.apache.storm.executor.Executor;
+import org.apache.storm.tuple.AddressedTuple;
+
+public class TaskJitterComparator implements Comparator<AddressedTuple> {
+    private final Executor executor;
+
+    public TaskJitterComparator(Executor executor) {
+        this.executor = executor;
+    }
+    @Override
+    public int compare(AddressedTuple t1, AddressedTuple t2) {
+        int taskId1 = t1.getDest();
+        int taskId2 = t2.getDest();
+
+        if (taskId1 == taskId2) {
+            return 0;
+        }
+
+        Map<String, Double> task1Stats = executor.getChildEwmaAvgStats(taskId1);
+        Map<String, Double> task2Stats = executor.getChildEwmaAvgStats(taskId2);
+
+        double processJitter1 = task1Stats.getOrDefault(METRIC_NAME_PROCESS_JITTER, Double.MAX_VALUE);
+        double processJitter2 = task2Stats.getOrDefault(METRIC_NAME_PROCESS_JITTER, Double.MAX_VALUE);
+
+        // compare process jitter
+        if (processJitter1 < processJitter2) {
+            return -1;
+        } else if (processJitter1 > processJitter2) {
+            return 1;
+        }
+
+        // fallback on execution jitter (it means that the network jitter is not affecting too much the whole process)
+        double execJitter1 = task1Stats.getOrDefault(METRIC_NAME_EXECUTE_JITTER, Double.MAX_VALUE);
+        double execJitter2 = task2Stats.getOrDefault(METRIC_NAME_EXECUTE_JITTER, Double.MAX_VALUE);
+        return Double.compare(execJitter1, execJitter2);
+    }
+}
\ No newline at end of file
diff --git a/storm-client/src/jvm/org/apache/storm/validation/ConfigValidation.java b/storm-client/src/jvm/org/apache/storm/validation/ConfigValidation.java
index 4175ee9f4a7..8b3fb830ddf 100644
--- a/storm-client/src/jvm/org/apache/storm/validation/ConfigValidation.java
+++ b/storm-client/src/jvm/org/apache/storm/validation/ConfigValidation.java
@@ -30,6 +30,7 @@
 import java.util.stream.Collectors;
 
 import org.apache.storm.Config;
+import org.apache.storm.utils.ObjectReader;
 import org.apache.storm.utils.Utils;
 import org.apache.storm.validation.ConfigValidationAnnotations.ValidatorParams;
 import org.slf4j.Logger;
@@ -849,6 +850,23 @@ public void validateField(String name, Object o) {
         }
     }
 
+    public static class ZeroOneOpenIntervalValidator extends Validator {
+        @Override
+        public void validateField(String name, Object o) {
+            if (o == null) {
+                return;
+            }
+            // ObjectReader.getDouble(o) handles the type conversion and will throw an
+            // IllegalArgumentException if the value cannot be parsed as a number.
+            double alpha = ObjectReader.getDouble(o);
+            if (alpha > 0.0 && alpha < 1.0) {
+                return;
+            }
+            throw new IllegalArgumentException(
+                    "Field " + name + " must be a number in the open interval (0, 1), got: " + o);
+        }
+    }
+
     public static class CustomIsExactlyOneOfValidators extends Validator {
         private Class<?>[] subValidators;
         private List<String> validatorClassNames;
diff --git a/storm-client/test/jvm/org/apache/storm/TestConfigValidate.java b/storm-client/test/jvm/org/apache/storm/TestConfigValidate.java
index 3c335308da5..31b9c483446 100644
--- a/storm-client/test/jvm/org/apache/storm/TestConfigValidate.java
+++ b/storm-client/test/jvm/org/apache/storm/TestConfigValidate.java
@@ -206,6 +206,34 @@ public void testTopologyStatsSampleRateIsFloat() {
         ConfigValidation.validateFields(conf);
     }
 
+    @Test
+    public void testTopologyStatsEwmaEnableIsBoolean() {
+        Map<String, Object> conf = new HashMap<>();
+        // optional configuration
+        ConfigValidation.validateFields(conf);
+        conf.put(Config.TOPOLOGY_STATS_EWMA_ENABLE, true);
+        ConfigValidation.validateFields(conf);
+        conf.put(Config.TOPOLOGY_STATS_EWMA_ENABLE, false);
+        ConfigValidation.validateFields(conf);
+    }
+
+    @Test
+    public void testTopologyStatsEwmaSmoothingFactorCustomValidator() {
+        Map<String, Object> conf = new HashMap<>();
+        // optional configuration
+        ConfigValidation.validateFields(conf);
+        conf.put(Config.TOPOLOGY_STATS_EWMA_SMOOTHING_FACTOR, 0.1);
+        ConfigValidation.validateFields(conf);
+        conf.put(Config.TOPOLOGY_STATS_EWMA_SMOOTHING_FACTOR, "0.1");
+        ConfigValidation.validateFields(conf);
+        conf.put(Config.TOPOLOGY_STATS_EWMA_SMOOTHING_FACTOR, 0.9);
+        ConfigValidation.validateFields(conf);
+        for (Object notAllowedValue : new Object[]{0.0, -0.1, 1.9, "1.9"}) {
+            conf.put(Config.TOPOLOGY_STATS_EWMA_SMOOTHING_FACTOR, notAllowedValue);
+            assertThrows(IllegalArgumentException.class, () -> ConfigValidation.validateFields(conf));
+        }
+    }
+
     @Test
     public void testWorkerChildoptsIsStringOrStringList() {
         Map<String, Object> conf = new HashMap<>();
diff --git a/storm-client/test/jvm/org/apache/storm/metrics2/EwmaGaugeTest.java b/storm-client/test/jvm/org/apache/storm/metrics2/EwmaGaugeTest.java
new file mode 100644
index 00000000000..ed1b5009ceb
--- /dev/null
+++ b/storm-client/test/jvm/org/apache/storm/metrics2/EwmaGaugeTest.java
@@ -0,0 +1,347 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version
+ * 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+package org.apache.storm.metrics2;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.api.Nested;
+import org.junit.jupiter.api.Test;
+
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class EwmaGaugeTest {
+
+    private static final double DELTA = 1e-9;
+
+    @Nested
+    @DisplayName("Construction")
+    class ConstructionTest {
+
+        @Test
+        @DisplayName("Default constructor uses RFC 1889 alpha (1/16)")
+        void defaultAlpha() {
+            EwmaGauge gauge = new EwmaGauge();
+            gauge.addValue(0L);
+            gauge.addValue(16L); // D = 16 ; J = 0 + (16 - 0) * (1/16) = 1.0
+            assertEquals(1.0, gauge.getValue(), DELTA);
+        }
+
+        @Test
+        @DisplayName("Invalid alpha values throw IllegalArgumentException")
+        void invalidAlphaThrows() {
+            double[] invalidAlphas = {
+                    0.0, 1.0, -0.1, 1.1,
+                    Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY
+            };
+            for (double alpha : invalidAlphas) {
+                assertThrows(IllegalArgumentException.class,
+                        () -> new EwmaGauge(alpha),
+                        "Expected IllegalArgumentException for alpha=" + alpha);
+            }
+        }
+
+        @Test
+        @DisplayName("Valid alpha boundary values are accepted")
+        void validAlphaAccepted() {
+            double[] validAlphas = {0.001, 0.0625, 0.5, 0.999};
+            for (double alpha : validAlphas) {
+                assertNotNull(new EwmaGauge(alpha),
+                        "Expected no exception for alpha=" + alpha);
+            }
+        }
+    }
+
+
+    @Nested
+    @DisplayName("Cold-start semantics")
+    class ColdStartTest {
+
+        private EwmaGauge gauge;
+
+        @BeforeEach
+        void setUp() {
+            gauge = new EwmaGauge();
+        }
+
+        @Test
+        @DisplayName("getValue() returns 0.0 before any sample")
+        void noSamples() {
+            assertEquals(0.0, gauge.getValue(), DELTA);
+        }
+
+        @Test
+        @DisplayName("getValue() returns 0.0 after exactly one sample (seed only)")
+        void oneSample() {
+            gauge.addValue(100L);
+            assertEquals(0.0, gauge.getValue(), DELTA);
+        }
+
+    }
+
+    @Nested
+    @DisplayName("EWMA formula RFC 1889 §A.8")
+    class FormulaTest {
+
+        @Test
+        @DisplayName("Single update: J = 0 + (D - 0) * alpha")
+        void singleDeviation() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+            gauge.addValue(0L);
+            gauge.addValue(10L);
+            assertEquals(5.0, gauge.getValue(), DELTA);
+        }
+
+        @Test
+        @DisplayName("Manual step-by-step verification against reference values")
+        void manualSteps() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+
+            gauge.addValue(0L); // seed
+
+            // Step 1: transit=10, prev=0,  D=10, J = 0    + (10-0)    * 0.5 = 5.0
+            gauge.addValue(10L);
+            assertEquals(5.0, gauge.getValue(), DELTA, "Step 1");
+
+            // Step 2: transit=0,  prev=10, D=10, J = 5.0  + (10-5.0)  * 0.5 = 7.5
+            gauge.addValue(0L);
+            assertEquals(7.5, gauge.getValue(), DELTA, "Step 2");
+
+            // Step 3: transit=10, prev=0,  D=10, J = 7.5  + (10-7.5)  * 0.5 = 8.75
+            gauge.addValue(10L);
+            assertEquals(8.75, gauge.getValue(), DELTA, "Step 3");
+        }
+
+        @Test
+        @DisplayName("Zero deviation decays jitter toward zero")
+        void zeroDeviationDecays() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+            gauge.addValue(0L); // 0
+            gauge.addValue(10L); // 0 + 5*alpha = 2.5
+            double afterFirst = gauge.getValue();
+            assertEquals(afterFirst, gauge.getValue(), DELTA);
+
+            gauge.addValue(10L); // 2.5 - 2.5*alpha = 2.5 - 1.25 = 1.25
+            assertEquals(afterFirst * 0.5, gauge.getValue(), DELTA);
+        }
+
+    }
+
+
+    @Nested
+    @DisplayName("Negative value guard")
+    class NegativeValueTest {
+
+        @Test
+        @DisplayName("Negative transit values are silently ignored before seed")
+        void negativeIgnoredBeforeSeed() {
+            EwmaGauge gauge = new EwmaGauge();
+            gauge.addValue(-1L);
+            gauge.addValue(-100L);
+            assertEquals(0.0, gauge.getValue(), DELTA);
+        }
+
+        @Test
+        @DisplayName("Negative value after seed does not corrupt lastTransit")
+        void negativeAfterSeedIgnored() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+            gauge.addValue(10L);
+            gauge.addValue(-5L);
+            gauge.addValue(20L);
+            assertEquals(5.0, gauge.getValue(), DELTA);
+        }
+    }
+
+
+    @Nested
+    @DisplayName("getValue() preserves EWMA across calls")
+    class GetValueIdempotentTest {
+
+        @Test
+        @DisplayName("Repeated getValue() without new samples returns same estimate")
+        void repeatedGetValueStable() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+            gauge.addValue(0L);
+            gauge.addValue(10L);
+            double first = gauge.getValue();
+
+            assertEquals(first, gauge.getValue(), DELTA, "Second call");
+            assertEquals(first, gauge.getValue(), DELTA, "Third call");
+        }
+
+        @Test
+        @DisplayName("EWMA accumulates correctly across multiple reporting windows")
+        void acrossReportingWindows() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+            gauge.addValue(0L);
+
+            gauge.addValue(10L);
+            assertEquals(5.0, gauge.getValue(), DELTA, "Window 1");
+
+            gauge.addValue(0L);
+            assertEquals(7.5, gauge.getValue(), DELTA, "Window 2");
+
+            gauge.addValue(10L);
+            assertEquals(8.75, gauge.getValue(), DELTA, "Window 3");
+        }
+    }
+
+    @Nested
+    @DisplayName("thread safe")
+    class ConcurrencyTest {
+
+        @Test
+        @DisplayName("Concurrent addValue() calls do not corrupt state")
+        void concurrentAddValue() throws InterruptedException {
+            EwmaGauge gauge = new EwmaGauge();
+            int threads = 8;
+            int samplesPerThread = 10_000;
+            CountDownLatch ready = new CountDownLatch(threads);
+            CountDownLatch start = new CountDownLatch(1);
+            ExecutorService pool = Executors.newFixedThreadPool(threads);
+
+            for (int t = 0; t < threads; t++) {
+                final long base = t * 10L;
+                pool.submit(() -> {
+                    ready.countDown();
+                    try {
+                        start.await();
+                    } catch (InterruptedException e) {
+                        Thread.currentThread().interrupt();
+                    }
+                    for (int i = 0; i < samplesPerThread; i++) {
+                        gauge.addValue(base + (i % 10));
+                    }
+                });
+            }
+
+            ready.await();
+            start.countDown();
+            pool.shutdown();
+            assertTrue(pool.awaitTermination(10, TimeUnit.SECONDS),
+                    "Executor did not terminate — possible deadlock");
+
+            double value = gauge.getValue();
+            assertTrue(value >= 0.0, "Jitter must be non-negative, got: " + value);
+            assertTrue(Double.isFinite(value), "Jitter must be finite, got: " + value);
+        }
+
+        @Test
+        @DisplayName("Concurrent getValue() and addValue() do not deadlock")
+        void concurrentGetAndAdd() throws Exception {
+            EwmaGauge gauge = new EwmaGauge();
+            ExecutorService pool = Executors.newFixedThreadPool(2);
+            CountDownLatch done = new CountDownLatch(2);
+
+            Future<?> writer = pool.submit(() -> {
+                for (int i = 0; i < 50_000; i++) {
+                    gauge.addValue(i % 100);
+                }
+                done.countDown();
+            });
+
+            Future<?> reader = pool.submit(() -> {
+                for (int i = 0; i < 1_000; i++) {
+                    double v = gauge.getValue();
+                    assertTrue(v >= 0.0 && Double.isFinite(v),
+                            "getValue() returned invalid result: " + v);
+                }
+                done.countDown();
+            });
+
+            assertTrue(done.await(10, TimeUnit.SECONDS),
+                    "Test did not complete within timeout possible deadlock");
+
+            writer.get();
+            reader.get();
+            pool.shutdown();
+        }
+
+        @Test
+        @DisplayName("Only one thread seeds lastTransit all same value gives zero jitter")
+        void seedRace() throws InterruptedException {
+            EwmaGauge gauge = new EwmaGauge();
+            int threads = 16;
+            CountDownLatch ready = new CountDownLatch(threads);
+            CountDownLatch start = new CountDownLatch(1);
+            ExecutorService pool = Executors.newFixedThreadPool(threads);
+
+            for (int t = 0; t < threads; t++) {
+                pool.submit(() -> {
+                    ready.countDown();
+                    try {
+                        start.await();
+                    } catch (InterruptedException e) {
+                        Thread.currentThread().interrupt();
+                    }
+                    gauge.addValue(42L);
+                });
+            }
+
+            ready.await();
+            start.countDown();
+            pool.shutdown();
+            assertTrue(pool.awaitTermination(5, TimeUnit.SECONDS),
+                    "Executor did not terminate possible deadlock");
+
+            assertEquals(0.0, gauge.getValue(), DELTA);
+        }
+    }
+
+    @Nested
+    @DisplayName("Edge cases")
+    class EdgeCaseTest {
+
+        @Test
+        @DisplayName("Long.MAX_VALUE transit does not overflow deviation")
+        void maxLongTransit() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+            gauge.addValue(0L);
+            gauge.addValue(Long.MAX_VALUE);
+            double value = gauge.getValue();
+            assertTrue(value > 0.0, "Jitter should be positive");
+            assertTrue(Double.isFinite(value), "Jitter should be finite");
+        }
+
+        @Test
+        @DisplayName("Zero transit time is valid and produces zero deviation")
+        void zeroTransit() {
+            EwmaGauge gauge = new EwmaGauge(0.5);
+            gauge.addValue(0L);
+            gauge.addValue(0L);
+            assertEquals(0.0, gauge.getValue(), DELTA);
+        }
+
+        @Test
+        @DisplayName("Large number of samples does not overflow LongAdder")
+        void manySamples() {
+            EwmaGauge gauge = new EwmaGauge();
+            gauge.addValue(0L);
+            for (int i = 1; i <= 100_000; i++) {
+                gauge.addValue(i % 2 == 0 ? 0L : 10L);
+            }
+            double value = gauge.getValue();
+            assertTrue(value > 0.0, "Jitter should be positive after many samples");
+            assertTrue(value <= 10.0, "Jitter cannot exceed max deviation of 10");
+            assertTrue(Double.isFinite(value), "Jitter must be finite");
+        }
+    }
+}
diff --git a/storm-client/test/jvm/org/apache/storm/metrics2/TaskMetricsTest.java b/storm-client/test/jvm/org/apache/storm/metrics2/TaskMetricsTest.java
new file mode 100644
index 00000000000..c812994280c
--- /dev/null
+++ b/storm-client/test/jvm/org/apache/storm/metrics2/TaskMetricsTest.java
@@ -0,0 +1,485 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version
+ * 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+package org.apache.storm.metrics2;
+
+import com.codahale.metrics.Gauge;
+import org.apache.storm.task.WorkerTopologyContext;
+import org.apache.storm.utils.ConfigUtils;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.ArgumentCaptor;
+import org.mockito.Mock;
+import org.mockito.MockedStatic;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.*;
+import static org.mockito.Mockito.*;
+
+@ExtendWith(MockitoExtension.class)
+class TaskMetricsTest {
+
+    private static final String TOPOLOGY_ID   = "test-topology-1";
+    private static final String COMPONENT_ID  = "test-bolt";
+    private static final Integer TASK_ID      = 42;
+    private static final Integer WORKER_PORT  = 6700;
+    private static final String STREAM_ID     = "default";
+    private static final String SOURCE_COMP   = "source-spout";
+    private static final int    SAMPLING_RATE = 1;
+    private static final double EWMA_FACTOR   = 0.3;
+
+    @Mock private WorkerTopologyContext context;
+    @Mock private StormMetricRegistry   metricRegistry;
+    @Mock private RateCounter           rateCounter;
+
+    private Map<String, Object> topoConf;
+
+    private TaskMetrics buildTaskMetrics(boolean ewmaEnabled) {
+        try (MockedStatic<ConfigUtils> cfgUtils = mockStatic(ConfigUtils.class)) {
+            cfgUtils.when(() -> ConfigUtils.samplingRate(topoConf)).thenReturn(SAMPLING_RATE);
+            cfgUtils.when(() -> ConfigUtils.ewmaSmoothingFactor(topoConf)).thenReturn(EWMA_FACTOR);
+            cfgUtils.when(() -> ConfigUtils.ewmaEnable(topoConf)).thenReturn(ewmaEnabled);
+
+            return new TaskMetrics(context, COMPONENT_ID, TASK_ID, metricRegistry, topoConf);
+        }
+    }
+
+    @BeforeEach
+    void setUp() {
+        when(context.getStormId()).thenReturn(TOPOLOGY_ID);
+        when(context.getThisWorkerPort()).thenReturn(WORKER_PORT);
+
+        topoConf = new HashMap<>();
+
+        when(metricRegistry.rateCounter(anyString(), anyString(), anyString(),
+                anyInt(), anyInt(), anyString())).thenReturn(rateCounter);
+    }
+
+    @Test
+    void spoutAckedTuple_incrementsAckCounter() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.spoutAckedTuple(STREAM_ID, 100L);
+
+        verify(rateCounter).inc(SAMPLING_RATE);
+    }
+
+    @Test
+    void spoutAckedTuple_registersCompleteLatencyGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.spoutAckedTuple(STREAM_ID, 200L);
+
+        verify(metricRegistry, atLeastOnce()).gauge(
+                contains("__complete-latency"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void spoutAckedTuple_withEwmaEnabled_registersJitterGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(true);
+
+        tm.spoutAckedTuple(STREAM_ID, 150L);
+
+        verify(metricRegistry, atLeastOnce()).gauge(
+                contains("__complete-jitter"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void spoutAckedTuple_withEwmaDisabled_doesNotRegisterJitterGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.spoutAckedTuple(STREAM_ID, 150L);
+
+        verify(metricRegistry, never()).gauge(
+                contains("__complete-rfc1889a-jitter"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void boltAckedTuple_incrementsAckCounter() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltAckedTuple(SOURCE_COMP, STREAM_ID, 50L);
+
+        verify(rateCounter).inc(SAMPLING_RATE);
+    }
+
+    @Test
+    void boltAckedTuple_registersProcessLatencyGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltAckedTuple(SOURCE_COMP, STREAM_ID, 50L);
+
+        verify(metricRegistry, atLeastOnce()).gauge(
+                contains("__process-latency"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void boltAckedTuple_withEwmaEnabled_registersJitterGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(true);
+
+        tm.boltAckedTuple(SOURCE_COMP, STREAM_ID, 75L);
+
+        verify(metricRegistry, atLeastOnce()).gauge(
+                contains("__process-jitter"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void boltAckedTuple_metricKeyIncludesSourceComponentAndStream() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltAckedTuple(SOURCE_COMP, STREAM_ID, 50L);
+
+        verify(metricRegistry).rateCounter(
+                contains(SOURCE_COMP + ":" + STREAM_ID),
+                anyString(), anyString(), anyInt(), anyInt(), anyString());
+    }
+
+    @Test
+    void spoutFailedTuple_incrementsFailCounter() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.spoutFailedTuple(STREAM_ID);
+
+        verify(rateCounter).inc(SAMPLING_RATE);
+    }
+
+    @Test
+    void spoutFailedTuple_usesCorrectMetricName() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.spoutFailedTuple(STREAM_ID);
+
+        verify(metricRegistry).rateCounter(
+                eq("__fail-count-" + STREAM_ID),
+                eq(TOPOLOGY_ID), eq(COMPONENT_ID), eq(TASK_ID), eq(WORKER_PORT), eq(STREAM_ID));
+    }
+
+    @Test
+    void boltFailedTuple_incrementsFailCounter() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltFailedTuple(SOURCE_COMP, STREAM_ID);
+
+        verify(rateCounter).inc(SAMPLING_RATE);
+    }
+
+    @Test
+    void boltFailedTuple_metricKeyIncludesSourceComponentAndStream() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltFailedTuple(SOURCE_COMP, STREAM_ID);
+
+        verify(metricRegistry).rateCounter(
+                eq("__fail-count-" + SOURCE_COMP + ":" + STREAM_ID),
+                anyString(), anyString(), anyInt(), anyInt(), anyString());
+    }
+
+    @Test
+    void emittedTuple_incrementsEmitCounter() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.emittedTuple(STREAM_ID);
+
+        verify(rateCounter).inc(SAMPLING_RATE);
+    }
+
+    @Test
+    void emittedTuple_usesCorrectMetricName() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.emittedTuple(STREAM_ID);
+
+        verify(metricRegistry).rateCounter(
+                eq("__emit-count-" + STREAM_ID),
+                eq(TOPOLOGY_ID), eq(COMPONENT_ID), eq(TASK_ID), eq(WORKER_PORT), eq(STREAM_ID));
+    }
+
+    @Test
+    void transferredTuples_incrementsByAmountTimesSamplingRate() {
+        TaskMetrics tm = buildTaskMetrics(false);
+        int amount = 5;
+
+        tm.transferredTuples(STREAM_ID, amount);
+
+        verify(rateCounter).inc(amount * SAMPLING_RATE);
+    }
+
+    @Test
+    void transferredTuples_usesCorrectMetricName() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.transferredTuples(STREAM_ID, 3);
+
+        verify(metricRegistry).rateCounter(
+                eq("__transfer-count-" + STREAM_ID),
+                anyString(), anyString(), anyInt(), anyInt(), anyString());
+    }
+
+    @Test
+    void boltExecuteTuple_incrementsExecuteCounter() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltExecuteTuple(SOURCE_COMP, STREAM_ID, 30L);
+
+        verify(rateCounter).inc(SAMPLING_RATE);
+    }
+
+    @Test
+    void boltExecuteTuple_registersExecuteLatencyGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltExecuteTuple(SOURCE_COMP, STREAM_ID, 30L);
+
+        verify(metricRegistry, atLeastOnce()).gauge(
+                contains("__execute-latency"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void boltExecuteTuple_withEwmaEnabled_registersJitterGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(true);
+
+        tm.boltExecuteTuple(SOURCE_COMP, STREAM_ID, 30L);
+
+        verify(metricRegistry, atLeastOnce()).gauge(
+                contains("__execute-jitter"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void boltExecuteTuple_withEwmaDisabled_doesNotRegisterJitterGauge() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.boltExecuteTuple(SOURCE_COMP, STREAM_ID, 30L);
+
+        verify(metricRegistry, never()).gauge(
+                contains("__execute-rfc1889a-jitter"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void differentStreams_produceSeparateRateCounters() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.emittedTuple("stream-A");
+        tm.emittedTuple("stream-B");
+
+        verify(metricRegistry).rateCounter(
+                eq("__emit-count-stream-A"),
+                anyString(), anyString(), anyInt(), anyInt(), eq("stream-A"));
+        verify(metricRegistry).rateCounter(
+                eq("__emit-count-stream-B"),
+                anyString(), anyString(), anyInt(), anyInt(), eq("stream-B"));
+    }
+
+    @Test
+    void rateCounter_registeredOnlyOnceForSameMetricName() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.emittedTuple(STREAM_ID);
+        tm.emittedTuple(STREAM_ID);
+        tm.emittedTuple(STREAM_ID);
+
+        verify(metricRegistry, times(1)).rateCounter(
+                eq("__emit-count-" + STREAM_ID),
+                anyString(), anyString(), anyInt(), anyInt(), anyString());
+    }
+
+    @Test
+    void gauge_registeredOnlyOnceForSameMetricName() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.spoutAckedTuple(STREAM_ID, 10L);
+        tm.spoutAckedTuple(STREAM_ID, 20L);
+        tm.spoutAckedTuple(STREAM_ID, 30L);
+
+        verify(metricRegistry, times(1)).gauge(
+                contains("__complete-latency"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void concurrentEmittedTuple_registersRateCounterExactlyOnce() throws InterruptedException {
+        TaskMetrics tm = buildTaskMetrics(false);
+        int threadCount = 20;
+        CountDownLatch ready = new CountDownLatch(threadCount);
+        CountDownLatch start = new CountDownLatch(1);
+        CountDownLatch done  = new CountDownLatch(threadCount);
+
+        ExecutorService pool = Executors.newFixedThreadPool(threadCount);
+        for (int i = 0; i < threadCount; i++) {
+            pool.submit(() -> {
+                ready.countDown();
+                try { start.await(); } catch (InterruptedException ignored) {}
+                tm.emittedTuple(STREAM_ID);
+                done.countDown();
+            });
+        }
+
+        ready.await();
+        start.countDown();
+        assertTrue(done.await(5, TimeUnit.SECONDS));
+        pool.shutdown();
+
+        verify(metricRegistry, times(1)).rateCounter(
+                eq("__emit-count-" + STREAM_ID),
+                anyString(), anyString(), anyInt(), anyInt(), anyString());
+    }
+
+    @Test
+    void concurrentSpoutAckedTuple_registersGaugeExactlyOnce() throws InterruptedException {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+        int threadCount = 20;
+        CountDownLatch ready = new CountDownLatch(threadCount);
+        CountDownLatch start = new CountDownLatch(1);
+        CountDownLatch done  = new CountDownLatch(threadCount);
+
+        ExecutorService pool = Executors.newFixedThreadPool(threadCount);
+        for (int i = 0; i < threadCount; i++) {
+            pool.submit(() -> {
+                ready.countDown();
+                try { start.await(); } catch (InterruptedException ignored) {}
+                tm.spoutAckedTuple(STREAM_ID, 100L);
+                done.countDown();
+            });
+        }
+
+        ready.await();
+        start.countDown();
+        assertTrue(done.await(5, TimeUnit.SECONDS));
+        pool.shutdown();
+
+        verify(metricRegistry, times(1)).gauge(
+                contains("__complete-latency"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void getOrCreateGauge_sameTypeReusedWithoutThrowing() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.spoutAckedTuple(STREAM_ID, 10L);
+        tm.spoutAckedTuple(STREAM_ID, 20L);
+
+        verify(metricRegistry, times(1)).gauge(
+                contains("__complete-latency"), any(Gauge.class),
+                anyString(), anyString(), anyString(), anyInt(), anyInt());
+    }
+
+    @Test
+    void boltAckedTuple_metricNameFormat() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+        String expectedKey = SOURCE_COMP + ":" + STREAM_ID;
+
+        tm.boltAckedTuple(SOURCE_COMP, STREAM_ID, 10L);
+
+        verify(metricRegistry).rateCounter(
+                eq("__ack-count-" + expectedKey),
+                eq(TOPOLOGY_ID), eq(COMPONENT_ID), eq(TASK_ID), eq(WORKER_PORT), eq(STREAM_ID));
+    }
+
+    @Test
+    void boltExecuteTuple_metricNameFormat() {
+        when(metricRegistry.gauge(anyString(), any(Gauge.class), anyString(),
+                anyString(), anyString(), anyInt(), anyInt())).thenReturn(null);
+        TaskMetrics tm = buildTaskMetrics(false);
+        String expectedKey = SOURCE_COMP + ":" + STREAM_ID;
+
+        tm.boltExecuteTuple(SOURCE_COMP, STREAM_ID, 10L);
+
+        verify(metricRegistry).rateCounter(
+                eq("__execute-count-" + expectedKey),
+                eq(TOPOLOGY_ID), eq(COMPONENT_ID), eq(TASK_ID), eq(WORKER_PORT), eq(STREAM_ID));
+    }
+
+    @Test
+    void boltFailedTuple_metricNameFormat() {
+        TaskMetrics tm = buildTaskMetrics(false);
+        String expectedKey = SOURCE_COMP + ":" + STREAM_ID;
+
+        tm.boltFailedTuple(SOURCE_COMP, STREAM_ID);
+
+        verify(metricRegistry).rateCounter(
+                eq("__fail-count-" + expectedKey),
+                eq(TOPOLOGY_ID), eq(COMPONENT_ID), eq(TASK_ID), eq(WORKER_PORT), eq(STREAM_ID));
+    }
+
+    @Test
+    void contextFields_propagatedCorrectlyToRegistry() {
+        TaskMetrics tm = buildTaskMetrics(false);
+
+        tm.emittedTuple(STREAM_ID);
+
+        ArgumentCaptor<String> topoCaptor  = ArgumentCaptor.forClass(String.class);
+        ArgumentCaptor<String> compCaptor  = ArgumentCaptor.forClass(String.class);
+        ArgumentCaptor<Integer> taskCaptor = ArgumentCaptor.forClass(Integer.class);
+        ArgumentCaptor<Integer> portCaptor = ArgumentCaptor.forClass(Integer.class);
+
+        verify(metricRegistry).rateCounter(
+                anyString(),
+                topoCaptor.capture(), compCaptor.capture(),
+                taskCaptor.capture(), portCaptor.capture(),
+                anyString());
+
+        assertEquals(TOPOLOGY_ID, topoCaptor.getValue());
+        assertEquals(COMPONENT_ID, compCaptor.getValue());
+        assertEquals(TASK_ID, taskCaptor.getValue());
+        assertEquals(WORKER_PORT, portCaptor.getValue());
+    }
+}