Build a Rate Limiter from Scratch in Node.js

Every API needs rate limiting. Without it, a single client can hammer your server into the ground — whether by accident or on purpose. Most developers reach for a library, but rate limiters are small enough to build yourself and understand completely.

Let’s implement three algorithms, compare their tradeoffs, and wire them into a real API.

Fixed window

The simplest approach. Count requests in a time window (say, 60 seconds). Reset when the window expires.

function fixedWindow({ windowMs = 60_000, max = 100 } = {}) {
  const counters = new Map();

  return function check(key) {
    const now = Date.now();
    const entry = counters.get(key);

    if (!entry || now >= entry.resetAt) {
      counters.set(key, { count: 1, resetAt: now + windowMs });
      return { allowed: true, remaining: max - 1, resetAt: now + windowMs };
    }

    if (entry.count >= max) {
      return { allowed: false, remaining: 0, resetAt: entry.resetAt };
    }

    entry.count++;
    return { allowed: true, remaining: max - entry.count, resetAt: entry.resetAt };
  };
}

The problem: a burst at the boundary. If a client sends 100 requests at 0:59 and another 100 at 1:01, they’ve made 200 requests in 2 seconds — but each falls in a different window.

Use when: you need something simple and the burst problem is acceptable.

Sliding window log

Track the timestamp of every request. Count how many fall within the window.

function slidingWindowLog({ windowMs = 60_000, max = 100 } = {}) {
  const logs = new Map();

  return function check(key) {
    const now = Date.now();
    const cutoff = now - windowMs;

    let entries = logs.get(key) || [];
    // Remove expired entries
    entries = entries.filter(ts => ts > cutoff);

    if (entries.length >= max) {
      logs.set(key, entries);
      return { allowed: false, remaining: 0, resetAt: entries[0] + windowMs };
    }

    entries.push(now);
    logs.set(key, entries);
    return { allowed: true, remaining: max - entries.length, resetAt: now + windowMs };
  };
}

The problem: memory. Storing every timestamp means O(max) memory per client. At 1000 requests/minute across 10,000 clients, that’s 10 million timestamps.

Use when: you need precise counting and have a low request limit.

Sliding window counter

The practical middle ground. Interpolate between two fixed windows to approximate a sliding window — without storing individual timestamps.

function slidingWindowCounter({ windowMs = 60_000, max = 100 } = {}) {
  const windows = new Map();

  return function check(key) {
    const now = Date.now();
    const currentWindow = Math.floor(now / windowMs);
    const windowStart = currentWindow * windowMs;
    const elapsed = (now - windowStart) / windowMs; // 0.0 to 1.0

    const entry = windows.get(key) || { prev: 0, curr: 0, window: currentWindow };

    // Advance window if needed
    if (entry.window < currentWindow) {
      entry.prev = entry.window === currentWindow - 1 ? entry.curr : 0;
      entry.curr = 0;
      entry.window = currentWindow;
    }

    // Weighted count: full current + proportional previous
    const weight = 1 - elapsed;
    const count = Math.floor(entry.prev * weight) + entry.curr;

    if (count >= max) {
      windows.set(key, entry);
      return { allowed: false, remaining: 0, resetAt: windowStart + windowMs };
    }

    entry.curr++;
    windows.set(key, entry);
    const newCount = Math.floor(entry.prev * weight) + entry.curr;
    return { allowed: true, remaining: max - newCount, resetAt: windowStart + windowMs };
  };
}

This stores only two numbers per client regardless of request volume. The interpolation isn’t perfectly accurate, but it’s close enough for rate limiting — and it eliminates the boundary burst problem.

Use when: you want sliding window accuracy with fixed window memory cost. This is what most production APIs use.

Token bucket

A different mental model: each client has a bucket of tokens. Requests consume tokens. Tokens refill at a steady rate.

function tokenBucket({ capacity = 100, refillRate = 10, refillMs = 1000 } = {}) {
  const buckets = new Map();

  return function check(key) {
    const now = Date.now();
    let bucket = buckets.get(key);

    if (!bucket) {
      bucket = { tokens: capacity - 1, lastRefill: now };
      buckets.set(key, bucket);
      return { allowed: true, remaining: bucket.tokens };
    }

    // Refill tokens based on elapsed time
    const elapsed = now - bucket.lastRefill;
    const refill = Math.floor(elapsed / refillMs) * refillRate;
    bucket.tokens = Math.min(capacity, bucket.tokens + refill);
    bucket.lastRefill = now;

    if (bucket.tokens < 1) {
      return { allowed: false, remaining: 0 };
    }

    bucket.tokens--;
    return { allowed: true, remaining: bucket.tokens };
  };
}

Token bucket allows bursts up to capacity, then enforces a steady rate of refillRate per interval. This is useful when you want to allow short bursts while enforcing a long-term average.

Use when: you want to allow controlled bursts. Good for user-facing APIs where occasional spikes are normal.

Standard HTTP headers

Whichever algorithm you choose, respond with the standard rate limit headers:

function setRateLimitHeaders(res, result, max) {
  res.setHeader('X-RateLimit-Limit', String(max));
  res.setHeader('X-RateLimit-Remaining', String(result.remaining));
  if (result.resetAt) {
    res.setHeader('X-RateLimit-Reset', String(Math.ceil(result.resetAt / 1000)));
  }
  if (!result.allowed) {
    res.setHeader('Retry-After', String(
      Math.ceil((result.resetAt - Date.now()) / 1000)
    ));
  }
}

These headers tell clients:

X-RateLimit-Limit — the maximum requests allowed
X-RateLimit-Remaining — requests left in the current window
X-RateLimit-Reset — Unix timestamp when the window resets
Retry-After — seconds until the client should retry (only on 429)

Middleware integration

Wrapping any of these into Express or Node.js http middleware:

function rateLimitMiddleware({ algorithm, max, keyFn }) {
  const check = algorithm;

  return (req, res, next) => {
    const key = keyFn ? keyFn(req) : req.ip;
    const result = check(key);

    setRateLimitHeaders(res, result, max);

    if (!result.allowed) {
      res.writeHead(429, { 'Content-Type': 'application/json' });
      res.end(JSON.stringify({
        error: 'Too many requests',
        retryAfter: Math.ceil((result.resetAt - Date.now()) / 1000),
      }));
      return;
    }

    next();
  };
}

// Usage
const limiter = slidingWindowCounter({ windowMs: 60_000, max: 60 });
app.use(rateLimitMiddleware({
  algorithm: limiter,
  max: 60,
  keyFn: (req) => req.headers['x-api-key'] || req.ip,
}));

Memory cleanup

All in-memory implementations leak memory if you don’t clean up expired entries:

function withCleanup(store, windowMs) {
  const cleanup = setInterval(() => {
    const now = Date.now();
    for (const [key, entry] of store) {
      const lastActivity = entry.resetAt || entry.lastRefill || 0;
      if (now - lastActivity > windowMs * 2) {
        store.delete(key);
      }
    }
  }, windowMs * 2);

  // Don't prevent process exit
  if (cleanup.unref) cleanup.unref();
}

Run cleanup at 2x the window interval. More frequent is wasteful; less frequent means stale entries pile up.

Comparison

Algorithm	Memory	Accuracy	Burst handling
Fixed window	O(1) per client	Low (boundary burst)	Poor
Sliding log	O(max) per client	Exact	Good
Sliding counter	O(1) per client	~99.7% accurate	Good
Token bucket	O(1) per client	N/A (different model)	Controlled bursts

For most APIs: sliding window counter is the right default. Low memory, good accuracy, no boundary problem.

For APIs that need burst tolerance: token bucket.

For quick prototypes: fixed window — just know its limitations.

When to use Redis instead

In-memory rate limiting works for single-process servers. When you scale to multiple processes or machines, you need a shared store. Redis is the standard choice — INCR with EXPIRE gives you a fixed window in two commands.

MULTI
INCR rate:${key}
EXPIRE rate:${key} 60
EXEC

But don’t reach for Redis by default. Most applications run on a single server, and in-memory rate limiting is simpler, faster, and has zero operational overhead.