Gaussian Blur

Separable Gaussian blur

Quick Start

import { createGaussianBlur } from './webgpu-market/gaussian-blur/gaussian-blur';

const blur = createGaussianBlur(device, { format: 'rgba8unorm' });

const output = device.createTexture({
  size: [1024, 768],
  format: 'rgba8unorm',
  usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.TEXTURE_BINDING,
});

blur.apply(sourceTexture, output, { radius: 8, sigma: 4.0 });
// output now contains the blurred result

blur.destroy();

Source

// Separable Gaussian blur fragment shader
//
// Samples along a single axis (horizontal or vertical) using
// precomputed kernel weights. Uses hardware texture sampling
// for bilinear filtering and edge clamping.

struct Uniforms {
  direction: vec2f,   // (1,0) for horizontal, (0,1) for vertical
  texel_size: vec2f,  // 1.0 / texture dimensions
  radius: i32,
  _pad0: i32,
  _pad1: i32,
  _pad2: i32,
}

// Kernel weights stored as an array of f32.
// Max 33 weights (center + up to 32 on each side, but weights are symmetric).
@group(0) @binding(0) var<uniform> u: Uniforms;
@group(0) @binding(1) var<storage, read> weights: array<f32>;
@group(0) @binding(2) var source_tex: texture_2d<f32>;
@group(0) @binding(3) var source_sampler: sampler;

struct VertexOutput {
  @builtin(position) position: vec4f,
  @location(0) uv: vec2f,
}

// Fullscreen triangle — 3 vertices cover the entire screen
@vertex
fn vs(@builtin(vertex_index) i: u32) -> VertexOutput {
  let uv = vec2f(f32((i << 1u) & 2u), f32(i & 2u));
  var out: VertexOutput;
  out.position = vec4f(uv * 2.0 - 1.0, 0.0, 1.0);
  out.uv = vec2f(uv.x, 1.0 - uv.y);
  return out;
}

@fragment
fn fs(in: VertexOutput) -> @location(0) vec4f {
  // Center sample (weight at index 0)
  var color = textureSample(source_tex, source_sampler, in.uv) * weights[0];

  // Symmetric samples on both sides of center
  let step = u.direction * u.texel_size;

  for (var i = 1i; i <= u.radius; i++) {
    let offset = step * f32(i);
    let w = weights[i];
    color += textureSample(source_tex, source_sampler, in.uv + offset) * w;
    color += textureSample(source_tex, source_sampler, in.uv - offset) * w;
  }

  return color;
}

// Gaussian Blur
// Two-pass separable Gaussian blur using render passes.
// Takes a source GPUTexture and writes the blurred result to a caller-provided target GPUTexture.
//
// Default WGSL loading uses a ?raw import (works with Vite, esbuild, Webpack).
// Alternative: load via fetch — see README.md for details.
import shaderSource from './gaussian-blur.wgsl?raw';

export interface GaussianBlurOptions {
  format?: GPUTextureFormat;
}

export interface GaussianBlurApplyOptions {
  radius?: number;
  sigma?: number;
}

export interface GaussianBlur {
  apply(source: GPUTexture, target: GPUTexture, options?: GaussianBlurApplyOptions): void;
  destroy(): void;
}

const MAX_RADIUS = 32;

// Uniform buffer: vec2f direction, vec2f texel_size, i32 radius, 3x i32 pad = 32 bytes
const UNIFORM_SIZE = 32;

// Compute normalized Gaussian kernel weights for the given radius and sigma.
// Returns an array of (radius + 1) values: weights[0] is center, weights[i] is
// the symmetric weight at offset ±i. The weights sum to 1.0.
function computeKernel(radius: number, sigma: number): Float32Array {
  const safeSigma = Math.max(sigma, 0.0001);
  const size = radius + 1;
  const kernel = new Float32Array(size);
  let sum = 0;

  for (let i = 0; i < size; i++) {
    const w = Math.exp((-i * i) / (2 * safeSigma * safeSigma));
    kernel[i] = w;
    sum += i === 0 ? w : w * 2; // center counted once, sides counted twice
  }

  // Normalize
  for (let i = 0; i < size; i++) {
    kernel[i] /= sum;
  }

  return kernel;
}

export function createGaussianBlur(
  device: GPUDevice,
  options: GaussianBlurOptions = {}
): GaussianBlur {
  const format = options.format ?? 'rgba8unorm';

  const shaderModule = device.createShaderModule({ code: shaderSource });

  const sampler = device.createSampler({
    magFilter: 'linear',
    minFilter: 'linear',
    addressModeU: 'clamp-to-edge',
    addressModeV: 'clamp-to-edge'
  });

  const hUniformBuffer = device.createBuffer({
    size: UNIFORM_SIZE,
    usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST
  });

  const vUniformBuffer = device.createBuffer({
    size: UNIFORM_SIZE,
    usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST
  });

  // Weights buffer: max 33 floats (center + 32 per side)
  const weightsBuffer = device.createBuffer({
    size: (MAX_RADIUS + 1) * 4,
    usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST
  });

  const bindGroupLayout = device.createBindGroupLayout({
    entries: [
      {
        binding: 0,
        visibility: GPUShaderStage.FRAGMENT | GPUShaderStage.VERTEX,
        buffer: { type: 'uniform' }
      },
      { binding: 1, visibility: GPUShaderStage.FRAGMENT, buffer: { type: 'read-only-storage' } },
      { binding: 2, visibility: GPUShaderStage.FRAGMENT, texture: { sampleType: 'float' } },
      { binding: 3, visibility: GPUShaderStage.FRAGMENT, sampler: {} }
    ]
  });

  const pipelineLayout = device.createPipelineLayout({ bindGroupLayouts: [bindGroupLayout] });

  const pipeline = device.createRenderPipeline({
    layout: pipelineLayout,
    vertex: { module: shaderModule, entryPoint: 'vs' },
    fragment: {
      module: shaderModule,
      entryPoint: 'fs',
      targets: [{ format }]
    },
    primitive: { topology: 'triangle-list' }
  });

  // Hoisted uniform data to avoid per-frame allocations
  const uniformData = new ArrayBuffer(UNIFORM_SIZE);
  const uniformF32 = new Float32Array(uniformData);
  const uniformI32 = new Int32Array(uniformData);

  // Mutable intermediate texture and cached bind groups — recreated when dimensions change
  let intermediateTexture: GPUTexture | null = null;
  let verticalBindGroup: GPUBindGroup | null = null;
  let lastWidth = 0;
  let lastHeight = 0;
  let lastTarget: GPUTexture | null = null;

  function ensureIntermediate(width: number, height: number): void {
    if (width === lastWidth && height === lastHeight) return;
    intermediateTexture?.destroy();
    intermediateTexture = device.createTexture({
      size: [width, height],
      format,
      usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.TEXTURE_BINDING
    });
    lastWidth = width;
    lastHeight = height;
    // Intermediate changed, so vertical bind group needs rebuild
    verticalBindGroup = null;
    lastTarget = null;
  }

  function writeUniforms(
    buffer: GPUBuffer,
    direction: [number, number],
    width: number,
    height: number,
    radius: number
  ): void {
    uniformF32[0] = direction[0];
    uniformF32[1] = direction[1];
    uniformF32[2] = 1.0 / width;
    uniformF32[3] = 1.0 / height;
    uniformI32[4] = radius;
    device.queue.writeBuffer(buffer, 0, uniformData);
  }

  function createSourceBindGroup(sourceTexture: GPUTexture): GPUBindGroup {
    return device.createBindGroup({
      layout: bindGroupLayout,
      entries: [
        { binding: 0, resource: { buffer: hUniformBuffer } },
        { binding: 1, resource: { buffer: weightsBuffer } },
        { binding: 2, resource: sourceTexture.createView() },
        { binding: 3, resource: sampler }
      ]
    });
  }

  function ensureVerticalBindGroup(target: GPUTexture): void {
    if (verticalBindGroup && lastTarget === target) return;
    verticalBindGroup = device.createBindGroup({
      layout: bindGroupLayout,
      entries: [
        { binding: 0, resource: { buffer: vUniformBuffer } },
        { binding: 1, resource: { buffer: weightsBuffer } },
        { binding: 2, resource: intermediateTexture!.createView() },
        { binding: 3, resource: sampler }
      ]
    });
    lastTarget = target;
  }

  function apply(
    source: GPUTexture,
    target: GPUTexture,
    applyOptions: GaussianBlurApplyOptions = {}
  ): void {
    const width = source.width;
    const height = source.height;
    const radius = Math.min(Math.max(applyOptions.radius ?? 8, 0), MAX_RADIUS);
    const sigma = applyOptions.sigma ?? radius / 2;

    if (radius === 0) {
      // No blur — pass-through render using weight of 1.0 at center
      device.queue.writeBuffer(weightsBuffer, 0, new Float32Array([1.0]));
      writeUniforms(hUniformBuffer, [1, 0], width, height, 0);

      const bindGroup = createSourceBindGroup(source);
      const encoder = device.createCommandEncoder();
      const pass = encoder.beginRenderPass({
        colorAttachments: [
          {
            view: target.createView(),
            loadOp: 'clear',
            storeOp: 'store'
          }
        ]
      });
      pass.setPipeline(pipeline);
      pass.setBindGroup(0, bindGroup);
      pass.draw(3);
      pass.end();
      device.queue.submit([encoder.finish()]);
      return;
    }

    ensureIntermediate(width, height);

    // Upload kernel weights
    const kernel = computeKernel(radius, sigma);
    device.queue.writeBuffer(weightsBuffer, 0, kernel);

    const encoder = device.createCommandEncoder();

    // Pass 1: Horizontal blur (source → intermediate)
    writeUniforms(hUniformBuffer, [1, 0], width, height, radius);
    const hBindGroup = createSourceBindGroup(source);

    const hPass = encoder.beginRenderPass({
      colorAttachments: [
        {
          view: intermediateTexture!.createView(),
          loadOp: 'clear',
          storeOp: 'store'
        }
      ]
    });
    hPass.setPipeline(pipeline);
    hPass.setBindGroup(0, hBindGroup);
    hPass.draw(3);
    hPass.end();

    // Pass 2: Vertical blur (intermediate → target)
    writeUniforms(vUniformBuffer, [0, 1], width, height, radius);
    ensureVerticalBindGroup(target);

    const vPass = encoder.beginRenderPass({
      colorAttachments: [
        {
          view: target.createView(),
          loadOp: 'clear',
          storeOp: 'store'
        }
      ]
    });
    vPass.setPipeline(pipeline);
    vPass.setBindGroup(0, verticalBindGroup!);
    vPass.draw(3);
    vPass.end();

    device.queue.submit([encoder.finish()]);
  }

  function destroy(): void {
    intermediateTexture?.destroy();
    hUniformBuffer.destroy();
    vUniformBuffer.destroy();
    weightsBuffer.destroy();
  }

  return { apply, destroy };
}

Documentation

Gaussian Blur

Separable Gaussian blur using two render passes. Takes a source GPUTexture and writes the blurred result to a caller-provided target GPUTexture. Uses fragment shaders with hardware texture sampling for bilinear filtering and edge clamping.

API

`createGaussianBlur(device, options?)`

Returns a GaussianBlur instance.

Option	Type	Default	Description
`format`	`GPUTextureFormat`	`'rgba8unorm'`	Texture format (must match source)

`blur.apply(source, target, options?)`

Blurs the source texture and writes the result to the target texture.

source — GPUTexture to read from (must have TEXTURE_BINDING usage)
target — GPUTexture to write to (must have RENDER_ATTACHMENT usage)

Option	Type	Default	Description
`radius`	`number`	`8`	Blur radius in pixels (0–32)
`sigma`	`number`	`radius / 2`	Gaussian standard deviation

When radius is 0, the source is copied to the target without blurring.

`blur.destroy()`

Releases internal textures and buffers. Does not destroy source or target textures.

Core Theory

Heckbert, "Filtering by Repeated Integration" (SIGGRAPH 1986) Foundational paper on efficient image filtering techniques, including the separability of Gaussian kernels that makes two-pass blur possible. https://dl.acm.org/doi/10.1145/15886.15921
Deriche, "Recursively Implementing the Gaussian and Its Derivatives" (1993) Introduces recursive (IIR) Gaussian filtering that achieves O(1) cost per pixel regardless of kernel size. A useful alternative for very large radii. https://inria.hal.science/inria-00074778/document

GPU Implementation

GPU Gems 3, Chapter 40: "Incremental Computation of the Gaussian" Practical GPU implementation techniques for Gaussian filtering, including incremental weight computation and shared memory optimizations. https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-40-incremental-computation-gaussian
Efficient Gaussian Blur with Linear Sampling (Rastergrid) Explains the technique of sampling between texels to halve the number of texture fetches in a Gaussian blur, leveraging hardware bilinear filtering. https://www.rastergrid.com/blog/2010/09/efficient-gaussian-blur-with-linear-sampling/
Intel, "An Investigation of Fast Real-Time GPU-Based Image Blur Algorithms" Comprehensive comparison of GPU blur techniques: box blur, Gaussian, Kawase, dual filtering. Benchmarks and trade-offs for each approach. https://www.intel.com/content/www/us/en/developer/articles/technical/an-investigation-of-fast-real-time-gpu-based-image-blur-algorithms.html

Post-Processing Applications

Kawase, "Frame Buffer Postprocessing Effects in DOUBLE-S.T.E.A.L" (GDC 2003) Introduces the Kawase blur — a multi-pass approach using progressively larger sample offsets that approximates Gaussian blur with fewer passes. Common in game engines. https://www.gdcvault.com/play/1022665/Frame-Buffer-Postprocessing-Effects-in
Jimenez, "Next Generation Post Processing in Call of Duty: Advanced Warfare" (SIGGRAPH 2014) State of the art in real-time post-processing, including bloom, depth of field, and motion blur. Shows how Gaussian blur fits into a modern rendering pipeline. https://www.iryoku.com/next-generation-post-processing-in-call-of-duty-advanced-warfare/

Bloom

Karis, "Real Shading in Unreal Engine 4" (SIGGRAPH 2013) Describes Unreal's bloom pipeline: threshold bright pixels, progressively downsample with blur, then composite. The Gaussian blur module can serve as the blur step. https://blog.selfshadow.com/publications/s2013-shading-course/karis/s2013_pbs_epic_notes_v2.pdf

General References

Wikipedia, "Gaussian blur" Clear mathematical description of the Gaussian function, separability proof, and relationship to the normal distribution. https://en.wikipedia.org/wiki/Gaussian_blur
WebGPU Specification — Render Passes The official spec for render pass encoding, color attachments, load/store operations, and fragment shader outputs used by this module. https://www.w3.org/TR/webgpu/#render-passes

Gaussian Blur

Quick Start

Gaussian Blur

API

createGaussianBlur(device, options?)

blur.apply(source, target, options?)

blur.destroy()

Further Reading

Core Theory

GPU Implementation

Post-Processing Applications

Bloom

General References

`createGaussianBlur(device, options?)`

`blur.apply(source, target, options?)`

`blur.destroy()`