diff --git a/NAM/slimmable_wavenet.cpp b/NAM/slimmable_wavenet.cpp
index 2357032..5be26e2 100644
--- a/NAM/slimmable_wavenet.cpp
+++ b/NAM/slimmable_wavenet.cpp
@@ -3,6 +3,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <optional>
 #include <stdexcept>
 
 namespace nam
@@ -120,6 +121,12 @@ std::vector<float> extract_slimmed_weights(const std::vector<wavenet::LayerArray
   for (int arr = 0; arr < num_arrays; arr++)
   {
     const auto& p = original_params[arr];
+    if (p.head_kernel_size != 1)
+    {
+      throw std::runtime_error(
+        "SlimmableWavenet: head rechannel kernel_size must be 1 (slimming with head kernel_size > 1 is not "
+        "implemented)");
+    }
     validate_groups(p);
 
     const int full_ch = p.channels;
@@ -258,8 +265,9 @@ std::vector<wavenet::LayerArrayParams> modify_params_for_channels(
     int new_head_size = (i < num_arrays - 1) ? new_channels_per_array[i + 1] : p.head_size;
 
     modified.push_back(wavenet::LayerArrayParams(
-      new_input_size, p.condition_size, new_head_size, new_ch, new_bottleneck, std::vector<int>(p.kernel_sizes),
-      std::vector<int>(p.dilations), std::vector<activations::ActivationConfig>(p.activation_configs),
+      new_input_size, p.condition_size, new_head_size, p.head_kernel_size, new_ch, new_bottleneck,
+      std::vector<int>(p.kernel_sizes), std::vector<int>(p.dilations),
+      std::vector<activations::ActivationConfig>(p.activation_configs),
       std::vector<wavenet::GatingMode>(p.gating_modes), p.head_bias, p.groups_input, p.groups_input_mixin,
       p.layer1x1_params, p.head1x1_params, std::vector<activations::ActivationConfig>(p.secondary_activation_configs),
       p.conv_pre_film_params, p.conv_post_film_params, p.input_mixin_pre_film_params, p.input_mixin_post_film_params,
@@ -326,6 +334,9 @@ SlimmableWavenet::SlimmableWavenet(std::vector<wavenet::LayerArrayParams> origin
   if (!any_slimmable)
     throw std::runtime_error("SlimmableWavenet: at least one layer array must have allowed_channels");
 
+  if (with_head)
+    throw std::runtime_error("SlimmableWavenet: post-stack head is not supported");
+
   // Build with full channel counts as default (ratio=1.0)
   std::vector<int> full_channels(_original_params.size());
   for (size_t i = 0; i < _original_params.size(); i++)
@@ -360,8 +371,8 @@ void SlimmableWavenet::_rebuild_model(const std::vector<int>& target_channels)
     condition_dsp = get_dsp(_condition_dsp_json);
 
   double sampleRate = _current_sample_rate > 0 ? _current_sample_rate : GetExpectedSampleRate();
-  _active_model = std::make_unique<wavenet::WaveNet>(
-    _in_channels, *params_ptr, _head_scale, _with_head, std::move(weights), std::move(condition_dsp), sampleRate);
+  _active_model = std::make_unique<wavenet::WaveNet>(_in_channels, *params_ptr, _head_scale, _with_head, std::nullopt,
+                                                     std::move(weights), std::move(condition_dsp), sampleRate);
   _current_channels = target_channels;
 
   if (_current_buffer_size > 0)
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
index 8ddce1a..f45b9f4 100644
--- a/NAM/wavenet.cpp
+++ b/NAM/wavenet.cpp
@@ -12,6 +12,75 @@
 #include "slimmable_wavenet.h"
 #include "wavenet.h"
 
+// PostStackHead (WaveNet post-stack head) =====================================
+
+nam::wavenet::PostStackHead::PostStackHead(const WaveNetHeadParams& params)
+: _in_channels(params.in_channels)
+, _out_channels(params.out_channels)
+{
+  if (params.kernel_sizes.empty())
+    throw std::runtime_error("PostStackHead: kernel_sizes must be non-empty");
+  const size_t n = params.kernel_sizes.size();
+  int cin = params.in_channels;
+  for (size_t i = 0; i < n; i++)
+  {
+    const int cout = (i + 1 == n) ? params.out_channels : params.channels;
+    const int k = params.kernel_sizes[i];
+    if (k < 1)
+      throw std::runtime_error("PostStackHead: kernel_sizes entries must be >= 1");
+    nam::activations::Activation::Ptr act = nam::activations::Activation::get_activation(params.activation_config);
+    if (act == nullptr)
+      throw std::runtime_error("PostStackHead: unsupported activation for post-stack head");
+    _activations.push_back(std::move(act));
+    nam::Conv1D conv;
+    conv.set_size_(cin, cout, k, true, 1, 1);
+    _convs.push_back(std::move(conv));
+    cin = cout;
+  }
+}
+
+void nam::wavenet::PostStackHead::set_weights_(std::vector<float>::iterator& weights)
+{
+  for (size_t i = 0; i < _convs.size(); i++)
+    _convs[i].set_weights_(weights);
+}
+
+void nam::wavenet::PostStackHead::SetMaxBufferSize(const int maxBufferSize)
+{
+  for (size_t i = 0; i < _convs.size(); i++)
+    _convs[i].SetMaxBufferSize(maxBufferSize);
+}
+
+long nam::wavenet::PostStackHead::receptive_field() const
+{
+  long rf = 1;
+  for (size_t i = 0; i < _convs.size(); i++)
+  {
+    const long k = _convs[i].get_kernel_size();
+    rf += k - 1;
+  }
+  return rf;
+}
+
+void nam::wavenet::PostStackHead::process(Eigen::MatrixXf& work, const int num_frames)
+{
+  for (size_t i = 0; i < _convs.size(); i++)
+  {
+    const long in_ch = _convs[i].get_in_channels();
+    if (i == 0)
+    {
+      _activations[i]->apply(work.data(), (long)(in_ch * num_frames));
+      _convs[i].Process(work, num_frames);
+    }
+    else
+    {
+      auto& prev = _convs[i - 1].GetOutput();
+      _activations[i]->apply(prev.data(), (long)(in_ch * num_frames));
+      _convs[i].Process(prev, num_frames);
+    }
+  }
+}
+
 // Layer ======================================================================
 
 void nam::wavenet::_Layer::SetMaxBufferSize(const int maxBufferSize)
@@ -306,7 +375,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
 nam::wavenet::_LayerArray::_LayerArray(const LayerArrayParams& params)
 : _rechannel(params.input_size, params.channels, false)
 , _head_rechannel(params.head1x1_params.active ? params.head1x1_params.out_channels : params.bottleneck,
-                  params.head_size, params.head_bias)
+                  params.head_size, params.head_kernel_size, params.head_bias ? 1 : 0, 1, 1)
 , _head_output_size(params.head1x1_params.active ? params.head1x1_params.out_channels : params.bottleneck)
 {
   const size_t num_layers = params.dilations.size();
@@ -345,6 +414,7 @@ long nam::wavenet::_LayerArray::get_receptive_field() const
   long result = 0;
   for (size_t i = 0; i < this->_layers.size(); i++)
     result += this->_layers[i].get_dilation() * (this->_layers[i].get_kernel_size() - 1);
+  result += (long)this->_head_rechannel.get_kernel_size() - 1;
   return result;
 }
 
@@ -431,8 +501,8 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);
 #endif
 
-  // Process head rechannel
-  _head_rechannel.process_(this->_head_inputs, num_frames);
+  // Process head rechannel (causal Conv1D)
+  _head_rechannel.Process(this->_head_inputs, num_frames);
 }
 
 
@@ -460,16 +530,27 @@ long nam::wavenet::_LayerArray::_get_channels() const
   return this->_layers.size() > 0 ? this->_layers[0].get_channels() : 0;
 }
 
+namespace
+{
+int wave_net_output_channels(const std::vector<nam::wavenet::LayerArrayParams>& layer_array_params,
+                             const bool with_head, const std::optional<nam::wavenet::WaveNetHeadParams>& head_params)
+{
+  if (layer_array_params.empty())
+    throw std::runtime_error("WaveNet requires at least one layer array");
+  if (with_head && head_params.has_value())
+    return head_params->out_channels;
+  return layer_array_params.back().head_size;
+}
+} // namespace
+
 // WaveNet ====================================================================
 
 nam::wavenet::WaveNet::WaveNet(const int in_channels,
                                const std::vector<nam::wavenet::LayerArrayParams>& layer_array_params,
-                               const float head_scale, const bool with_head, std::vector<float> weights,
+                               const float head_scale, const bool with_head,
+                               std::optional<WaveNetHeadParams> head_params, std::vector<float> weights,
                                std::unique_ptr<DSP> condition_dsp, const double expected_sample_rate)
-: DSP(in_channels,
-      layer_array_params.empty() ? throw std::runtime_error("WaveNet requires at least one layer array")
-                                 : layer_array_params.back().head_size,
-      expected_sample_rate)
+: DSP(in_channels, wave_net_output_channels(layer_array_params, with_head, head_params), expected_sample_rate)
 , _condition_dsp(std::move(condition_dsp))
 , _head_scale(head_scale)
 {
@@ -484,10 +565,22 @@ nam::wavenet::WaveNet::WaveNet(const int in_channels,
       throw std::runtime_error(ss.str().c_str());
     }
   }
-  if (layer_array_params.empty())
-    throw std::runtime_error("WaveNet requires at least one layer array");
   if (with_head)
-    throw std::runtime_error("Head not implemented!");
+  {
+    if (!head_params.has_value())
+      throw std::runtime_error("WaveNet: with_head is true but head configuration is missing");
+    if (head_params->in_channels != layer_array_params.back().head_size)
+    {
+      std::stringstream ss;
+      ss << "WaveNet head in_channels (" << head_params->in_channels << ") must match last layer array head_size ("
+         << layer_array_params.back().head_size << ")";
+      throw std::runtime_error(ss.str());
+    }
+    this->_post_stack_head = std::make_unique<PostStackHead>(*head_params);
+  }
+  else if (head_params.has_value())
+    throw std::runtime_error("WaveNet: head configuration provided but with_head is false");
+
   for (size_t i = 0; i < layer_array_params.size(); i++)
   {
     // Quick assert that the condition_dsp will output compatibly with this layer array
@@ -518,6 +611,8 @@ nam::wavenet::WaveNet::WaveNet(const int in_channels,
   mPrewarmSamples = this->_condition_dsp != nullptr ? this->_condition_dsp->PrewarmSamples() : 1;
   for (size_t i = 0; i < this->_layer_arrays.size(); i++)
     mPrewarmSamples += this->_layer_arrays[i].get_receptive_field();
+  if (this->_post_stack_head != nullptr)
+    mPrewarmSamples += this->_post_stack_head->receptive_field() - 1;
 }
 
 void nam::wavenet::WaveNet::set_weights_(std::vector<float>& weights)
@@ -527,6 +622,8 @@ void nam::wavenet::WaveNet::set_weights_(std::vector<float>& weights)
   // so we don't need to set its weights here.
   for (size_t i = 0; i < this->_layer_arrays.size(); i++)
     this->_layer_arrays[i].set_weights_(it);
+  if (this->_post_stack_head != nullptr)
+    this->_post_stack_head->set_weights_(it);
   this->_head_scale = *(it++); // TODO `LayerArray.absorb_head_scale()`
   if (it != weights.end())
   {
@@ -579,6 +676,12 @@ void nam::wavenet::WaveNet::SetMaxBufferSize(const int maxBufferSize)
 
   for (size_t i = 0; i < this->_layer_arrays.size(); i++)
     this->_layer_arrays[i].SetMaxBufferSize(maxBufferSize);
+
+  if (this->_post_stack_head != nullptr)
+  {
+    this->_post_stack_head->SetMaxBufferSize(maxBufferSize);
+    this->_scaled_head_scratch.resize(this->_post_stack_head->in_channels(), maxBufferSize);
+  }
 }
 
 void nam::wavenet::WaveNet::_process_condition(const int num_frames)
@@ -656,9 +759,39 @@ void nam::wavenet::WaveNet::process(NAM_SAMPLE** input, NAM_SAMPLE** output, con
     }
   }
 
-  // (Head not implemented)
-
   auto& final_head_outputs = this->_layer_arrays.back().GetHeadOutputs();
+
+  if (this->_post_stack_head != nullptr)
+  {
+    assert(final_head_outputs.rows() == this->_post_stack_head->in_channels());
+    const int head_in = this->_post_stack_head->in_channels();
+    for (int ch = 0; ch < head_in; ch++)
+    {
+      for (int s = 0; s < num_frames; s++)
+        this->_scaled_head_scratch(ch, s) = this->_head_scale * final_head_outputs(ch, s);
+    }
+    this->_post_stack_head->process(this->_scaled_head_scratch, num_frames);
+    const Eigen::MatrixXf& head_out = this->_post_stack_head->get_last_output();
+    assert(head_out.rows() == out_channels);
+
+    if (out_channels == 1)
+    {
+      const float* __restrict__ src = head_out.data();
+      NAM_SAMPLE* __restrict__ dst = output[0];
+      for (int s = 0; s < num_frames; s++)
+        dst[s] = (NAM_SAMPLE)src[s];
+    }
+    else
+    {
+      for (int ch = 0; ch < out_channels; ch++)
+      {
+        for (int s = 0; s < num_frames; s++)
+          output[ch][s] = (NAM_SAMPLE)head_out(ch, s);
+      }
+    }
+    return;
+  }
+
   assert(final_head_outputs.rows() == out_channels);
 
   // Optimized output copy with head_scale multiplication
@@ -729,7 +862,41 @@ nam::wavenet::WaveNetConfig nam::wavenet::parse_config_json(const nlohmann::json
 
     const int input_size = layer_config["input_size"];
     const int condition_size = layer_config["condition_size"];
-    const int head_size = layer_config["head_size"];
+
+    int head_size = 0;
+    int head_kernel_size = 1;
+    bool head_bias = false;
+
+    // Prefer nested "head" (matches trainer export). Legacy .nam uses head_size + head_bias (implicit kernel 1).
+    if (layer_config.find("head") != layer_config.end() && !layer_config["head"].is_null())
+    {
+      const auto& head_json = layer_config["head"];
+      if (!head_json.is_object())
+      {
+        throw std::runtime_error("Layer array " + std::to_string(i) + ": 'head' must be a JSON object");
+      }
+      head_size = head_json.at("out_channels").get<int>();
+      head_kernel_size = head_json.at("kernel_size").get<int>();
+      head_bias = head_json.at("bias").get<bool>();
+    }
+    else if (layer_config.find("head_size") != layer_config.end())
+    {
+      head_size = layer_config["head_size"].get<int>();
+      head_kernel_size = 1;
+      head_bias = layer_config.at("head_bias").get<bool>();
+    }
+    else
+    {
+      throw std::runtime_error("Layer array " + std::to_string(i)
+                               + ": expected 'head' object with out_channels, kernel_size, and bias, "
+                                 "or legacy 'head_size' and 'head_bias'");
+    }
+
+    if (head_kernel_size < 1)
+    {
+      throw std::runtime_error("Layer array " + std::to_string(i) + ": head.kernel_size must be >= 1");
+    }
+
     const auto dilations = layer_config["dilations"];
     const size_t num_layers = dilations.size();
 
@@ -921,8 +1088,6 @@ nam::wavenet::WaveNetConfig nam::wavenet::parse_config_json(const nlohmann::json
       secondary_activation_configs.resize(num_layers, activations::ActivationConfig{});
     }
 
-    const bool head_bias = layer_config["head_bias"];
-
     // Parse head1x1 parameters
     bool head1x1_active = false;
     int head1x1_out_channels = channels;
@@ -967,7 +1132,7 @@ nam::wavenet::WaveNetConfig nam::wavenet::parse_config_json(const nlohmann::json
     }
 
     wc.layer_array_params.push_back(nam::wavenet::LayerArrayParams(
-      input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), dilations,
+      input_size, condition_size, head_size, head_kernel_size, channels, bottleneck, std::move(kernel_sizes), dilations,
       std::move(activation_configs), std::move(gating_modes), head_bias, groups, groups_input_mixin, layer1x1_params,
       head1x1_params, std::move(secondary_activation_configs), conv_pre_film_params, conv_post_film_params,
       input_mixin_pre_film_params, input_mixin_post_film_params, activation_pre_film_params,
@@ -981,14 +1146,44 @@ nam::wavenet::WaveNetConfig nam::wavenet::parse_config_json(const nlohmann::json
   if (wc.layer_array_params.empty())
     throw std::runtime_error("WaveNet config requires at least one layer array");
 
+  if (wc.with_head)
+  {
+    const nlohmann::json& hj = config["head"];
+    WaveNetHeadParams hp;
+    const int implied_in = wc.layer_array_params.back().head_size;
+    // New trainer export omits in_channels (single source: last layer head_size). Legacy .nam may include it.
+    if (hj.find("in_channels") != hj.end() && !hj["in_channels"].is_null())
+    {
+      const int legacy_in = hj["in_channels"].get<int>();
+      if (legacy_in != implied_in)
+      {
+        std::stringstream ss;
+        ss << "WaveNet config: head.in_channels (" << legacy_in << ") must equal last layer's head_size (" << implied_in
+           << ")";
+        throw std::runtime_error(ss.str());
+      }
+    }
+    hp.in_channels = implied_in;
+    hp.channels = hj.at("channels").get<int>();
+    hp.out_channels = hj.at("out_channels").get<int>();
+    hp.kernel_sizes = hj.at("kernel_sizes").get<std::vector<int>>();
+    hp.activation_config = nam::activations::ActivationConfig::from_json(hj.at("activation"));
+    if (hp.kernel_sizes.empty())
+      throw std::runtime_error("WaveNet config: head.kernel_sizes must be non-empty");
+    wc.head_params = std::move(hp);
+  }
+  else
+    wc.head_params = std::nullopt;
+
   return wc;
 }
 
 // WaveNetConfig::create()
 std::unique_ptr<nam::DSP> nam::wavenet::WaveNetConfig::create(std::vector<float> weights, double sampleRate)
 {
-  return std::make_unique<nam::wavenet::WaveNet>(
-    in_channels, layer_array_params, head_scale, with_head, std::move(weights), std::move(condition_dsp), sampleRate);
+  return std::make_unique<nam::wavenet::WaveNet>(in_channels, layer_array_params, head_scale, with_head,
+                                                 std::move(head_params), std::move(weights), std::move(condition_dsp),
+                                                 sampleRate);
 }
 
 namespace
diff --git a/NAM/wavenet.h b/NAM/wavenet.h
index 6cb43e5..4aeafe3 100644
--- a/NAM/wavenet.h
+++ b/NAM/wavenet.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <memory>
+#include <optional>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -435,6 +436,7 @@ class LayerArrayParams
   /// \param dilations_ Vector of dilation factors, one per layer
   /// \param activation_configs_ Vector of primary activation configurations, one per layer
   /// \param gating_modes_ Vector of gating modes, one per layer
+  /// \param head_kernel_size_ Kernel size of the head rechannel conv (>= 1)
   /// \param head_bias_ Whether to use bias in the head rechannel
   /// \param groups_input Number of groups for input convolutions
   /// \param groups_input_mixin_ Number of groups for input mixin convolutions
@@ -451,8 +453,9 @@ class LayerArrayParams
   /// \param head1x1_post_film_params_ FiLM parameters after head1x1 convolutions
   /// \throws std::invalid_argument If dilations, activation_configs, gating_modes, or secondary_activation_configs
   /// sizes don't match
-  LayerArrayParams(const int input_size_, const int condition_size_, const int head_size_, const int channels_,
-                   const int bottleneck_, const std::vector<int>&& kernel_sizes_, const std::vector<int>&& dilations_,
+  LayerArrayParams(const int input_size_, const int condition_size_, const int head_size_, const int head_kernel_size_,
+                   const int channels_, const int bottleneck_, const std::vector<int>&& kernel_sizes_,
+                   const std::vector<int>&& dilations_,
                    const std::vector<activations::ActivationConfig>&& activation_configs_,
                    const std::vector<GatingMode>&& gating_modes_, const bool head_bias_, const int groups_input,
                    const int groups_input_mixin_, const Layer1x1Params& layer1x1_params_,
@@ -465,6 +468,7 @@ class LayerArrayParams
   : input_size(input_size_)
   , condition_size(condition_size_)
   , head_size(head_size_)
+  , head_kernel_size(head_kernel_size_)
   , channels(channels_)
   , bottleneck(bottleneck_)
   , kernel_sizes(std::move(kernel_sizes_))
@@ -486,6 +490,10 @@ class LayerArrayParams
   , _layer1x1_post_film_params(_layer1x1_post_film_params_)
   , head1x1_post_film_params(head1x1_post_film_params_)
   {
+    if (head_kernel_size < 1)
+    {
+      throw std::invalid_argument("LayerArrayParams: head_kernel_size must be >= 1");
+    }
     const size_t num_layers = dilations.size();
     if (kernel_sizes.empty())
     {
@@ -518,6 +526,7 @@ class LayerArrayParams
   const int input_size; ///< Input size (number of channels)
   const int condition_size; ///< Size of conditioning input
   const int head_size; ///< Size of head output (after rechannel)
+  const int head_kernel_size; ///< Kernel size of head rechannel convolution (>= 1)
   const int channels; ///< Number of channels in each layer
   const int bottleneck; ///< Bottleneck size (internal channel count)
   std::vector<int> kernel_sizes; ///< Per-layer kernel sizes, one per layer
@@ -628,8 +637,8 @@ class _LayerArray
   // Size is _head_output_size (= head1x1.out_channels if head1x1 active, else bottleneck)
   Eigen::MatrixXf _head_inputs;
 
-  // Rechannel for the head (_head_output_size -> head_size)
-  Conv1x1 _head_rechannel;
+  // Rechannel for the head (_head_output_size -> head_size), causal Conv1D (dilation 1)
+  Conv1D _head_rechannel;
 
   // Head output size from each layer (head1x1.out_channels if active, else bottleneck)
   const int _head_output_size;
@@ -639,6 +648,42 @@ class _LayerArray
   void ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, const int num_frames);
 };
 
+/// \brief Parameters for the optional post-stack head (matches Python ``nam.models.wavenet._head.Head``).
+/// JSON export omits ``in_channels`` (implied by last layer array ``head_size``); load sets it from there.
+struct WaveNetHeadParams
+{
+  int in_channels;
+  int channels;
+  int out_channels;
+  std::vector<int> kernel_sizes;
+  activations::ActivationConfig activation_config;
+};
+
+/// \brief Post-stack head: repeated (activation → Conv1D) with dilation 1, stride 1, valid (causal streaming) conv.
+class PostStackHead
+{
+public:
+  explicit PostStackHead(const WaveNetHeadParams& params);
+
+  void set_weights_(std::vector<float>::iterator& weights);
+  void SetMaxBufferSize(int maxBufferSize);
+  long receptive_field() const;
+  int in_channels() const { return _in_channels; }
+  int out_channels() const { return _out_channels; }
+
+  /// \param work Input buffer (in_channels × maxBufferSize); first in_channels×num_frames scaled by head_scale;
+  ///             may be modified in place.
+  void process(Eigen::MatrixXf& work, int num_frames);
+
+  const Eigen::MatrixXf& get_last_output() const { return _convs.back().GetOutput(); }
+
+private:
+  std::vector<nam::Conv1D> _convs;
+  std::vector<nam::activations::Activation::Ptr> _activations;
+  int _in_channels;
+  int _out_channels;
+};
+
 /// \brief The main WaveNet model
 ///
 /// WaveNet is a dilated convolutional neural network architecture for audio processing.
@@ -657,13 +702,14 @@ class WaveNet : public DSP
   /// \param in_channels Number of input channels
   /// \param layer_array_params Parameters for each layer array
   /// \param head_scale Scaling factor applied to the final head output
-  /// \param with_head Whether to use a custom "head" module that further processes the output (not currently supported)
+  /// \param with_head Whether to apply the optional post-stack head (Conv1D stack after layer arrays)
+  /// \param head_params Configuration for the post-stack head when ``with_head`` is true
   /// \param weights Model weights (will be consumed during construction)
   /// \param condition_dsp Optional DSP module for processing the conditioning input
   /// \param expected_sample_rate Expected sample rate in Hz (-1.0 if unknown)
   WaveNet(const int in_channels, const std::vector<LayerArrayParams>& layer_array_params, const float head_scale,
-          const bool with_head, std::vector<float> weights, std::unique_ptr<DSP> condition_dsp,
-          const double expected_sample_rate = -1.0);
+          const bool with_head, std::optional<WaveNetHeadParams> head_params, std::vector<float> weights,
+          std::unique_ptr<DSP> condition_dsp, const double expected_sample_rate = -1.0);
 
   /// \brief Destructor
   ~WaveNet() = default;
@@ -725,6 +771,10 @@ class WaveNet : public DSP
 
   float _head_scale;
 
+  std::unique_ptr<PostStackHead> _post_stack_head;
+  /// Scratch (in_channels × maxBufferSize) for scaled head input when ``_post_stack_head`` is used
+  Eigen::MatrixXf _scaled_head_scratch;
+
   int mPrewarmSamples = 0; // Pre-compute during initialization
   int PrewarmSamples() override { return mPrewarmSamples; };
 };
@@ -736,6 +786,7 @@ struct WaveNetConfig : public ModelConfig
   std::vector<LayerArrayParams> layer_array_params;
   float head_scale;
   bool with_head;
+  std::optional<WaveNetHeadParams> head_params;
   std::unique_ptr<DSP> condition_dsp;
 
   // Move-only due to unique_ptr
diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp
index 38aa5b5..5908dbc 100644
--- a/tools/run_tests.cpp
+++ b/tools/run_tests.cpp
@@ -18,6 +18,8 @@
 #include "test/test_wavenet/test_real_time_safe.cpp"
 #include "test/test_wavenet/test_condition_processing.cpp"
 #include "test/test_wavenet/test_head1x1.cpp"
+#include "test/test_wavenet/test_output_head.cpp"
+#include "test/test_wavenet/test_layer_head_config.cpp"
 #include "test/test_wavenet/test_layer1x1.cpp"
 #include "test/test_wavenet/test_factory.cpp"
 #include "test/test_gating_activations.cpp"
@@ -159,6 +161,11 @@ int main()
   test_wavenet::test_layer_array::test_layer_array_with_head_input();
   test_wavenet::test_layer_array::test_layer_array_different_activations();
   test_wavenet::test_full::test_wavenet_model();
+  test_wavenet::test_output_head::test_post_stack_head_receptive_field();
+  test_wavenet::test_output_head::test_wavenet_with_post_stack_head_processes();
+  test_wavenet::test_output_head::test_wavenet_with_two_layer_post_stack_head_applies_activation_per_layer_input();
+  test_wavenet::test_layer_head_config::test_legacy_head_size_and_head_bias_implies_kernel_one();
+  test_wavenet::test_layer_head_config::test_nested_head_with_kernel_size_three();
   test_wavenet::test_full::test_wavenet_multiple_arrays();
   test_wavenet::test_full::test_wavenet_zero_input();
   test_wavenet::test_full::test_wavenet_different_buffer_sizes();
@@ -190,6 +197,7 @@ int main()
   test_wavenet::test_layer_post_activation_film_blended_realtime_safe();
   test_wavenet::test_layer_array_process_realtime_safe();
   test_wavenet::test_process_realtime_safe();
+  test_wavenet::test_process_with_post_stack_head_realtime_safe();
   test_wavenet::test_process_3in_2out_realtime_safe();
   test_wavenet::test_condition_processing::test_with_condition_dsp();
   test_wavenet::test_condition_processing::test_with_condition_dsp_multichannel();
diff --git a/tools/test/test_wavenet/test_condition_processing.cpp b/tools/test/test_wavenet/test_condition_processing.cpp
index 5a7e0d0..f929483 100644
--- a/tools/test/test_wavenet/test_condition_processing.cpp
+++ b/tools/test/test_wavenet/test_condition_processing.cpp
@@ -2,6 +2,7 @@
 
 #include <Eigen/Dense>
 #include <cassert>
+#include <optional>
 #include <cmath>
 #include <iostream>
 #include <vector>
@@ -35,7 +36,7 @@ static nam::wavenet::LayerArrayParams make_layer_array_params(
   std::vector<nam::activations::ActivationConfig> secondary_activation_configs(
     dilations.size(), secondary_activation_config);
   return nam::wavenet::LayerArrayParams(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
     layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params,
     film_params, film_params, film_params, film_params, film_params);
@@ -143,7 +144,7 @@ std::unique_ptr<nam::wavenet::WaveNet> create_simple_wavenet(
   weights.push_back(head_scale);
 
   return std::make_unique<nam::wavenet::WaveNet>(
-    in_channels, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    in_channels, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 }
 
 // Test condition processing with condition_dsp
diff --git a/tools/test/test_wavenet/test_full.cpp b/tools/test/test_wavenet/test_full.cpp
index 7f18b68..0c09c2e 100644
--- a/tools/test/test_wavenet/test_full.cpp
+++ b/tools/test/test_wavenet/test_full.cpp
@@ -4,6 +4,7 @@
 #include <cassert>
 #include <cmath>
 #include <iostream>
+#include <optional>
 #include <vector>
 
 #include "NAM/wavenet.h"
@@ -34,7 +35,7 @@ static nam::wavenet::LayerArrayParams make_layer_array_params(
   std::vector<nam::activations::ActivationConfig> secondary_activation_configs(
     dilations.size(), secondary_activation_config);
   return nam::wavenet::LayerArrayParams(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
     layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params,
     film_params, film_params, film_params, film_params, film_params);
@@ -82,7 +83,7 @@ void test_wavenet_model()
 
   std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
   auto wavenet = std::make_unique<nam::wavenet::WaveNet>(
-    input_size, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    input_size, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 
   const int numFrames = 4;
   const int maxBufferSize = 64;
@@ -151,7 +152,7 @@ void test_wavenet_multiple_arrays()
 
   std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
   auto wavenet = std::make_unique<nam::wavenet::WaveNet>(
-    input_size, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    input_size, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 
   const int numFrames = 4;
   const int maxBufferSize = 64;
@@ -204,7 +205,7 @@ void test_wavenet_zero_input()
 
   std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
   auto wavenet = std::make_unique<nam::wavenet::WaveNet>(
-    input_size, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    input_size, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 
   const int numFrames = 4;
   wavenet->Reset(48000.0, numFrames);
@@ -256,7 +257,7 @@ void test_wavenet_different_buffer_sizes()
 
   std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
   auto wavenet = std::make_unique<nam::wavenet::WaveNet>(
-    input_size, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    input_size, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 
   // Test with different buffer sizes
   wavenet->Reset(48000.0, 64);
@@ -331,7 +332,7 @@ void test_wavenet_prewarm()
 
   std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
   auto wavenet = std::make_unique<nam::wavenet::WaveNet>(
-    input_size, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    input_size, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 
   // Test that prewarm can be called without errors
   wavenet->Reset(48000.0, 64);
diff --git a/tools/test/test_wavenet/test_layer_array.cpp b/tools/test/test_wavenet/test_layer_array.cpp
index f91ff98..0dc1fde 100644
--- a/tools/test/test_wavenet/test_layer_array.cpp
+++ b/tools/test/test_wavenet/test_layer_array.cpp
@@ -38,7 +38,7 @@ static nam::wavenet::_LayerArray make_layer_array(const int input_size, const in
   std::vector<int> dilations_copy = dilations; // Make a copy since we need to move it
   std::vector<int> kernel_sizes(dilations.size(), kernel_size);
   nam::wavenet::LayerArrayParams params(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations_copy),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations_copy),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
     layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params,
     film_params, film_params, film_params, film_params, film_params);
@@ -223,7 +223,7 @@ void test_layer_array_different_activations()
   auto film_params = make_default_film_params();
   std::vector<int> kernel_sizes(dilations.size(), kernel_size);
   nam::wavenet::LayerArrayParams params(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups, groups_input_mixin, layer1x1_params,
     head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params, film_params,
     film_params, film_params, film_params, film_params);
@@ -306,7 +306,7 @@ void test_layer_array_different_activations()
     dilations_all_relu.size(), nam::activations::ActivationConfig{});
   std::vector<int> kernel_sizes_all_relu(dilations_all_relu.size(), kernel_size);
   nam::wavenet::LayerArrayParams params_all_relu(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes_all_relu),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes_all_relu),
     std::move(dilations_all_relu), std::move(all_relu_configs), std::move(all_none_gating_modes), head_bias, groups,
     groups_input_mixin, layer1x1_params, head1x1_params, std::move(all_empty_secondary_configs), film_params,
     film_params, film_params, film_params, film_params, film_params, film_params, film_params);
diff --git a/tools/test/test_wavenet/test_layer_head_config.cpp b/tools/test/test_wavenet/test_layer_head_config.cpp
new file mode 100644
index 0000000..847810a
--- /dev/null
+++ b/tools/test/test_wavenet/test_layer_head_config.cpp
@@ -0,0 +1,68 @@
+// Layer-array head JSON: legacy head_size/head_bias vs nested "head" (out_channels, kernel_size, bias)
+
+#include <cassert>
+#include <string>
+
+#include "json.hpp"
+
+#include "NAM/wavenet.h"
+
+namespace test_wavenet
+{
+namespace test_layer_head_config
+{
+
+void test_legacy_head_size_and_head_bias_implies_kernel_one()
+{
+  const std::string configStr = R"({
+    "layers": [{
+      "input_size": 1,
+      "condition_size": 1,
+      "head_size": 2,
+      "channels": 2,
+      "kernel_size": 1,
+      "dilations": [1],
+      "activation": "ReLU",
+      "head_bias": false
+    }],
+    "head_scale": 1.0
+  })";
+
+  const nlohmann::json j = nlohmann::json::parse(configStr);
+  const auto wc = nam::wavenet::parse_config_json(j, 48000.0);
+  assert(wc.layer_array_params.size() == 1);
+  const auto& p = wc.layer_array_params[0];
+  assert(p.head_size == 2);
+  assert(p.head_kernel_size == 1);
+  assert(p.head_bias == false);
+}
+
+void test_nested_head_with_kernel_size_three()
+{
+  const std::string configStr = R"({
+    "layers": [{
+      "input_size": 1,
+      "condition_size": 1,
+      "head": {"out_channels": 1, "kernel_size": 3, "bias": true},
+      "channels": 2,
+      "kernel_size": 1,
+      "dilations": [1],
+      "activation": "ReLU"
+    }],
+    "head_scale": 1.0
+  })";
+
+  const nlohmann::json j = nlohmann::json::parse(configStr);
+  const auto wc = nam::wavenet::parse_config_json(j, 48000.0);
+  assert(wc.layer_array_params.size() == 1);
+  const auto& p = wc.layer_array_params[0];
+  assert(p.head_size == 1);
+  assert(p.head_kernel_size == 3);
+  assert(p.head_bias == true);
+
+  nam::wavenet::_LayerArray array(p);
+  assert(array.get_receptive_field() == 2); // one dilated layer: 0 + (3-1) head rechannel
+}
+
+} // namespace test_layer_head_config
+} // namespace test_wavenet
diff --git a/tools/test/test_wavenet/test_output_head.cpp b/tools/test/test_wavenet/test_output_head.cpp
new file mode 100644
index 0000000..4762df5
--- /dev/null
+++ b/tools/test/test_wavenet/test_output_head.cpp
@@ -0,0 +1,189 @@
+// Tests for WaveNet post-stack head (Python ``Head`` module)
+
+#include <Eigen/Dense>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <optional>
+#include <vector>
+
+#include "NAM/wavenet.h"
+
+namespace test_wavenet
+{
+namespace test_output_head
+{
+
+static nam::wavenet::_FiLMParams make_inactive_film()
+{
+  return nam::wavenet::_FiLMParams(false, false);
+}
+
+static nam::wavenet::LayerArrayParams make_layer_array_params(
+  const int input_size, const int condition_size, const int head_size, const int channels, const int bottleneck,
+  std::vector<int>&& kernel_sizes, std::vector<int>&& dilations,
+  const nam::activations::ActivationConfig& activation_config, const nam::wavenet::GatingMode gating_mode,
+  const bool head_bias, const int groups_input, const int groups_input_mixin,
+  const nam::wavenet::Layer1x1Params& layer1x1_params, const nam::wavenet::Head1x1Params& head1x1_params,
+  const nam::activations::ActivationConfig& secondary_activation_config)
+{
+  auto film = make_inactive_film();
+  std::vector<nam::activations::ActivationConfig> activation_configs(dilations.size(), activation_config);
+  std::vector<nam::wavenet::GatingMode> gating_modes(dilations.size(), gating_mode);
+  std::vector<nam::activations::ActivationConfig> secondary_activation_configs(
+    dilations.size(), secondary_activation_config);
+  return nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, 1, channels, bottleneck,
+                                        std::move(kernel_sizes), std::move(dilations), std::move(activation_configs),
+                                        std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
+                                        layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film,
+                                        film, film, film, film, film, film, film);
+}
+
+void test_post_stack_head_receptive_field()
+{
+  nam::wavenet::WaveNetHeadParams p;
+  p.in_channels = 2;
+  p.channels = 3;
+  p.out_channels = 1;
+  p.kernel_sizes = {3, 5};
+  p.activation_config = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::Tanh);
+  nam::wavenet::PostStackHead head(p);
+  // Python: 1 + (3-1) + (5-1) = 7
+  assert(head.receptive_field() == 7);
+}
+
+void test_wavenet_with_post_stack_head_processes()
+{
+  const int input_size = 1;
+  const int condition_size = 1;
+  const int head_size = 1;
+  const int channels = 1;
+  const int bottleneck = channels;
+  const int kernel_size = 1;
+  std::vector<int> dilations{1};
+  std::vector<int> kernel_sizes(dilations.size(), kernel_size);
+  const auto activation = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU);
+  const nam::wavenet::GatingMode gating_mode = nam::wavenet::GatingMode::NONE;
+  const bool head_bias = false;
+  const float head_scale = 0.5f;
+  const bool with_head = true;
+  const int groups = 1;
+  const int groups_input_mixin = 1;
+  nam::wavenet::Layer1x1Params layer1x1_params(true, 1);
+  nam::wavenet::Head1x1Params head1x1_params(false, channels, 1);
+  nam::activations::ActivationConfig empty_config{};
+  nam::wavenet::LayerArrayParams layer_params = make_layer_array_params(
+    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
+    activation, gating_mode, head_bias, groups, groups_input_mixin, layer1x1_params, head1x1_params, empty_config);
+  std::vector<nam::wavenet::LayerArrayParams> layer_array_params;
+  layer_array_params.push_back(std::move(layer_params));
+
+  nam::wavenet::WaveNetHeadParams hp;
+  hp.in_channels = 1;
+  hp.channels = 1;
+  hp.out_channels = 1;
+  hp.kernel_sizes = {1};
+  hp.activation_config = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::Tanh);
+
+  std::vector<float> weights;
+  weights.push_back(1.0f); // Rechannel
+  weights.insert(weights.end(), {1.0f, 0.0f, 1.0f, 1.0f, 0.0f}); // Layer 0
+  weights.push_back(1.0f); // Head rechannel
+  weights.push_back(1.0f); // Post-stack conv weight (1x1)
+  weights.push_back(0.0f); // Post-stack conv bias
+  weights.push_back(head_scale);
+
+  std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
+  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head,
+                                                         std::optional<nam::wavenet::WaveNetHeadParams>(std::move(hp)),
+                                                         std::move(weights), std::move(condition_dsp), 48000.0);
+
+  const int numFrames = 8;
+  const int maxBufferSize = 64;
+  wavenet->Reset(48000.0, maxBufferSize);
+  wavenet->prewarm();
+
+  std::vector<NAM_SAMPLE> input(numFrames, 0.1f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  wavenet->process(inputPtrs, outputPtrs, numFrames);
+
+  for (int i = 0; i < numFrames; i++)
+    assert(std::isfinite(output[i]));
+}
+
+void test_wavenet_with_two_layer_post_stack_head_applies_activation_per_layer_input()
+{
+  // Regression for multi-layer post-stack head execution:
+  // each layer must apply its activation to that layer's input, not always the
+  // original head input buffer.
+  const int input_size = 1;
+  const int condition_size = 1;
+  const int head_size = 1;
+  const int channels = 1;
+  const int bottleneck = channels;
+  const int kernel_size = 1;
+  std::vector<int> dilations{1};
+  std::vector<int> kernel_sizes(dilations.size(), kernel_size);
+  const auto activation = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU);
+  const nam::wavenet::GatingMode gating_mode = nam::wavenet::GatingMode::NONE;
+  const bool head_bias = false;
+  const float head_scale = 1.0f;
+  const bool with_head = true;
+  const int groups = 1;
+  const int groups_input_mixin = 1;
+  nam::wavenet::Layer1x1Params layer1x1_params(true, 1);
+  nam::wavenet::Head1x1Params head1x1_params(false, channels, 1);
+  nam::activations::ActivationConfig empty_config{};
+  nam::wavenet::LayerArrayParams layer_params = make_layer_array_params(
+    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
+    activation, gating_mode, head_bias, groups, groups_input_mixin, layer1x1_params, head1x1_params, empty_config);
+  std::vector<nam::wavenet::LayerArrayParams> layer_array_params;
+  layer_array_params.push_back(std::move(layer_params));
+
+  nam::wavenet::WaveNetHeadParams hp;
+  hp.in_channels = 1;
+  hp.channels = 1;
+  hp.out_channels = 1;
+  hp.kernel_sizes = {1, 1};
+  hp.activation_config = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU);
+
+  std::vector<float> weights;
+  // Main WaveNet (single 1x1 layer array), identity mapping to head input:
+  weights.push_back(1.0f); // Rechannel weight
+  weights.insert(weights.end(), {1.0f, 0.0f, 1.0f, 1.0f, 0.0f}); // Layer 0 weights
+  weights.push_back(1.0f); // Head rechannel weight
+  // Post-stack head (2x [ReLU -> Conv1d(k=1)]):
+  // First conv: y = -1*x + 0
+  // Second conv: y = 2*x + 0
+  // For negative input, correct chain gives 0 (ReLU before second conv on first conv output).
+  weights.push_back(-1.0f); // Head layer 0 conv weight
+  weights.push_back(0.0f); // Head layer 0 conv bias
+  weights.push_back(2.0f); // Head layer 1 conv weight
+  weights.push_back(0.0f); // Head layer 1 conv bias
+  weights.push_back(head_scale);
+
+  std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
+  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head,
+                                                         std::optional<nam::wavenet::WaveNetHeadParams>(std::move(hp)),
+                                                         std::move(weights), std::move(condition_dsp), 48000.0);
+
+  const int numFrames = 8;
+  const int maxBufferSize = 64;
+  wavenet->Reset(48000.0, maxBufferSize);
+  wavenet->prewarm();
+
+  std::vector<NAM_SAMPLE> input(numFrames, -0.25f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+  wavenet->process(inputPtrs, outputPtrs, numFrames);
+
+  for (int i = 0; i < numFrames; i++)
+    assert(std::fabs(output[i]) < 1.0e-6f);
+}
+
+} // namespace test_output_head
+} // namespace test_wavenet
diff --git a/tools/test/test_wavenet/test_real_time_safe.cpp b/tools/test/test_wavenet/test_real_time_safe.cpp
index 9bea8e9..b590551 100644
--- a/tools/test/test_wavenet/test_real_time_safe.cpp
+++ b/tools/test/test_wavenet/test_real_time_safe.cpp
@@ -5,6 +5,7 @@
 #include <cmath>
 #include <functional>
 #include <iostream>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -60,7 +61,7 @@ static nam::wavenet::_LayerArray make_layer_array(const int input_size, const in
   std::vector<int> dilations_copy = dilations; // Make a copy since we need to move it
   std::vector<int> kernel_sizes(dilations.size(), kernel_size);
   nam::wavenet::LayerArrayParams params(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations_copy),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations_copy),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
     layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params,
     film_params, film_params, film_params, film_params, film_params);
@@ -83,7 +84,7 @@ static nam::wavenet::LayerArrayParams make_layer_array_params(
   std::vector<nam::activations::ActivationConfig> secondary_activation_configs(
     dilations.size(), secondary_activation_config);
   return nam::wavenet::LayerArrayParams(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
     layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params,
     film_params, film_params, film_params, film_params, film_params);
@@ -1032,7 +1033,7 @@ void test_process_realtime_safe()
 
   std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
   auto wavenet = std::make_unique<nam::wavenet::WaveNet>(
-    input_size, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    input_size, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 
   const int maxBufferSize = 256;
   wavenet->Reset(48000.0, maxBufferSize);
@@ -1154,7 +1155,7 @@ void test_process_3in_2out_realtime_safe()
   const int in_channels = 3;
   std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
   auto wavenet = std::make_unique<nam::wavenet::WaveNet>(
-    in_channels, layer_array_params, head_scale, with_head, weights, std::move(condition_dsp), 48000.0);
+    in_channels, layer_array_params, head_scale, with_head, std::nullopt, weights, std::move(condition_dsp), 48000.0);
 
   const int maxBufferSize = 256;
   wavenet->Reset(48000.0, maxBufferSize);
@@ -1194,4 +1195,78 @@ void test_process_3in_2out_realtime_safe()
     }
   }
 }
+
+// WaveNet::process() with optional post-stack head (multi-layer PostStackHead) must not allocate or free.
+void test_process_with_post_stack_head_realtime_safe()
+{
+  const int input_size = 1;
+  const int condition_size = 1;
+  const int head_size = 1;
+  const int channels = 1;
+  const int bottleneck = channels;
+  const int kernel_size = 1;
+  std::vector<int> dilations{1};
+  std::vector<int> kernel_sizes(dilations.size(), kernel_size);
+  const auto activation = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU);
+  const nam::wavenet::GatingMode gating_mode = nam::wavenet::GatingMode::NONE;
+  const bool head_bias = false;
+  const float head_scale = 1.0f;
+  const bool with_head = true;
+  const int groups = 1;
+  const int groups_input_mixin = 1;
+
+  nam::wavenet::Layer1x1Params layer1x1_params(true, 1);
+  nam::wavenet::Head1x1Params head1x1_params(false, channels, 1);
+  std::vector<nam::wavenet::LayerArrayParams> layer_array_params;
+  layer_array_params.push_back(
+    make_layer_array_params(input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes),
+                            std::move(dilations), activation, gating_mode, head_bias, groups, groups_input_mixin,
+                            layer1x1_params, head1x1_params, nam::activations::ActivationConfig{}));
+
+  nam::wavenet::WaveNetHeadParams head_params;
+  head_params.in_channels = 1;
+  head_params.channels = 1;
+  head_params.out_channels = 1;
+  head_params.kernel_sizes = {1, 1};
+  head_params.activation_config = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU);
+
+  std::vector<float> weights;
+  weights.push_back(1.0f); // Rechannel
+  weights.insert(weights.end(), {1.0f, 0.0f, 1.0f, 1.0f, 0.0f}); // Layer 0
+  weights.push_back(1.0f); // Head rechannel
+  weights.push_back(-1.0f); // Post-stack head layer 0 conv weight
+  weights.push_back(0.0f); // Post-stack head layer 0 conv bias
+  weights.push_back(2.0f); // Post-stack head layer 1 conv weight
+  weights.push_back(0.0f); // Post-stack head layer 1 conv bias
+  weights.push_back(head_scale);
+
+  std::unique_ptr<nam::wavenet::WaveNet> condition_dsp = nullptr;
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head,
+                                            std::optional<nam::wavenet::WaveNetHeadParams>(std::move(head_params)),
+                                            std::move(weights), std::move(condition_dsp), 48000.0);
+
+  const int maxBufferSize = 256;
+  wavenet->Reset(48000.0, maxBufferSize);
+
+  const std::vector<int> buffer_sizes{1, 8, 16, 32, 64, 128, 256};
+  for (const int buffer_size : buffer_sizes)
+  {
+    std::vector<NAM_SAMPLE> input(buffer_size, -0.25f);
+    std::vector<NAM_SAMPLE> output(buffer_size, 0.0f);
+
+    const std::string test_name = "WaveNet process (post-stack head) - Buffer size " + std::to_string(buffer_size);
+    run_allocation_test_no_allocations(
+      nullptr,
+      [&]() {
+        NAM_SAMPLE* input_ptrs[] = {input.data()};
+        NAM_SAMPLE* output_ptrs[] = {output.data()};
+        wavenet->process(input_ptrs, output_ptrs, buffer_size);
+      },
+      nullptr, test_name.c_str());
+
+    for (int i = 0; i < buffer_size; i++)
+      assert(std::isfinite(output[i]));
+  }
+}
 } // namespace test_wavenet
diff --git a/tools/test/test_wavenet_configurable_gating.cpp b/tools/test/test_wavenet_configurable_gating.cpp
index ee07d94..62d5a17 100644
--- a/tools/test/test_wavenet_configurable_gating.cpp
+++ b/tools/test/test_wavenet_configurable_gating.cpp
@@ -49,7 +49,7 @@ static nam::wavenet::LayerArrayParams make_layer_array_params(
   std::vector<nam::activations::ActivationConfig> secondary_activation_configs(
     dilations.size(), secondary_activation_config);
   return nam::wavenet::LayerArrayParams(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
     layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params,
     film_params, film_params, film_params, film_params, film_params);
@@ -75,7 +75,7 @@ static nam::wavenet::_LayerArray make_layer_array(const int input_size, const in
   std::vector<int> dilations_copy = dilations; // Make a copy since we need to move it
   std::vector<int> kernel_sizes(dilations.size(), kernel_size);
   nam::wavenet::LayerArrayParams params(
-    input_size, condition_size, head_size, channels, bottleneck, std::move(kernel_sizes), std::move(dilations_copy),
+    input_size, condition_size, head_size, 1, channels, bottleneck, std::move(kernel_sizes), std::move(dilations_copy),
     std::move(activation_configs), std::move(gating_modes), head_bias, groups_input, groups_input_mixin,
     layer1x1_params, head1x1_params, std::move(secondary_activation_configs), film_params, film_params, film_params,
     film_params, film_params, film_params, film_params, film_params);