Realtime Compositor: Implement Ghost Glare node

This patch implements the Ghost Glare node. It is implemented using direct convolution as opposed to a recursive one, which produces slightly different results---more accurate ones, however, since the ghosts are attenuated where it matters, the difference is barely visible and is acceptable as far as I can tell. A possible performance improvement is to implement all passes in a single shader dispatch, where an array of all scales and color modulators is computed recursively on the host then used in the shader to add all ghosts, avoiding usage of global memory and unnecessary copies. This optimization will be implemented separately. Differential Revision: https://developer.blender.org/D16641 Reviewed By: Clement Foucault
2022-12-09 16:50:52 +02:00 · 2022-12-09 16:50:52 +02:00 · fa27a5d066
parent a45284b855
commit fa27a5d066
13 changed files with 786 additions and 98 deletions
--- a/source/blender/compositor/realtime_compositor/CMakeLists.txt
+++ b/source/blender/compositor/realtime_compositor/CMakeLists.txt
@ -60,8 +60,10 @@ set(SRC
  COM_utilities.hh

  algorithms/intern/algorithm_parallel_reduction.cc
+  algorithms/intern/symmetric_separable_blur.cc

  algorithms/COM_algorithm_parallel_reduction.hh
+  algorithms/COM_algorithm_symmetric_separable_blur.hh

  cached_resources/intern/morphological_distance_feather_weights.cc
  cached_resources/intern/symmetric_blur_weights.cc
@ -96,6 +98,10 @@ set(GLSL_SRC
  shaders/compositor_ellipse_mask.glsl
  shaders/compositor_filter.glsl
  shaders/compositor_flip.glsl
+  shaders/compositor_glare_ghost_accumulate.glsl
+  shaders/compositor_glare_ghost_base.glsl
+  shaders/compositor_glare_highlights.glsl
+  shaders/compositor_glare_mix.glsl
  shaders/compositor_image_crop.glsl
  shaders/compositor_morphological_distance.glsl
  shaders/compositor_morphological_distance_feather.glsl
@ -181,6 +187,7 @@ set(SRC_SHADER_CREATE_INFOS
  shaders/infos/compositor_ellipse_mask_info.hh
  shaders/infos/compositor_filter_info.hh
  shaders/infos/compositor_flip_info.hh
+  shaders/infos/compositor_glare_info.hh
  shaders/infos/compositor_image_crop_info.hh
  shaders/infos/compositor_morphological_distance_feather_info.hh
  shaders/infos/compositor_morphological_distance_info.hh
--- a/source/blender/compositor/realtime_compositor/COM_result.hh
+++ b/source/blender/compositor/realtime_compositor/COM_result.hh
@ -105,6 +105,11 @@ class Result {
   * and release the result's texture. */
  Result(ResultType type, TexturePool &texture_pool);

+  /* Identical to the standard constructor but initializes the reference count to 1. This is useful
+   * to construct temporary results that are created and released by the developer manually, which
+   * are typically used in operations that need temporary intermediate results. */
+  static Result Temporary(ResultType type, TexturePool &texture_pool);
+
  /* Declare the result to be a texture result, allocate a texture of an appropriate type with
   * the size of the given domain from the result's texture pool, and set the domain of the result
   * to the given domain. */
@ -125,8 +130,9 @@ class Result {
  void bind_as_texture(GPUShader *shader, const char *texture_name) const;

  /* Bind the texture of the result to the image unit with the given name in the currently bound
-   * given shader. */
-  void bind_as_image(GPUShader *shader, const char *image_name) const;
+   * given shader. If read is true, a memory barrier will be inserted for image reads to ensure any
+   * prior writes to the images are reflected before reading from it. */
+  void bind_as_image(GPUShader *shader, const char *image_name, bool read = false) const;

  /* Unbind the texture which was previously bound using bind_as_texture. */
  void unbind_as_texture() const;
--- a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_symmetric_separable_blur.hh
+++ b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_symmetric_separable_blur.hh
@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include "BLI_math_vec_types.hh"
+
+#include "COM_context.hh"
+#include "COM_result.hh"
+
+namespace blender::realtime_compositor {
+
+/* Blur the input using a horizontal and a vertical separable blur passes given a certain radius
+ * and filter type using SymmetricSeparableBlurWeights. The output is written to the given output
+ * result, which will be allocated internally and is thus expected not to be previously allocated.
+ * If extend_bounds is true, the output will have an extra radius amount of pixels on the boundary
+ * of the image, where blurring can take place assuming a fully transparent out of bound values. If
+ * gamma_correct is true, the input will be gamma corrected before blurring and then uncorrected
+ * after blurring, using a gamma coefficient of 2. */
+void symmetric_separable_blur(Context &context,
+                              Result &input,
+                              Result &output,
+                              float2 radius,
+                              int filter_type,
+                              bool extend_bounds,
+                              bool gamma_correct);
+
+}  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/intern/symmetric_separable_blur.cc
+++ b/source/blender/compositor/realtime_compositor/algorithms/intern/symmetric_separable_blur.cc
@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_math_base.hh"
+#include "BLI_math_vec_types.hh"
+#include "BLI_math_vector.hh"
+
+#include "GPU_shader.h"
+#include "GPU_texture.h"
+
+#include "COM_context.hh"
+#include "COM_utilities.hh"
+
+#include "COM_algorithm_symmetric_separable_blur.hh"
+
+#include "COM_symmetric_separable_blur_weights.hh"
+
+namespace blender::realtime_compositor {
+
+static Result horizontal_pass(Context &context,
+                              Result &input,
+                              float radius,
+                              int filter_type,
+                              bool extend_bounds,
+                              bool gamma_correct)
+{
+  GPUShader *shader = context.shader_manager().get("compositor_symmetric_separable_blur");
+  GPU_shader_bind(shader);
+
+  GPU_shader_uniform_1b(shader, "extend_bounds", extend_bounds);
+  GPU_shader_uniform_1b(shader, "gamma_correct_input", gamma_correct);
+  GPU_shader_uniform_1b(shader, "gamma_uncorrect_output", false);
+
+  input.bind_as_texture(shader, "input_tx");
+
+  const SymmetricSeparableBlurWeights &weights =
+      context.cache_manager().get_symmetric_separable_blur_weights(filter_type, radius);
+  weights.bind_as_texture(shader, "weights_tx");
+
+  Domain domain = input.domain();
+  if (extend_bounds) {
+    domain.size.x += int(math::ceil(radius)) * 2;
+  }
+
+  /* We allocate an output image of a transposed size, that is, with a height equivalent to the
+   * width of the input and vice versa. This is done as a performance optimization. The shader
+   * will blur the image horizontally and write it to the intermediate output transposed. Then
+   * the vertical pass will execute the same horizontal blur shader, but since its input is
+   * transposed, it will effectively do a vertical blur and write to the output transposed,
+   * effectively undoing the transposition in the horizontal pass. This is done to improve
+   * spatial cache locality in the shader and to avoid having two separate shaders for each blur
+   * pass. */
+  const int2 transposed_domain = int2(domain.size.y, domain.size.x);
+
+  Result output = Result::Temporary(ResultType::Color, context.texture_pool());
+  output.allocate_texture(transposed_domain);
+  output.bind_as_image(shader, "output_img");
+
+  compute_dispatch_threads_at_least(shader, domain.size);
+
+  GPU_shader_unbind();
+  input.unbind_as_texture();
+  weights.unbind_as_texture();
+  output.unbind_as_image();
+
+  return output;
+}
+
+static void vertical_pass(Context &context,
+                          Result &original_input,
+                          Result &horizontal_pass_result,
+                          Result &output,
+                          float2 radius,
+                          int filter_type,
+                          bool extend_bounds,
+                          bool gamma_correct)
+{
+  GPUShader *shader = context.shader_manager().get("compositor_symmetric_separable_blur");
+  GPU_shader_bind(shader);
+
+  GPU_shader_uniform_1b(shader, "extend_bounds", extend_bounds);
+  GPU_shader_uniform_1b(shader, "gamma_correct_input", false);
+  GPU_shader_uniform_1b(shader, "gamma_uncorrect_output", gamma_correct);
+
+  horizontal_pass_result.bind_as_texture(shader, "input_tx");
+
+  const SymmetricSeparableBlurWeights &weights =
+      context.cache_manager().get_symmetric_separable_blur_weights(filter_type, radius.y);
+  weights.bind_as_texture(shader, "weights_tx");
+
+  Domain domain = original_input.domain();
+  if (extend_bounds) {
+    /* Add a radius amount of pixels in both sides of the image, hence the multiply by 2. */
+    domain.size += int2(math::ceil(radius)) * 2;
+  }
+
+  output.allocate_texture(domain);
+  output.bind_as_image(shader, "output_img");
+
+  /* Notice that the domain is transposed, see the note on the horizontal pass method for more
+   * information on the reasoning behind this. */
+  compute_dispatch_threads_at_least(shader, int2(domain.size.y, domain.size.x));
+
+  GPU_shader_unbind();
+  horizontal_pass_result.unbind_as_texture();
+  output.unbind_as_image();
+  weights.unbind_as_texture();
+}
+
+void symmetric_separable_blur(Context &context,
+                              Result &input,
+                              Result &output,
+                              float2 radius,
+                              int filter_type,
+                              bool extend_bounds,
+                              bool gamma_correct)
+{
+  Result horizontal_pass_result = horizontal_pass(
+      context, input, radius.x, filter_type, extend_bounds, gamma_correct);
+
+  vertical_pass(context,
+                input,
+                horizontal_pass_result,
+                output,
+                radius,
+                filter_type,
+                extend_bounds,
+                gamma_correct);
+
+  horizontal_pass_result.release();
+}
+
+}  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/intern/result.cc
+++ b/source/blender/compositor/realtime_compositor/intern/result.cc
@ -18,6 +18,13 @@ Result::Result(ResultType type, TexturePool &texture_pool)
 {
 }

+Result Result::Temporary(ResultType type, TexturePool &texture_pool)
+{
+  Result result = Result(type, texture_pool);
+  result.increment_reference_count();
+  return result;
+}
+
 void Result::allocate_texture(Domain domain)
 {
  is_single_value_ = false;
@ -79,8 +86,13 @@ void Result::bind_as_texture(GPUShader *shader, const char *texture_name) const
  GPU_texture_bind(texture_, texture_image_unit);
 }

-void Result::bind_as_image(GPUShader *shader, const char *image_name) const
+void Result::bind_as_image(GPUShader *shader, const char *image_name, bool read) const
 {
+  /* Make sure any prior writes to the texture are reflected before reading from it. */
+  if (read) {
+    GPU_memory_barrier(GPU_BARRIER_SHADER_IMAGE_ACCESS);
+  }
+
  const int image_unit = GPU_shader_get_texture_binding(shader, image_name);
  GPU_texture_image_bind(texture_, image_unit);
 }
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_accumulate.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_accumulate.glsl
@ -0,0 +1,37 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+  ivec2 input_size = texture_size(input_ghost_tx);
+
+  /* Add 0.5 to evaluate the input sampler at the center of the pixel and divide by the image size
+   * to get the coordinates into the sampler's expected [0, 1] range*/
+  vec2 coordinates = (vec2(texel) + vec2(0.5)) / input_size;
+
+  /* We accumulate four variants of the input ghost texture, each is scaled by some amount and
+   * possibly multiplied by some color as a form of color modulation. */
+  vec4 accumulated_ghost = vec4(0.0);
+  for (int i = 0; i < 4; i++) {
+    float scale = scales[i];
+    vec4 color_modulator = color_modulators[i];
+
+    /* Scale the coordinates for the ghost, pre subtract 0.5 and post add 0.5 to use 0.5 as the
+     * origin of the scaling. */
+    vec2 scaled_coordinates = (coordinates - 0.5) * scale + 0.5;
+
+    /* The value of the ghost is attenuated by a scalar multiple of the inverse distance to the
+     * center, such that it is maximum at the center and become zero further from the center,
+     * making sure to take the scale into account. The scaler multiple of 1 / 4 is chosen using
+     * visual judgement. */
+    float distance_to_center = distance(coordinates, vec2(0.5)) * 2.0;
+    float attenuator = max(0.0, 1.0 - distance_to_center * abs(scale)) / 4.0;
+
+    /* Accumulate the scaled ghost after attenuating and color modulating its value. */
+    vec4 multiplier = attenuator * color_modulator;
+    accumulated_ghost += texture(input_ghost_tx, scaled_coordinates) * multiplier;
+  }
+
+  vec4 current_accumulated_ghost = imageLoad(accumulated_ghost_img, texel);
+  imageStore(accumulated_ghost_img, texel, current_accumulated_ghost + accumulated_ghost);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_base.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_ghost_base.glsl
@ -0,0 +1,37 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+  ivec2 input_size = texture_size(small_ghost_tx);
+
+  /* Add 0.5 to evaluate the input sampler at the center of the pixel and divide by the image size
+   * to get the coordinates into the sampler's expected [0, 1] range*/
+  vec2 coordinates = (vec2(texel) + vec2(0.5)) / input_size;
+
+  /* The small ghost is scaled down with the origin as the center of the image by a factor of 2.13,
+   * while the big ghost is flipped and scaled up with the origin as the center of the image by a
+   * factor of 0.97. Note that 1) The negative scale implements the flipping. 2) Factors larger
+   * than 1 actually scales down the image since the factor multiplies the coordinates and not the
+   * images itself. 3) The values are arbitrarily chosen using visual judgement. */
+  float small_ghost_scale = 2.13;
+  float big_ghost_scale = -0.97;
+
+  /* Scale the coordinates for the small and big ghosts, pre subtract 0.5 and post add 0.5 to use
+   * 0.5 as the origin of the scaling. Notice that the big ghost is flipped due to the negative
+   * scale. */
+  vec2 small_ghost_coordinates = (coordinates - 0.5) * small_ghost_scale + 0.5;
+  vec2 big_ghost_coordinates = (coordinates - 0.5) * big_ghost_scale + 0.5;
+
+  /* The values of the ghosts are attenuated by the inverse distance to the center, such that they
+   * are maximum at the center and become zero further from the center, making sure to take the
+   * aforementioned scale into account. */
+  float distance_to_center = distance(coordinates, vec2(0.5)) * 2.0;
+  float small_ghost_attenuator = max(0.0, 1.0 - distance_to_center * small_ghost_scale);
+  float big_ghost_attenuator = max(0.0, 1.0 - distance_to_center * abs(big_ghost_scale));
+
+  vec4 small_ghost = texture(small_ghost_tx, small_ghost_coordinates) * small_ghost_attenuator;
+  vec4 big_ghost = texture(big_ghost_tx, big_ghost_coordinates) * big_ghost_attenuator;
+
+  imageStore(combined_ghost_img, texel, small_ghost + big_ghost);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_highlights.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_highlights.glsl
@ -0,0 +1,31 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  /* The dispatch domain covers the output image size, which might be a fraction of the input image
+   * size, so you will notice the output image size used throughout the shader instead of the input
+   * one. */
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  /* Since the output image might be a fraction of the input image size, and since we want to
+   * evaluate the input sampler at the center of the output pixel, we add an offset equal to half
+   * the number of input pixels that covers a single output pixel. In case the input and output
+   * have the same size, this will be 0.5, which is the offset required to evaluate the sampler at
+   * the center of the pixel. */
+  vec2 offset = (texture_size(input_tx) / imageSize(output_img)) / 2.0;
+
+  /* Add the aforementioned offset and divide by the output image size to get the coordinates into
+   * the sampler's expected [0, 1] range. */
+  vec2 normalized_coordinates = (vec2(texel) + offset) / imageSize(output_img);
+
+  vec4 input_color = texture(input_tx, normalized_coordinates);
+  float luminance = dot(input_color.rgb, luminance_coefficients);
+
+  /* The pixel whose luminance is less than the threshold luminance is not considered part of the
+   * highlights and is given a value of zero. Otherwise, the pixel is considered part of the
+   * highlights, whose value is the difference to the threshold value clamped to zero. */
+  bool is_highlights = luminance >= threshold;
+  vec3 highlights = is_highlights ? max(vec3(0.0), input_color.rgb - threshold) : vec3(0.0);
+
+  imageStore(output_img, texel, vec4(highlights, 1.0));
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_glare_mix.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_glare_mix.glsl
@ -0,0 +1,28 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  /* Add 0.5 to evaluate the input sampler at the center of the pixel and divide by the input image
+   * size to get the relevant coordinates into the sampler's expected [0, 1] range. Make sure the
+   * input color is not negative to avoid a subtractive effect when mixing the glare. */
+  vec2 normalized_coordinates = (vec2(texel) + vec2(0.5)) / texture_size(input_tx);
+  vec4 glare_color = texture(glare_tx, normalized_coordinates);
+  vec4 input_color = max(vec4(0.0), texture_load(input_tx, texel));
+
+  /* The mix factor is in the range [-1, 1] and linearly interpolate between the three values such
+   * that:
+   *   1 => Glare only.
+   *   0 => Input + Glare.
+   *  -1 => Input only.
+   * We implement that as a weighted sum as follows. When the mix factor is 1, the glare weight
+   * should be 1 and the input weight should be 0. When the mix factor is -1, the glare weight
+   * should be 0 and the input weight should be 1. When the mix factor is 0, both weights should
+   * be 1. This can be expressed using the following compact min max expressions. */
+  float input_weight = 1.0 - max(0.0, mix_factor);
+  float glare_weight = 1.0 + min(0.0, mix_factor);
+  vec3 highlights = input_weight * input_color.rgb + glare_weight * glare_color.rgb;
+
+  imageStore(output_img, texel, vec4(highlights, input_color.a));
+}
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_glare_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_glare_info.hh
@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "gpu_shader_create_info.hh"
+
+/* -------
+ * Common.
+ * ------- */
+
+GPU_SHADER_CREATE_INFO(compositor_glare_highlights)
+    .local_group_size(16, 16)
+    .push_constant(Type::FLOAT, "threshold")
+    .push_constant(Type::VEC3, "luminance_coefficients")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_glare_highlights.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_glare_mix)
+    .local_group_size(16, 16)
+    .push_constant(Type::FLOAT, "mix_factor")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_2D, "glare_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_glare_mix.glsl")
+    .do_static_compilation(true);
+
+/* ------------
+ * Ghost Glare.
+ * ------------ */
+
+GPU_SHADER_CREATE_INFO(compositor_glare_ghost_base)
+    .local_group_size(16, 16)
+    .sampler(0, ImageType::FLOAT_2D, "small_ghost_tx")
+    .sampler(1, ImageType::FLOAT_2D, "big_ghost_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "combined_ghost_img")
+    .compute_source("compositor_glare_ghost_base.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_glare_ghost_accumulate)
+    .local_group_size(16, 16)
+    .push_constant(Type::VEC4, "scales")
+    .push_constant(Type::VEC4, "color_modulators", 4)
+    .sampler(0, ImageType::FLOAT_2D, "input_ghost_tx")
+    .image(0, GPU_RGBA16F, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "accumulated_ghost_img")
+    .compute_source("compositor_glare_ghost_accumulate.glsl")
+    .do_static_compilation(true);
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@ -2066,6 +2066,14 @@ typedef enum CMPNodeTrackPositionMode {
  CMP_NODE_TRACK_POSITION_ABSOLUTE_FRAME = 3,
 } CMPNodeTrackPositionMode;

+/* Glare Node. Stored in NodeGlare.type. */
+typedef enum CMPNodeGlareType {
+  CMP_NODE_GLARE_SIMPLE_STAR = 0,
+  CMP_NODE_GLARE_FOG_GLOW = 1,
+  CMP_NODE_GLARE_STREAKS = 2,
+  CMP_NODE_GLARE_GHOST = 3,
+} CMPNodeGlareType;
+
 /* Plane track deform node. */

 enum {
--- a/source/blender/nodes/composite/nodes/node_composite_blur.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_blur.cc
@ -19,9 +19,9 @@
 #include "GPU_state.h"
 #include "GPU_texture.h"

+#include "COM_algorithm_symmetric_separable_blur.hh"
 #include "COM_node_operation.hh"
 #include "COM_symmetric_blur_weights.hh"
-#include "COM_symmetric_separable_blur_weights.hh"
 #include "COM_utilities.hh"

 #include "node_composite_util.hh"
@ -34,8 +34,14 @@ NODE_STORAGE_FUNCS(NodeBlurData)

 static void cmp_node_blur_declare(NodeDeclarationBuilder &b)
 {
-  b.add_input<decl::Color>(N_("Image")).default_value({1.0f, 1.0f, 1.0f, 1.0f});
-  b.add_input<decl::Float>(N_("Size")).default_value(1.0f).min(0.0f).max(1.0f);
+  b.add_input<decl::Color>(N_("Image"))
+      .default_value({1.0f, 1.0f, 1.0f, 1.0f})
+      .compositor_domain_priority(0);
+  b.add_input<decl::Float>(N_("Size"))
+      .default_value(1.0f)
+      .min(0.0f)
+      .max(1.0f)
+      .compositor_domain_priority(1);
  b.add_output<decl::Color>(N_("Image"));
 }

@ -101,8 +107,13 @@ class BlurOperation : public NodeOperation {
    }

    if (use_separable_filter()) {
-      GPUTexture *horizontal_pass_result = execute_separable_blur_horizontal_pass();
-      execute_separable_blur_vertical_pass(horizontal_pass_result);
+      symmetric_separable_blur(context(),
+                               get_input("Image"),
+                               get_result("Image"),
+                               compute_blur_radius(),
+                               node_storage(bnode()).filtertype,
+                               get_extend_bounds(),
+                               node_storage(bnode()).gamma);
    }
    else {
      execute_blur();
@ -144,94 +155,6 @@ class BlurOperation : public NodeOperation {
    weights.unbind_as_texture();
  }

-  GPUTexture *execute_separable_blur_horizontal_pass()
-  {
-    GPUShader *shader = shader_manager().get("compositor_symmetric_separable_blur");
-    GPU_shader_bind(shader);
-
-    GPU_shader_uniform_1b(shader, "extend_bounds", get_extend_bounds());
-    GPU_shader_uniform_1b(shader, "gamma_correct_input", node_storage(bnode()).gamma);
-    GPU_shader_uniform_1b(shader, "gamma_uncorrect_output", false);
-
-    const Result &input_image = get_input("Image");
-    input_image.bind_as_texture(shader, "input_tx");
-
-    const float2 blur_radius = compute_blur_radius();
-
-    const SymmetricSeparableBlurWeights &weights =
-        context().cache_manager().get_symmetric_separable_blur_weights(
-            node_storage(bnode()).filtertype, blur_radius.x);
-    weights.bind_as_texture(shader, "weights_tx");
-
-    Domain domain = compute_domain();
-    if (get_extend_bounds()) {
-      domain.size.x += int(math::ceil(blur_radius.x)) * 2;
-    }
-
-    /* We allocate an output image of a transposed size, that is, with a height equivalent to the
-     * width of the input and vice versa. This is done as a performance optimization. The shader
-     * will blur the image horizontally and write it to the intermediate output transposed. Then
-     * the vertical pass will execute the same horizontal blur shader, but since its input is
-     * transposed, it will effectively do a vertical blur and write to the output transposed,
-     * effectively undoing the transposition in the horizontal pass. This is done to improve
-     * spatial cache locality in the shader and to avoid having two separate shaders for each blur
-     * pass. */
-    const int2 transposed_domain = int2(domain.size.y, domain.size.x);
-
-    GPUTexture *horizontal_pass_result = texture_pool().acquire_color(transposed_domain);
-    const int image_unit = GPU_shader_get_texture_binding(shader, "output_img");
-    GPU_texture_image_bind(horizontal_pass_result, image_unit);
-
-    compute_dispatch_threads_at_least(shader, domain.size);
-
-    GPU_shader_unbind();
-    input_image.unbind_as_texture();
-    weights.unbind_as_texture();
-    GPU_texture_image_unbind(horizontal_pass_result);
-
-    return horizontal_pass_result;
-  }
-
-  void execute_separable_blur_vertical_pass(GPUTexture *horizontal_pass_result)
-  {
-    GPUShader *shader = shader_manager().get("compositor_symmetric_separable_blur");
-    GPU_shader_bind(shader);
-
-    GPU_shader_uniform_1b(shader, "extend_bounds", get_extend_bounds());
-    GPU_shader_uniform_1b(shader, "gamma_correct_input", false);
-    GPU_shader_uniform_1b(shader, "gamma_uncorrect_output", node_storage(bnode()).gamma);
-
-    GPU_memory_barrier(GPU_BARRIER_TEXTURE_FETCH);
-    const int texture_image_unit = GPU_shader_get_texture_binding(shader, "input_tx");
-    GPU_texture_bind(horizontal_pass_result, texture_image_unit);
-
-    const float2 blur_radius = compute_blur_radius();
-
-    const SymmetricSeparableBlurWeights &weights =
-        context().cache_manager().get_symmetric_separable_blur_weights(
-            node_storage(bnode()).filtertype, blur_radius.y);
-    weights.bind_as_texture(shader, "weights_tx");
-
-    Domain domain = compute_domain();
-    if (get_extend_bounds()) {
-      /* Add a radius amount of pixels in both sides of the image, hence the multiply by 2. */
-      domain.size += int2(math::ceil(compute_blur_radius())) * 2;
-    }
-
-    Result &output_image = get_result("Image");
-    output_image.allocate_texture(domain);
-    output_image.bind_as_image(shader, "output_img");
-
-    /* Notice that the domain is transposed, see the note on the horizontal pass method for more
-     * information on the reasoning behind this. */
-    compute_dispatch_threads_at_least(shader, int2(domain.size.y, domain.size.x));
-
-    GPU_shader_unbind();
-    output_image.unbind_as_image();
-    weights.unbind_as_texture();
-    GPU_texture_unbind(horizontal_pass_result);
-  }
-
  float2 compute_blur_radius()
  {
    const float size = math::clamp(get_input("Size").get_float_value_default(1.0f), 0.0f, 1.0f);
--- a/source/blender/nodes/composite/nodes/node_composite_glare.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_glare.cc
@ -5,20 +5,40 @@
 * \ingroup cmpnodes
 */

+#include <array>
+
+#include "BLI_assert.h"
+#include "BLI_index_range.hh"
+#include "BLI_math_vec_types.hh"
+
+#include "DNA_scene_types.h"
+
 #include "RNA_access.h"

 #include "UI_interface.h"
 #include "UI_resources.h"

+#include "IMB_colormanagement.h"
+
+#include "GPU_shader.h"
+#include "GPU_state.h"
+#include "GPU_texture.h"
+
+#include "COM_algorithm_symmetric_separable_blur.hh"
 #include "COM_node_operation.hh"
+#include "COM_utilities.hh"

 #include "node_composite_util.hh"

 namespace blender::nodes::node_composite_glare_cc {

+NODE_STORAGE_FUNCS(NodeGlare)
+
 static void cmp_node_glare_declare(NodeDeclarationBuilder &b)
 {
-  b.add_input<decl::Color>(N_("Image")).default_value({1.0f, 1.0f, 1.0f, 1.0f});
+  b.add_input<decl::Color>(N_("Image"))
+      .default_value({1.0f, 1.0f, 1.0f, 1.0f})
+      .compositor_domain_priority(0);
  b.add_output<decl::Color>(N_("Image"));
 }

@ -85,7 +105,381 @@ class GlareOperation : public NodeOperation {

  void execute() override
  {
-    get_input("Image").pass_through(get_result("Image"));
+    if (is_identity()) {
+      get_input("Image").pass_through(get_result("Image"));
+      return;
+    }
+
+    Result highlights_result = execute_highlights();
+    Result glare_result = execute_glare(highlights_result);
+    execute_mix(glare_result);
+  }
+
+  bool is_identity()
+  {
+    if (get_input("Image").is_single_value()) {
+      return true;
+    }
+
+    /* A mix factor of -1 indicates that the original image is returned as is. See the execute_mix
+     * method for more information. */
+    if (node_storage(bnode()).mix == -1.0f) {
+      return true;
+    }
+
+    /* Only the ghost operation is currently supported. */
+    switch (node_storage(bnode()).type) {
+      case CMP_NODE_GLARE_SIMPLE_STAR:
+        return true;
+      case CMP_NODE_GLARE_FOG_GLOW:
+        return true;
+      case CMP_NODE_GLARE_STREAKS:
+        return true;
+      case CMP_NODE_GLARE_GHOST:
+        return false;
+      default:
+        BLI_assert_unreachable();
+        return true;
+    }
+
+    return false;
+  }
+
+  Result execute_glare(Result &highlights_result)
+  {
+    switch (node_storage(bnode()).type) {
+      case CMP_NODE_GLARE_SIMPLE_STAR:
+        return execute_simple_star(highlights_result);
+      case CMP_NODE_GLARE_FOG_GLOW:
+        return execute_fog_glow(highlights_result);
+      case CMP_NODE_GLARE_STREAKS:
+        return execute_streaks(highlights_result);
+      case CMP_NODE_GLARE_GHOST:
+        return execute_ghost(highlights_result);
+      default:
+        BLI_assert_unreachable();
+        return Result(ResultType::Color, texture_pool());
+    }
+  }
+
+  /* -----------------
+   * Glare Highlights.
+   * ----------------- */
+
+  Result execute_highlights()
+  {
+    GPUShader *shader = shader_manager().get("compositor_glare_highlights");
+    GPU_shader_bind(shader);
+
+    float luminance_coefficients[3];
+    IMB_colormanagement_get_luminance_coefficients(luminance_coefficients);
+    GPU_shader_uniform_3fv(shader, "luminance_coefficients", luminance_coefficients);
+    GPU_shader_uniform_1f(shader, "threshold", node_storage(bnode()).threshold);
+
+    const Result &input_image = get_input("Image");
+    input_image.bind_as_texture(shader, "input_tx");
+    GPU_texture_filter_mode(input_image.texture(), true);
+
+    const int2 glare_size = get_glare_size();
+    Result highlights_result = Result::Temporary(ResultType::Color, texture_pool());
+    highlights_result.allocate_texture(glare_size);
+    highlights_result.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, glare_size);
+
+    GPU_shader_unbind();
+    input_image.unbind_as_texture();
+    highlights_result.unbind_as_image();
+
+    return highlights_result;
+  }
+
+  /* ------------------
+   * Simple Star Glare.
+   * ------------------ */
+
+  /* Not yet implemented. Unreachable code due to the is_identity method. */
+  Result execute_simple_star(Result &highlights_result)
+  {
+    BLI_assert_unreachable();
+    return Result(ResultType::Color, texture_pool());
+  }
+
+  /* ---------------
+   * Fog Glow Glare.
+   * --------------- */
+
+  /* Not yet implemented. Unreachable code due to the is_identity method. */
+  Result execute_fog_glow(Result &highlights_result)
+  {
+    BLI_assert_unreachable();
+    return Result(ResultType::Color, texture_pool());
+  }
+
+  /* --------------
+   * Streaks Glare.
+   * -------------- */
+
+  /* Not yet implemented. Unreachable code due to the is_identity method. */
+  Result execute_streaks(Result &highlights_result)
+  {
+    BLI_assert_unreachable();
+    return Result(ResultType::Color, texture_pool());
+  }
+
+  /* ------------
+   * Ghost Glare.
+   * ------------ */
+
+  Result execute_ghost(Result &highlights_result)
+  {
+    Result base_ghost_result = compute_base_ghost(highlights_result);
+
+    GPUShader *shader = shader_manager().get("compositor_glare_ghost_accumulate");
+    GPU_shader_bind(shader);
+
+    /* Color modulators are constant across iterations. */
+    std::array<float4, 4> color_modulators = compute_ghost_color_modulators();
+    GPU_shader_uniform_4fv_array(shader,
+                                 "color_modulators",
+                                 color_modulators.size(),
+                                 (const float(*)[4])color_modulators.data());
+
+    /* Create an initially zero image where ghosts will be accumulated. */
+    const float4 zero_color = float4(0.0f);
+    const int2 glare_size = get_glare_size();
+    Result accumulated_ghost_result = Result::Temporary(ResultType::Color, texture_pool());
+    accumulated_ghost_result.allocate_texture(glare_size);
+    GPU_texture_clear(accumulated_ghost_result.texture(), GPU_DATA_FLOAT, zero_color);
+
+    /* For the given number of iterations, accumulate four ghosts with different scales and color
+     * modulators. The result of the previous iteration is used as the input of the current
+     * iteration. We start from index 1 because we are not interested in the scales produced for
+     * the first iteration according to visual judgement, see the compute_ghost_scales method. */
+    Result &input_ghost_result = base_ghost_result;
+    const IndexRange iterations_range = IndexRange(get_number_of_iterations()).drop_front(1);
+    for (const int i : iterations_range) {
+      std::array<float, 4> scales = compute_ghost_scales(i);
+      GPU_shader_uniform_4fv(shader, "scales", scales.data());
+
+      input_ghost_result.bind_as_texture(shader, "input_ghost_tx");
+      accumulated_ghost_result.bind_as_image(shader, "accumulated_ghost_img", true);
+
+      compute_dispatch_threads_at_least(shader, glare_size);
+
+      input_ghost_result.unbind_as_texture();
+      accumulated_ghost_result.unbind_as_image();
+
+      /* The accumulated result serves as the input for the next iteration, so copy the result to
+       * the input result since it can't be used for reading and writing simultaneously. Skip
+       * copying for the last iteration since it is not needed. */
+      if (i != iterations_range.last()) {
+        GPU_memory_barrier(GPU_BARRIER_TEXTURE_UPDATE);
+        GPU_texture_copy(input_ghost_result.texture(), accumulated_ghost_result.texture());
+      }
+    }
+
+    GPU_shader_unbind();
+    input_ghost_result.release();
+
+    return accumulated_ghost_result;
+  }
+
+  /* Computes two ghosts by blurring the highlights with two different radii, then adds them into a
+   * single base ghost image after scaling them by some factor and flipping the bigger ghost along
+   * the center of the image. */
+  Result compute_base_ghost(Result &highlights_result)
+  {
+    Result small_ghost_result = Result::Temporary(ResultType::Color, texture_pool());
+    symmetric_separable_blur(context(),
+                             highlights_result,
+                             small_ghost_result,
+                             float2(get_small_ghost_radius()),
+                             R_FILTER_GAUSS,
+                             false,
+                             false);
+
+    Result big_ghost_result = Result::Temporary(ResultType::Color, texture_pool());
+    symmetric_separable_blur(context(),
+                             highlights_result,
+                             big_ghost_result,
+                             float2(get_big_ghost_radius()),
+                             R_FILTER_GAUSS,
+                             false,
+                             false);
+
+    highlights_result.release();
+
+    GPUShader *shader = shader_manager().get("compositor_glare_ghost_base");
+    GPU_shader_bind(shader);
+
+    small_ghost_result.bind_as_texture(shader, "small_ghost_tx");
+    GPU_texture_filter_mode(small_ghost_result.texture(), true);
+    GPU_texture_wrap_mode(small_ghost_result.texture(), false, false);
+
+    big_ghost_result.bind_as_texture(shader, "big_ghost_tx");
+    GPU_texture_filter_mode(big_ghost_result.texture(), true);
+    GPU_texture_wrap_mode(big_ghost_result.texture(), false, false);
+
+    const int2 glare_size = get_glare_size();
+    Result base_ghost_result = Result::Temporary(ResultType::Color, texture_pool());
+    base_ghost_result.allocate_texture(glare_size);
+    base_ghost_result.bind_as_image(shader, "combined_ghost_img");
+
+    compute_dispatch_threads_at_least(shader, glare_size);
+
+    GPU_shader_unbind();
+    small_ghost_result.unbind_as_texture();
+    big_ghost_result.unbind_as_texture();
+    base_ghost_result.unbind_as_image();
+
+    small_ghost_result.release();
+    big_ghost_result.release();
+
+    return base_ghost_result;
+  }
+
+  /* In each iteration of ghost accumulation, four ghosts are accumulated, each of which might be
+   * modulated by multiplying by some color modulator, this function generates a color modulator
+   * for each of the four ghosts. The first ghost is always unmodulated, so is the multiplicative
+   * identity of 1. The second ghost gets only its green and blue channels modulated, the third
+   * ghost gets only its red and green channels modulated, and the fourth ghost gets only its red
+   * and blue channels modulated. */
+  std::array<float4, 4> compute_ghost_color_modulators()
+  {
+    const float color_modulation_factor = get_ghost_color_modulation_factor();
+
+    std::array<float4, 4> color_modulators;
+    color_modulators[0] = float4(1.0f);
+    color_modulators[1] = float4(1.0f, color_modulation_factor, color_modulation_factor, 1.0f);
+    color_modulators[2] = float4(color_modulation_factor, color_modulation_factor, 1.0f, 1.0f);
+    color_modulators[3] = float4(color_modulation_factor, 1.0f, color_modulation_factor, 1.0f);
+
+    return color_modulators;
+  }
+
+  /* In each iteration of ghost accumulation, four ghosts with different scales are accumulated.
+   * Given the index of a certain iteration, this method computes the 4 scales for it. Assuming we
+   * have n number of iterations, that means the total number of accumulations is 4 * n. To get a
+   * variety of scales, we generate an arithmetic progression that starts from 2.1 and ends at
+   * zero exclusive, containing 4 * n elements. The start scale of 2.1 is chosen arbitrarily using
+   * visual judgement. To get more scale variations, every other scale is inverted with a slight
+   * change in scale such that it alternates between scaling down and up, additionally every other
+   * ghost is flipped across the image center by negating its scale. Finally, to get variations
+   * across the number of iterations, a shift of 0.5 is introduced when the number of iterations is
+   * odd, that way, the user will get variations when changing the number of iterations as opposed
+   * to just getting less or more ghosts. */
+  std::array<float, 4> compute_ghost_scales(int iteration)
+  {
+    /* Shift scales by 0.5 for odd number of iterations as discussed in the method description. */
+    const float offset = (get_number_of_iterations() % 2 == 1) ? 0.5f : 0.0f;
+
+    std::array<float, 4> scales;
+    for (const int i : IndexRange(scales.size())) {
+      /* Global index in all accumulations. */
+      const int global_i = iteration * 4 + i;
+      /* Arithmetic progression in the range [0, 1) + offset. */
+      const float progression = (global_i + offset) / (get_number_of_iterations() * 4);
+      /* Remap range [0, 1) to [1, 0) and multiply to remap to [2.1, 0). */
+      scales[i] = 2.1f * (1.0f - progression);
+
+      /* Invert the scale with a slight variation and flip it across the image center through
+       * negation for odd scales as discussed in the method description. */
+      if (i % 2 == 1) {
+        scales[i] = -0.99f / scales[i];
+      }
+    }
+
+    return scales;
+  }
+
+  /* The operation computes two base ghosts by blurring the highlights with two different radii,
+   * this method computes the blur radius for the smaller one. The value is chosen using visual
+   * judgement. Make sure to take the quality factor into account, see the get_quality_factor
+   * method for more information. */
+  float get_small_ghost_radius()
+  {
+    return 16.0f / get_quality_factor();
+  }
+
+  /* Computes the blur radius of the bigger ghost, which is double the blur radius if the smaller
+   * one, see the get_small_ghost_radius for more information. */
+  float get_big_ghost_radius()
+  {
+    return get_small_ghost_radius() * 2.0f;
+  }
+
+  /* The color channels of the glare can be modulated by being multiplied by this factor. In the
+   * user interface, 0 means no modulation and 1 means full modulation. But since the factor is
+   * multiplied, 1 corresponds to no modulation and 0 corresponds to full modulation, so we
+   * subtract from one. */
+  float get_ghost_color_modulation_factor()
+  {
+    return 1.0f - node_storage(bnode()).colmod;
+  }
+
+  /* ----------
+   * Glare Mix.
+   * ---------- */
+
+  void execute_mix(Result &glare_result)
+  {
+    GPUShader *shader = shader_manager().get("compositor_glare_mix");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1f(shader, "mix_factor", node_storage(bnode()).mix);
+
+    const Result &input_image = get_input("Image");
+    input_image.bind_as_texture(shader, "input_tx");
+
+    glare_result.bind_as_texture(shader, "glare_tx");
+    GPU_texture_filter_mode(glare_result.texture(), true);
+
+    const Domain domain = compute_domain();
+    Result &output_image = get_result("Image");
+    output_image.allocate_texture(domain);
+    output_image.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, domain.size);
+
+    GPU_shader_unbind();
+    output_image.unbind_as_image();
+    input_image.unbind_as_texture();
+    glare_result.unbind_as_texture();
+
+    glare_result.release();
+  }
+
+  /* -------
+   * Common.
+   * ------- */
+
+  /* As a performance optimization, the operation can compute the glare on a fraction of the input
+   * image size, which is what this method returns. */
+  int2 get_glare_size()
+  {
+    return compute_domain().size / get_quality_factor();
+  }
+
+  int get_number_of_iterations()
+  {
+    return node_storage(bnode()).iter;
+  }
+
+  /* The glare node can compute the glare on a fraction of the input image size to improve
+   * performance. The quality values and their corresponding quality factors are as follows:
+   *
+   * - High Quality   => Quality Value: 0 => Quality Factor: 1.
+   * - Medium Quality => Quality Value: 1 => Quality Factor: 2.
+   * - Low Quality    => Quality Value: 2 => Quality Factor: 4.
+   *
+   * Dividing the image size by the quality factor gives the size where the glare should be
+   * computed. The glare algorithm should also take the quality factor into account to compensate
+   * for the reduced sized, perhaps by dividing blur radii and similar values by the quality
+   * factor. */
+  int get_quality_factor()
+  {
+    return 1 << node_storage(bnode()).quality;
  }
 };