Cleanup: refactor to make number of channels for shader evaluation variable

2021-10-13 19:13:35 +02:00 · 2021-10-13 19:13:35 +02:00 · 2ba7c3aa65
parent 70376154a0
commit 2ba7c3aa65
10 changed files with 53 additions and 36 deletions
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@ -54,7 +54,7 @@ class CPUKernels {
  /* Shader evaluation. */

  using ShaderEvalFunction = CPUKernelFunction<void (*)(
-      const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+      const KernelGlobals *kg, const KernelShaderEvalInput *, float *, const int)>;

  ShaderEvalFunction shader_eval_displace;
  ShaderEvalFunction shader_eval_background;
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@ -34,9 +34,10 @@ ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), pr
 }

 bool ShaderEval::eval(const ShaderEvalType type,
-                      const int max_num_points,
+                      const int max_num_inputs,
+                      const int num_channels,
                      const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
-                      const function<void(device_vector<float4> &)> &read_output)
+                      const function<void(device_vector<float> &)> &read_output)
 {
  bool first_device = true;
  bool success = true;
@ -50,26 +51,27 @@ bool ShaderEval::eval(const ShaderEvalType type,
    first_device = false;

    device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
-    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+    device_vector<float> output(device, "ShaderEval output", MEM_READ_WRITE);

    /* Allocate and copy device buffers. */
    DCHECK_EQ(input.device, device);
    DCHECK_EQ(output.device, device);
    DCHECK_LE(output.size(), input.size());

-    input.alloc(max_num_points);
+    input.alloc(max_num_inputs);
    int num_points = fill_input(input);
    if (num_points == 0) {
      return;
    }

    input.copy_to_device();
-    output.alloc(num_points);
+    output.alloc(num_points * num_channels);
    output.zero_to_device();

    /* Evaluate on CPU or GPU. */
-    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
-                                                  eval_gpu(device, type, input, output);
+    success = (device->info.type == DEVICE_CPU) ?
+                  eval_cpu(device, type, input, output, num_points) :
+                  eval_gpu(device, type, input, output, num_points);

    /* Copy data back from device if not canceled. */
    if (success) {
@ -87,7 +89,8 @@ bool ShaderEval::eval(const ShaderEvalType type,
 bool ShaderEval::eval_cpu(Device *device,
                          const ShaderEvalType type,
                          device_vector<KernelShaderEvalInput> &input,
-                          device_vector<float4> &output)
+                          device_vector<float> &output,
+                          const int64_t work_size)
 {
  vector<CPUKernelThreadGlobals> kernel_thread_globals;
  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
@ -96,9 +99,8 @@ bool ShaderEval::eval_cpu(Device *device,
  const CPUKernels &kernels = *(device->get_cpu_kernels());

  /* Simple parallel_for over all work items. */
-  const int64_t work_size = output.size();
  KernelShaderEvalInput *input_data = input.data();
-  float4 *output_data = output.data();
+  float *output_data = output.data();
  bool success = true;

  tbb::task_arena local_arena(device->info.cpu_threads);
@ -130,7 +132,8 @@ bool ShaderEval::eval_cpu(Device *device,
 bool ShaderEval::eval_gpu(Device *device,
                          const ShaderEvalType type,
                          device_vector<KernelShaderEvalInput> &input,
-                          device_vector<float4> &output)
+                          device_vector<float> &output,
+                          const int64_t work_size)
 {
  /* Find required kernel function. */
  DeviceKernel kernel;
@ -151,7 +154,6 @@ bool ShaderEval::eval_gpu(Device *device,
   * TODO : query appropriate size from device.*/
  const int64_t chunk_size = 65536;

-  const int64_t work_size = output.size();
  void *d_input = (void *)input.device_pointer;
  void *d_output = (void *)output.device_pointer;

--- a/intern/cycles/integrator/shader_eval.h
+++ b/intern/cycles/integrator/shader_eval.h
@ -40,19 +40,22 @@ class ShaderEval {
  /* Evaluate shader at points specified by KernelShaderEvalInput and write out
   * RGBA colors to output. */
  bool eval(const ShaderEvalType type,
-            const int max_num_points,
+            const int max_num_inputs,
+            const int num_channels,
            const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
-            const function<void(device_vector<float4> &)> &read_output);
+            const function<void(device_vector<float> &)> &read_output);

 protected:
  bool eval_cpu(Device *device,
                const ShaderEvalType type,
                device_vector<KernelShaderEvalInput> &input,
-                device_vector<float4> &output);
+                device_vector<float> &output,
+                const int64_t work_size);
  bool eval_gpu(Device *device,
                const ShaderEvalType type,
                device_vector<KernelShaderEvalInput> &input,
-                device_vector<float4> &output);
+                device_vector<float> &output,
+                const int64_t work_size);

  Device *device_;
  Progress &progress_;
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@ -58,11 +58,11 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);

 void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
                                                       const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                       const int offset);
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
                                                     const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                     const int offset);

 /* --------------------------------------------------------------------
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@ -114,7 +114,7 @@ DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)

 void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
                                                     const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                     const int offset)
 {
 #ifdef KERNEL_STUB
@ -126,7 +126,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,

 void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
                                                       const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                       const int offset)
 {
 #ifdef KERNEL_STUB
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -615,7 +615,7 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float4 *output,
+                                    float *output,
                                    const int offset,
                                    const int work_size)
 {
@ -629,7 +629,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float4 *output,
+                                      float *output,
                                      const int offset,
                                      const int work_size)
 {
--- a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@ -85,7 +85,8 @@ ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS,
    if (num_recorded_hits > 0) {
      sort_intersections(isect, num_recorded_hits);

-      /* Write intersection result into global integrator state memory. */
+      /* Write intersection result into global integrator state memory.
+       * More efficient may be to do this directly from the intersection kernel. */
      for (int hit = 0; hit < num_recorded_hits; hit++) {
        integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit);
      }
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@ -26,7 +26,7 @@ CCL_NAMESPACE_BEGIN

 ccl_device void kernel_displace_evaluate(ccl_global const KernelGlobals *kg,
                                         ccl_global const KernelShaderEvalInput *input,
-                                         ccl_global float4 *output,
+                                         ccl_global float *output,
                                         const int offset)
 {
  /* Setup shader data. */
@ -53,12 +53,14 @@ ccl_device void kernel_displace_evaluate(ccl_global const KernelGlobals *kg,
  D = ensure_finite3(D);

  /* Write output. */
-  output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
+  output[offset * 3 + 0] += D.x;
+  output[offset * 3 + 1] += D.y;
+  output[offset * 3 + 2] += D.z;
 }

 ccl_device void kernel_background_evaluate(ccl_global const KernelGlobals *kg,
                                           ccl_global const KernelShaderEvalInput *input,
-                                           ccl_global float4 *output,
+                                           ccl_global float *output,
                                           const int offset)
 {
  /* Setup ray */
@ -88,7 +90,9 @@ ccl_device void kernel_background_evaluate(ccl_global const KernelGlobals *kg,
  color = ensure_finite3(color);

  /* Write output. */
-  output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
+  output[offset * 3 + 0] += color.x;
+  output[offset * 3 + 1] += color.y;
+  output[offset * 3 + 2] += color.z;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@ -50,6 +50,7 @@ static void shade_background_pixels(Device *device,
  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));

  const int size = width * height;
+  const int num_channels = 3;
  pixels.resize(size);

  /* Evaluate shader on device. */
@ -57,6 +58,7 @@ static void shade_background_pixels(Device *device,
  shader_eval.eval(
      SHADER_EVAL_BACKGROUND,
      size,
+      num_channels,
      [&](device_vector<KernelShaderEvalInput> &d_input) {
        /* Fill coordinates for shading. */
        KernelShaderEvalInput *d_input_data = d_input.data();
@ -77,15 +79,15 @@ static void shade_background_pixels(Device *device,

        return size;
      },
-      [&](device_vector<float4> &d_output) {
+      [&](device_vector<float> &d_output) {
        /* Copy output to pixel buffer. */
-        float4 *d_output_data = d_output.data();
+        float *d_output_data = d_output.data();

        for (int y = 0; y < height; y++) {
          for (int x = 0; x < width; x++) {
-            pixels[y * width + x].x = d_output_data[y * width + x].x;
-            pixels[y * width + x].y = d_output_data[y * width + x].y;
-            pixels[y * width + x].z = d_output_data[y * width + x].z;
+            pixels[y * width + x].x = d_output_data[(y * width + x) * num_channels + 0];
+            pixels[y * width + x].y = d_output_data[(y * width + x) * num_channels + 1];
+            pixels[y * width + x].z = d_output_data[(y * width + x) * num_channels + 2];
          }
        }
      });
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@ -115,7 +115,7 @@ static int fill_shader_input(const Scene *scene,
 /* Read back mesh displacement shader output. */
 static void read_shader_output(const Scene *scene,
                               Mesh *mesh,
-                               const device_vector<float4> &d_output)
+                               const device_vector<float> &d_output)
 {
  const array<int> &mesh_shaders = mesh->get_shader();
  const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
@ -125,7 +125,7 @@ static void read_shader_output(const Scene *scene,
  const int num_motion_steps = mesh->get_motion_steps();
  vector<bool> done(num_verts, false);

-  const float4 *d_output_data = d_output.data();
+  const float *d_output_data = d_output.data();
  int d_output_index = 0;

  Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
@ -144,7 +144,11 @@ static void read_shader_output(const Scene *scene,
    for (int j = 0; j < 3; j++) {
      if (!done[t.v[j]]) {
        done[t.v[j]] = true;
-        float3 off = float4_to_float3(d_output_data[d_output_index++]);
+        float3 off = make_float3(d_output_data[d_output_index + 0],
+                                 d_output_data[d_output_index + 1],
+                                 d_output_data[d_output_index + 2]);
+        d_output_index += 3;
+
        /* Avoid illegal vertex coordinates. */
        off = ensure_finite3(off);
        mesh_verts[t.v[j]] += off;
@ -194,6 +198,7 @@ bool GeometryManager::displace(
  ShaderEval shader_eval(device, progress);
  if (!shader_eval.eval(SHADER_EVAL_DISPLACE,
                        num_verts,
+                        3,
                        function_bind(&fill_shader_input, scene, mesh, object_index, _1),
                        function_bind(&read_shader_output, scene, mesh, _1))) {
    return false;