Cycles: make transform inverse match Embree exactly

Helps improve ray-tracing precision. This is a bit complicated as it requires different implementation depending on the CPU architecture.
2022-08-09 14:28:19 +02:00 · 2022-08-09 14:28:19 +02:00 · 230f9ade64
parent 286e535071
commit 230f9ade64
7 changed files with 142 additions and 39 deletions
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -326,6 +326,7 @@ set(SRC_UTIL_HEADERS
  ../util/rect.h
  ../util/static_assert.h
  ../util/transform.h
+  ../util/transform_inverse.h
  ../util/texture.h
  ../util/types.h
  ../util/types_float2.h
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@ -26,6 +26,8 @@ set(SRC
  thread.cpp
  time.cpp
  transform.cpp
+  transform_avx2.cpp
+  transform_sse41.cpp
  windows.cpp
 )

@ -138,6 +140,13 @@ set(SRC_HEADERS
  xml.h
 )

+if(CXX_HAS_SSE)
+  set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+  set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})

--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN

 /* Transform Inverse */

-static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
+static bool projection_matrix4_inverse(float R[][4], float M[][4])
 {
  /* SPDX-License-Identifier: BSD-3-Clause
   * Adapted from code:
@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
  memcpy(R, &tfmR, sizeof(R));
  memcpy(M, &tfm, sizeof(M));

-  if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
+  if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
    return projection_identity();
  }

--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@ -11,6 +11,10 @@
 #include "util/math.h"
 #include "util/types.h"

+#ifndef __KERNEL_GPU__
+#  include "util/system.h"
+#endif
+
 CCL_NAMESPACE_BEGIN

 /* Affine transformation, stored as 4x3 matrix. */
@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
  float4 x, y, z, w;
 } DecomposedTransform;

+CCL_NAMESPACE_END
+
+#include "util/transform_inverse.h"
+
+CCL_NAMESPACE_BEGIN
+
 /* Functions */

 #ifdef __KERNEL_METAL__
@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 #endif /* defined(__KERNEL_GPU_RAYTRACING__) */
 }

+#ifndef __KERNEL_GPU__
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
+#endif
+
 ccl_device_inline Transform transform_inverse(const Transform tfm)
 {
-  /* This implementation matches the one in Embree exactly, to ensure consistent
-   * results with the ray intersection of instances. */
-  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
-  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
-  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
-  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-
-  /* Compute determinant. */
-  float det = dot(x, cross(y, z));
-
-  if (det == 0.0f) {
-    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
-     * never be in this situation, but try to invert it anyway with tweak.
-     *
-     * This logic does not match Embree which would just give an invalid
-     * matrix. A better solution would be to remove this and ensure any object
-     * matrix is valid. */
-    x.x += 1e-8f;
-    y.y += 1e-8f;
-    z.z += 1e-8f;
-
-    det = dot(x, cross(y, z));
-    if (det == 0.0f) {
-      det = FLT_MAX;
-    }
+  /* Optimized transform implementations. */
+#ifndef __KERNEL_GPU__
+  if (system_cpu_support_avx2()) {
+    Transform itfm;
+    transform_inverse_cpu_avx2(tfm, itfm);
+    return itfm;
  }
+  else if (system_cpu_support_sse41()) {
+    Transform itfm;
+    transform_inverse_cpu_sse41(tfm, itfm);
+    return itfm;
+  }
+#endif

-  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
-  const float3 inverse_x = cross(y, z) / det;
-  const float3 inverse_y = cross(z, x) / det;
-  const float3 inverse_z = cross(x, y) / det;
-
-  /* Compute translation and fill transform. */
-  Transform itfm;
-  itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
-  itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
-  itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
-
-  return itfm;
+  return transform_inverse_impl(tfm);
 }

 ccl_device_inline void transform_compose(ccl_private Transform *tfm,
--- a/intern/cycles/util/transform_avx2.cpp
+++ b/intern/cycles/util/transform_avx2.cpp
@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/transform_inverse.h
+++ b/intern/cycles/util/transform_inverse.h
@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Custom cross and dot implementations that match Embree bit for bit.
+ * Normally we don't use SSE41/AVX outside the kernel, but for this it's
+ * important to match exactly for ray tracing precision. */
+
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+{
+#ifdef __AVX2__
+  const ssef sse_a = (const __m128 &)a;
+  const ssef sse_b = (const __m128 &)b;
+  const ssef r = shuffle<1, 2, 0, 3>(
+      ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+  return (const float3 &)r;
+#endif
+
+  return cross(a, b);
+}
+
+ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+{
+#ifdef __SSE4_1__
+  return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#endif
+
+  return dot(a, b);
+}
+
+ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
+{
+  /* This implementation matches the one in Embree exactly, to ensure consistent
+   * results with the ray intersection of instances. */
+  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+  /* Compute determinant. */
+  float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
+
+  if (det == 0.0f) {
+    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+     * never be in this situation, but try to invert it anyway with tweak.
+     *
+     * This logic does not match Embree which would just give an invalid
+     * matrix. A better solution would be to remove this and ensure any object
+     * matrix is valid. */
+    x.x += 1e-8f;
+    y.y += 1e-8f;
+    z.z += 1e-8f;
+
+    det = transform_inverse_dot(x, cross(y, z));
+    if (det == 0.0f) {
+      det = FLT_MAX;
+    }
+  }
+
+  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+  const float3 inverse_x = transform_inverse_cross(y, z) / det;
+  const float3 inverse_y = transform_inverse_cross(z, x) / det;
+  const float3 inverse_z = transform_inverse_cross(x, y) / det;
+
+  /* Compute translation and fill transform. */
+  Transform itfm;
+  itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
+  itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
+  itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
+
+  return itfm;
+}
+CCL_NAMESPACE_END
--- a/intern/cycles/util/transform_sse41.cpp
+++ b/intern/cycles/util/transform_sse41.cpp
@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END