Cycles: make transform inverse match Embree exactly

Helps improve ray-tracing precision. This is a bit complicated as it requires
different implementation depending on the CPU architecture.
This commit is contained in:
Brecht Van Lommel 2022-08-09 14:28:19 +02:00
parent 286e535071
commit 230f9ade64
7 changed files with 142 additions and 39 deletions

View File

@ -326,6 +326,7 @@ set(SRC_UTIL_HEADERS
../util/rect.h
../util/static_assert.h
../util/transform.h
../util/transform_inverse.h
../util/texture.h
../util/types.h
../util/types_float2.h

View File

@ -26,6 +26,8 @@ set(SRC
thread.cpp
time.cpp
transform.cpp
transform_avx2.cpp
transform_sse41.cpp
windows.cpp
)
@ -138,6 +140,13 @@ set(SRC_HEADERS
xml.h
)
if(CXX_HAS_SSE)
set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})

View File

@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
/* Transform Inverse */
static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
static bool projection_matrix4_inverse(float R[][4], float M[][4])
{
/* SPDX-License-Identifier: BSD-3-Clause
* Adapted from code:
@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
memcpy(R, &tfmR, sizeof(R));
memcpy(M, &tfm, sizeof(M));
if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
return projection_identity();
}

View File

@ -11,6 +11,10 @@
#include "util/math.h"
#include "util/types.h"
#ifndef __KERNEL_GPU__
# include "util/system.h"
#endif
CCL_NAMESPACE_BEGIN
/* Affine transformation, stored as 4x3 matrix. */
@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
float4 x, y, z, w;
} DecomposedTransform;
CCL_NAMESPACE_END
#include "util/transform_inverse.h"
CCL_NAMESPACE_BEGIN
/* Functions */
#ifdef __KERNEL_METAL__
@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
#endif /* defined(__KERNEL_GPU_RAYTRACING__) */
}
#ifndef __KERNEL_GPU__
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
#endif
ccl_device_inline Transform transform_inverse(const Transform tfm)
{
/* This implementation matches the one in Embree exactly, to ensure consistent
* results with the ray intersection of instances. */
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
/* Compute determinant. */
float det = dot(x, cross(y, z));
if (det == 0.0f) {
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
* never be in this situation, but try to invert it anyway with tweak.
*
* This logic does not match Embree which would just give an invalid
* matrix. A better solution would be to remove this and ensure any object
* matrix is valid. */
x.x += 1e-8f;
y.y += 1e-8f;
z.z += 1e-8f;
det = dot(x, cross(y, z));
if (det == 0.0f) {
det = FLT_MAX;
}
/* Optimized transform implementations. */
#ifndef __KERNEL_GPU__
if (system_cpu_support_avx2()) {
Transform itfm;
transform_inverse_cpu_avx2(tfm, itfm);
return itfm;
}
else if (system_cpu_support_sse41()) {
Transform itfm;
transform_inverse_cpu_sse41(tfm, itfm);
return itfm;
}
#endif
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
const float3 inverse_x = cross(y, z) / det;
const float3 inverse_y = cross(z, x) / det;
const float3 inverse_z = cross(x, y) / det;
/* Compute translation and fill transform. */
Transform itfm;
itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
return itfm;
return transform_inverse_impl(tfm);
}
ccl_device_inline void transform_compose(ccl_private Transform *tfm,

View File

@ -0,0 +1,13 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#include "util/transform.h"
CCL_NAMESPACE_BEGIN
void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
{
itfm = transform_inverse_impl(tfm);
}
CCL_NAMESPACE_END

View File

@ -0,0 +1,76 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#pragma once
CCL_NAMESPACE_BEGIN
/* Custom cross and dot implementations that match Embree bit for bit.
* Normally we don't use SSE41/AVX outside the kernel, but for this it's
* important to match exactly for ray tracing precision. */
ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
{
#ifdef __AVX2__
const ssef sse_a = (const __m128 &)a;
const ssef sse_b = (const __m128 &)b;
const ssef r = shuffle<1, 2, 0, 3>(
ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
return (const float3 &)r;
#endif
return cross(a, b);
}
ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
{
#ifdef __SSE4_1__
return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
#endif
return dot(a, b);
}
ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
{
/* This implementation matches the one in Embree exactly, to ensure consistent
* results with the ray intersection of instances. */
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
/* Compute determinant. */
float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
if (det == 0.0f) {
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
* never be in this situation, but try to invert it anyway with tweak.
*
* This logic does not match Embree which would just give an invalid
* matrix. A better solution would be to remove this and ensure any object
* matrix is valid. */
x.x += 1e-8f;
y.y += 1e-8f;
z.z += 1e-8f;
det = transform_inverse_dot(x, cross(y, z));
if (det == 0.0f) {
det = FLT_MAX;
}
}
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
const float3 inverse_x = transform_inverse_cross(y, z) / det;
const float3 inverse_y = transform_inverse_cross(z, x) / det;
const float3 inverse_z = transform_inverse_cross(x, y) / det;
/* Compute translation and fill transform. */
Transform itfm;
itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
return itfm;
}
CCL_NAMESPACE_END

View File

@ -0,0 +1,13 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#include "util/transform.h"
CCL_NAMESPACE_BEGIN
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
{
itfm = transform_inverse_impl(tfm);
}
CCL_NAMESPACE_END