Cycles: make transform inverse match Embree exactly
Helps improve ray-tracing precision. This is a bit complicated as it requires different implementation depending on the CPU architecture.
This commit is contained in:
parent
286e535071
commit
230f9ade64
|
@ -326,6 +326,7 @@ set(SRC_UTIL_HEADERS
|
|||
../util/rect.h
|
||||
../util/static_assert.h
|
||||
../util/transform.h
|
||||
../util/transform_inverse.h
|
||||
../util/texture.h
|
||||
../util/types.h
|
||||
../util/types_float2.h
|
||||
|
|
|
@ -26,6 +26,8 @@ set(SRC
|
|||
thread.cpp
|
||||
time.cpp
|
||||
transform.cpp
|
||||
transform_avx2.cpp
|
||||
transform_sse41.cpp
|
||||
windows.cpp
|
||||
)
|
||||
|
||||
|
@ -138,6 +140,13 @@ set(SRC_HEADERS
|
|||
xml.h
|
||||
)
|
||||
|
||||
if(CXX_HAS_SSE)
|
||||
set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
endif()
|
||||
if(CXX_HAS_AVX2)
|
||||
set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
include_directories(${INC})
|
||||
include_directories(SYSTEM ${INC_SYS})
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
|
|||
|
||||
/* Transform Inverse */
|
||||
|
||||
static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
|
||||
static bool projection_matrix4_inverse(float R[][4], float M[][4])
|
||||
{
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Adapted from code:
|
||||
|
@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
|
|||
memcpy(R, &tfmR, sizeof(R));
|
||||
memcpy(M, &tfm, sizeof(M));
|
||||
|
||||
if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
|
||||
if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
|
||||
return projection_identity();
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,10 @@
|
|||
#include "util/math.h"
|
||||
#include "util/types.h"
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
# include "util/system.h"
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Affine transformation, stored as 4x3 matrix. */
|
||||
|
@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
|
|||
float4 x, y, z, w;
|
||||
} DecomposedTransform;
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#include "util/transform_inverse.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Functions */
|
||||
|
||||
#ifdef __KERNEL_METAL__
|
||||
|
@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
|
|||
#endif /* defined(__KERNEL_GPU_RAYTRACING__) */
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
|
||||
void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
|
||||
#endif
|
||||
|
||||
ccl_device_inline Transform transform_inverse(const Transform tfm)
|
||||
{
|
||||
/* This implementation matches the one in Embree exactly, to ensure consistent
|
||||
* results with the ray intersection of instances. */
|
||||
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
|
||||
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
|
||||
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
|
||||
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
|
||||
|
||||
/* Compute determinant. */
|
||||
float det = dot(x, cross(y, z));
|
||||
|
||||
if (det == 0.0f) {
|
||||
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
|
||||
* never be in this situation, but try to invert it anyway with tweak.
|
||||
*
|
||||
* This logic does not match Embree which would just give an invalid
|
||||
* matrix. A better solution would be to remove this and ensure any object
|
||||
* matrix is valid. */
|
||||
x.x += 1e-8f;
|
||||
y.y += 1e-8f;
|
||||
z.z += 1e-8f;
|
||||
|
||||
det = dot(x, cross(y, z));
|
||||
if (det == 0.0f) {
|
||||
det = FLT_MAX;
|
||||
}
|
||||
/* Optimized transform implementations. */
|
||||
#ifndef __KERNEL_GPU__
|
||||
if (system_cpu_support_avx2()) {
|
||||
Transform itfm;
|
||||
transform_inverse_cpu_avx2(tfm, itfm);
|
||||
return itfm;
|
||||
}
|
||||
else if (system_cpu_support_sse41()) {
|
||||
Transform itfm;
|
||||
transform_inverse_cpu_sse41(tfm, itfm);
|
||||
return itfm;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
|
||||
const float3 inverse_x = cross(y, z) / det;
|
||||
const float3 inverse_y = cross(z, x) / det;
|
||||
const float3 inverse_z = cross(x, y) / det;
|
||||
|
||||
/* Compute translation and fill transform. */
|
||||
Transform itfm;
|
||||
itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
|
||||
itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
|
||||
itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
|
||||
|
||||
return itfm;
|
||||
return transform_inverse_impl(tfm);
|
||||
}
|
||||
|
||||
ccl_device_inline void transform_compose(ccl_private Transform *tfm,
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#include "util/transform.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
|
||||
{
|
||||
itfm = transform_inverse_impl(tfm);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,76 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#pragma once
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Custom cross and dot implementations that match Embree bit for bit.
|
||||
* Normally we don't use SSE41/AVX outside the kernel, but for this it's
|
||||
* important to match exactly for ray tracing precision. */
|
||||
|
||||
ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
|
||||
{
|
||||
#ifdef __AVX2__
|
||||
const ssef sse_a = (const __m128 &)a;
|
||||
const ssef sse_b = (const __m128 &)b;
|
||||
const ssef r = shuffle<1, 2, 0, 3>(
|
||||
ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
|
||||
return (const float3 &)r;
|
||||
#endif
|
||||
|
||||
return cross(a, b);
|
||||
}
|
||||
|
||||
ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
|
||||
#endif
|
||||
|
||||
return dot(a, b);
|
||||
}
|
||||
|
||||
ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
|
||||
{
|
||||
/* This implementation matches the one in Embree exactly, to ensure consistent
|
||||
* results with the ray intersection of instances. */
|
||||
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
|
||||
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
|
||||
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
|
||||
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
|
||||
|
||||
/* Compute determinant. */
|
||||
float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
|
||||
|
||||
if (det == 0.0f) {
|
||||
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
|
||||
* never be in this situation, but try to invert it anyway with tweak.
|
||||
*
|
||||
* This logic does not match Embree which would just give an invalid
|
||||
* matrix. A better solution would be to remove this and ensure any object
|
||||
* matrix is valid. */
|
||||
x.x += 1e-8f;
|
||||
y.y += 1e-8f;
|
||||
z.z += 1e-8f;
|
||||
|
||||
det = transform_inverse_dot(x, cross(y, z));
|
||||
if (det == 0.0f) {
|
||||
det = FLT_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
|
||||
const float3 inverse_x = transform_inverse_cross(y, z) / det;
|
||||
const float3 inverse_y = transform_inverse_cross(z, x) / det;
|
||||
const float3 inverse_z = transform_inverse_cross(x, y) / det;
|
||||
|
||||
/* Compute translation and fill transform. */
|
||||
Transform itfm;
|
||||
itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
|
||||
itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
|
||||
itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
|
||||
|
||||
return itfm;
|
||||
}
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,13 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#include "util/transform.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
|
||||
{
|
||||
itfm = transform_inverse_impl(tfm);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
Loading…
Reference in New Issue