Merge branch 'blender-v3.3-release'
This commit is contained in:
commit
752fb5dd08
|
@ -326,6 +326,7 @@ set(SRC_UTIL_HEADERS
|
|||
../util/rect.h
|
||||
../util/static_assert.h
|
||||
../util/transform.h
|
||||
../util/transform_inverse.h
|
||||
../util/texture.h
|
||||
../util/types.h
|
||||
../util/types_float2.h
|
||||
|
|
|
@ -33,6 +33,30 @@ ccl_device_forceinline float intersection_t_offset(const float t)
|
|||
return __uint_as_float(bits);
|
||||
}
|
||||
|
||||
/* Ray offset to avoid self intersection.
|
||||
*
|
||||
* This function can be used to compute a modified ray start position for rays
|
||||
* leaving from a surface. This is from:
|
||||
* "A Fast and Robust Method for Avoiding Self-Intersection"
|
||||
* Ray Tracing Gems, chapter 6.
|
||||
*/
|
||||
ccl_device_inline float3 ray_offset(const float3 P, const float3 Ng)
|
||||
{
|
||||
const float int_scale = 256.0f;
|
||||
const int3 of_i = make_int3(
|
||||
(int)(int_scale * Ng.x), (int)(int_scale * Ng.y), (int)(int_scale * Ng.z));
|
||||
|
||||
const float3 p_i = make_float3(
|
||||
__int_as_float(__float_as_int(P.x) + ((P.x < 0) ? -of_i.x : of_i.x)),
|
||||
__int_as_float(__float_as_int(P.y) + ((P.y < 0) ? -of_i.y : of_i.y)),
|
||||
__int_as_float(__float_as_int(P.z) + ((P.z < 0) ? -of_i.z : of_i.z)));
|
||||
const float origin = 1.0f / 32.0f;
|
||||
const float float_scale = 1.0f / 65536.0f;
|
||||
return make_float3(fabsf(P.x) < origin ? P.x + float_scale * Ng.x : p_i.x,
|
||||
fabsf(P.y) < origin ? P.y + float_scale * Ng.y : p_i.y,
|
||||
fabsf(P.z) < origin ? P.z + float_scale * Ng.z : p_i.z);
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
ccl_device int intersections_compare(const void *a, const void *b)
|
||||
{
|
||||
|
|
|
@ -31,6 +31,52 @@ ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
|
|||
shader_setup_from_ray(kg, sd, &ray, &isect);
|
||||
}
|
||||
|
||||
ccl_device_forceinline float3 integrate_surface_ray_offset(KernelGlobals kg,
|
||||
const ccl_private ShaderData *sd,
|
||||
const float3 ray_P,
|
||||
const float3 ray_D)
|
||||
{
|
||||
/* No ray offset needed for other primitive types. */
|
||||
if (!(sd->type & PRIMITIVE_TRIANGLE)) {
|
||||
return ray_P;
|
||||
}
|
||||
|
||||
/* Self intersection tests already account for the case where a ray hits the
|
||||
* same primitive. However precision issues can still cause neighboring
|
||||
* triangles to be hit. Here we test if the ray-triangle intersection with
|
||||
* the same primitive would miss, implying that a neighbouring triangle would
|
||||
* be hit instead.
|
||||
*
|
||||
* This relies on triangle intersection to be watertight, and the object inverse
|
||||
* object transform to match the one used by ray intersection exactly.
|
||||
*
|
||||
* Potential improvements:
|
||||
* - It appears this happens when either barycentric coordinates are small,
|
||||
* or dot(sd->Ng, ray_D) is small. Detect such cases and skip test?
|
||||
* - Instead of ray offset, can we tweak P to lie within the triangle?
|
||||
*/
|
||||
const uint tri_vindex = kernel_data_fetch(tri_vindex, sd->prim).w;
|
||||
const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
|
||||
tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
|
||||
tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
|
||||
|
||||
float3 local_ray_P = ray_P;
|
||||
float3 local_ray_D = ray_D;
|
||||
|
||||
if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
|
||||
const Transform itfm = object_get_inverse_transform(kg, sd);
|
||||
local_ray_P = transform_point(&itfm, local_ray_P);
|
||||
local_ray_D = transform_direction(&itfm, local_ray_D);
|
||||
}
|
||||
|
||||
if (ray_triangle_intersect_self(local_ray_P, local_ray_D, tri_a, tri_b, tri_c)) {
|
||||
return ray_P;
|
||||
}
|
||||
else {
|
||||
return ray_offset(ray_P, sd->Ng);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __HOLDOUT__
|
||||
ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg,
|
||||
ConstIntegratorState state,
|
||||
|
@ -200,6 +246,10 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
|
|||
# endif
|
||||
}
|
||||
|
||||
if (ray.self.object != OBJECT_NONE) {
|
||||
ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
|
||||
}
|
||||
|
||||
/* Write shadow ray and associated state to global memory. */
|
||||
integrator_state_write_shadow_ray(kg, shadow_state, &ray);
|
||||
// Save memory by storing the light and object indices in the shadow_isect
|
||||
|
@ -328,8 +378,9 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
|
|||
}
|
||||
else {
|
||||
/* Setup ray with changed origin and direction. */
|
||||
INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
|
||||
const float3 D = normalize(bsdf_omega_in);
|
||||
INTEGRATOR_STATE_WRITE(state, ray, P) = integrate_surface_ray_offset(kg, sd, sd->P, D);
|
||||
INTEGRATOR_STATE_WRITE(state, ray, D) = D;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
|
||||
INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
|
||||
#ifdef __RAY_DIFFERENTIALS__
|
||||
|
@ -423,6 +474,9 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
|
|||
Ray ray ccl_optional_struct_init;
|
||||
ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self);
|
||||
ray.D = ao_D;
|
||||
if (skip_self) {
|
||||
ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
|
||||
}
|
||||
ray.tmin = 0.0f;
|
||||
ray.tmax = kernel_data.integrator.ao_bounces_distance;
|
||||
ray.time = sd->time;
|
||||
|
|
|
@ -26,6 +26,8 @@ set(SRC
|
|||
thread.cpp
|
||||
time.cpp
|
||||
transform.cpp
|
||||
transform_avx2.cpp
|
||||
transform_sse41.cpp
|
||||
windows.cpp
|
||||
)
|
||||
|
||||
|
@ -137,6 +139,13 @@ set(SRC_HEADERS
|
|||
xml.h
|
||||
)
|
||||
|
||||
if(CXX_HAS_SSE)
|
||||
set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
endif()
|
||||
if(CXX_HAS_AVX2)
|
||||
set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
include_directories(${INC})
|
||||
include_directories(SYSTEM ${INC_SYS})
|
||||
|
||||
|
|
|
@ -105,6 +105,51 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
|
|||
return false;
|
||||
}
|
||||
|
||||
/* Custom rcp, cross and dot implementations that match Embree bit for bit. */
|
||||
ccl_device_forceinline float ray_triangle_rcp(const float x)
|
||||
{
|
||||
#ifdef __KERNEL_NEON__
|
||||
/* Move scalar to vector register and do rcp. */
|
||||
__m128 a;
|
||||
a[0] = x;
|
||||
float32x4_t reciprocal = vrecpeq_f32(a);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
|
||||
return reciprocal[0];
|
||||
#elif defined(__KERNEL_SSE__)
|
||||
const __m128 a = _mm_set_ss(x);
|
||||
const __m128 r = _mm_rcp_ss(a);
|
||||
|
||||
# ifdef __KERNEL_AVX2_
|
||||
return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
|
||||
# else
|
||||
return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
|
||||
# endif
|
||||
#else
|
||||
return 1.0f / x;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
|
||||
#else
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
|
||||
msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
|
||||
msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
|
||||
#else
|
||||
return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
|
||||
const float3 ray_D,
|
||||
const float ray_tmin,
|
||||
|
@ -130,9 +175,9 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
|
|||
const float3 e2 = v1 - v2;
|
||||
|
||||
/* Perform edge tests. */
|
||||
const float U = dot(cross(e0, v2 + v0), ray_D);
|
||||
const float V = dot(cross(e1, v0 + v1), ray_D);
|
||||
const float W = dot(cross(e2, v1 + v2), ray_D);
|
||||
const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D);
|
||||
const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D);
|
||||
const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D);
|
||||
|
||||
const float UVW = U + V + W;
|
||||
const float eps = FLT_EPSILON * fabsf(UVW);
|
||||
|
@ -144,7 +189,7 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
|
|||
}
|
||||
|
||||
/* Calculate geometry normal and denominator. */
|
||||
const float3 Ng1 = cross(e1, e0);
|
||||
const float3 Ng1 = ray_triangle_cross(e1, e0);
|
||||
const float3 Ng = Ng1 + Ng1;
|
||||
const float den = dot(Ng, ray_D);
|
||||
/* Avoid division by 0. */
|
||||
|
@ -159,13 +204,46 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
|
|||
return false;
|
||||
}
|
||||
|
||||
const float rcp_UVW = (fabsf(UVW) < 1e-18f) ? 0.0f : 1.0f / UVW;
|
||||
*isect_u = min(U * rcp_UVW, 1.0f);
|
||||
*isect_v = min(V * rcp_UVW, 1.0f);
|
||||
const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW);
|
||||
*isect_u = min(U * rcp_uvw, 1.0f);
|
||||
*isect_v = min(V * rcp_uvw, 1.0f);
|
||||
*isect_t = t;
|
||||
return true;
|
||||
}
|
||||
|
||||
ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P,
|
||||
const float3 ray_D,
|
||||
const float3 tri_a,
|
||||
const float3 tri_b,
|
||||
const float3 tri_c)
|
||||
{
|
||||
/* Matches logic in ray_triangle_intersect, self intersection test to validate
|
||||
* if a ray is going to hit self or might incorrectly hit a neighboring triangle. */
|
||||
|
||||
/* Calculate vertices relative to ray origin. */
|
||||
const float3 v0 = tri_a - ray_P;
|
||||
const float3 v1 = tri_b - ray_P;
|
||||
const float3 v2 = tri_c - ray_P;
|
||||
|
||||
/* Calculate triangle edges. */
|
||||
const float3 e0 = v2 - v0;
|
||||
const float3 e1 = v0 - v1;
|
||||
const float3 e2 = v1 - v2;
|
||||
|
||||
/* Perform edge tests. */
|
||||
const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D);
|
||||
const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D);
|
||||
const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D);
|
||||
|
||||
const float eps = FLT_EPSILON * fabsf(U + V + W);
|
||||
const float minUVW = min(U, min(V, W));
|
||||
const float maxUVW = max(U, max(V, W));
|
||||
|
||||
/* Note the extended epsilon compared to ray_triangle_intersect, to account
|
||||
* for intersections with neighboring triangles that have an epsilon. */
|
||||
return (minUVW >= eps || maxUVW <= -eps);
|
||||
}
|
||||
|
||||
/* Tests for an intersection between a ray and a quad defined by
|
||||
* its midpoint, normal and sides.
|
||||
* If ellipse is true, hits outside the ellipse that's enclosed by the
|
||||
|
|
|
@ -128,53 +128,42 @@ int system_cpu_bits()
|
|||
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
|
||||
|
||||
struct CPUCapabilities {
|
||||
bool x64;
|
||||
bool mmx;
|
||||
bool sse;
|
||||
bool sse2;
|
||||
bool sse3;
|
||||
bool ssse3;
|
||||
bool sse41;
|
||||
bool sse42;
|
||||
bool sse4a;
|
||||
bool avx;
|
||||
bool f16c;
|
||||
bool avx2;
|
||||
bool xop;
|
||||
bool fma3;
|
||||
bool fma4;
|
||||
bool bmi1;
|
||||
bool bmi2;
|
||||
};
|
||||
|
||||
static CPUCapabilities &system_cpu_capabilities()
|
||||
{
|
||||
static CPUCapabilities caps;
|
||||
static CPUCapabilities caps = {};
|
||||
static bool caps_init = false;
|
||||
|
||||
if (!caps_init) {
|
||||
int result[4], num;
|
||||
|
||||
memset(&caps, 0, sizeof(caps));
|
||||
|
||||
__cpuid(result, 0);
|
||||
num = result[0];
|
||||
|
||||
if (num >= 1) {
|
||||
__cpuid(result, 0x00000001);
|
||||
caps.mmx = (result[3] & ((int)1 << 23)) != 0;
|
||||
caps.sse = (result[3] & ((int)1 << 25)) != 0;
|
||||
caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
|
||||
caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
|
||||
const bool sse = (result[3] & ((int)1 << 25)) != 0;
|
||||
const bool sse2 = (result[3] & ((int)1 << 26)) != 0;
|
||||
const bool sse3 = (result[2] & ((int)1 << 0)) != 0;
|
||||
|
||||
caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
|
||||
caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
|
||||
caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
|
||||
const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
|
||||
const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
|
||||
/* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */
|
||||
|
||||
caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
|
||||
caps.avx = false;
|
||||
bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
|
||||
bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
|
||||
const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
|
||||
const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
|
||||
const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
|
||||
|
||||
/* Simplify to combined capabilities for which we specialize kernels. */
|
||||
caps.sse2 = sse && sse2;
|
||||
caps.sse3 = sse && sse2 && sse3 && ssse3;
|
||||
caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
|
||||
|
||||
if (os_uses_xsave_xrestore && cpu_avx_support) {
|
||||
// Check if the OS will save the YMM registers
|
||||
|
@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities()
|
|||
# else
|
||||
xcr_feature_mask = 0;
|
||||
# endif
|
||||
caps.avx = (xcr_feature_mask & 0x6) == 0x6;
|
||||
const bool avx = (xcr_feature_mask & 0x6) == 0x6;
|
||||
const bool f16c = (result[2] & ((int)1 << 29)) != 0;
|
||||
|
||||
__cpuid(result, 0x00000007);
|
||||
bool bmi1 = (result[1] & ((int)1 << 3)) != 0;
|
||||
bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
|
||||
bool avx2 = (result[1] & ((int)1 << 5)) != 0;
|
||||
|
||||
caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
|
||||
caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
|
||||
bmi2;
|
||||
}
|
||||
|
||||
caps.f16c = (result[2] & ((int)1 << 29)) != 0;
|
||||
|
||||
__cpuid(result, 0x00000007);
|
||||
caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
|
||||
caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
|
||||
caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
|
||||
}
|
||||
|
||||
caps_init = true;
|
||||
|
@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities()
|
|||
bool system_cpu_support_sse2()
|
||||
{
|
||||
CPUCapabilities &caps = system_cpu_capabilities();
|
||||
return caps.sse && caps.sse2;
|
||||
return caps.sse2;
|
||||
}
|
||||
|
||||
bool system_cpu_support_sse3()
|
||||
{
|
||||
CPUCapabilities &caps = system_cpu_capabilities();
|
||||
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
|
||||
return caps.sse3;
|
||||
}
|
||||
|
||||
bool system_cpu_support_sse41()
|
||||
{
|
||||
CPUCapabilities &caps = system_cpu_capabilities();
|
||||
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
|
||||
return caps.sse41;
|
||||
}
|
||||
|
||||
bool system_cpu_support_avx()
|
||||
{
|
||||
CPUCapabilities &caps = system_cpu_capabilities();
|
||||
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
|
||||
return caps.avx;
|
||||
}
|
||||
|
||||
bool system_cpu_support_avx2()
|
||||
{
|
||||
CPUCapabilities &caps = system_cpu_capabilities();
|
||||
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c &&
|
||||
caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
|
||||
return caps.avx2;
|
||||
}
|
||||
#else
|
||||
|
||||
|
@ -264,26 +255,6 @@ bool system_cpu_support_avx2()
|
|||
|
||||
#endif
|
||||
|
||||
bool system_call_self(const vector<string> &args)
|
||||
{
|
||||
/* Escape program and arguments in case they contain spaces. */
|
||||
string cmd = "\"" + Sysutil::this_program_path() + "\"";
|
||||
|
||||
for (int i = 0; i < args.size(); i++) {
|
||||
cmd += " \"" + args[i] + "\"";
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
/* Use cmd /S to avoid issues with spaces in arguments. */
|
||||
cmd = "cmd /S /C \"" + cmd + " > nul \"";
|
||||
#else
|
||||
/* Quiet output. */
|
||||
cmd += " > /dev/null";
|
||||
#endif
|
||||
|
||||
return (system(cmd.c_str()) == 0);
|
||||
}
|
||||
|
||||
size_t system_physical_ram()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
|
|
|
@ -4,15 +4,17 @@
|
|||
#ifndef __UTIL_SYSTEM_H__
|
||||
#define __UTIL_SYSTEM_H__
|
||||
|
||||
#include "util/string.h"
|
||||
#include "util/vector.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Get width in characters of the current console output. */
|
||||
int system_console_width();
|
||||
|
||||
string system_cpu_brand_string();
|
||||
std::string system_cpu_brand_string();
|
||||
int system_cpu_bits();
|
||||
bool system_cpu_support_sse2();
|
||||
bool system_cpu_support_sse3();
|
||||
|
@ -22,9 +24,6 @@ bool system_cpu_support_avx2();
|
|||
|
||||
size_t system_physical_ram();
|
||||
|
||||
/* Start a new process of the current application with the given arguments. */
|
||||
bool system_call_self(const vector<string> &args);
|
||||
|
||||
/* Get identifier of the currently running process. */
|
||||
uint64_t system_self_process_id();
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
|
|||
|
||||
/* Transform Inverse */
|
||||
|
||||
static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
|
||||
static bool projection_matrix4_inverse(float R[][4], float M[][4])
|
||||
{
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Adapted from code:
|
||||
|
@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
|
|||
memcpy(R, &tfmR, sizeof(R));
|
||||
memcpy(M, &tfm, sizeof(M));
|
||||
|
||||
if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
|
||||
if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
|
||||
return projection_identity();
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,10 @@
|
|||
#include "util/math.h"
|
||||
#include "util/types.h"
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
# include "util/system.h"
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Affine transformation, stored as 4x3 matrix. */
|
||||
|
@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
|
|||
float4 x, y, z, w;
|
||||
} DecomposedTransform;
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#include "util/transform_inverse.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Functions */
|
||||
|
||||
#ifdef __KERNEL_METAL__
|
||||
|
@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
|
|||
#endif /* defined(__KERNEL_GPU_RAYTRACING__) */
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
|
||||
void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
|
||||
#endif
|
||||
|
||||
ccl_device_inline Transform transform_inverse(const Transform tfm)
|
||||
{
|
||||
/* This implementation matches the one in Embree exactly, to ensure consistent
|
||||
* results with the ray intersection of instances. */
|
||||
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
|
||||
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
|
||||
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
|
||||
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
|
||||
|
||||
/* Compute determinant. */
|
||||
float det = dot(x, cross(y, z));
|
||||
|
||||
if (det == 0.0f) {
|
||||
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
|
||||
* never be in this situation, but try to invert it anyway with tweak.
|
||||
*
|
||||
* This logic does not match Embree which would just give an invalid
|
||||
* matrix. A better solution would be to remove this and ensure any object
|
||||
* matrix is valid. */
|
||||
x.x += 1e-8f;
|
||||
y.y += 1e-8f;
|
||||
z.z += 1e-8f;
|
||||
|
||||
det = dot(x, cross(y, z));
|
||||
if (det == 0.0f) {
|
||||
det = FLT_MAX;
|
||||
}
|
||||
/* Optimized transform implementations. */
|
||||
#ifndef __KERNEL_GPU__
|
||||
if (system_cpu_support_avx2()) {
|
||||
Transform itfm;
|
||||
transform_inverse_cpu_avx2(tfm, itfm);
|
||||
return itfm;
|
||||
}
|
||||
else if (system_cpu_support_sse41()) {
|
||||
Transform itfm;
|
||||
transform_inverse_cpu_sse41(tfm, itfm);
|
||||
return itfm;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
|
||||
const float3 inverse_x = cross(y, z) / det;
|
||||
const float3 inverse_y = cross(z, x) / det;
|
||||
const float3 inverse_z = cross(x, y) / det;
|
||||
|
||||
/* Compute translation and fill transform. */
|
||||
Transform itfm;
|
||||
itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
|
||||
itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
|
||||
itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
|
||||
|
||||
return itfm;
|
||||
return transform_inverse_impl(tfm);
|
||||
}
|
||||
|
||||
ccl_device_inline void transform_compose(ccl_private Transform *tfm,
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#include "util/transform.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
|
||||
{
|
||||
itfm = transform_inverse_impl(tfm);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,76 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#pragma once
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Custom cross and dot implementations that match Embree bit for bit.
|
||||
* Normally we don't use SSE41/AVX outside the kernel, but for this it's
|
||||
* important to match exactly for ray tracing precision. */
|
||||
|
||||
ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
|
||||
{
|
||||
#ifdef __AVX2__
|
||||
const ssef sse_a = (const __m128 &)a;
|
||||
const ssef sse_b = (const __m128 &)b;
|
||||
const ssef r = shuffle<1, 2, 0, 3>(
|
||||
ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
|
||||
return (const float3 &)r;
|
||||
#endif
|
||||
|
||||
return cross(a, b);
|
||||
}
|
||||
|
||||
ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
|
||||
#endif
|
||||
|
||||
return dot(a, b);
|
||||
}
|
||||
|
||||
ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
|
||||
{
|
||||
/* This implementation matches the one in Embree exactly, to ensure consistent
|
||||
* results with the ray intersection of instances. */
|
||||
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
|
||||
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
|
||||
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
|
||||
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
|
||||
|
||||
/* Compute determinant. */
|
||||
float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
|
||||
|
||||
if (det == 0.0f) {
|
||||
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
|
||||
* never be in this situation, but try to invert it anyway with tweak.
|
||||
*
|
||||
* This logic does not match Embree which would just give an invalid
|
||||
* matrix. A better solution would be to remove this and ensure any object
|
||||
* matrix is valid. */
|
||||
x.x += 1e-8f;
|
||||
y.y += 1e-8f;
|
||||
z.z += 1e-8f;
|
||||
|
||||
det = transform_inverse_dot(x, cross(y, z));
|
||||
if (det == 0.0f) {
|
||||
det = FLT_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
|
||||
const float3 inverse_x = transform_inverse_cross(y, z) / det;
|
||||
const float3 inverse_y = transform_inverse_cross(z, x) / det;
|
||||
const float3 inverse_z = transform_inverse_cross(x, y) / det;
|
||||
|
||||
/* Compute translation and fill transform. */
|
||||
Transform itfm;
|
||||
itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
|
||||
itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
|
||||
itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
|
||||
|
||||
return itfm;
|
||||
}
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,13 @@
|
|||
/* SPDX-License-Identifier: Apache-2.0
|
||||
* Copyright 2011-2022 Blender Foundation */
|
||||
|
||||
#include "util/transform.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
|
||||
{
|
||||
itfm = transform_inverse_impl(tfm);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -10,7 +10,6 @@
|
|||
|
||||
#include "util/aligned_malloc.h"
|
||||
#include "util/guarded_allocator.h"
|
||||
#include "util/types.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
|
|
Loading…
Reference in New Issue