Fix anisotropic Beckmann regression test failing on Metal

The lookup table method on CPU and the numerical root finding method on
GPU give quite different results. This commit deletes the Beckmann lookup
table and uses numerical root finding on all devices. For the numerical
root finding, a combined bisection-Newton method with precision control
is used.

Differential Revision: https://developer.blender.org/D17050
This commit is contained in:
Weizhen Huang 2023-01-19 20:02:35 +01:00
parent fa67b84c34
commit f71bfe4655
6 changed files with 30 additions and 166 deletions

View File

@ -71,7 +71,6 @@ ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals kg,
*G1i = G1;
#if defined(__KERNEL_GPU__)
/* Based on paper from Wenzel Jakob
* An Improved Visible Normal Sampling Routine for the Beckmann Distribution
*
@ -87,38 +86,38 @@ ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals kg,
* exp(-ierf(x)^2) ~= 1 - x * x
* solve y = 1 + b + K * (1 - b * b)
*/
float K = tan_theta_i * SQRT_PI_INV;
float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
float y_exact = randu * (1.0f + erf_a + K * exp_a2);
const float K = tan_theta_i * SQRT_PI_INV;
const float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
const float y_exact = randu * (1.0f + erf_a + K * exp_a2);
float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
/* Perform newton step to refine toward the true root. */
float inv_erf = fast_ierff(b);
float value = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
/* Check if we are close enough already,
* this also avoids NaNs as we get close to the root.
*/
if (fabsf(value) > 1e-6f) {
b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 1. */
inv_erf = fast_ierff(b);
value = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 2. */
/* Compute the slope from the refined value. */
*slope_x = fast_ierff(b);
}
else {
/* We are close enough already. */
*slope_x = inv_erf;
}
*slope_y = fast_ierff(2.0f * randv - 1.0f);
#else
/* Use precomputed table on CPU, it gives better performance. */
int beckmann_table_offset = kernel_data.tables.beckmann_offset;
float2 begin = make_float2(-1.0f, -y_exact);
float2 end = make_float2(erf_a, 1.0f + erf_a + K * exp_a2 - y_exact);
float2 current = make_float2(b, 1.0f + b + K * expf(-sqr(inv_erf)) - y_exact);
*slope_x = lookup_table_read_2D(
kg, randu, cos_theta_i, beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE);
/* Find root in a monotonic interval using newton method, under given precision and maximal
* iterations. Falls back to bisection if newton step produces results outside of the valid
* interval.*/
const float precision = 1e-6f;
const int max_iter = 3;
int iter = 0;
while (fabsf(current.y) > precision && iter++ < max_iter) {
if (signf(begin.y) == signf(current.y)) {
begin.x = current.x;
begin.y = current.y;
}
else {
end.x = current.x;
}
const float newton_x = current.x - current.y / (1.0f - inv_erf * tan_theta_i);
current.x = (newton_x >= begin.x && newton_x <= end.x) ? newton_x : 0.5f * (begin.x + end.x);
inv_erf = fast_ierff(current.x);
current.y = 1.0f + current.x + K * expf(-sqr(inv_erf)) - y_exact;
}
*slope_x = inv_erf;
*slope_y = fast_ierff(2.0f * randv - 1.0f);
#endif
}
/* GGX microfacet importance sampling from:

View File

@ -34,8 +34,6 @@ CCL_NAMESPACE_BEGIN
#define VOLUME_BOUNDS_MAX 1024
#define BECKMANN_TABLE_SIZE 256
#define SHADER_NONE (~0)
#define OBJECT_NONE (~0)
#define PRIM_NONE (~0)
@ -1187,9 +1185,8 @@ typedef enum KernelBVHLayout {
#include "kernel/data_template.h"
typedef struct KernelTables {
int beckmann_offset;
int filter_table_offset;
int pad1, pad2;
int pad1, pad2, pad3;
} KernelTables;
static_assert_align(KernelTables, 16);

View File

@ -32,114 +32,6 @@ namespace OCIO = OCIO_NAMESPACE;
CCL_NAMESPACE_BEGIN
thread_mutex ShaderManager::lookup_table_mutex;
vector<float> ShaderManager::beckmann_table;
bool ShaderManager::beckmann_table_ready = false;
/* Beckmann sampling precomputed table, see bsdf_microfacet.h */
/* 2D slope distribution (alpha = 1.0) */
static float beckmann_table_P22(const float slope_x, const float slope_y)
{
return expf(-(slope_x * slope_x + slope_y * slope_y));
}
/* maximal slope amplitude (range that contains 99.99% of the distribution) */
static float beckmann_table_slope_max()
{
return 6.0;
}
/* MSVC 2015 needs this ugly hack to prevent a codegen bug on x86
* see T50176 for details
*/
#if defined(_MSC_VER) && (_MSC_VER == 1900)
# define MSVC_VOLATILE volatile
#else
# define MSVC_VOLATILE
#endif
/* Paper used: Importance Sampling Microfacet-Based BSDFs with the
* Distribution of Visible Normals. Supplemental Material 2/2.
*
* http://hal.inria.fr/docs/01/00/66/20/ANNEX/supplemental2.pdf
*/
static void beckmann_table_rows(float *table, int row_from, int row_to)
{
/* allocate temporary data */
const int DATA_TMP_SIZE = 512;
vector<double> slope_x(DATA_TMP_SIZE);
vector<double> CDF_P22_omega_i(DATA_TMP_SIZE);
/* loop over incident directions */
for (int index_theta = row_from; index_theta < row_to; index_theta++) {
/* incident vector */
const float cos_theta = index_theta / (BECKMANN_TABLE_SIZE - 1.0f);
const float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
/* for a given incident vector
* integrate P22_{omega_i}(x_slope, 1, 1), Eq. (10) */
slope_x[0] = (double)-beckmann_table_slope_max();
CDF_P22_omega_i[0] = 0;
for (MSVC_VOLATILE int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
/* slope_x */
slope_x[index_slope_x] = (double)(-beckmann_table_slope_max() +
2.0f * beckmann_table_slope_max() * index_slope_x /
(DATA_TMP_SIZE - 1.0f));
/* dot product with incident vector */
float dot_product = fmaxf(0.0f, -(float)slope_x[index_slope_x] * sin_theta + cos_theta);
/* marginalize P22_{omega_i}(x_slope, 1, 1), Eq. (10) */
float P22_omega_i = 0.0f;
for (int j = 0; j < 100; ++j) {
float slope_y = -beckmann_table_slope_max() +
2.0f * beckmann_table_slope_max() * j * (1.0f / 99.0f);
P22_omega_i += dot_product * beckmann_table_P22((float)slope_x[index_slope_x], slope_y);
}
/* CDF of P22_{omega_i}(x_slope, 1, 1), Eq. (10) */
CDF_P22_omega_i[index_slope_x] = CDF_P22_omega_i[index_slope_x - 1] + (double)P22_omega_i;
}
/* renormalize CDF_P22_omega_i */
for (int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x)
CDF_P22_omega_i[index_slope_x] /= CDF_P22_omega_i[DATA_TMP_SIZE - 1];
/* loop over random number U1 */
int index_slope_x = 0;
for (int index_U = 0; index_U < BECKMANN_TABLE_SIZE; ++index_U) {
const double U = 0.0000001 + 0.9999998 * index_U / (double)(BECKMANN_TABLE_SIZE - 1);
/* inverse CDF_P22_omega_i, solve Eq.(11) */
while (CDF_P22_omega_i[index_slope_x] <= U)
++index_slope_x;
const double interp = (CDF_P22_omega_i[index_slope_x] - U) /
(CDF_P22_omega_i[index_slope_x] - CDF_P22_omega_i[index_slope_x - 1]);
/* store value */
table[index_U + index_theta * BECKMANN_TABLE_SIZE] =
(float)(interp * slope_x[index_slope_x - 1] + (1.0 - interp) * slope_x[index_slope_x]);
}
}
}
#undef MSVC_VOLATILE
static void beckmann_table_build(vector<float> &table)
{
table.resize(BECKMANN_TABLE_SIZE * BECKMANN_TABLE_SIZE);
/* multithreaded build */
TaskPool pool;
for (int i = 0; i < BECKMANN_TABLE_SIZE; i += 8)
pool.push(function_bind(&beckmann_table_rows, &table[0], i, i + 8));
pool.wait_work();
}
/* Shader */
@ -491,7 +383,6 @@ bool Shader::need_update_geometry() const
ShaderManager::ShaderManager()
{
update_flags = UPDATE_ALL;
beckmann_table_offset = TABLE_OFFSET_INVALID;
init_xyz_transforms();
}
@ -663,22 +554,6 @@ void ShaderManager::device_update_common(Device * /*device*/,
dscene->shaders.copy_to_device();
/* lookup tables */
KernelTables *ktables = &dscene->data.tables;
/* beckmann lookup table */
if (beckmann_table_offset == TABLE_OFFSET_INVALID) {
if (!beckmann_table_ready) {
thread_scoped_lock lock(lookup_table_mutex);
if (!beckmann_table_ready) {
beckmann_table_build(beckmann_table);
beckmann_table_ready = true;
}
}
beckmann_table_offset = scene->lookup_tables->add_table(dscene, beckmann_table);
}
ktables->beckmann_offset = (int)beckmann_table_offset;
/* integrator */
KernelIntegrator *kintegrator = &dscene->data.integrator;
kintegrator->use_volumes = has_volumes;
@ -700,8 +575,6 @@ void ShaderManager::device_update_common(Device * /*device*/,
void ShaderManager::device_free_common(Device *, DeviceScene *dscene, Scene *scene)
{
scene->lookup_tables->remove_table(&beckmann_table_offset);
dscene->shaders.free();
}
@ -844,7 +717,6 @@ uint ShaderManager::get_kernel_features(Scene *scene)
void ShaderManager::free_memory()
{
beckmann_table.free_memory();
#ifdef WITH_OSL
OSLShaderManager::free_memory();

View File

@ -232,10 +232,6 @@ class ShaderManager {
AttributeIDMap unique_attribute_id;
static thread_mutex lookup_table_mutex;
static vector<float> beckmann_table;
static bool beckmann_table_ready;
size_t beckmann_table_offset;
uint get_graph_kernel_features(ShaderGraph *graph);

@ -1 +1 @@
Subproject commit bf49eeaa14c445d3c53068203fdf91bff568fe64
Subproject commit 6fcd157f2497d9ba4ba82191cb2abf3de11a0394

@ -1 +1 @@
Subproject commit 0f72f6c85c3743a9072273acb6a8a34b1cf1064b
Subproject commit 9d538629bb8a425991c7d10a49bab1ba0788c18f