Merge branch 'master' into blender2.8

This commit is contained in:
Sergey Sharybin 2018-08-29 16:09:59 +02:00
commit 31278eb4bc
40 changed files with 5088 additions and 67 deletions

View File

@ -63,6 +63,11 @@ else()
set(BOOST_BUILD_COMMAND ./b2)
set(BOOST_BUILD_OPTIONS cxxflags=${PLATFORM_CXXFLAGS} --disable-icu boost.locale.icu=off)
set(BOOST_PATCH_COMMAND echo .)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(BOOST_ADDRESS_MODEL 64)
else()
set(BOOST_ADDRESS_MODEL 32)
endif()
endif()
set(BOOST_OPTIONS

View File

@ -45,7 +45,6 @@ if(WITH_OPENAL)
find_package(OpenAL)
if(OPENAL_FOUND)
set(WITH_OPENAL ON)
set(OPENAL_INCLUDE_DIR "${LIBDIR}/openal/include/AL")
else()
set(WITH_OPENAL OFF)
endif()

View File

@ -30,7 +30,10 @@ set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/${LIBDIR_NAME})
if(EXISTS ${LIBDIR})
file(GLOB LIB_SUBDIRS ${LIBDIR}/*)
set(CMAKE_PREFIX_PATH ${LIB_SUBDIRS})
# NOTE: Make sure "proper" compiled zlib comes first before the one
# which is a part of OpenCollada. They have different ABI, and we
# do need to use the official one.
set(CMAKE_PREFIX_PATH ${LIBDIR}/zlib ${LIB_SUBDIRS})
set(WITH_STATIC_LIBS ON)
set(WITH_OPENMP_STATIC ON)
endif()

View File

@ -54,6 +54,7 @@ enum_displacement_methods = (
enum_bvh_layouts = (
('BVH2', "BVH2", "", 1),
('BVH4', "BVH4", "", 2),
('BVH8', "BVH8", "", 4),
)
enum_bvh_types = (
@ -678,7 +679,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
debug_bvh_layout: EnumProperty(
name="BVH Layout",
items=enum_bvh_layouts,
default='BVH4',
default='BVH8',
)
debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)

View File

@ -635,7 +635,15 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
params.texture_limit = 0;
}
params.bvh_layout = DebugFlags().cpu.bvh_layout;
/* TODO(sergey): Once OSL supports per-microarchitecture optimization get
* rid of this.
*/
if (params.shadingsystem == SHADINGSYSTEM_OSL) {
params.bvh_layout = BVH_LAYOUT_BVH4;
}
else {
params.bvh_layout = DebugFlags().cpu.bvh_layout;
}
return params;
}

View File

@ -10,6 +10,7 @@ set(SRC
bvh.cpp
bvh2.cpp
bvh4.cpp
bvh8.cpp
bvh_binning.cpp
bvh_build.cpp
bvh_node.cpp
@ -22,6 +23,7 @@ set(SRC_HEADERS
bvh.h
bvh2.h
bvh4.h
bvh8.h
bvh_binning.h
bvh_build.h
bvh_node.h

View File

@ -22,6 +22,7 @@
#include "bvh/bvh2.h"
#include "bvh/bvh4.h"
#include "bvh/bvh8.h"
#include "bvh/bvh_build.h"
#include "bvh/bvh_node.h"
@ -38,6 +39,7 @@ const char *bvh_layout_name(BVHLayout layout)
switch(layout) {
case BVH_LAYOUT_BVH2: return "BVH2";
case BVH_LAYOUT_BVH4: return "BVH4";
case BVH_LAYOUT_BVH8: return "BVH8";
case BVH_LAYOUT_NONE: return "NONE";
case BVH_LAYOUT_ALL: return "ALL";
}
@ -92,6 +94,8 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
return new BVH2(params, objects);
case BVH_LAYOUT_BVH4:
return new BVH4(params, objects);
case BVH_LAYOUT_BVH8:
return new BVH8(params, objects);
case BVH_LAYOUT_NONE:
case BVH_LAYOUT_ALL:
break;
@ -215,6 +219,38 @@ void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility)
}
}
visibility |= ob->visibility_for_tracing();
}
}
bool BVH::leaf_check(const BVHNode *node, BVH_TYPE bvh)
{
if(node->is_leaf()) {
return node->is_unaligned;
}
else {
return node_is_unaligned(node, bvh);
}
}
bool BVH::node_is_unaligned(const BVHNode *node, BVH_TYPE bvh)
{
const BVHNode *node0 = node->get_child(0);
const BVHNode *node1 = node->get_child(1);
switch(bvh) {
case bvh2:
return node0->is_unaligned || node1->is_unaligned;
break;
case bvh4:
return leaf_check(node0, bvh2) || leaf_check(node1, bvh2);
break;
case bvh8:
return leaf_check(node0, bvh4) || leaf_check(node1, bvh4);
break;
default:
assert(0);
return false;
}
}
@ -291,8 +327,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
* BVH's are stored in global arrays. This function merges them into the
* top level BVH, adjusting indexes and offsets where appropriate.
*/
/* TODO(sergey): This code needs adjustment for wider BVH than 4. */
const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
/* Adjust primitive index to point to the triangle in the global array, for
* meshes with transform applied and already in the top level BVH.
@ -469,14 +505,26 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
size_t nsize, nsize_bbox;
if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
nsize = use_qbvh
? BVH_UNALIGNED_QNODE_SIZE
: BVH_UNALIGNED_NODE_SIZE;
nsize_bbox = (use_qbvh)? 13: 0;
if(use_obvh) {
nsize = BVH_UNALIGNED_ONODE_SIZE;
nsize_bbox = BVH_UNALIGNED_ONODE_SIZE-1;
}
else {
nsize = use_qbvh
? BVH_UNALIGNED_QNODE_SIZE
: BVH_UNALIGNED_NODE_SIZE;
nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE-1 : 0;
}
}
else {
nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
nsize_bbox = (use_qbvh)? 7: 0;
if(use_obvh) {
nsize = BVH_ONODE_SIZE;
nsize_bbox = BVH_ONODE_SIZE-1;
}
else {
nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
nsize_bbox = (use_qbvh)? BVH_QNODE_SIZE-1 : 0;
}
}
memcpy(pack_nodes + pack_nodes_offset,
@ -485,16 +533,29 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
/* Modify offsets into arrays */
int4 data = bvh_nodes[i + nsize_bbox];
data.z += (data.z < 0)? -noffset_leaf: noffset;
data.w += (data.w < 0)? -noffset_leaf: noffset;
if(use_qbvh) {
data.x += (data.x < 0)? -noffset_leaf: noffset;
data.y += (data.y < 0)? -noffset_leaf: noffset;
int4 data1 = bvh_nodes[i + nsize_bbox-1];
if(use_obvh) {
data.z += (data.z < 0) ? -noffset_leaf : noffset;
data.w += (data.w < 0) ? -noffset_leaf : noffset;
data.x += (data.x < 0) ? -noffset_leaf : noffset;
data.y += (data.y < 0) ? -noffset_leaf : noffset;
data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
}
else {
data.z += (data.z < 0) ? -noffset_leaf : noffset;
data.w += (data.w < 0) ? -noffset_leaf : noffset;
if(use_qbvh) {
data.x += (data.x < 0)? -noffset_leaf: noffset;
data.y += (data.y < 0)? -noffset_leaf: noffset;
}
}
pack_nodes[pack_nodes_offset + nsize_bbox] = data;
if(use_obvh) {
pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
}
/* Usually this copies nothing, but we better
* be prepared for possible node size extension.

View File

@ -73,6 +73,12 @@ struct PackedBVH {
}
};
enum BVH_TYPE {
bvh2,
bvh4,
bvh8
};
/* BVH */
class BVH
@ -93,6 +99,8 @@ protected:
/* Refit range of primitives. */
void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility);
static __forceinline bool leaf_check(const BVHNode *node, BVH_TYPE bvh);
static bool node_is_unaligned(const BVHNode *node, BVH_TYPE bvh);
/* triangles and strands */
void pack_primitives();

View File

@ -25,13 +25,6 @@
CCL_NAMESPACE_BEGIN
static bool node_bvh_is_unaligned(const BVHNode *node)
{
const BVHNode *node0 = node->get_child(0),
*node1 = node->get_child(1);
return node0->is_unaligned || node1->is_unaligned;
}
BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
: BVH(params_, objects_)
{
@ -195,7 +188,7 @@ void BVH2::pack_nodes(const BVHNode *root)
}
else {
stack.push_back(BVHStackEntry(root, nextNodeIdx));
nextNodeIdx += node_bvh_is_unaligned(root)
nextNodeIdx += node_is_unaligned(root, bvh2)
? BVH_UNALIGNED_NODE_SIZE
: BVH_NODE_SIZE;
}
@ -218,7 +211,7 @@ void BVH2::pack_nodes(const BVHNode *root)
}
else {
idx[i] = nextNodeIdx;
nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
nextNodeIdx += node_is_unaligned(e.node->get_child(i), bvh2)
? BVH_UNALIGNED_NODE_SIZE
: BVH_NODE_SIZE;
}

View File

@ -30,27 +30,6 @@ CCL_NAMESPACE_BEGIN
* Perhaps we can merge nodes in actual tree and make our
* life easier all over the place.
*/
static bool node_qbvh_is_unaligned(const BVHNode *node)
{
const BVHNode *node0 = node->get_child(0),
*node1 = node->get_child(1);
bool has_unaligned = false;
if(node0->is_leaf()) {
has_unaligned |= node0->is_unaligned;
}
else {
has_unaligned |= node0->get_child(0)->is_unaligned;
has_unaligned |= node0->get_child(1)->is_unaligned;
}
if(node1->is_leaf()) {
has_unaligned |= node1->is_unaligned;
}
else {
has_unaligned |= node1->get_child(0)->is_unaligned;
has_unaligned |= node1->get_child(1)->is_unaligned;
}
return has_unaligned;
}
BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
: BVH(params_, objects_)
@ -304,7 +283,7 @@ void BVH4::pack_nodes(const BVHNode *root)
}
else {
stack.push_back(BVHStackEntry(root, nextNodeIdx));
nextNodeIdx += node_qbvh_is_unaligned(root)
nextNodeIdx += node_is_unaligned(root, bvh4)
? BVH_UNALIGNED_QNODE_SIZE
: BVH_QNODE_SIZE;
}
@ -348,7 +327,7 @@ void BVH4::pack_nodes(const BVHNode *root)
}
else {
idx = nextNodeIdx;
nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
nextNodeIdx += node_is_unaligned(nodes[i], bvh4)
? BVH_UNALIGNED_QNODE_SIZE
: BVH_QNODE_SIZE;
}
@ -438,7 +417,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
visibility,
0.0f,
1.0f,
4);
num_nodes);
}
else {
pack_aligned_node(idx,
@ -447,7 +426,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
visibility,
0.0f,
1.0f,
4);
num_nodes);
}
}
}

515
intern/cycles/bvh/bvh8.cpp Normal file
View File

@ -0,0 +1,515 @@
/*
Copyright (c) 2017, Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bvh/bvh8.h"
#include "render/mesh.h"
#include "render/object.h"
#include "bvh/bvh_node.h"
#include "bvh/bvh_unaligned.h"
CCL_NAMESPACE_BEGIN
BVH8::BVH8(const BVHParams& params_, const vector<Object*>& objects_)
: BVH(params_, objects_)
{
}
void BVH8::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
{
float4 data[BVH_ONODE_LEAF_SIZE];
memset(data, 0, sizeof(data));
if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
/* object */
data[0].x = __int_as_float(~(leaf->lo));
data[0].y = __int_as_float(0);
}
else {
/* triangle */
data[0].x = __int_as_float(leaf->lo);
data[0].y = __int_as_float(leaf->hi);
}
data[0].z = __uint_as_float(leaf->visibility);
if(leaf->num_triangles() != 0) {
data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
}
memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
}
void BVH8::pack_inner(const BVHStackEntry& e,
const BVHStackEntry *en,
int num)
{
bool has_unaligned = false;
/* Check whether we have to create unaligned node or all nodes are aligned
* and we can cut some corner here.
*/
if(params.use_unaligned_nodes) {
for(int i = 0; i < num; i++) {
if(en[i].node->is_unaligned) {
has_unaligned = true;
break;
}
}
}
if(has_unaligned) {
/* There's no unaligned children, pack into AABB node. */
pack_unaligned_inner(e, en, num);
}
else {
/* Create unaligned node with orientation transform for each of the
* children.
*/
pack_aligned_inner(e, en, num);
}
}
void BVH8::pack_aligned_inner(const BVHStackEntry& e,
const BVHStackEntry *en,
int num)
{
BoundBox bounds[8];
int child[8];
for(int i = 0; i < num; ++i) {
bounds[i] = en[i].node->bounds;
child[i] = en[i].encodeIdx();
}
pack_aligned_node(e.idx,
bounds,
child,
e.node->visibility,
e.node->time_from,
e.node->time_to,
num);
}
void BVH8::pack_aligned_node(int idx,
const BoundBox *bounds,
const int *child,
const uint visibility,
const float time_from,
const float time_to,
const int num)
{
float8 data[8];
memset(data, 0, sizeof(data));
data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
data[0].b = time_from;
data[0].c = time_to;
for(int i = 0; i < num; i++) {
float3 bb_min = bounds[i].min;
float3 bb_max = bounds[i].max;
data[1][i] = bb_min.x;
data[2][i] = bb_max.x;
data[3][i] = bb_min.y;
data[4][i] = bb_max.y;
data[5][i] = bb_min.z;
data[6][i] = bb_max.z;
data[7][i] = __int_as_float(child[i]);
}
for(int i = num; i < 8; i++) {
/* We store BB which would never be recorded as intersection
* so kernel might safely assume there are always 4 child nodes.
*/
data[1][i] = FLT_MAX;
data[2][i] = -FLT_MAX;
data[3][i] = FLT_MAX;
data[4][i] = -FLT_MAX;
data[5][i] = FLT_MAX;
data[6][i] = -FLT_MAX;
data[7][i] = __int_as_float(0);
}
memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE);
}
void BVH8::pack_unaligned_inner(const BVHStackEntry& e,
const BVHStackEntry *en,
int num)
{
Transform aligned_space[8];
BoundBox bounds[8];
int child[8];
for(int i = 0; i < num; ++i) {
aligned_space[i] = en[i].node->get_aligned_space();
bounds[i] = en[i].node->bounds;
child[i] = en[i].encodeIdx();
}
pack_unaligned_node(e.idx,
aligned_space,
bounds,
child,
e.node->visibility,
e.node->time_from,
e.node->time_to,
num);
}
void BVH8::pack_unaligned_node(int idx,
const Transform *aligned_space,
const BoundBox *bounds,
const int *child,
const uint visibility,
const float time_from,
const float time_to,
const int num)
{
float8 data[BVH_UNALIGNED_ONODE_SIZE];
memset(data, 0, sizeof(data));
data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
data[0].b = time_from;
data[0].c = time_to;
for(int i = 0; i < num; i++) {
Transform space = BVHUnaligned::compute_node_transform(
bounds[i],
aligned_space[i]);
data[1][i] = space.x.x;
data[2][i] = space.x.y;
data[3][i] = space.x.z;
data[4][i] = space.y.x;
data[5][i] = space.y.y;
data[6][i] = space.y.z;
data[7][i] = space.z.x;
data[8][i] = space.z.y;
data[9][i] = space.z.z;
data[10][i] = space.x.w;
data[11][i] = space.y.w;
data[12][i] = space.z.w;
data[13][i] = __int_as_float(child[i]);
}
for(int i = num; i < 8; i++) {
/* We store BB which would never be recorded as intersection
* so kernel might safely assume there are always 4 child nodes.
*/
data[1][i] = 1.0f;
data[2][i] = 0.0f;
data[3][i] = 0.0f;
data[4][i] = 0.0f;
data[5][i] = 0.0f;
data[6][i] = 0.0f;
data[7][i] = 0.0f;
data[8][i] = 0.0f;
data[9][i] = 0.0f;
data[10][i] = -FLT_MAX;
data[11][i] = -FLT_MAX;
data[12][i] = -FLT_MAX;
data[13][i] = __int_as_float(0);
}
memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_ONODE_SIZE);
}
/* Quad SIMD Nodes */
void BVH8::pack_nodes(const BVHNode *root)
{
/* Calculate size of the arrays required. */
const size_t num_nodes = root->getSubtreeSize(BVH_STAT_ONODE_COUNT);
const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
assert(num_leaf_nodes <= num_nodes);
const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
size_t node_size;
if(params.use_unaligned_nodes) {
const size_t num_unaligned_nodes =
root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_ONODE_COUNT);
node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
(num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
}
else {
node_size = num_inner_nodes * BVH_ONODE_SIZE;
}
/* Resize arrays. */
pack.nodes.clear();
pack.leaf_nodes.clear();
/* For top level BVH, first merge existing BVH's so we know the offsets. */
if(params.top_level) {
pack_instances(node_size, num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
}
else {
pack.nodes.resize(node_size);
pack.leaf_nodes.resize(num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
}
int nextNodeIdx = 0, nextLeafNodeIdx = 0;
vector<BVHStackEntry> stack;
stack.reserve(BVHParams::MAX_DEPTH*2);
if(root->is_leaf()) {
stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
}
else {
stack.push_back(BVHStackEntry(root, nextNodeIdx));
nextNodeIdx += node_is_unaligned(root, bvh8)
? BVH_UNALIGNED_ONODE_SIZE
: BVH_ONODE_SIZE;
}
while(stack.size()) {
BVHStackEntry e = stack.back();
stack.pop_back();
if(e.node->is_leaf()) {
/* leaf node */
const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
pack_leaf(e, leaf);
}
else {
/* Inner node. */
const BVHNode *node = e.node;
const BVHNode *node0 = node->get_child(0);
const BVHNode *node1 = node->get_child(1);
/* Collect nodes. */
const BVHNode *nodes[8];
int numnodes = 0;
if(node0->is_leaf()) {
nodes[numnodes++] = node0;
}
else {
const BVHNode *node00 = node0->get_child(0),
*node01 = node0->get_child(1);
if(node00->is_leaf()) {
nodes[numnodes++] = node00;
}
else {
nodes[numnodes++] = node00->get_child(0);
nodes[numnodes++] = node00->get_child(1);
}
if(node01->is_leaf()) {
nodes[numnodes++] = node01;
}
else {
nodes[numnodes++] = node01->get_child(0);
nodes[numnodes++] = node01->get_child(1);
}
}
if(node1->is_leaf()) {
nodes[numnodes++] = node1;
}
else {
const BVHNode *node10 = node1->get_child(0),
*node11 = node1->get_child(1);
if(node10->is_leaf()) {
nodes[numnodes++] = node10;
}
else {
nodes[numnodes++] = node10->get_child(0);
nodes[numnodes++] = node10->get_child(1);
}
if(node11->is_leaf()) {
nodes[numnodes++] = node11;
}
else {
nodes[numnodes++] = node11->get_child(0);
nodes[numnodes++] = node11->get_child(1);
}
}
/* Push entries on the stack. */
for(int i = 0; i < numnodes; ++i) {
int idx;
if(nodes[i]->is_leaf()) {
idx = nextLeafNodeIdx++;
}
else {
idx = nextNodeIdx;
nextNodeIdx += node_is_unaligned(nodes[i], bvh8)
? BVH_UNALIGNED_ONODE_SIZE
: BVH_ONODE_SIZE;
}
stack.push_back(BVHStackEntry(nodes[i], idx));
}
/* Set node. */
pack_inner(e, &stack[stack.size() - numnodes], numnodes);
}
}
assert(node_size == nextNodeIdx);
/* Root index to start traversal at, to handle case of single leaf node. */
pack.root_index = (root->is_leaf()) ? -1 : 0;
}
void BVH8::refit_nodes()
{
assert(!params.top_level);
BoundBox bbox = BoundBox::empty;
uint visibility = 0;
refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
}
void BVH8::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
{
if(leaf) {
int4 *data = &pack.leaf_nodes[idx];
int4 c = data[0];
/* Refit leaf node. */
for(int prim = c.x; prim < c.y; prim++) {
int pidx = pack.prim_index[prim];
int tob = pack.prim_object[prim];
Object *ob = objects[tob];
if(pidx == -1) {
/* Object instance. */
bbox.grow(ob->bounds);
}
else {
/* Primitives. */
const Mesh *mesh = ob->mesh;
if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
/* Curves. */
int str_offset = (params.top_level) ? mesh->curve_offset : 0;
Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
visibility |= PATH_RAY_CURVE;
/* Motion curves. */
if(mesh->use_motion_blur) {
Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
if(attr) {
size_t mesh_size = mesh->curve_keys.size();
size_t steps = mesh->motion_steps - 1;
float3 *key_steps = attr->data_float3();
for(size_t i = 0; i < steps; i++) {
curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
}
}
}
}
else {
/* Triangles. */
int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
const float3 *vpos = &mesh->verts[0];
triangle.bounds_grow(vpos, bbox);
/* Motion triangles. */
if(mesh->use_motion_blur) {
Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
if(attr) {
size_t mesh_size = mesh->verts.size();
size_t steps = mesh->motion_steps - 1;
float3 *vert_steps = attr->data_float3();
for(size_t i = 0; i < steps; i++) {
triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
}
}
}
}
}
visibility |= ob->visibility;
}
float4 leaf_data[BVH_ONODE_LEAF_SIZE];
leaf_data[0].x = __int_as_float(c.x);
leaf_data[0].y = __int_as_float(c.y);
leaf_data[0].z = __uint_as_float(visibility);
leaf_data[0].w = __uint_as_float(c.w);
memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
}
else {
int4 *data = &pack.nodes[idx];
bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
int4 c;
if(is_unaligned) {
c = data[BVH_UNALIGNED_ONODE_SIZE-1];
}
else {
c = data[BVH_ONODE_SIZE-1];
}
/* Refit inner node, set bbox from children. */
BoundBox child_bbox[8] = { BoundBox::empty, BoundBox::empty,
BoundBox::empty, BoundBox::empty,
BoundBox::empty, BoundBox::empty,
BoundBox::empty, BoundBox::empty };
uint child_visibility[8] = { 0 };
int num_nodes = 0;
for(int i = 0; i < 8; ++i) {
if(c[i] != 0) {
refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
child_bbox[i], child_visibility[i]);
++num_nodes;
bbox.grow(child_bbox[i]);
visibility |= child_visibility[i];
}
}
if(is_unaligned) {
Transform aligned_space[8] = { transform_identity(), transform_identity(),
transform_identity(), transform_identity(),
transform_identity(), transform_identity(),
transform_identity(), transform_identity()};
pack_unaligned_node(idx,
aligned_space,
child_bbox,
&c[0],
visibility,
0.0f,
1.0f,
num_nodes);
}
else {
pack_aligned_node(idx,
child_bbox,
&c[0],
visibility,
0.0f,
1.0f,
num_nodes);
}
}
}
CCL_NAMESPACE_END

97
intern/cycles/bvh/bvh8.h Normal file
View File

@ -0,0 +1,97 @@
/*
Copyright (c) 2017, Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __BVH8_H__
#define __BVH8_H__
#include "bvh/bvh.h"
#include "bvh/bvh_params.h"
#include "util/util_types.h"
#include "util/util_vector.h"
CCL_NAMESPACE_BEGIN
class BVHNode;
struct BVHStackEntry;
class BVHParams;
class BoundBox;
class LeafNode;
class Object;
class Progress;
#define BVH_ONODE_SIZE 16
#define BVH_ONODE_LEAF_SIZE 1
#define BVH_UNALIGNED_ONODE_SIZE 28
/* BVH8
*
* Octo BVH, with each node having eight children, to use with SIMD instructions.
*/
class BVH8 : public BVH {
protected:
/* constructor */
friend class BVH;
BVH8(const BVHParams& params, const vector<Object*>& objects);
/* pack */
void pack_nodes(const BVHNode *root);
void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
void pack_aligned_inner(const BVHStackEntry& e,
const BVHStackEntry *en,
int num);
void pack_aligned_node(int idx,
const BoundBox *bounds,
const int *child,
const uint visibility,
const float time_from,
const float time_to,
const int num);
void pack_unaligned_inner(const BVHStackEntry& e,
const BVHStackEntry *en,
int num);
void pack_unaligned_node(int idx,
const Transform *aligned_space,
const BoundBox *bounds,
const int *child,
const uint visibility,
const float time_from,
const float time_to,
const int num);
/* refit */
void refit_nodes();
void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
};
CCL_NAMESPACE_END
#endif /* __BVH8_H__ */

View File

@ -61,6 +61,55 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
}
}
return cnt;
case BVH_STAT_ONODE_COUNT:
cnt = 1;
for(int i = 0; i < num_children(); i++) {
BVHNode *node = get_child(i);
if(node->is_leaf()) {
cnt += 1;
}
else {
for(int j = 0; j < node->num_children(); j++)
{
BVHNode *node_next = node->get_child(j);
if(node_next->is_leaf()) {
cnt += 1;
}
else {
for(int k = 0; k < node_next->num_children(); k++) {
cnt += node_next->get_child(k)->getSubtreeSize(stat);
}
}
}
}
}
return cnt;
case BVH_STAT_UNALIGNED_INNER_ONODE_COUNT:
{
bool has_unaligned = false;
for(int i = 0; i < num_children(); i++) {
BVHNode *node = get_child(i);
if(node->is_leaf()) {
has_unaligned |= node->is_unaligned;
}
else {
for(int j = 0; j < node->num_children(); j++) {
BVHNode *node_next = node->get_child(j);
if(node_next->is_leaf()) {
has_unaligned |= node_next->is_unaligned;
}
else {
for(int k = 0; k < node_next->num_children(); k++) {
cnt += node_next->get_child(k)->getSubtreeSize(stat);
has_unaligned |= node_next->get_child(k)->is_unaligned;
}
}
}
}
}
cnt += has_unaligned? 1: 0;
}
return cnt;
case BVH_STAT_ALIGNED_COUNT:
if(!is_unaligned) {
cnt = 1;

View File

@ -39,6 +39,8 @@ enum BVH_STAT {
BVH_STAT_ALIGNED_LEAF_COUNT,
BVH_STAT_UNALIGNED_LEAF_COUNT,
BVH_STAT_DEPTH,
BVH_STAT_ONODE_COUNT,
BVH_STAT_UNALIGNED_INNER_ONODE_COUNT,
};
class BVHParams;

View File

@ -1031,6 +1031,9 @@ void device_cpu_info(vector<DeviceInfo>& devices)
if(system_cpu_support_sse2()) {
info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
}
if (system_cpu_support_avx2()) {
info.bvh_layout_mask |= BVH_LAYOUT_BVH8;
}
info.has_volume_decoupled = true;
info.has_osl = true;
info.has_half_images = true;

View File

@ -76,6 +76,12 @@ set(SRC_BVH_HEADERS
bvh/qbvh_traversal.h
bvh/qbvh_volume.h
bvh/qbvh_volume_all.h
bvh/obvh_nodes.h
bvh/obvh_shadow_all.h
bvh/obvh_local.h
bvh/obvh_traversal.h
bvh/obvh_volume.h
bvh/obvh_volume_all.h
)
set(SRC_HEADERS
@ -270,6 +276,8 @@ set(SRC_UTIL_HEADERS
../util/util_types_float3_impl.h
../util/util_types_float4.h
../util/util_types_float4_impl.h
../util/util_types_float8.h
../util/util_types_float8_impl.h
../util/util_types_int2.h
../util/util_types_int2_impl.h
../util/util_types_int3.h

View File

@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN
/* Common QBVH functions. */
#ifdef __QBVH__
# include "kernel/bvh/qbvh_nodes.h"
#ifdef __KERNEL_AVX2__
# include "kernel/bvh/obvh_nodes.h"
#endif
#endif
/* Regular BVH traversal */

View File

@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_local.h"
# ifdef __KERNEL_AVX2__
# include "kernel/bvh/obvh_local.h"
# endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@ -254,6 +257,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
int max_hits)
{
switch(kernel_data.bvh.bvh_layout) {
#ifdef __KERNEL_AVX2__
case BVH_LAYOUT_BVH8:
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
ray,
local_isect,
local_object,
lcg_state,
max_hits);
#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,

View File

@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_shadow_all.h"
#ifdef __KERNEL_AVX2__
# include "kernel/bvh/obvh_shadow_all.h"
#endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
uint *num_hits)
{
switch(kernel_data.bvh.bvh_layout) {
#ifdef __KERNEL_AVX2__
case BVH_LAYOUT_BVH8:
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
ray,
isect_array,
visibility,
max_hits,
num_hits);
#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,

View File

@ -20,6 +20,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_traversal.h"
#endif
#ifdef __KERNEL_AVX2__
# include "kernel/bvh/obvh_traversal.h"
#endif
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT bvh_node_intersect
@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
)
{
switch(kernel_data.bvh.bvh_layout) {
#ifdef __KERNEL_AVX2__
case BVH_LAYOUT_BVH8:
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
ray,
isect,
visibility
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
, lcg_state,
difl,
extmax
# endif
);
#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,

View File

@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
#define BVH_STACK_SIZE 192
#define BVH_QSTACK_SIZE 384
#define BVH_OSTACK_SIZE 768
/* BVH intersection function variations */
#define BVH_INSTANCING 1

View File

@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_volume.h"
#ifdef __KERNEL_AVX2__
# include "kernel/bvh/obvh_volume.h"
#endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
const uint visibility)
{
switch(kernel_data.bvh.bvh_layout) {
#ifdef __KERNEL_AVX2__
case BVH_LAYOUT_BVH8:
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
ray,
isect,
visibility);
#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,

View File

@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_volume_all.h"
#ifdef __KERNEL_AVX2__
# include "kernel/bvh/obvh_volume_all.h"
#endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@ -386,6 +389,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
const uint visibility)
{
switch(kernel_data.bvh.bvh_layout) {
#ifdef __KERNEL_AVX2__
case BVH_LAYOUT_BVH8:
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
ray,
isect_array,
max_hits,
visibility);
#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,

View File

@ -0,0 +1,409 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This is a template BVH traversal function for subsurface scattering, where
* various features can be enabled/disabled. This way we can compile optimized
* versions for each case without new features slowing things down.
*
* BVH_MOTION: motion blur rendering
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT obvh_node_intersect
#else
# define NODE_INTERSECT obvh_aligned_node_intersect
#endif
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
const Ray *ray,
LocalIntersection *local_isect,
int local_object,
uint *lcg_state,
int max_hits)
{
/* Traversal stack in CUDA thread-local memory. */
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
/* Traversal variables in registers. */
int stack_ptr = 0;
int node_addr = kernel_tex_fetch(__object_node, local_object);
/* Ray parameters in registers. */
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE;
float isect_t = ray->t;
local_isect->num_hits = 0;
const int object_flag = kernel_tex_fetch(__object_flag, local_object);
if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
#if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm;
isect_t = bvh_instance_motion_push(kg,
local_object,
ray,
&P,
&dir,
&idir,
isect_t,
&ob_itfm);
#else
isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
#endif
object = local_object;
}
#ifndef __KERNEL_SSE41__
if(!isfinite(P.x)) {
return false;
}
#endif
avxf tnear(0.0f), tfar(isect_t);
#if BVH_FEATURE(BVH_HAIR)
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
#endif
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
int near_x, near_y, near_z;
int far_x, far_y, far_z;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
/* Traversal loop. */
do {
do {
/* Traverse internal nodes. */
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
avxf dist;
int child_mask = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
&dist);
if(child_mask != 0) {
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
avxf cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
}
else
#endif
{
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
}
/* One child is hit, continue with that child. */
int r = __bscf(child_mask);
if(child_mask == 0) {
node_addr = __float_as_int(cnodes[r]);
continue;
}
/* Two children are hit, push far child, and continue with
* closer child.
*/
int c0 = __float_as_int(cnodes[r]);
float d0 = ((float*)&dist)[r];
r = __bscf(child_mask);
int c1 = __float_as_int(cnodes[r]);
float d1 = ((float*)&dist)[r];
if(child_mask == 0) {
if(d1 < d0) {
node_addr = c1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
continue;
}
else {
node_addr = c0;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
continue;
}
}
/* Here starts the slow path for 3 or 4 hit children. We push
* all nodes onto the stack to sort them there.
*/
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
/* Three children are hit, push all onto stack and sort 3
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c2 = __float_as_int(cnodes[r]);
float d2 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Four children are hit, push all onto stack and sort 4
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c3 = __float_as_int(cnodes[r]);
float d3 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
/* Five children are hit, push all onto stack and sort 5
* stack items, continue with closest child
*/
r = __bscf(child_mask);
int c4 = __float_as_int(cnodes[r]);
float d4 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Six children are hit, push all onto stack and sort 6
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c5 = __float_as_int(cnodes[r]);
float d5 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
/* Seven children are hit, push all onto stack and sort 7
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c6 = __float_as_int(cnodes[r]);
float d6 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Eight children are hit, push all onto stack and sort 8
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c7 = __float_as_int(cnodes[r]);
float d7 = ((float*)&dist)[r];
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c7;
traversal_stack[stack_ptr].dist = d7;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6],
&traversal_stack[stack_ptr - 7]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
/* If node is leaf, fetch triangle list. */
if(node_addr < 0) {
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
int prim_addr = __float_as_int(leaf.x);
int prim_addr2 = __float_as_int(leaf.y);
const uint type = __float_as_int(leaf.w);
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
/* Primitive intersection. */
switch(type & PRIMITIVE_ALL) {
case PRIMITIVE_TRIANGLE: {
/* Intersect ray against primitive, */
for(; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
if(triangle_intersect_local(kg,
local_isect,
P,
dir,
object,
local_object,
prim_addr,
isect_t,
lcg_state,
max_hits))
{
return true;
}
}
break;
}
#if BVH_FEATURE(BVH_MOTION)
case PRIMITIVE_MOTION_TRIANGLE: {
/* Intersect ray against primitive. */
for(; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
if(motion_triangle_intersect_local(kg,
local_isect,
P,
dir,
ray->time,
object,
local_object,
prim_addr,
isect_t,
lcg_state,
max_hits))
{
return true;
}
}
break;
}
#endif
default:
break;
}
}
} while(node_addr != ENTRYPOINT_SENTINEL);
} while(node_addr != ENTRYPOINT_SENTINEL);
return false;
}
#undef NODE_INTERSECT

View File

@ -0,0 +1,532 @@
/*
* Copyright 2011-2014, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Aligned nodes intersection AVX code is adopted from Embree,
*/
struct OBVHStackItem {
int addr;
float dist;
};
ccl_device_inline void obvh_near_far_idx_calc(const float3& idir,
int *ccl_restrict near_x,
int *ccl_restrict near_y,
int *ccl_restrict near_z,
int *ccl_restrict far_x,
int *ccl_restrict far_y,
int *ccl_restrict far_z)
{
#ifdef __KERNEL_SSE__
*near_x = 0; *far_x = 1;
*near_y = 2; *far_y = 3;
*near_z = 4; *far_z = 5;
const size_t mask = movemask(ssef(idir.m128));
const int mask_x = mask & 1;
const int mask_y = (mask & 2) >> 1;
const int mask_z = (mask & 4) >> 2;
*near_x += mask_x; *far_x -= mask_x;
*near_y += mask_y; *far_y -= mask_y;
*near_z += mask_z; *far_z -= mask_z;
#else
if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
#endif
}
ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a,
OBVHStackItem *ccl_restrict b)
{
OBVHStackItem tmp = *a;
*a = *b;
*b = tmp;
}
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
OBVHStackItem *ccl_restrict s2,
OBVHStackItem *ccl_restrict s3)
{
if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
}
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
OBVHStackItem *ccl_restrict s2,
OBVHStackItem *ccl_restrict s3,
OBVHStackItem *ccl_restrict s4)
{
if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); }
if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); }
if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); }
if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
}
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
OBVHStackItem *ccl_restrict s2,
OBVHStackItem *ccl_restrict s3,
OBVHStackItem *ccl_restrict s4,
OBVHStackItem *ccl_restrict s5)
{
obvh_stack_sort(s1, s2, s3, s4);
if(s5->dist < s4->dist) {
obvh_item_swap(s4, s5);
if(s4->dist < s3->dist) {
obvh_item_swap(s3, s4);
if(s3->dist < s2->dist) {
obvh_item_swap(s2, s3);
if(s2->dist < s1->dist) {
obvh_item_swap(s1, s2);
}
}
}
}
}
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
OBVHStackItem *ccl_restrict s2,
OBVHStackItem *ccl_restrict s3,
OBVHStackItem *ccl_restrict s4,
OBVHStackItem *ccl_restrict s5,
OBVHStackItem *ccl_restrict s6)
{
obvh_stack_sort(s1, s2, s3, s4, s5);
if(s6->dist < s5->dist) {
obvh_item_swap(s5, s6);
if(s5->dist < s4->dist) {
obvh_item_swap(s4, s5);
if(s4->dist < s3->dist) {
obvh_item_swap(s3, s4);
if(s3->dist < s2->dist) {
obvh_item_swap(s2, s3);
if(s2->dist < s1->dist) {
obvh_item_swap(s1, s2);
}
}
}
}
}
}
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
OBVHStackItem *ccl_restrict s2,
OBVHStackItem *ccl_restrict s3,
OBVHStackItem *ccl_restrict s4,
OBVHStackItem *ccl_restrict s5,
OBVHStackItem *ccl_restrict s6,
OBVHStackItem *ccl_restrict s7)
{
obvh_stack_sort(s1, s2, s3, s4, s5, s6);
if(s7->dist < s6->dist) {
obvh_item_swap(s6, s7);
if(s6->dist < s5->dist) {
obvh_item_swap(s5, s6);
if(s5->dist < s4->dist) {
obvh_item_swap(s4, s5);
if(s4->dist < s3->dist) {
obvh_item_swap(s3, s4);
if(s3->dist < s2->dist) {
obvh_item_swap(s2, s3);
if(s2->dist < s1->dist) {
obvh_item_swap(s1, s2);
}
}
}
}
}
}
}
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
OBVHStackItem *ccl_restrict s2,
OBVHStackItem *ccl_restrict s3,
OBVHStackItem *ccl_restrict s4,
OBVHStackItem *ccl_restrict s5,
OBVHStackItem *ccl_restrict s6,
OBVHStackItem *ccl_restrict s7,
OBVHStackItem *ccl_restrict s8)
{
obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
if(s8->dist < s7->dist) {
obvh_item_swap(s7, s8);
if(s7->dist < s6->dist) {
obvh_item_swap(s6, s7);
if(s6->dist < s5->dist) {
obvh_item_swap(s5, s6);
if(s5->dist < s4->dist) {
obvh_item_swap(s4, s5);
if(s4->dist < s3->dist) {
obvh_item_swap(s3, s4);
if(s3->dist < s2->dist) {
obvh_item_swap(s2, s3);
if(s2->dist < s1->dist) {
obvh_item_swap(s1, s2);
}
}
}
}
}
}
}
}
/* Axis-aligned nodes intersection */
ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
const avxf& isect_near,
const avxf& isect_far,
#ifdef __KERNEL_AVX2__
const avx3f& org_idir,
#else
const avx3f& org,
#endif
const avx3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int node_addr,
avxf *ccl_restrict dist)
{
const int offset = node_addr + 2;
#ifdef __KERNEL_AVX2__
const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x);
const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y);
const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z);
const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x);
const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y);
const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z);
const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
const avxb vmask = tnear <= tfar;
int mask = (int)movemask(vmask);
*dist = tnear;
return mask;
#else
return 0;
#endif
}
ccl_device_inline int obvh_aligned_node_intersect_robust(
KernelGlobals *ccl_restrict kg,
const avxf& isect_near,
const avxf& isect_far,
#ifdef __KERNEL_AVX2__
const avx3f& P_idir,
#else
const avx3f& P,
#endif
const avx3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int node_addr,
const float difl,
avxf *ccl_restrict dist)
{
const int offset = node_addr + 2;
#ifdef __KERNEL_AVX2__
const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x);
const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x);
const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y);
const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y);
const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z);
const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z);
const float round_down = 1.0f - difl;
const float round_up = 1.0f + difl;
const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
const avxb vmask = round_down*tnear <= round_up*tfar;
int mask = (int)movemask(vmask);
*dist = tnear;
return mask;
#else
return 0;
#endif
}
/* Unaligned nodes intersection */
ccl_device_inline int obvh_unaligned_node_intersect(
KernelGlobals *ccl_restrict kg,
const avxf& isect_near,
const avxf& isect_far,
#ifdef __KERNEL_AVX2__
const avx3f& org_idir,
#endif
const avx3f& org,
const avx3f& dir,
const avx3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int node_addr,
avxf *ccl_restrict dist)
{
const int offset = node_addr;
const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
const avxf neg_one(-1.0f);
const avxf nrdir_x = neg_one / aligned_dir_x,
nrdir_y = neg_one / aligned_dir_y,
nrdir_z = neg_one / aligned_dir_z;
const avxf tlower_x = aligned_P_x * nrdir_x,
tlower_y = aligned_P_y * nrdir_y,
tlower_z = aligned_P_z * nrdir_z;
const avxf tupper_x = tlower_x - nrdir_x,
tupper_y = tlower_y - nrdir_y,
tupper_z = tlower_z - nrdir_z;
const avxf tnear_x = min(tlower_x, tupper_x);
const avxf tnear_y = min(tlower_y, tupper_y);
const avxf tnear_z = min(tlower_z, tupper_z);
const avxf tfar_x = max(tlower_x, tupper_x);
const avxf tfar_y = max(tlower_y, tupper_y);
const avxf tfar_z = max(tlower_z, tupper_z);
const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
const avxb vmask = tnear <= tfar;
*dist = tnear;
return movemask(vmask);
}
ccl_device_inline int obvh_unaligned_node_intersect_robust(
KernelGlobals *ccl_restrict kg,
const avxf& isect_near,
const avxf& isect_far,
#ifdef __KERNEL_AVX2__
const avx3f& P_idir,
#endif
const avx3f& P,
const avx3f& dir,
const avx3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int node_addr,
const float difl,
avxf *ccl_restrict dist)
{
const int offset = node_addr;
const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
const avxf neg_one(-1.0f);
const avxf nrdir_x = neg_one / aligned_dir_x,
nrdir_y = neg_one / aligned_dir_y,
nrdir_z = neg_one / aligned_dir_z;
const avxf tlower_x = aligned_P_x * nrdir_x,
tlower_y = aligned_P_y * nrdir_y,
tlower_z = aligned_P_z * nrdir_z;
const avxf tupper_x = tlower_x - nrdir_x,
tupper_y = tlower_y - nrdir_y,
tupper_z = tlower_z - nrdir_z;
const float round_down = 1.0f - difl;
const float round_up = 1.0f + difl;
const avxf tnear_x = min(tlower_x, tupper_x);
const avxf tnear_y = min(tlower_y, tupper_y);
const avxf tnear_z = min(tlower_z, tupper_z);
const avxf tfar_x = max(tlower_x, tupper_x);
const avxf tfar_y = max(tlower_y, tupper_y);
const avxf tfar_z = max(tlower_z, tupper_z);
const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
const avxb vmask = round_down*tnear <= round_up*tfar;
*dist = tnear;
return movemask(vmask);
}
/* Intersectors wrappers.
*
* They'll check node type and call appropriate intersection code.
*/
ccl_device_inline int obvh_node_intersect(
KernelGlobals *ccl_restrict kg,
const avxf& isect_near,
const avxf& isect_far,
#ifdef __KERNEL_AVX2__
const avx3f& org_idir,
#endif
const avx3f& org,
const avx3f& dir,
const avx3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int node_addr,
avxf *ccl_restrict dist)
{
const int offset = node_addr;
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return obvh_unaligned_node_intersect(kg,
isect_near,
isect_far,
#ifdef __KERNEL_AVX2__
org_idir,
#endif
org,
dir,
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
dist);
}
else {
return obvh_aligned_node_intersect(kg,
isect_near,
isect_far,
#ifdef __KERNEL_AVX2__
org_idir,
#else
org,
#endif
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
dist);
}
}
ccl_device_inline int obvh_node_intersect_robust(
KernelGlobals *ccl_restrict kg,
const avxf& isect_near,
const avxf& isect_far,
#ifdef __KERNEL_AVX2__
const avx3f& P_idir,
#endif
const avx3f& P,
const avx3f& dir,
const avx3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int node_addr,
const float difl,
avxf *ccl_restrict dist)
{
const int offset = node_addr;
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return obvh_unaligned_node_intersect_robust(kg,
isect_near,
isect_far,
#ifdef __KERNEL_AVX2__
P_idir,
#endif
P,
dir,
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
difl,
dist);
}
else {
return obvh_aligned_node_intersect_robust(kg,
isect_near,
isect_far,
#ifdef __KERNEL_AVX2__
P_idir,
#else
P,
#endif
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
difl,
dist);
}
}

View File

@ -0,0 +1,687 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This is a template BVH traversal function, where various features can be
* enabled/disabled. This way we can compile optimized versions for each case
* without new features slowing things down.
*
* BVH_INSTANCING: object instancing
* BVH_HAIR: hair curve rendering
* BVH_MOTION: motion blur rendering
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT obvh_node_intersect
#else
# define NODE_INTERSECT obvh_aligned_node_intersect
#endif
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
const int skip_object,
const uint max_hits,
uint *num_hits)
{
/* TODO(sergey):
* - Test if pushing distance on the stack helps.
* - Likely and unlikely for if() statements.
* - Test restrict attribute for pointers.
*/
/* Traversal stack in CUDA thread-local memory. */
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
/* Traversal variables in registers. */
int stack_ptr = 0;
int node_addr = kernel_data.bvh.root;
/* Ray parameters in registers. */
const float tmax = ray->t;
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE;
float isect_t = tmax;
#if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm;
#endif
*num_hits = 0;
isect_array->t = tmax;
#ifndef __KERNEL_SSE41__
if(!isfinite(P.x)) {
return false;
}
#endif
#if BVH_FEATURE(BVH_INSTANCING)
int num_hits_in_instance = 0;
#endif
avxf tnear(0.0f), tfar(isect_t);
#if BVH_FEATURE(BVH_HAIR)
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
#endif
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
int near_x, near_y, near_z;
int far_x, far_y, far_z;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
/* Traversal loop. */
do {
do {
/* Traverse internal nodes. */
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
(void)inodes;
if(false
#ifdef __VISIBILITY_FLAG__
|| ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
#endif
#if BVH_FEATURE(BVH_MOTION)
|| UNLIKELY(ray->time < inodes.y)
|| UNLIKELY(ray->time > inodes.z)
#endif
) {
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
avxf dist;
int child_mask = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
//#if !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
&dist);
if(child_mask != 0) {
avxf cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
}
else
#endif
{
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
}
/* One child is hit, continue with that child. */
int r = __bscf(child_mask);
if(child_mask == 0) {
node_addr = __float_as_int(cnodes[r]);
continue;
}
/* Two children are hit, push far child, and continue with
* closer child.
*/
int c0 = __float_as_int(cnodes[r]);
float d0 = ((float*)&dist)[r];
r = __bscf(child_mask);
int c1 = __float_as_int(cnodes[r]);
float d1 = ((float*)&dist)[r];
if(child_mask == 0) {
if(d1 < d0) {
node_addr = c1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
continue;
}
else {
node_addr = c0;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
continue;
}
}
/* Here starts the slow path for 3 or 4 hit children. We push
* all nodes onto the stack to sort them there.
*/
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
/* Three children are hit, push all onto stack and sort 3
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c2 = __float_as_int(cnodes[r]);
float d2 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Four children are hit, push all onto stack and sort 4
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c3 = __float_as_int(cnodes[r]);
float d3 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
/* Five children are hit, push all onto stack and sort 5
* stack items, continue with closest child
*/
r = __bscf(child_mask);
int c4 = __float_as_int(cnodes[r]);
float d4 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Six children are hit, push all onto stack and sort 6
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c5 = __float_as_int(cnodes[r]);
float d5 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
/* Seven children are hit, push all onto stack and sort 7
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c6 = __float_as_int(cnodes[r]);
float d6 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Eight children are hit, push all onto stack and sort 8
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c7 = __float_as_int(cnodes[r]);
float d7 = ((float*)&dist)[r];
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c7;
traversal_stack[stack_ptr].dist = d7;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6],
&traversal_stack[stack_ptr - 7]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
/* If node is leaf, fetch triangle list. */
if(node_addr < 0) {
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
#ifdef __VISIBILITY_FLAG__
if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
#endif
int prim_addr = __float_as_int(leaf.x);
#if BVH_FEATURE(BVH_INSTANCING)
if(prim_addr >= 0) {
#endif
int prim_addr2 = __float_as_int(leaf.y);
const uint type = __float_as_int(leaf.w);
const uint p_type = type & PRIMITIVE_ALL;
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
/* Primitive intersection. */
if(p_type == PRIMITIVE_TRIANGLE) {
int prim_count = prim_addr2 - prim_addr;
if(prim_count < 3) {
while(prim_addr < prim_addr2) {
kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
int hit = triangle_intersect(kg,
isect_array,
P,
dir,
PATH_RAY_SHADOW,
object,
prim_addr);
/* Shadow ray early termination. */
if(hit) {
/* detect if this surface has a shader with transparent shadows */
/* todo: optimize so primitive visibility flag indicates if
* the primitive has a transparent shadow shader? */
int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
int shader = 0;
#ifdef __HAIR__
if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
#endif
{
shader = kernel_tex_fetch(__tri_shader, prim);
}
#ifdef __HAIR__
else {
float4 str = kernel_tex_fetch(__curves, prim);
shader = __float_as_int(str.z);
}
#endif
int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
/* if no transparent shadows, all light is blocked */
if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
return true;
}
/* if maximum number of hits reached, block all light */
else if(*num_hits == max_hits) {
return true;
}
/* move on to next entry in intersections array */
isect_array++;
(*num_hits)++;
#if BVH_FEATURE(BVH_INSTANCING)
num_hits_in_instance++;
#endif
isect_array->t = isect_t;
}
prim_addr++;
} //while
} else {
kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type);
#if BVH_FEATURE(BVH_INSTANCING)
int* nhiptr = &num_hits_in_instance;
#else
int nhi= 0;
int *nhiptr = &nhi;
#endif
int result = triangle_intersect8(kg,
&isect_array,
P,
dir,
PATH_RAY_SHADOW,
object,
prim_addr,
prim_count,
num_hits,
max_hits,
nhiptr,
isect_t);
if(result == 2) {
return true;
}
} // prim_count
} // PRIMITIVE_TRIANGLE
else {
while(prim_addr < prim_addr2) {
kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
#ifdef __SHADOW_TRICKS__
uint tri_object = (object == OBJECT_NONE)
? kernel_tex_fetch(__prim_object, prim_addr)
: object;
if(tri_object == skip_object) {
++prim_addr;
continue;
}
#endif
bool hit;
/* todo: specialized intersect functions which don't fill in
* isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
* might give a few % performance improvement */
switch(p_type) {
#if BVH_FEATURE(BVH_MOTION)
case PRIMITIVE_MOTION_TRIANGLE: {
hit = motion_triangle_intersect(kg,
isect_array,
P,
dir,
ray->time,
PATH_RAY_SHADOW,
object,
prim_addr);
break;
}
#endif
#if BVH_FEATURE(BVH_HAIR)
case PRIMITIVE_CURVE:
case PRIMITIVE_MOTION_CURVE: {
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
hit = cardinal_curve_intersect(kg,
isect_array,
P,
dir,
PATH_RAY_SHADOW,
object,
prim_addr,
ray->time,
curve_type,
NULL,
0, 0);
}
else {
hit = curve_intersect(kg,
isect_array,
P,
dir,
PATH_RAY_SHADOW,
object,
prim_addr,
ray->time,
curve_type,
NULL,
0, 0);
}
break;
}
#endif
default: {
hit = false;
break;
}
}
/* Shadow ray early termination. */
if(hit) {
/* detect if this surface has a shader with transparent shadows */
/* todo: optimize so primitive visibility flag indicates if
* the primitive has a transparent shadow shader? */
int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
int shader = 0;
#ifdef __HAIR__
if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
#endif
{
shader = kernel_tex_fetch(__tri_shader, prim);
}
#ifdef __HAIR__
else {
float4 str = kernel_tex_fetch(__curves, prim);
shader = __float_as_int(str.z);
}
#endif
int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
/* if no transparent shadows, all light is blocked */
if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
return true;
}
/* if maximum number of hits reached, block all light */
else if(*num_hits == max_hits) {
return true;
}
/* move on to next entry in intersections array */
isect_array++;
(*num_hits)++;
#if BVH_FEATURE(BVH_INSTANCING)
num_hits_in_instance++;
#endif
isect_array->t = isect_t;
}
prim_addr++;
}//while prim
}
}
#if BVH_FEATURE(BVH_INSTANCING)
else {
/* Instance push. */
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
# if BVH_FEATURE(BVH_MOTION)
isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
# else
isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
# endif
num_hits_in_instance = 0;
isect_array->t = isect_t;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect_t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
node_addr = kernel_tex_fetch(__object_node, object);
}
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
#if BVH_FEATURE(BVH_INSTANCING)
if(stack_ptr >= 0) {
kernel_assert(object != OBJECT_NONE);
/* Instance pop. */
if(num_hits_in_instance) {
float t_fac;
# if BVH_FEATURE(BVH_MOTION)
bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
# else
bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
# endif
/* Scale isect->t to adjust for instancing. */
for(int i = 0; i < num_hits_in_instance; i++) {
(isect_array-i-1)->t *= t_fac;
}
}
else {
# if BVH_FEATURE(BVH_MOTION)
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
# else
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
# endif
}
isect_t = tmax;
isect_array->t = isect_t;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect_t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
object = OBJECT_NONE;
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
return false;
}
#undef NODE_INTERSECT

View File

@ -0,0 +1,642 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This is a template BVH traversal function, where various features can be
* enabled/disabled. This way we can compile optimized versions for each case
* without new features slowing things down.
*
* BVH_INSTANCING: object instancing
* BVH_HAIR: hair curve rendering
* BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
* BVH_MOTION: motion blur rendering
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT obvh_node_intersect
# define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
#else
# define NODE_INTERSECT obvh_aligned_node_intersect
# define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
#endif
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint visibility
#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
,uint *lcg_state,
float difl,
float extmax
#endif
)
{
/* Traversal stack in CUDA thread-local memory. */
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
traversal_stack[0].dist = -FLT_MAX;
/* Traversal variables in registers. */
int stack_ptr = 0;
int node_addr = kernel_data.bvh.root;
float node_dist = -FLT_MAX;
/* Ray parameters in registers. */
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE;
#if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm;
#endif
#ifndef __KERNEL_SSE41__
if(!isfinite(P.x)) {
return false;
}
#endif
isect->t = ray->t;
isect->u = 0.0f;
isect->v = 0.0f;
isect->prim = PRIM_NONE;
isect->object = OBJECT_NONE;
BVH_DEBUG_INIT();
avxf tnear(0.0f), tfar(ray->t);
#if BVH_FEATURE(BVH_HAIR)
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
#endif
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
int near_x, near_y, near_z;
int far_x, far_y, far_z;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
/* Traversal loop. */
do {
do {
/* Traverse internal nodes. */
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
(void)inodes;
if(UNLIKELY(node_dist > isect->t)
#if BVH_FEATURE(BVH_MOTION)
|| UNLIKELY(ray->time < inodes.y)
|| UNLIKELY(ray->time > inodes.z)
#endif
#ifdef __VISIBILITY_FLAG__
|| (__float_as_uint(inodes.x) & visibility) == 0
#endif
)
{
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
int child_mask;
avxf dist;
BVH_DEBUG_NEXT_NODE();
#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(difl != 0.0f) {
/* NOTE: We extend all the child BB instead of fetching
* and checking visibility flags for each of the,
*
* Need to test if doing opposite would be any faster.
*/
child_mask = NODE_INTERSECT_ROBUST(kg,
tnear,
tfar,
# ifdef __KERNEL_AVX2__
P_idir4,
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
# endif
# if BVH_FEATURE(BVH_HAIR)
dir4,
# endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
difl,
&dist);
}
else
#endif /* BVH_HAIR_MINIMUM_WIDTH */
{
child_mask = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
&dist);
}
if(child_mask != 0) {
avxf cnodes;
/* TODO(sergey): Investigate whether moving cnodes upwards
* gives a speedup (will be different cache pattern but will
* avoid extra check here),
*/
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
}
else
#endif
{
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
}
/* One child is hit, continue with that child. */
int r = __bscf(child_mask);
float d0 = ((float*)&dist)[r];
if(child_mask == 0) {
node_addr = __float_as_int(cnodes[r]);
node_dist = d0;
continue;
}
/* Two children are hit, push far child, and continue with
* closer child.
*/
int c0 = __float_as_int(cnodes[r]);
r = __bscf(child_mask);
int c1 = __float_as_int(cnodes[r]);
float d1 = ((float*)&dist)[r];
if(child_mask == 0) {
if(d1 < d0) {
node_addr = c1;
node_dist = d1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
continue;
}
else {
node_addr = c0;
node_dist = d0;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
continue;
}
}
/* Here starts the slow path for 3 or 4 hit children. We push
* all nodes onto the stack to sort them there.
*/
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
/* Three children are hit, push all onto stack and sort 3
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c2 = __float_as_int(cnodes[r]);
float d2 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2]);
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
/* Four children are hit, push all onto stack and sort 4
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c3 = __float_as_int(cnodes[r]);
float d3 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3]);
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
/* Five children are hit, push all onto stack and sort 5
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c4 = __float_as_int(cnodes[r]);
float d4 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4]);
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
/* Six children are hit, push all onto stack and sort 6
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c5 = __float_as_int(cnodes[r]);
float d5 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5]);
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
/* Seven children are hit, push all onto stack and sort 7
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c6 = __float_as_int(cnodes[r]);
float d6 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6]);
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
/* Eight children are hit, push all onto stack and sort 8
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c7 = __float_as_int(cnodes[r]);
float d7 = ((float*)&dist)[r];
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c7;
traversal_stack[stack_ptr].dist = d7;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6],
&traversal_stack[stack_ptr - 7]);
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
}
/* If node is leaf, fetch triangle list. */
if(node_addr < 0) {
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
#ifdef __VISIBILITY_FLAG__
if(UNLIKELY((node_dist > isect->t) ||
((__float_as_uint(leaf.z) & visibility) == 0)))
#else
if(UNLIKELY((node_dist > isect->t)))
#endif
{
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
continue;
}
int prim_addr = __float_as_int(leaf.x);
#if BVH_FEATURE(BVH_INSTANCING)
if(prim_addr >= 0) {
#endif
int prim_addr2 = __float_as_int(leaf.y);
const uint type = __float_as_int(leaf.w);
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
/* Primitive intersection. */
switch(type & PRIMITIVE_ALL) {
case PRIMITIVE_TRIANGLE: {
int prim_count = prim_addr2 - prim_addr;
if(prim_count < 3) {
for(; prim_addr < prim_addr2; prim_addr++) {
BVH_DEBUG_NEXT_INTERSECTION();
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
if(triangle_intersect(kg,
isect,
P,
dir,
visibility,
object,
prim_addr))
{
tfar = avxf(isect->t);
/* Shadow ray early termination. */
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
return true;
}
}
}//for
}
else {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
if(triangle_intersect8(kg,
&isect,
P,
dir,
visibility,
object,
prim_addr,
prim_count,
0,
0,
NULL,
0.0f))
{
tfar = avxf(isect->t);
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
return true;
}
}
}//prim count
break;
}
#if BVH_FEATURE(BVH_MOTION)
case PRIMITIVE_MOTION_TRIANGLE: {
for(; prim_addr < prim_addr2; prim_addr++) {
BVH_DEBUG_NEXT_INTERSECTION();
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
if(motion_triangle_intersect(kg,
isect,
P,
dir,
ray->time,
visibility,
object,
prim_addr))
{
tfar = avxf(isect->t);
/* Shadow ray early termination. */
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
return true;
}
}
}
break;
}
#endif /* BVH_FEATURE(BVH_MOTION) */
#if BVH_FEATURE(BVH_HAIR)
case PRIMITIVE_CURVE:
case PRIMITIVE_MOTION_CURVE: {
for(; prim_addr < prim_addr2; prim_addr++) {
BVH_DEBUG_NEXT_INTERSECTION();
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
bool hit;
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
hit = cardinal_curve_intersect(kg,
isect,
P,
dir,
visibility,
object,
prim_addr,
ray->time,
curve_type,
lcg_state,
difl,
extmax);
}
else {
hit = curve_intersect(kg,
isect,
P,
dir,
visibility,
object,
prim_addr,
ray->time,
curve_type,
lcg_state,
difl,
extmax);
}
if(hit) {
tfar = avxf(isect->t);
/* Shadow ray early termination. */
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
return true;
}
}
}
break;
}
#endif /* BVH_FEATURE(BVH_HAIR) */
}
}
#if BVH_FEATURE(BVH_INSTANCING)
else {
/* Instance push. */
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
# if BVH_FEATURE(BVH_MOTION)
qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
# else
qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
# endif
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
traversal_stack[stack_ptr].dist = -FLT_MAX;
node_addr = kernel_tex_fetch(__object_node, object);
BVH_DEBUG_NEXT_INSTANCE();
}
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
#if BVH_FEATURE(BVH_INSTANCING)
if(stack_ptr >= 0) {
kernel_assert(object != OBJECT_NONE);
/* Instance pop. */
# if BVH_FEATURE(BVH_MOTION)
isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
# else
isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
# endif
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
object = OBJECT_NONE;
node_addr = traversal_stack[stack_ptr].addr;
node_dist = traversal_stack[stack_ptr].dist;
--stack_ptr;
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
return (isect->prim != PRIM_NONE);
}
#undef NODE_INTERSECT
#undef NODE_INTERSECT_ROBUST

View File

@ -0,0 +1,483 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This is a template BVH traversal function for volumes, where
* various features can be enabled/disabled. This way we can compile optimized
* versions for each case without new features slowing things down.
*
* BVH_INSTANCING: object instancing
* BVH_MOTION: motion blur rendering
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT obvh_node_intersect
#else
# define NODE_INTERSECT obvh_aligned_node_intersect
#endif
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint visibility)
{
/* Traversal stack in CUDA thread-local memory. */
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
/* Traversal variables in registers. */
int stack_ptr = 0;
int node_addr = kernel_data.bvh.root;
/* Ray parameters in registers. */
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE;
#if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm;
#endif
#ifndef __KERNEL_SSE41__
if(!isfinite(P.x)) {
return false;
}
#endif
isect->t = ray->t;
isect->u = 0.0f;
isect->v = 0.0f;
isect->prim = PRIM_NONE;
isect->object = OBJECT_NONE;
avxf tnear(0.0f), tfar(ray->t);
#if BVH_FEATURE(BVH_HAIR)
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
#endif
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
int near_x, near_y, near_z;
int far_x, far_y, far_z;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
/* Traversal loop. */
do {
do {
/* Traverse internal nodes. */
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
#ifdef __VISIBILITY_FLAG__
if((__float_as_uint(inodes.x) & visibility) == 0) {
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
#endif
avxf dist;
int child_mask = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
&dist);
if(child_mask != 0) {
avxf cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
}
else
#endif
{
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
}
/* One child is hit, continue with that child. */
int r = __bscf(child_mask);
if(child_mask == 0) {
node_addr = __float_as_int(cnodes[r]);
continue;
}
/* Two children are hit, push far child, and continue with
* closer child.
*/
int c0 = __float_as_int(cnodes[r]);
float d0 = ((float*)&dist)[r];
r = __bscf(child_mask);
int c1 = __float_as_int(cnodes[r]);
float d1 = ((float*)&dist)[r];
if(child_mask == 0) {
if(d1 < d0) {
node_addr = c1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
continue;
}
else {
node_addr = c0;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
continue;
}
}
/* Here starts the slow path for 3 or 4 hit children. We push
* all nodes onto the stack to sort them there.
*/
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
/* Three children are hit, push all onto stack and sort 3
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c2 = __float_as_int(cnodes[r]);
float d2 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Four children are hit, push all onto stack and sort 4
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c3 = __float_as_int(cnodes[r]);
float d3 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
/* Five children are hit, push all onto stack and sort 5
* stack items, continue with closest child
*/
r = __bscf(child_mask);
int c4 = __float_as_int(cnodes[r]);
float d4 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Six children are hit, push all onto stack and sort 6
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c5 = __float_as_int(cnodes[r]);
float d5 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
/* Seven children are hit, push all onto stack and sort 7
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c6 = __float_as_int(cnodes[r]);
float d6 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Eight children are hit, push all onto stack and sort 8
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c7 = __float_as_int(cnodes[r]);
float d7 = ((float*)&dist)[r];
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c7;
traversal_stack[stack_ptr].dist = d7;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6],
&traversal_stack[stack_ptr - 7]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
/* If node is leaf, fetch triangle list. */
if(node_addr < 0) {
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
if((__float_as_uint(leaf.z) & visibility) == 0) {
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
int prim_addr = __float_as_int(leaf.x);
#if BVH_FEATURE(BVH_INSTANCING)
if(prim_addr >= 0) {
#endif
int prim_addr2 = __float_as_int(leaf.y);
const uint type = __float_as_int(leaf.w);
const uint p_type = type & PRIMITIVE_ALL;
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
/* Primitive intersection. */
switch(p_type) {
case PRIMITIVE_TRIANGLE: {
for(; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
/* Only primitives from volume object. */
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
continue;
}
/* Intersect ray against primitive. */
triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
}
break;
}
#if BVH_FEATURE(BVH_MOTION)
case PRIMITIVE_MOTION_TRIANGLE: {
for(; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
/* Only primitives from volume object. */
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
continue;
}
/* Intersect ray against primitive. */
motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
}
break;
}
#endif
}
}
#if BVH_FEATURE(BVH_INSTANCING)
else {
/* Instance push. */
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
int object_flag = kernel_tex_fetch(__object_flag, object);
if(object_flag & SD_OBJECT_HAS_VOLUME) {
# if BVH_FEATURE(BVH_MOTION)
isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
# else
isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
# endif
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
node_addr = kernel_tex_fetch(__object_node, object);
}
else {
/* Pop. */
object = OBJECT_NONE;
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
}
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
#if BVH_FEATURE(BVH_INSTANCING)
if(stack_ptr >= 0) {
kernel_assert(object != OBJECT_NONE);
/* Instance pop. */
# if BVH_FEATURE(BVH_MOTION)
isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
# else
isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
# endif
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
object = OBJECT_NONE;
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
return (isect->prim != PRIM_NONE);
}
#undef NODE_INTERSECT

View File

@ -0,0 +1,554 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This is a template BVH traversal function for volumes, where
* various features can be enabled/disabled. This way we can compile optimized
* versions for each case without new features slowing things down.
*
* BVH_INSTANCING: object instancing
* BVH_MOTION: motion blur rendering
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT obvh_node_intersect
#else
# define NODE_INTERSECT obvh_aligned_node_intersect
#endif
ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
const uint max_hits,
const uint visibility)
{
/* Traversal stack in CUDA thread-local memory. */
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
/* Traversal variables in registers. */
int stack_ptr = 0;
int node_addr = kernel_data.bvh.root;
/* Ray parameters in registers. */
const float tmax = ray->t;
float3 P = ray->P;
float3 dir = bvh_clamp_direction(ray->D);
float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE;
float isect_t = tmax;
#if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm;
#endif
uint num_hits = 0;
isect_array->t = tmax;
#ifndef __KERNEL_SSE41__
if(!isfinite(P.x)) {
return 0;
}
#endif
#if BVH_FEATURE(BVH_INSTANCING)
int num_hits_in_instance = 0;
#endif
avxf tnear(0.0f), tfar(isect_t);
#if BVH_FEATURE(BVH_HAIR)
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
#endif
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
int near_x, near_y, near_z;
int far_x, far_y, far_z;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
/* Traversal loop. */
do {
do {
/* Traverse internal nodes. */
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
#ifdef __VISIBILITY_FLAG__
if((__float_as_uint(inodes.x) & visibility) == 0) {
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
#endif
avxf dist;
int child_mask = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
node_addr,
&dist);
if(child_mask != 0) {
avxf cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
}
else
#endif
{
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
}
/* One child is hit, continue with that child. */
int r = __bscf(child_mask);
if(child_mask == 0) {
node_addr = __float_as_int(cnodes[r]);
continue;
}
/* Two children are hit, push far child, and continue with
* closer child.
*/
int c0 = __float_as_int(cnodes[r]);
float d0 = ((float*)&dist)[r];
r = __bscf(child_mask);
int c1 = __float_as_int(cnodes[r]);
float d1 = ((float*)&dist)[r];
if(child_mask == 0) {
if(d1 < d0) {
node_addr = c1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
continue;
}
else {
node_addr = c0;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
continue;
}
}
/* Here starts the slow path for 3 or 4 hit children. We push
* all nodes onto the stack to sort them there.
*/
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c1;
traversal_stack[stack_ptr].dist = d1;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c0;
traversal_stack[stack_ptr].dist = d0;
/* Three children are hit, push all onto stack and sort 3
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c2 = __float_as_int(cnodes[r]);
float d2 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Four children are hit, push all onto stack and sort 4
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c3 = __float_as_int(cnodes[r]);
float d3 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c3;
traversal_stack[stack_ptr].dist = d3;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c2;
traversal_stack[stack_ptr].dist = d2;
/* Five children are hit, push all onto stack and sort 5
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c4 = __float_as_int(cnodes[r]);
float d4 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Six children are hit, push all onto stack and sort 6
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c5 = __float_as_int(cnodes[r]);
float d5 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c5;
traversal_stack[stack_ptr].dist = d5;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c4;
traversal_stack[stack_ptr].dist = d4;
/* Seven children are hit, push all onto stack and sort 7
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c6 = __float_as_int(cnodes[r]);
float d6 = ((float*)&dist)[r];
if(child_mask == 0) {
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
/* Eight children are hit, push all onto stack and sort 8
* stack items, continue with closest child.
*/
r = __bscf(child_mask);
int c7 = __float_as_int(cnodes[r]);
float d7 = ((float*)&dist)[r];
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c7;
traversal_stack[stack_ptr].dist = d7;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = c6;
traversal_stack[stack_ptr].dist = d6;
obvh_stack_sort(&traversal_stack[stack_ptr],
&traversal_stack[stack_ptr - 1],
&traversal_stack[stack_ptr - 2],
&traversal_stack[stack_ptr - 3],
&traversal_stack[stack_ptr - 4],
&traversal_stack[stack_ptr - 5],
&traversal_stack[stack_ptr - 6],
&traversal_stack[stack_ptr - 7]);
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
/* If node is leaf, fetch triangle list. */
if(node_addr < 0) {
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
if((__float_as_uint(leaf.z) & visibility) == 0) {
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
continue;
}
int prim_addr = __float_as_int(leaf.x);
#if BVH_FEATURE(BVH_INSTANCING)
if(prim_addr >= 0) {
#endif
int prim_addr2 = __float_as_int(leaf.y);
const uint type = __float_as_int(leaf.w);
const uint p_type = type & PRIMITIVE_ALL;
bool hit;
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
/* Primitive intersection. */
switch(p_type) {
case PRIMITIVE_TRIANGLE: {
for(; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
/* Only primitives from volume object. */
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
continue;
}
/* Intersect ray against primitive. */
hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
if(hit) {
/* Move on to next entry in intersections array. */
isect_array++;
num_hits++;
#if BVH_FEATURE(BVH_INSTANCING)
num_hits_in_instance++;
#endif
isect_array->t = isect_t;
if(num_hits == max_hits) {
#if BVH_FEATURE(BVH_INSTANCING)
# if BVH_FEATURE(BVH_MOTION)
float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
# else
Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
float t_fac = 1.0f / len(transform_direction(&itfm, dir));
# endif
for(int i = 0; i < num_hits_in_instance; i++) {
(isect_array-i-1)->t *= t_fac;
}
#endif /* BVH_FEATURE(BVH_INSTANCING) */
return num_hits;
}
}
}
break;
}
#if BVH_FEATURE(BVH_MOTION)
case PRIMITIVE_MOTION_TRIANGLE: {
for(; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
/* Only primitives from volume object. */
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
continue;
}
/* Intersect ray against primitive. */
hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
if(hit) {
/* Move on to next entry in intersections array. */
isect_array++;
num_hits++;
# if BVH_FEATURE(BVH_INSTANCING)
num_hits_in_instance++;
# endif
isect_array->t = isect_t;
if(num_hits == max_hits) {
# if BVH_FEATURE(BVH_INSTANCING)
# if BVH_FEATURE(BVH_MOTION)
float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
# else
Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
float t_fac = 1.0f / len(transform_direction(&itfm, dir));
# endif
for(int i = 0; i < num_hits_in_instance; i++) {
(isect_array-i-1)->t *= t_fac;
}
# endif /* BVH_FEATURE(BVH_INSTANCING) */
return num_hits;
}
}
}
break;
}
#endif
}
}
#if BVH_FEATURE(BVH_INSTANCING)
else {
/* Instance push. */
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
int object_flag = kernel_tex_fetch(__object_flag, object);
if(object_flag & SD_OBJECT_HAS_VOLUME) {
# if BVH_FEATURE(BVH_MOTION)
isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
# else
isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
# endif
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect_t);
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
num_hits_in_instance = 0;
isect_array->t = isect_t;
++stack_ptr;
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
node_addr = kernel_tex_fetch(__object_node, object);
}
else {
/* Pop. */
object = OBJECT_NONE;
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
}
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
#if BVH_FEATURE(BVH_INSTANCING)
if(stack_ptr >= 0) {
kernel_assert(object != OBJECT_NONE);
/* Instance pop. */
if(num_hits_in_instance) {
float t_fac;
# if BVH_FEATURE(BVH_MOTION)
bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
# else
bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
# endif
/* Scale isect->t to adjust for instancing. */
for(int i = 0; i < num_hits_in_instance; i++) {
(isect_array-i-1)->t *= t_fac;
}
}
else {
# if BVH_FEATURE(BVH_MOTION)
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
# else
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
# endif
}
isect_t = tmax;
isect_array->t = isect_t;
obvh_near_far_idx_calc(idir,
&near_x, &near_y, &near_z,
&far_x, &far_y, &far_z);
tfar = avxf(isect_t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
# endif
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
# endif
object = OBJECT_NONE;
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
}
#endif /* FEATURE(BVH_INSTANCING) */
} while(node_addr != ENTRYPOINT_SENTINEL);
return num_hits;
}
#undef NODE_INTERSECT

View File

@ -85,7 +85,8 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
/* Axis-aligned nodes intersection */
ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
const ssef& isect_near,
const ssef& isect_far,
#ifdef __KERNEL_AVX2__

View File

@ -1,4 +1,4 @@
/*
/*
* Copyright 2014, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -70,7 +70,479 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
return false;
}
/* Special ray intersection routines for local intersection. In that case we
#define cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D))
#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
ccl_device_inline
#else
ccl_device_forceinline
#endif
int ray_triangle_intersect8(KernelGlobals *kg,
float3 ray_P,
float3 ray_dir,
Intersection **isect,
uint visibility,
int object,
__m256 *triA,
__m256 *triB,
__m256 *triC,
int prim_addr,
int prim_num,
uint *num_hits,
uint max_hits,
int *num_hits_in_instance,
float isec_t)
{
const unsigned char prim_num_mask = (1 << prim_num) - 1;
const __m256i zero256 = _mm256_setzero_si256();
const __m256 Px256 = _mm256_set1_ps(ray_P.x);
const __m256 Py256 = _mm256_set1_ps(ray_P.y);
const __m256 Pz256 = _mm256_set1_ps(ray_P.z);
const __m256 dirx256 = _mm256_set1_ps(ray_dir.x);
const __m256 diry256 = _mm256_set1_ps(ray_dir.y);
const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
/* Calculate vertices relative to ray origin. */
/* const float3 v0 = tri_c - P;
const float3 v1 = tri_a - P;
const float3 v2 = tri_b - P; */
__m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
__m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
__m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
__m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256);
__m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256);
__m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256);
__m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256);
__m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256);
__m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256);
__m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256);
__m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256);
__m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256);
__m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256);
__m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256);
__m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256);
__m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256);
__m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
__m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
/* Calculate triangle edges.
const float3 e0 = v2 - v0;
const float3 e1 = v0 - v1;
const float3 e2 = v1 - v2;*/
__m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
__m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
__m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
__m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256);
__m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256);
__m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256);
__m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256);
__m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
__m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
/* Perform edge tests.
const float U = dot(cross(v2 + v0, e0), ray_dir);
const float V = dot(cross(v0 + v1, e1), ray_dir);
const float W = dot(cross(v1 + v2, e2), ray_dir);*/
//cross (AyBz - AzBy, AzBx -AxBz, AxBy - AyBx)
__m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
__m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
__m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
//vertical dot
__m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256));
U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256));
__m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
__m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
__m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
//vertical dot
__m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256));
V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256));
__m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
__m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
__m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
//vertical dot
__m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256));
W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256));
//const float minUVW = min(U, min(V, W));
//const float maxUVW = max(U, max(V, W));
#if 0
__m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256));
__m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256));
//if(minUVW < 0.0f && maxUVW > 0.0f)
__m256i mask_minmaxUVW_256 = _mm256_and_si256(
_mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)),
//_mm256_castps_si256(minUVW_256),
_mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256));
#else
__m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
__m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
__m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
__m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1);
const __m256i one256 = _mm256_set1_epi32(1);
const __m256i two256 = _mm256_set1_epi32(2);
__m256i mask_minmaxUVW_256 = _mm256_or_si256(
_mm256_cmpeq_epi32(one256, UVW_256_1),
_mm256_cmpeq_epi32(two256, UVW_256_1) );
#endif
unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set
return false;
}
/* Calculate geometry normal and denominator. */
// const float3 Ng1 = cross(e1, e0);
//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
__m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
__m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
__m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
//const float3 Ng = Ng1 + Ng1;
Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
//const float den = dot3(Ng, dir);
//vertical dot
__m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256));
den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256));
// __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
/* Perform depth test. */
//const float T = dot3(v0, Ng);
__m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256));
T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256));
//const int sign_den = (__float_as_int(den) & 0x80000000);
const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
__m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
//const float sign_T = xor_signmask(T, sign_den);
__m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
/*if((sign_T < 0.0f) || mask_minmaxUVW_pos { return false;} */
unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
return false;
} /**/
__m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
ccl_align(32) unsigned int mask_minmaxUVW8[8];
if(visibility == PATH_RAY_SHADOW_OPAQUE){
__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256
__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
_mm256_castps_si256(
_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
)
);
/* __m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256,
_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256),
_CMP_GT_OS
) );*/
mask0 = _mm256_or_si256(mask1, mask0);
//unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0));
//unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256));
//unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos));
mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
if((mask_final & prim_num_mask) == 0) { //all bits NOT set
return false;
} /**/
unsigned long i = 0;
#if defined(_MSC_VER)
unsigned char res = _BitScanForward(&i, (unsigned long)mask_final);
#else
i = __builtin_ffs(mask_final)-1;
#endif
den_256 = _mm256_rcp_ps(den_256); //inv_den
U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den
V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den
T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den
_mm256_store_ps(U8, U_256);
_mm256_store_ps(V8, V_256);
_mm256_store_ps(T8, T_256);
//here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true
(*isect)->u = U8[i];
(*isect)->v = V8[i];
(*isect)->t = T8[i];
(*isect)->prim = (prim_addr + i);
(*isect)->object = object;
(*isect)->type = PRIMITIVE_TRIANGLE;
return true;
}
else {
_mm256_store_ps(den8, den_256);
_mm256_store_ps(U8, U_256);
_mm256_store_ps(V8, V_256);
_mm256_store_ps(T8, T_256);
_mm256_store_ps(sign_T8, sign_T_256);
_mm256_store_ps(xor_signmask8, xor_signmask_256);
_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
int ret = false;
if(visibility == PATH_RAY_SHADOW) {
for(int i = 0; i < prim_num; i++) {
if(!mask_minmaxUVW8[i]) {
#ifdef __VISIBILITY_FLAG__
if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
#endif
{
if((sign_T8[i] >= 0.0f) &&
(sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
{
if(den8[i]) {
const float inv_den = 1.0f / den8[i];
(*isect)->u = U8[i] * inv_den;
(*isect)->v = V8[i] * inv_den;
(*isect)->t = T8[i] * inv_den;
(*isect)->prim = (prim_addr + i);
(*isect)->object = object;
(*isect)->type = PRIMITIVE_TRIANGLE;
int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
int shader = 0;
#ifdef __HAIR__
if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
#endif
{
shader = kernel_tex_fetch(__tri_shader, prim);
}
#ifdef __HAIR__
else {
float4 str = kernel_tex_fetch(__curves, prim);
shader = __float_as_int(str.z);
}
#endif
int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
/* if no transparent shadows, all light is blocked */
if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
return 2;
}
/* if maximum number of hits reached, block all light */
else if(*num_hits == max_hits) {
return 2;
}
/* move on to next entry in intersections array */
ret = true;
(*isect)++;
(*num_hits)++;
(*num_hits_in_instance)++;
(*isect)->t = isec_t;
} //den
} //if sign
} //vis
}//if mask
} //for
}
else { //default case
for(int i = 0; i < prim_num; i++) {
if(!mask_minmaxUVW8[i]) {
#ifdef __VISIBILITY_FLAG__
if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
#endif
{
if((sign_T8[i] >= 0.0f) &&
(sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
{
if(den8[i]) {
const float inv_den = 1.0f / den8[i];
(*isect)->u = U8[i] * inv_den;
(*isect)->v = V8[i] * inv_den;
(*isect)->t = T8[i] * inv_den;
(*isect)->prim = (prim_addr + i);
(*isect)->object = object;
(*isect)->type = PRIMITIVE_TRIANGLE;
ret = true;
} //den
} //if sign
} //vis
}//if mask
} //for
} //default
return ret;
}// else PATH_RAY_SHADOW_OPAQUE
}
//vz static
ccl_device_inline
int triangle_intersect8(KernelGlobals *kg,
Intersection **isect,
float3 P,
float3 dir,
uint visibility,
int object,
int prim_addr,
int prim_num,
uint *num_hits,
uint max_hits,
int *num_hits_in_instance,
float isec_t)
{
__m128 tri_a[8], tri_b[8], tri_c[8];
__m256 tritmp[12], tri[12];
__m256 triA[3], triB[3], triC[3];
int i, r;
uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
for(i = 0; i < prim_num; i++) {
tri_a[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
tri_b[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
tri_c[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
}
//create 9 or 12 placeholders
tri[0] = _mm256_castps128_ps256(tri_a[0]); //_mm256_zextps128_ps256
tri[1] = _mm256_castps128_ps256(tri_b[0]);//_mm256_zextps128_ps256
tri[2] = _mm256_castps128_ps256(tri_c[0]);//_mm256_zextps128_ps256
tri[3] = _mm256_castps128_ps256(tri_a[1]); //_mm256_zextps128_ps256
tri[4] = _mm256_castps128_ps256(tri_b[1]);//_mm256_zextps128_ps256
tri[5] = _mm256_castps128_ps256(tri_c[1]);//_mm256_zextps128_ps256
tri[6] = _mm256_castps128_ps256(tri_a[2]); //_mm256_zextps128_ps256
tri[7] = _mm256_castps128_ps256(tri_b[2]);//_mm256_zextps128_ps256
tri[8] = _mm256_castps128_ps256(tri_c[2]);//_mm256_zextps128_ps256
if(prim_num > 3) {
tri[9] = _mm256_castps128_ps256(tri_a[3]); //_mm256_zextps128_ps256
tri[10] = _mm256_castps128_ps256(tri_b[3]);//_mm256_zextps128_ps256
tri[11] = _mm256_castps128_ps256(tri_c[3]);//_mm256_zextps128_ps256
}
for(i = 4, r = 0; i < prim_num; i ++, r += 3) {
tri[r] = _mm256_insertf128_ps(tri[r] , tri_a[i], 1);
tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1);
tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1);
}
//------------------------------------------------
//0! Xa0 Ya0 Za0 1 Xa4 Ya4 Za4 1
//1! Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1
//2! Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1
//3! Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1
//4! Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5 1
//5! Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1
//6! Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1
//7! Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6 1
//8! Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1
//9! Xa3 Ya3 Za3 1 Xa7 Ya7 Za7 1
//10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7 1
//11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7 1
//"transpose"
tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]); //0! Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5
tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]); //1! Za0 Za1 1 1 Za4 Za5 1 1
tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]); //2! Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7
tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]); //3! Za2 Za3 1 1 Za6 Za7 1 1
tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]); //4! Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5
tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]); //5! Zb0 Zb1 1 1 Zb4 Zb5 1 1
tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]); //6! Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7
tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]); //7! Zb2 Zb3 1 1 Zb6 Zb7 1 1
tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]); //8! Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5
tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]); //9! Zc0 Zc1 1 1 Zc4 Zc5 1 1
tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]); //10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7
tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]); //11! Zc2 Zc3 1 1 Zc6 Zc7 1 1
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
triA[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7
triA[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7
triA[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]), _mm256_castps_pd(tritmp[3]))); // Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7
triB[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Xb0 Xb1 Xb2 Xb3 Xb4 Xb5 Xb5 Xb7
triB[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Yb0 Yb1 Yb2 Yb3 Yb4 Yb5 Yb5 Yb7
triB[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]), _mm256_castps_pd(tritmp[7]))); // Zb0 Zb1 Zb2 Zb3 Zb4 Zb5 Zb5 Zb7
triC[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7
triC[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7
triC[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]), _mm256_castps_pd(tritmp[11]))); //Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
int result = ray_triangle_intersect8(kg, P,
dir,
isect,
visibility, object,
triA,
triB,
triC,
prim_addr,
prim_num,
num_hits,
max_hits,
num_hits_in_instance,
isec_t);
return result;
}
/* Special ray intersection routines for subsurface scattering. In that case we
* only want to intersect with primitives in the same object, and if case of
* multiple hits we pick a single random primitive as the intersection point.
* Returns whether traversal should be stopped.
@ -83,7 +555,7 @@ ccl_device_inline bool triangle_intersect_local(
float3 P,
float3 dir,
int object,
int local_object,
int local_object,
int prim_addr,
float tmax,
uint *lcg_state,

View File

@ -71,15 +71,13 @@ CCL_NAMESPACE_BEGIN
/* Texture types to be compatible with CUDA textures. These are really just
* simple arrays and after inlining fetch hopefully revert to being a simple
* pointer lookup. */
template<typename T> struct texture {
ccl_always_inline const T& fetch(int index)
{
kernel_assert(index >= 0 && index < width);
return data[index];
}
#ifdef __KERNEL_AVX__
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
* compatibility with existing indicies and data structures.
*/
@ -90,7 +88,6 @@ template<typename T> struct texture {
ssef *ssef_node_data = &ssef_data[index];
return _mm256_loadu_ps((float *)ssef_node_data);
}
#endif
#ifdef __KERNEL_SSE2__
@ -148,6 +145,10 @@ ccl_device_inline void print_sse3i(const char *label, sse3i& a)
print_ssei(label, a.z);
}
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
typedef vector3<avxf> avx3f;
#endif
#endif
CCL_NAMESPACE_END

View File

@ -1384,8 +1384,9 @@ typedef enum KernelBVHLayout {
BVH_LAYOUT_BVH2 = (1 << 0),
BVH_LAYOUT_BVH4 = (1 << 1),
BVH_LAYOUT_BVH8 = (1 << 2),
BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4,
BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
BVH_LAYOUT_ALL = (unsigned int)(-1),
} KernelBVHLayout;

View File

@ -78,6 +78,7 @@ set(SRC_HEADERS
util_sky_model.h
util_sky_model_data.h
util_avxf.h
util_avxb.h
util_sseb.h
util_ssef.h
util_ssei.h
@ -98,7 +99,9 @@ set(SRC_HEADERS
util_types_float3_impl.h
util_types_float4.h
util_types_float4_impl.h
util_types_int2.h
util_types_float8.h
util_types_float8_impl.h
util_types_int2.h
util_types_int2_impl.h
util_types_int3.h
util_types_int3_impl.h

View File

@ -0,0 +1,192 @@
/*
* Copyright 2011-2013 Intel Corporation
* Modifications Copyright 2014, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0(the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __UTIL_AVXB_H__
#define __UTIL_AVXB_H__
CCL_NAMESPACE_BEGIN
struct avxf;
/*! 4-wide SSE bool type. */
struct avxb
{
typedef avxb Mask; // mask type
typedef avxf Float; // float type
enum { size = 8 }; // number of SIMD elements
union { __m256 m256; int32_t v[8]; }; // data
////////////////////////////////////////////////////////////////////////////////
/// Constructors, Assignment & Cast Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline avxb ( ) {}
__forceinline avxb ( const avxb& other ) { m256 = other.m256; }
__forceinline avxb& operator=( const avxb& other ) { m256 = other.m256; return *this; }
__forceinline avxb( const __m256 input ) : m256(input) {}
__forceinline operator const __m256&( void ) const { return m256; }
__forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
__forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
//__forceinline avxb ( bool a )
// : m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
//__forceinline avxb ( bool a, bool b)
// : m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
//__forceinline avxb ( bool a, bool b, bool c, bool d)
// : m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
//__forceinline avxb(int mask) {
// assert(mask >= 0 && mask < 16);
// m128 = _mm_lookupmask_ps[mask];
//}
////////////////////////////////////////////////////////////////////////////////
/// Constants
////////////////////////////////////////////////////////////////////////////////
__forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
__forceinline avxb( TrueTy ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {}
////////////////////////////////////////////////////////////////////////////////
/// Array Access
////////////////////////////////////////////////////////////////////////////////
__forceinline bool operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; }
__forceinline int32_t& operator []( const size_t i ) { assert(i < 8); return v[i]; }
};
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); }
////////////////////////////////////////////////////////////////////////////////
/// Binary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); }
__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); }
__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; }
__forceinline const avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; }
__forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; }
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators + Select
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
__forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
#if defined(__KERNEL_SSE41__)
return _mm256_blendv_ps(f, t, m);
#else
return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
#endif
}
////////////////////////////////////////////////////////////////////////////////
/// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); }
__forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); }
#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \
((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
__forceinline const avxb shuffle( const avxb& a ) {
return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0)));
}
/*
template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) {
return _mm_movelh_ps(a, a);
}
template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
return _mm_movehl_ps(a, a);
}
template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
return _mm_movelh_ps(a, b);
}
template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
return _mm_movehl_ps(b, a);
}
#if defined(__KERNEL_SSE3__)
template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
#endif
#if defined(__KERNEL_SSE41__)
template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
#endif
*/
////////////////////////////////////////////////////////////////////////////////
/// Reduction Operations
////////////////////////////////////////////////////////////////////////////////
#if defined(__KERNEL_SSE41__)
__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); }
#else
__forceinline size_t popcnt( const avxb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3])+bool(a[4])+
bool(a[5])+bool(a[6])+bool(a[7]); }
#endif
__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xf; }
__forceinline bool reduce_or ( const avxb& a ) { return _mm256_movemask_ps(a) != 0x0; }
__forceinline bool all ( const avxb& b ) { return _mm256_movemask_ps(b) == 0xf; }
__forceinline bool any ( const avxb& b ) { return _mm256_movemask_ps(b) != 0x0; }
__forceinline bool none ( const avxb& b ) { return _mm256_movemask_ps(b) == 0x0; }
__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
////////////////////////////////////////////////////////////////////////////////
/// Debug Functions
////////////////////////////////////////////////////////////////////////////////
ccl_device_inline void print_avxb(const char *label, const avxb &a)
{
printf("%s: %df %df %df %df %df %df %df %d\n",
label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
}
#endif
CCL_NAMESPACE_END
//#endif

View File

@ -19,7 +19,8 @@
CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_AVX__
struct avxb;
struct avxf
{
typedef avxf Float;
@ -53,6 +54,9 @@ struct avxf
__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
__forceinline avxf(float3 a) :
m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) {}
__forceinline avxf(int a3, int a2, int a1, int a0)
{
@ -73,8 +77,24 @@ struct avxf
m256 = _mm256_insertf128_ps(foo, b, 1);
}
__forceinline const float& operator [](const size_t i) const { assert(i < 8); return f[i]; }
__forceinline float& operator [](const size_t i) { assert(i < 8); return f[i]; }
};
__forceinline avxf cross(const avxf& a, const avxf& b)
{
avxf r(0.0, a[4]*b[5] - a[5]*b[4], a[6]*b[4] - a[4]*b[6], a[5]*b[6] - a[6]*b[5],
0.0, a[0]*b[1] - a[1]*b[0], a[2]*b[0] - a[0]*b[2], a[1]*b[2] - a[2]*b[1]);
return r;
}
__forceinline void dot3(const avxf& a, const avxf& b, float &den, float &den2)
{
const avxf t = _mm256_mul_ps(a.m256, b.m256);
den = ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
den2 = ((float*)&t)[4] + ((float*)&t)[5] + ((float*)&t)[6];
}
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
@ -107,6 +127,9 @@ __forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256
__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
__forceinline const avxf max(const avxf& a, const avxf& b) { return _mm256_max_ps(a.m256, b.m256); }
__forceinline const avxf min(const avxf& a, const avxf& b) { return _mm256_min_ps(a.m256, b.m256); }
////////////////////////////////////////////////////////////////////////////////
/// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
@ -160,6 +183,18 @@ ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
}
//#if defined(__KERNEL_SSE41__)
__forceinline avxf maxi(const avxf& a, const avxf& b) {
const avxf ci = _mm256_max_ps(a, b);
return ci;
}
__forceinline avxf mini(const avxf& a, const avxf& b) {
const avxf ci = _mm256_min_ps(a, b);
return ci;
}
//#endif
////////////////////////////////////////////////////////////////////////////////
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
@ -178,6 +213,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
return c-(a*b);
#endif
}
__forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) {
return _mm256_fmsub_ps(a, b, c);
}
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators
////////////////////////////////////////////////////////////////////////////////
#ifdef __KERNEL_AVX2__
__forceinline const avxb operator <=(const avxf& a, const avxf& b) {
return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
}
#endif
#endif
#ifndef _mm256_set_m128
@ -190,4 +238,3 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
CCL_NAMESPACE_END
#endif

View File

@ -57,7 +57,19 @@ void DebugFlags::CPU::reset()
#undef STRINGIFY
#undef CHECK_CPU_FLAGS
bvh_layout = BVH_LAYOUT_DEFAULT;
if (getenv("CYCLES_BVH2") != NULL) {
bvh_layout = BVH_LAYOUT_BVH2;
}
else if (getenv("CYCLES_BVH4") != NULL) {
bvh_layout = BVH_LAYOUT_BVH4;
}
else if (getenv("CYCLES_BVH8") != NULL) {
bvh_layout = BVH_LAYOUT_BVH8;
}
else {
bvh_layout = BVH_LAYOUT_DEFAULT;
}
split_kernel = false;
}

View File

@ -121,6 +121,7 @@ CCL_NAMESPACE_END
#include "util/util_types_float2.h"
#include "util/util_types_float3.h"
#include "util/util_types_float4.h"
#include "util/util_types_float8.h"
#include "util/util_types_vector3.h"
@ -140,6 +141,7 @@ CCL_NAMESPACE_END
#include "util/util_types_float2_impl.h"
#include "util/util_types_float3_impl.h"
#include "util/util_types_float4_impl.h"
#include "util/util_types_float8_impl.h"
#include "util/util_types_vector3_impl.h"
@ -148,7 +150,10 @@ CCL_NAMESPACE_END
# include "util/util_sseb.h"
# include "util/util_ssei.h"
# include "util/util_ssef.h"
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
# include "util/util_avxb.h"
# include "util/util_avxf.h"
#endif
#endif
#endif /* __UTIL_TYPES_H__ */

View File

@ -0,0 +1,70 @@
/*
Copyright (c) 2017, Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __UTIL_TYPES_FLOAT8_H__
#define __UTIL_TYPES_FLOAT8_H__
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util_types.h instead."
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_GPU__
struct ccl_try_align(16) float8 {
#ifdef __KERNEL_AVX2__
union {
__m256 m256;
struct { float a, b, c, d, e, f, g, h; };
};
__forceinline float8();
__forceinline float8(const float8& a);
__forceinline explicit float8(const __m256& a);
__forceinline operator const __m256&(void) const;
__forceinline operator __m256&(void);
__forceinline float8& operator =(const float8& a);
#else /* __KERNEL_AVX2__ */
float a, b, c, d, e, f, g, h;
#endif /* __KERNEL_AVX2__ */
__forceinline float operator[](int i) const;
__forceinline float& operator[](int i);
};
ccl_device_inline float8 make_float8(float f);
ccl_device_inline float8 make_float8(float a, float b, float c, float d,
float e, float f, float g, float h);
#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_FLOAT8_H__ */

View File

@ -0,0 +1,113 @@
/*
Copyright (c) 2017, Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
#define __UTIL_TYPES_FLOAT8_IMPL_H__
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util_types.h instead."
#endif
#ifndef __KERNEL_GPU__
# include <cstdio>
#endif
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_GPU__
#ifdef __KERNEL_AVX2__
__forceinline float8::float8()
{
}
__forceinline float8::float8(const float8& f)
: m256(f.m256)
{
}
__forceinline float8::float8(const __m256& f)
: m256(f)
{
}
__forceinline float8::operator const __m256&(void) const
{
return m256;
}
__forceinline float8::operator __m256&(void)
{
return m256;
}
__forceinline float8& float8::operator =(const float8& f)
{
m256 = f.m256;
return *this;
}
#endif /* __KERNEL_AVX2__ */
__forceinline float float8::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
__forceinline float& float8::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
ccl_device_inline float8 make_float8(float f)
{
#ifdef __KERNEL_AVX2__
float8 r(_mm256_set1_ps(f));
#else
float8 r = {f, f, f, f, f, f, f, f};
#endif
return r;
}
ccl_device_inline float8 make_float8(float a, float b, float c, float d,
float e, float f, float g, float h)
{
#ifdef __KERNEL_AVX2__
float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
#else
float8 r = {a, b, c, d, e, f, g, h};
#endif
return r;
}
#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */