Merge branch 'master' into blender2.8
This commit is contained in:
commit
31278eb4bc
|
@ -63,6 +63,11 @@ else()
|
|||
set(BOOST_BUILD_COMMAND ./b2)
|
||||
set(BOOST_BUILD_OPTIONS cxxflags=${PLATFORM_CXXFLAGS} --disable-icu boost.locale.icu=off)
|
||||
set(BOOST_PATCH_COMMAND echo .)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(BOOST_ADDRESS_MODEL 64)
|
||||
else()
|
||||
set(BOOST_ADDRESS_MODEL 32)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(BOOST_OPTIONS
|
||||
|
|
|
@ -45,7 +45,6 @@ if(WITH_OPENAL)
|
|||
find_package(OpenAL)
|
||||
if(OPENAL_FOUND)
|
||||
set(WITH_OPENAL ON)
|
||||
set(OPENAL_INCLUDE_DIR "${LIBDIR}/openal/include/AL")
|
||||
else()
|
||||
set(WITH_OPENAL OFF)
|
||||
endif()
|
||||
|
|
|
@ -30,7 +30,10 @@ set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/${LIBDIR_NAME})
|
|||
|
||||
if(EXISTS ${LIBDIR})
|
||||
file(GLOB LIB_SUBDIRS ${LIBDIR}/*)
|
||||
set(CMAKE_PREFIX_PATH ${LIB_SUBDIRS})
|
||||
# NOTE: Make sure "proper" compiled zlib comes first before the one
|
||||
# which is a part of OpenCollada. They have different ABI, and we
|
||||
# do need to use the official one.
|
||||
set(CMAKE_PREFIX_PATH ${LIBDIR}/zlib ${LIB_SUBDIRS})
|
||||
set(WITH_STATIC_LIBS ON)
|
||||
set(WITH_OPENMP_STATIC ON)
|
||||
endif()
|
||||
|
|
|
@ -54,6 +54,7 @@ enum_displacement_methods = (
|
|||
enum_bvh_layouts = (
|
||||
('BVH2', "BVH2", "", 1),
|
||||
('BVH4', "BVH4", "", 2),
|
||||
('BVH8', "BVH8", "", 4),
|
||||
)
|
||||
|
||||
enum_bvh_types = (
|
||||
|
@ -678,7 +679,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
|
|||
debug_bvh_layout: EnumProperty(
|
||||
name="BVH Layout",
|
||||
items=enum_bvh_layouts,
|
||||
default='BVH4',
|
||||
default='BVH8',
|
||||
)
|
||||
debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)
|
||||
|
||||
|
|
|
@ -635,7 +635,15 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
|
|||
params.texture_limit = 0;
|
||||
}
|
||||
|
||||
params.bvh_layout = DebugFlags().cpu.bvh_layout;
|
||||
/* TODO(sergey): Once OSL supports per-microarchitecture optimization get
|
||||
* rid of this.
|
||||
*/
|
||||
if (params.shadingsystem == SHADINGSYSTEM_OSL) {
|
||||
params.bvh_layout = BVH_LAYOUT_BVH4;
|
||||
}
|
||||
else {
|
||||
params.bvh_layout = DebugFlags().cpu.bvh_layout;
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
|
|
@ -10,6 +10,7 @@ set(SRC
|
|||
bvh.cpp
|
||||
bvh2.cpp
|
||||
bvh4.cpp
|
||||
bvh8.cpp
|
||||
bvh_binning.cpp
|
||||
bvh_build.cpp
|
||||
bvh_node.cpp
|
||||
|
@ -22,6 +23,7 @@ set(SRC_HEADERS
|
|||
bvh.h
|
||||
bvh2.h
|
||||
bvh4.h
|
||||
bvh8.h
|
||||
bvh_binning.h
|
||||
bvh_build.h
|
||||
bvh_node.h
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
#include "bvh/bvh2.h"
|
||||
#include "bvh/bvh4.h"
|
||||
#include "bvh/bvh8.h"
|
||||
#include "bvh/bvh_build.h"
|
||||
#include "bvh/bvh_node.h"
|
||||
|
||||
|
@ -38,6 +39,7 @@ const char *bvh_layout_name(BVHLayout layout)
|
|||
switch(layout) {
|
||||
case BVH_LAYOUT_BVH2: return "BVH2";
|
||||
case BVH_LAYOUT_BVH4: return "BVH4";
|
||||
case BVH_LAYOUT_BVH8: return "BVH8";
|
||||
case BVH_LAYOUT_NONE: return "NONE";
|
||||
case BVH_LAYOUT_ALL: return "ALL";
|
||||
}
|
||||
|
@ -92,6 +94,8 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
|
|||
return new BVH2(params, objects);
|
||||
case BVH_LAYOUT_BVH4:
|
||||
return new BVH4(params, objects);
|
||||
case BVH_LAYOUT_BVH8:
|
||||
return new BVH8(params, objects);
|
||||
case BVH_LAYOUT_NONE:
|
||||
case BVH_LAYOUT_ALL:
|
||||
break;
|
||||
|
@ -215,6 +219,38 @@ void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility)
|
|||
}
|
||||
}
|
||||
visibility |= ob->visibility_for_tracing();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
bool BVH::leaf_check(const BVHNode *node, BVH_TYPE bvh)
|
||||
{
|
||||
if(node->is_leaf()) {
|
||||
return node->is_unaligned;
|
||||
}
|
||||
else {
|
||||
return node_is_unaligned(node, bvh);
|
||||
}
|
||||
}
|
||||
|
||||
bool BVH::node_is_unaligned(const BVHNode *node, BVH_TYPE bvh)
|
||||
{
|
||||
const BVHNode *node0 = node->get_child(0);
|
||||
const BVHNode *node1 = node->get_child(1);
|
||||
|
||||
switch(bvh) {
|
||||
case bvh2:
|
||||
return node0->is_unaligned || node1->is_unaligned;
|
||||
break;
|
||||
case bvh4:
|
||||
return leaf_check(node0, bvh2) || leaf_check(node1, bvh2);
|
||||
break;
|
||||
case bvh8:
|
||||
return leaf_check(node0, bvh4) || leaf_check(node1, bvh4);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -291,8 +327,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
|
|||
* BVH's are stored in global arrays. This function merges them into the
|
||||
* top level BVH, adjusting indexes and offsets where appropriate.
|
||||
*/
|
||||
/* TODO(sergey): This code needs adjustment for wider BVH than 4. */
|
||||
const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
|
||||
const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
|
||||
|
||||
/* Adjust primitive index to point to the triangle in the global array, for
|
||||
* meshes with transform applied and already in the top level BVH.
|
||||
|
@ -469,14 +505,26 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
|
|||
for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
|
||||
size_t nsize, nsize_bbox;
|
||||
if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
|
||||
nsize = use_qbvh
|
||||
? BVH_UNALIGNED_QNODE_SIZE
|
||||
: BVH_UNALIGNED_NODE_SIZE;
|
||||
nsize_bbox = (use_qbvh)? 13: 0;
|
||||
if(use_obvh) {
|
||||
nsize = BVH_UNALIGNED_ONODE_SIZE;
|
||||
nsize_bbox = BVH_UNALIGNED_ONODE_SIZE-1;
|
||||
}
|
||||
else {
|
||||
nsize = use_qbvh
|
||||
? BVH_UNALIGNED_QNODE_SIZE
|
||||
: BVH_UNALIGNED_NODE_SIZE;
|
||||
nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE-1 : 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
|
||||
nsize_bbox = (use_qbvh)? 7: 0;
|
||||
if(use_obvh) {
|
||||
nsize = BVH_ONODE_SIZE;
|
||||
nsize_bbox = BVH_ONODE_SIZE-1;
|
||||
}
|
||||
else {
|
||||
nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
|
||||
nsize_bbox = (use_qbvh)? BVH_QNODE_SIZE-1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(pack_nodes + pack_nodes_offset,
|
||||
|
@ -485,16 +533,29 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
|
|||
|
||||
/* Modify offsets into arrays */
|
||||
int4 data = bvh_nodes[i + nsize_bbox];
|
||||
|
||||
data.z += (data.z < 0)? -noffset_leaf: noffset;
|
||||
data.w += (data.w < 0)? -noffset_leaf: noffset;
|
||||
|
||||
if(use_qbvh) {
|
||||
data.x += (data.x < 0)? -noffset_leaf: noffset;
|
||||
data.y += (data.y < 0)? -noffset_leaf: noffset;
|
||||
int4 data1 = bvh_nodes[i + nsize_bbox-1];
|
||||
if(use_obvh) {
|
||||
data.z += (data.z < 0) ? -noffset_leaf : noffset;
|
||||
data.w += (data.w < 0) ? -noffset_leaf : noffset;
|
||||
data.x += (data.x < 0) ? -noffset_leaf : noffset;
|
||||
data.y += (data.y < 0) ? -noffset_leaf : noffset;
|
||||
data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
|
||||
data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
|
||||
data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
|
||||
data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
|
||||
}
|
||||
else {
|
||||
data.z += (data.z < 0) ? -noffset_leaf : noffset;
|
||||
data.w += (data.w < 0) ? -noffset_leaf : noffset;
|
||||
if(use_qbvh) {
|
||||
data.x += (data.x < 0)? -noffset_leaf: noffset;
|
||||
data.y += (data.y < 0)? -noffset_leaf: noffset;
|
||||
}
|
||||
}
|
||||
|
||||
pack_nodes[pack_nodes_offset + nsize_bbox] = data;
|
||||
if(use_obvh) {
|
||||
pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
|
||||
}
|
||||
|
||||
/* Usually this copies nothing, but we better
|
||||
* be prepared for possible node size extension.
|
||||
|
|
|
@ -73,6 +73,12 @@ struct PackedBVH {
|
|||
}
|
||||
};
|
||||
|
||||
enum BVH_TYPE {
|
||||
bvh2,
|
||||
bvh4,
|
||||
bvh8
|
||||
};
|
||||
|
||||
/* BVH */
|
||||
|
||||
class BVH
|
||||
|
@ -93,6 +99,8 @@ protected:
|
|||
|
||||
/* Refit range of primitives. */
|
||||
void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility);
|
||||
static __forceinline bool leaf_check(const BVHNode *node, BVH_TYPE bvh);
|
||||
static bool node_is_unaligned(const BVHNode *node, BVH_TYPE bvh);
|
||||
|
||||
/* triangles and strands */
|
||||
void pack_primitives();
|
||||
|
|
|
@ -25,13 +25,6 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
static bool node_bvh_is_unaligned(const BVHNode *node)
|
||||
{
|
||||
const BVHNode *node0 = node->get_child(0),
|
||||
*node1 = node->get_child(1);
|
||||
return node0->is_unaligned || node1->is_unaligned;
|
||||
}
|
||||
|
||||
BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
|
||||
: BVH(params_, objects_)
|
||||
{
|
||||
|
@ -195,7 +188,7 @@ void BVH2::pack_nodes(const BVHNode *root)
|
|||
}
|
||||
else {
|
||||
stack.push_back(BVHStackEntry(root, nextNodeIdx));
|
||||
nextNodeIdx += node_bvh_is_unaligned(root)
|
||||
nextNodeIdx += node_is_unaligned(root, bvh2)
|
||||
? BVH_UNALIGNED_NODE_SIZE
|
||||
: BVH_NODE_SIZE;
|
||||
}
|
||||
|
@ -218,7 +211,7 @@ void BVH2::pack_nodes(const BVHNode *root)
|
|||
}
|
||||
else {
|
||||
idx[i] = nextNodeIdx;
|
||||
nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
|
||||
nextNodeIdx += node_is_unaligned(e.node->get_child(i), bvh2)
|
||||
? BVH_UNALIGNED_NODE_SIZE
|
||||
: BVH_NODE_SIZE;
|
||||
}
|
||||
|
|
|
@ -30,27 +30,6 @@ CCL_NAMESPACE_BEGIN
|
|||
* Perhaps we can merge nodes in actual tree and make our
|
||||
* life easier all over the place.
|
||||
*/
|
||||
static bool node_qbvh_is_unaligned(const BVHNode *node)
|
||||
{
|
||||
const BVHNode *node0 = node->get_child(0),
|
||||
*node1 = node->get_child(1);
|
||||
bool has_unaligned = false;
|
||||
if(node0->is_leaf()) {
|
||||
has_unaligned |= node0->is_unaligned;
|
||||
}
|
||||
else {
|
||||
has_unaligned |= node0->get_child(0)->is_unaligned;
|
||||
has_unaligned |= node0->get_child(1)->is_unaligned;
|
||||
}
|
||||
if(node1->is_leaf()) {
|
||||
has_unaligned |= node1->is_unaligned;
|
||||
}
|
||||
else {
|
||||
has_unaligned |= node1->get_child(0)->is_unaligned;
|
||||
has_unaligned |= node1->get_child(1)->is_unaligned;
|
||||
}
|
||||
return has_unaligned;
|
||||
}
|
||||
|
||||
BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
|
||||
: BVH(params_, objects_)
|
||||
|
@ -304,7 +283,7 @@ void BVH4::pack_nodes(const BVHNode *root)
|
|||
}
|
||||
else {
|
||||
stack.push_back(BVHStackEntry(root, nextNodeIdx));
|
||||
nextNodeIdx += node_qbvh_is_unaligned(root)
|
||||
nextNodeIdx += node_is_unaligned(root, bvh4)
|
||||
? BVH_UNALIGNED_QNODE_SIZE
|
||||
: BVH_QNODE_SIZE;
|
||||
}
|
||||
|
@ -348,7 +327,7 @@ void BVH4::pack_nodes(const BVHNode *root)
|
|||
}
|
||||
else {
|
||||
idx = nextNodeIdx;
|
||||
nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
|
||||
nextNodeIdx += node_is_unaligned(nodes[i], bvh4)
|
||||
? BVH_UNALIGNED_QNODE_SIZE
|
||||
: BVH_QNODE_SIZE;
|
||||
}
|
||||
|
@ -438,7 +417,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
|
|||
visibility,
|
||||
0.0f,
|
||||
1.0f,
|
||||
4);
|
||||
num_nodes);
|
||||
}
|
||||
else {
|
||||
pack_aligned_node(idx,
|
||||
|
@ -447,7 +426,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
|
|||
visibility,
|
||||
0.0f,
|
||||
1.0f,
|
||||
4);
|
||||
num_nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,515 @@
|
|||
/*
|
||||
Copyright (c) 2017, Intel Corporation
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "bvh/bvh8.h"
|
||||
|
||||
#include "render/mesh.h"
|
||||
#include "render/object.h"
|
||||
|
||||
#include "bvh/bvh_node.h"
|
||||
#include "bvh/bvh_unaligned.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
BVH8::BVH8(const BVHParams& params_, const vector<Object*>& objects_)
|
||||
: BVH(params_, objects_)
|
||||
{
|
||||
}
|
||||
|
||||
void BVH8::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
|
||||
{
|
||||
float4 data[BVH_ONODE_LEAF_SIZE];
|
||||
memset(data, 0, sizeof(data));
|
||||
if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
|
||||
/* object */
|
||||
data[0].x = __int_as_float(~(leaf->lo));
|
||||
data[0].y = __int_as_float(0);
|
||||
}
|
||||
else {
|
||||
/* triangle */
|
||||
data[0].x = __int_as_float(leaf->lo);
|
||||
data[0].y = __int_as_float(leaf->hi);
|
||||
}
|
||||
data[0].z = __uint_as_float(leaf->visibility);
|
||||
if(leaf->num_triangles() != 0) {
|
||||
data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
|
||||
}
|
||||
|
||||
memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
|
||||
}
|
||||
|
||||
void BVH8::pack_inner(const BVHStackEntry& e,
|
||||
const BVHStackEntry *en,
|
||||
int num)
|
||||
{
|
||||
bool has_unaligned = false;
|
||||
/* Check whether we have to create unaligned node or all nodes are aligned
|
||||
* and we can cut some corner here.
|
||||
*/
|
||||
if(params.use_unaligned_nodes) {
|
||||
for(int i = 0; i < num; i++) {
|
||||
if(en[i].node->is_unaligned) {
|
||||
has_unaligned = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(has_unaligned) {
|
||||
/* There's no unaligned children, pack into AABB node. */
|
||||
pack_unaligned_inner(e, en, num);
|
||||
}
|
||||
else {
|
||||
/* Create unaligned node with orientation transform for each of the
|
||||
* children.
|
||||
*/
|
||||
pack_aligned_inner(e, en, num);
|
||||
}
|
||||
}
|
||||
|
||||
void BVH8::pack_aligned_inner(const BVHStackEntry& e,
|
||||
const BVHStackEntry *en,
|
||||
int num)
|
||||
{
|
||||
BoundBox bounds[8];
|
||||
int child[8];
|
||||
for(int i = 0; i < num; ++i) {
|
||||
bounds[i] = en[i].node->bounds;
|
||||
child[i] = en[i].encodeIdx();
|
||||
}
|
||||
pack_aligned_node(e.idx,
|
||||
bounds,
|
||||
child,
|
||||
e.node->visibility,
|
||||
e.node->time_from,
|
||||
e.node->time_to,
|
||||
num);
|
||||
}
|
||||
|
||||
void BVH8::pack_aligned_node(int idx,
|
||||
const BoundBox *bounds,
|
||||
const int *child,
|
||||
const uint visibility,
|
||||
const float time_from,
|
||||
const float time_to,
|
||||
const int num)
|
||||
{
|
||||
float8 data[8];
|
||||
memset(data, 0, sizeof(data));
|
||||
|
||||
data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
|
||||
data[0].b = time_from;
|
||||
data[0].c = time_to;
|
||||
for(int i = 0; i < num; i++) {
|
||||
float3 bb_min = bounds[i].min;
|
||||
float3 bb_max = bounds[i].max;
|
||||
|
||||
data[1][i] = bb_min.x;
|
||||
data[2][i] = bb_max.x;
|
||||
data[3][i] = bb_min.y;
|
||||
data[4][i] = bb_max.y;
|
||||
data[5][i] = bb_min.z;
|
||||
data[6][i] = bb_max.z;
|
||||
|
||||
data[7][i] = __int_as_float(child[i]);
|
||||
}
|
||||
|
||||
for(int i = num; i < 8; i++) {
|
||||
/* We store BB which would never be recorded as intersection
|
||||
* so kernel might safely assume there are always 4 child nodes.
|
||||
*/
|
||||
data[1][i] = FLT_MAX;
|
||||
data[2][i] = -FLT_MAX;
|
||||
|
||||
data[3][i] = FLT_MAX;
|
||||
data[4][i] = -FLT_MAX;
|
||||
|
||||
data[5][i] = FLT_MAX;
|
||||
data[6][i] = -FLT_MAX;
|
||||
|
||||
data[7][i] = __int_as_float(0);
|
||||
}
|
||||
memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE);
|
||||
}
|
||||
|
||||
void BVH8::pack_unaligned_inner(const BVHStackEntry& e,
|
||||
const BVHStackEntry *en,
|
||||
int num)
|
||||
{
|
||||
Transform aligned_space[8];
|
||||
BoundBox bounds[8];
|
||||
int child[8];
|
||||
for(int i = 0; i < num; ++i) {
|
||||
aligned_space[i] = en[i].node->get_aligned_space();
|
||||
bounds[i] = en[i].node->bounds;
|
||||
child[i] = en[i].encodeIdx();
|
||||
}
|
||||
pack_unaligned_node(e.idx,
|
||||
aligned_space,
|
||||
bounds,
|
||||
child,
|
||||
e.node->visibility,
|
||||
e.node->time_from,
|
||||
e.node->time_to,
|
||||
num);
|
||||
}
|
||||
|
||||
void BVH8::pack_unaligned_node(int idx,
|
||||
const Transform *aligned_space,
|
||||
const BoundBox *bounds,
|
||||
const int *child,
|
||||
const uint visibility,
|
||||
const float time_from,
|
||||
const float time_to,
|
||||
const int num)
|
||||
{
|
||||
float8 data[BVH_UNALIGNED_ONODE_SIZE];
|
||||
memset(data, 0, sizeof(data));
|
||||
data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
|
||||
data[0].b = time_from;
|
||||
data[0].c = time_to;
|
||||
|
||||
for(int i = 0; i < num; i++) {
|
||||
Transform space = BVHUnaligned::compute_node_transform(
|
||||
bounds[i],
|
||||
aligned_space[i]);
|
||||
|
||||
data[1][i] = space.x.x;
|
||||
data[2][i] = space.x.y;
|
||||
data[3][i] = space.x.z;
|
||||
|
||||
data[4][i] = space.y.x;
|
||||
data[5][i] = space.y.y;
|
||||
data[6][i] = space.y.z;
|
||||
|
||||
data[7][i] = space.z.x;
|
||||
data[8][i] = space.z.y;
|
||||
data[9][i] = space.z.z;
|
||||
|
||||
data[10][i] = space.x.w;
|
||||
data[11][i] = space.y.w;
|
||||
data[12][i] = space.z.w;
|
||||
|
||||
data[13][i] = __int_as_float(child[i]);
|
||||
}
|
||||
|
||||
for(int i = num; i < 8; i++) {
|
||||
/* We store BB which would never be recorded as intersection
|
||||
* so kernel might safely assume there are always 4 child nodes.
|
||||
*/
|
||||
|
||||
data[1][i] = 1.0f;
|
||||
data[2][i] = 0.0f;
|
||||
data[3][i] = 0.0f;
|
||||
|
||||
data[4][i] = 0.0f;
|
||||
data[5][i] = 0.0f;
|
||||
data[6][i] = 0.0f;
|
||||
|
||||
data[7][i] = 0.0f;
|
||||
data[8][i] = 0.0f;
|
||||
data[9][i] = 0.0f;
|
||||
|
||||
data[10][i] = -FLT_MAX;
|
||||
data[11][i] = -FLT_MAX;
|
||||
data[12][i] = -FLT_MAX;
|
||||
|
||||
data[13][i] = __int_as_float(0);
|
||||
}
|
||||
|
||||
memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_ONODE_SIZE);
|
||||
}
|
||||
|
||||
/* Quad SIMD Nodes */
|
||||
|
||||
void BVH8::pack_nodes(const BVHNode *root)
|
||||
{
|
||||
/* Calculate size of the arrays required. */
|
||||
const size_t num_nodes = root->getSubtreeSize(BVH_STAT_ONODE_COUNT);
|
||||
const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
|
||||
assert(num_leaf_nodes <= num_nodes);
|
||||
const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
|
||||
size_t node_size;
|
||||
if(params.use_unaligned_nodes) {
|
||||
const size_t num_unaligned_nodes =
|
||||
root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_ONODE_COUNT);
|
||||
node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
|
||||
(num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
|
||||
}
|
||||
else {
|
||||
node_size = num_inner_nodes * BVH_ONODE_SIZE;
|
||||
}
|
||||
/* Resize arrays. */
|
||||
pack.nodes.clear();
|
||||
pack.leaf_nodes.clear();
|
||||
/* For top level BVH, first merge existing BVH's so we know the offsets. */
|
||||
if(params.top_level) {
|
||||
pack_instances(node_size, num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
|
||||
}
|
||||
else {
|
||||
pack.nodes.resize(node_size);
|
||||
pack.leaf_nodes.resize(num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
|
||||
}
|
||||
|
||||
int nextNodeIdx = 0, nextLeafNodeIdx = 0;
|
||||
|
||||
vector<BVHStackEntry> stack;
|
||||
stack.reserve(BVHParams::MAX_DEPTH*2);
|
||||
if(root->is_leaf()) {
|
||||
stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
|
||||
}
|
||||
else {
|
||||
stack.push_back(BVHStackEntry(root, nextNodeIdx));
|
||||
nextNodeIdx += node_is_unaligned(root, bvh8)
|
||||
? BVH_UNALIGNED_ONODE_SIZE
|
||||
: BVH_ONODE_SIZE;
|
||||
}
|
||||
|
||||
while(stack.size()) {
|
||||
BVHStackEntry e = stack.back();
|
||||
stack.pop_back();
|
||||
|
||||
if(e.node->is_leaf()) {
|
||||
/* leaf node */
|
||||
const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
|
||||
pack_leaf(e, leaf);
|
||||
}
|
||||
else {
|
||||
/* Inner node. */
|
||||
const BVHNode *node = e.node;
|
||||
const BVHNode *node0 = node->get_child(0);
|
||||
const BVHNode *node1 = node->get_child(1);
|
||||
/* Collect nodes. */
|
||||
const BVHNode *nodes[8];
|
||||
int numnodes = 0;
|
||||
if(node0->is_leaf()) {
|
||||
nodes[numnodes++] = node0;
|
||||
}
|
||||
else {
|
||||
const BVHNode *node00 = node0->get_child(0),
|
||||
*node01 = node0->get_child(1);
|
||||
if(node00->is_leaf()) {
|
||||
nodes[numnodes++] = node00;
|
||||
}
|
||||
else {
|
||||
nodes[numnodes++] = node00->get_child(0);
|
||||
nodes[numnodes++] = node00->get_child(1);
|
||||
}
|
||||
if(node01->is_leaf()) {
|
||||
nodes[numnodes++] = node01;
|
||||
}
|
||||
else {
|
||||
nodes[numnodes++] = node01->get_child(0);
|
||||
nodes[numnodes++] = node01->get_child(1);
|
||||
}
|
||||
}
|
||||
if(node1->is_leaf()) {
|
||||
nodes[numnodes++] = node1;
|
||||
}
|
||||
else {
|
||||
const BVHNode *node10 = node1->get_child(0),
|
||||
*node11 = node1->get_child(1);
|
||||
if(node10->is_leaf()) {
|
||||
nodes[numnodes++] = node10;
|
||||
}
|
||||
else {
|
||||
nodes[numnodes++] = node10->get_child(0);
|
||||
nodes[numnodes++] = node10->get_child(1);
|
||||
}
|
||||
if(node11->is_leaf()) {
|
||||
nodes[numnodes++] = node11;
|
||||
}
|
||||
else {
|
||||
nodes[numnodes++] = node11->get_child(0);
|
||||
nodes[numnodes++] = node11->get_child(1);
|
||||
}
|
||||
}
|
||||
/* Push entries on the stack. */
|
||||
for(int i = 0; i < numnodes; ++i) {
|
||||
int idx;
|
||||
if(nodes[i]->is_leaf()) {
|
||||
idx = nextLeafNodeIdx++;
|
||||
}
|
||||
else {
|
||||
idx = nextNodeIdx;
|
||||
nextNodeIdx += node_is_unaligned(nodes[i], bvh8)
|
||||
? BVH_UNALIGNED_ONODE_SIZE
|
||||
: BVH_ONODE_SIZE;
|
||||
}
|
||||
stack.push_back(BVHStackEntry(nodes[i], idx));
|
||||
}
|
||||
/* Set node. */
|
||||
pack_inner(e, &stack[stack.size() - numnodes], numnodes);
|
||||
}
|
||||
}
|
||||
assert(node_size == nextNodeIdx);
|
||||
/* Root index to start traversal at, to handle case of single leaf node. */
|
||||
pack.root_index = (root->is_leaf()) ? -1 : 0;
|
||||
}
|
||||
|
||||
void BVH8::refit_nodes()
|
||||
{
|
||||
assert(!params.top_level);
|
||||
|
||||
BoundBox bbox = BoundBox::empty;
|
||||
uint visibility = 0;
|
||||
refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
|
||||
}
|
||||
|
||||
void BVH8::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
|
||||
{
|
||||
if(leaf) {
|
||||
int4 *data = &pack.leaf_nodes[idx];
|
||||
int4 c = data[0];
|
||||
/* Refit leaf node. */
|
||||
for(int prim = c.x; prim < c.y; prim++) {
|
||||
int pidx = pack.prim_index[prim];
|
||||
int tob = pack.prim_object[prim];
|
||||
Object *ob = objects[tob];
|
||||
|
||||
if(pidx == -1) {
|
||||
/* Object instance. */
|
||||
bbox.grow(ob->bounds);
|
||||
}
|
||||
else {
|
||||
/* Primitives. */
|
||||
const Mesh *mesh = ob->mesh;
|
||||
|
||||
if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
|
||||
/* Curves. */
|
||||
int str_offset = (params.top_level) ? mesh->curve_offset : 0;
|
||||
Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
|
||||
int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
|
||||
|
||||
curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
|
||||
|
||||
visibility |= PATH_RAY_CURVE;
|
||||
|
||||
/* Motion curves. */
|
||||
if(mesh->use_motion_blur) {
|
||||
Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
|
||||
|
||||
if(attr) {
|
||||
size_t mesh_size = mesh->curve_keys.size();
|
||||
size_t steps = mesh->motion_steps - 1;
|
||||
float3 *key_steps = attr->data_float3();
|
||||
|
||||
for(size_t i = 0; i < steps; i++) {
|
||||
curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Triangles. */
|
||||
int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
|
||||
Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
|
||||
const float3 *vpos = &mesh->verts[0];
|
||||
|
||||
triangle.bounds_grow(vpos, bbox);
|
||||
|
||||
/* Motion triangles. */
|
||||
if(mesh->use_motion_blur) {
|
||||
Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
|
||||
|
||||
if(attr) {
|
||||
size_t mesh_size = mesh->verts.size();
|
||||
size_t steps = mesh->motion_steps - 1;
|
||||
float3 *vert_steps = attr->data_float3();
|
||||
|
||||
for(size_t i = 0; i < steps; i++) {
|
||||
triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
visibility |= ob->visibility;
|
||||
}
|
||||
|
||||
float4 leaf_data[BVH_ONODE_LEAF_SIZE];
|
||||
leaf_data[0].x = __int_as_float(c.x);
|
||||
leaf_data[0].y = __int_as_float(c.y);
|
||||
leaf_data[0].z = __uint_as_float(visibility);
|
||||
leaf_data[0].w = __uint_as_float(c.w);
|
||||
memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
|
||||
}
|
||||
else {
|
||||
int4 *data = &pack.nodes[idx];
|
||||
bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
|
||||
int4 c;
|
||||
if(is_unaligned) {
|
||||
c = data[BVH_UNALIGNED_ONODE_SIZE-1];
|
||||
}
|
||||
else {
|
||||
c = data[BVH_ONODE_SIZE-1];
|
||||
}
|
||||
/* Refit inner node, set bbox from children. */
|
||||
BoundBox child_bbox[8] = { BoundBox::empty, BoundBox::empty,
|
||||
BoundBox::empty, BoundBox::empty,
|
||||
BoundBox::empty, BoundBox::empty,
|
||||
BoundBox::empty, BoundBox::empty };
|
||||
uint child_visibility[8] = { 0 };
|
||||
int num_nodes = 0;
|
||||
|
||||
for(int i = 0; i < 8; ++i) {
|
||||
if(c[i] != 0) {
|
||||
refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
|
||||
child_bbox[i], child_visibility[i]);
|
||||
++num_nodes;
|
||||
bbox.grow(child_bbox[i]);
|
||||
visibility |= child_visibility[i];
|
||||
}
|
||||
}
|
||||
|
||||
if(is_unaligned) {
|
||||
Transform aligned_space[8] = { transform_identity(), transform_identity(),
|
||||
transform_identity(), transform_identity(),
|
||||
transform_identity(), transform_identity(),
|
||||
transform_identity(), transform_identity()};
|
||||
pack_unaligned_node(idx,
|
||||
aligned_space,
|
||||
child_bbox,
|
||||
&c[0],
|
||||
visibility,
|
||||
0.0f,
|
||||
1.0f,
|
||||
num_nodes);
|
||||
}
|
||||
else {
|
||||
pack_aligned_node(idx,
|
||||
child_bbox,
|
||||
&c[0],
|
||||
visibility,
|
||||
0.0f,
|
||||
1.0f,
|
||||
num_nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
Copyright (c) 2017, Intel Corporation
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __BVH8_H__
|
||||
#define __BVH8_H__
|
||||
|
||||
#include "bvh/bvh.h"
|
||||
#include "bvh/bvh_params.h"
|
||||
|
||||
#include "util/util_types.h"
|
||||
#include "util/util_vector.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class BVHNode;
|
||||
struct BVHStackEntry;
|
||||
class BVHParams;
|
||||
class BoundBox;
|
||||
class LeafNode;
|
||||
class Object;
|
||||
class Progress;
|
||||
|
||||
#define BVH_ONODE_SIZE 16
|
||||
#define BVH_ONODE_LEAF_SIZE 1
|
||||
#define BVH_UNALIGNED_ONODE_SIZE 28
|
||||
|
||||
/* BVH8
|
||||
*
|
||||
* Octo BVH, with each node having eight children, to use with SIMD instructions.
|
||||
*/
|
||||
class BVH8 : public BVH {
|
||||
protected:
|
||||
/* constructor */
|
||||
friend class BVH;
|
||||
BVH8(const BVHParams& params, const vector<Object*>& objects);
|
||||
|
||||
/* pack */
|
||||
void pack_nodes(const BVHNode *root);
|
||||
|
||||
void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
|
||||
void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
|
||||
|
||||
void pack_aligned_inner(const BVHStackEntry& e,
|
||||
const BVHStackEntry *en,
|
||||
int num);
|
||||
void pack_aligned_node(int idx,
|
||||
const BoundBox *bounds,
|
||||
const int *child,
|
||||
const uint visibility,
|
||||
const float time_from,
|
||||
const float time_to,
|
||||
const int num);
|
||||
|
||||
void pack_unaligned_inner(const BVHStackEntry& e,
|
||||
const BVHStackEntry *en,
|
||||
int num);
|
||||
void pack_unaligned_node(int idx,
|
||||
const Transform *aligned_space,
|
||||
const BoundBox *bounds,
|
||||
const int *child,
|
||||
const uint visibility,
|
||||
const float time_from,
|
||||
const float time_to,
|
||||
const int num);
|
||||
|
||||
/* refit */
|
||||
void refit_nodes();
|
||||
void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __BVH8_H__ */
|
|
@ -61,6 +61,55 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
|
|||
}
|
||||
}
|
||||
return cnt;
|
||||
case BVH_STAT_ONODE_COUNT:
|
||||
cnt = 1;
|
||||
for(int i = 0; i < num_children(); i++) {
|
||||
BVHNode *node = get_child(i);
|
||||
if(node->is_leaf()) {
|
||||
cnt += 1;
|
||||
}
|
||||
else {
|
||||
for(int j = 0; j < node->num_children(); j++)
|
||||
{
|
||||
BVHNode *node_next = node->get_child(j);
|
||||
if(node_next->is_leaf()) {
|
||||
cnt += 1;
|
||||
}
|
||||
else {
|
||||
for(int k = 0; k < node_next->num_children(); k++) {
|
||||
cnt += node_next->get_child(k)->getSubtreeSize(stat);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return cnt;
|
||||
case BVH_STAT_UNALIGNED_INNER_ONODE_COUNT:
|
||||
{
|
||||
bool has_unaligned = false;
|
||||
for(int i = 0; i < num_children(); i++) {
|
||||
BVHNode *node = get_child(i);
|
||||
if(node->is_leaf()) {
|
||||
has_unaligned |= node->is_unaligned;
|
||||
}
|
||||
else {
|
||||
for(int j = 0; j < node->num_children(); j++) {
|
||||
BVHNode *node_next = node->get_child(j);
|
||||
if(node_next->is_leaf()) {
|
||||
has_unaligned |= node_next->is_unaligned;
|
||||
}
|
||||
else {
|
||||
for(int k = 0; k < node_next->num_children(); k++) {
|
||||
cnt += node_next->get_child(k)->getSubtreeSize(stat);
|
||||
has_unaligned |= node_next->get_child(k)->is_unaligned;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cnt += has_unaligned? 1: 0;
|
||||
}
|
||||
return cnt;
|
||||
case BVH_STAT_ALIGNED_COUNT:
|
||||
if(!is_unaligned) {
|
||||
cnt = 1;
|
||||
|
|
|
@ -39,6 +39,8 @@ enum BVH_STAT {
|
|||
BVH_STAT_ALIGNED_LEAF_COUNT,
|
||||
BVH_STAT_UNALIGNED_LEAF_COUNT,
|
||||
BVH_STAT_DEPTH,
|
||||
BVH_STAT_ONODE_COUNT,
|
||||
BVH_STAT_UNALIGNED_INNER_ONODE_COUNT,
|
||||
};
|
||||
|
||||
class BVHParams;
|
||||
|
|
|
@ -1031,6 +1031,9 @@ void device_cpu_info(vector<DeviceInfo>& devices)
|
|||
if(system_cpu_support_sse2()) {
|
||||
info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
|
||||
}
|
||||
if (system_cpu_support_avx2()) {
|
||||
info.bvh_layout_mask |= BVH_LAYOUT_BVH8;
|
||||
}
|
||||
info.has_volume_decoupled = true;
|
||||
info.has_osl = true;
|
||||
info.has_half_images = true;
|
||||
|
|
|
@ -76,6 +76,12 @@ set(SRC_BVH_HEADERS
|
|||
bvh/qbvh_traversal.h
|
||||
bvh/qbvh_volume.h
|
||||
bvh/qbvh_volume_all.h
|
||||
bvh/obvh_nodes.h
|
||||
bvh/obvh_shadow_all.h
|
||||
bvh/obvh_local.h
|
||||
bvh/obvh_traversal.h
|
||||
bvh/obvh_volume.h
|
||||
bvh/obvh_volume_all.h
|
||||
)
|
||||
|
||||
set(SRC_HEADERS
|
||||
|
@ -270,6 +276,8 @@ set(SRC_UTIL_HEADERS
|
|||
../util/util_types_float3_impl.h
|
||||
../util/util_types_float4.h
|
||||
../util/util_types_float4_impl.h
|
||||
../util/util_types_float8.h
|
||||
../util/util_types_float8_impl.h
|
||||
../util/util_types_int2.h
|
||||
../util/util_types_int2_impl.h
|
||||
../util/util_types_int3.h
|
||||
|
|
|
@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN
|
|||
/* Common QBVH functions. */
|
||||
#ifdef __QBVH__
|
||||
# include "kernel/bvh/qbvh_nodes.h"
|
||||
#ifdef __KERNEL_AVX2__
|
||||
# include "kernel/bvh/obvh_nodes.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Regular BVH traversal */
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
#ifdef __QBVH__
|
||||
# include "kernel/bvh/qbvh_local.h"
|
||||
# ifdef __KERNEL_AVX2__
|
||||
# include "kernel/bvh/obvh_local.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
|
@ -254,6 +257,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
|
|||
int max_hits)
|
||||
{
|
||||
switch(kernel_data.bvh.bvh_layout) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
case BVH_LAYOUT_BVH8:
|
||||
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
|
||||
ray,
|
||||
local_isect,
|
||||
local_object,
|
||||
lcg_state,
|
||||
max_hits);
|
||||
#endif
|
||||
#ifdef __QBVH__
|
||||
case BVH_LAYOUT_BVH4:
|
||||
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
#ifdef __QBVH__
|
||||
# include "kernel/bvh/qbvh_shadow_all.h"
|
||||
#ifdef __KERNEL_AVX2__
|
||||
# include "kernel/bvh/obvh_shadow_all.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
|
@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
|
|||
uint *num_hits)
|
||||
{
|
||||
switch(kernel_data.bvh.bvh_layout) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
case BVH_LAYOUT_BVH8:
|
||||
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
|
||||
ray,
|
||||
isect_array,
|
||||
visibility,
|
||||
max_hits,
|
||||
num_hits);
|
||||
#endif
|
||||
#ifdef __QBVH__
|
||||
case BVH_LAYOUT_BVH4:
|
||||
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
|
||||
|
|
|
@ -20,6 +20,9 @@
|
|||
#ifdef __QBVH__
|
||||
# include "kernel/bvh/qbvh_traversal.h"
|
||||
#endif
|
||||
#ifdef __KERNEL_AVX2__
|
||||
# include "kernel/bvh/obvh_traversal.h"
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT bvh_node_intersect
|
||||
|
@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
|
|||
)
|
||||
{
|
||||
switch(kernel_data.bvh.bvh_layout) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
case BVH_LAYOUT_BVH8:
|
||||
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
|
||||
ray,
|
||||
isect,
|
||||
visibility
|
||||
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
, lcg_state,
|
||||
difl,
|
||||
extmax
|
||||
# endif
|
||||
);
|
||||
#endif
|
||||
#ifdef __QBVH__
|
||||
case BVH_LAYOUT_BVH4:
|
||||
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
|
||||
|
|
|
@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
|
|||
/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
|
||||
#define BVH_STACK_SIZE 192
|
||||
#define BVH_QSTACK_SIZE 384
|
||||
|
||||
#define BVH_OSTACK_SIZE 768
|
||||
/* BVH intersection function variations */
|
||||
|
||||
#define BVH_INSTANCING 1
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
#ifdef __QBVH__
|
||||
# include "kernel/bvh/qbvh_volume.h"
|
||||
#ifdef __KERNEL_AVX2__
|
||||
# include "kernel/bvh/obvh_volume.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
|
@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
|
|||
const uint visibility)
|
||||
{
|
||||
switch(kernel_data.bvh.bvh_layout) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
case BVH_LAYOUT_BVH8:
|
||||
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
|
||||
ray,
|
||||
isect,
|
||||
visibility);
|
||||
#endif
|
||||
#ifdef __QBVH__
|
||||
case BVH_LAYOUT_BVH4:
|
||||
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
#ifdef __QBVH__
|
||||
# include "kernel/bvh/qbvh_volume_all.h"
|
||||
#ifdef __KERNEL_AVX2__
|
||||
# include "kernel/bvh/obvh_volume_all.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
|
@ -386,6 +389,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
|
|||
const uint visibility)
|
||||
{
|
||||
switch(kernel_data.bvh.bvh_layout) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
case BVH_LAYOUT_BVH8:
|
||||
return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
|
||||
ray,
|
||||
isect_array,
|
||||
max_hits,
|
||||
visibility);
|
||||
#endif
|
||||
#ifdef __QBVH__
|
||||
case BVH_LAYOUT_BVH4:
|
||||
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
|
||||
|
|
|
@ -0,0 +1,409 @@
|
|||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* This is a template BVH traversal function for subsurface scattering, where
|
||||
* various features can be enabled/disabled. This way we can compile optimized
|
||||
* versions for each case without new features slowing things down.
|
||||
*
|
||||
* BVH_MOTION: motion blur rendering
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT obvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT obvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
LocalIntersection *local_isect,
|
||||
int local_object,
|
||||
uint *lcg_state,
|
||||
int max_hits)
|
||||
{
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
|
||||
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stack_ptr = 0;
|
||||
int node_addr = kernel_tex_fetch(__object_node, local_object);
|
||||
|
||||
/* Ray parameters in registers. */
|
||||
float3 P = ray->P;
|
||||
float3 dir = bvh_clamp_direction(ray->D);
|
||||
float3 idir = bvh_inverse_direction(dir);
|
||||
int object = OBJECT_NONE;
|
||||
float isect_t = ray->t;
|
||||
|
||||
local_isect->num_hits = 0;
|
||||
|
||||
const int object_flag = kernel_tex_fetch(__object_flag, local_object);
|
||||
if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
Transform ob_itfm;
|
||||
isect_t = bvh_instance_motion_push(kg,
|
||||
local_object,
|
||||
ray,
|
||||
&P,
|
||||
&dir,
|
||||
&idir,
|
||||
isect_t,
|
||||
&ob_itfm);
|
||||
#else
|
||||
isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
|
||||
#endif
|
||||
object = local_object;
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_SSE41__
|
||||
if(!isfinite(P.x)) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
avxf tnear(0.0f), tfar(isect_t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
#endif
|
||||
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
int near_x, near_y, near_z;
|
||||
int far_x, far_y, far_z;
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
|
||||
/* Traversal loop. */
|
||||
do {
|
||||
do {
|
||||
/* Traverse internal nodes. */
|
||||
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
|
||||
avxf dist;
|
||||
int child_mask = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
&dist);
|
||||
|
||||
if(child_mask != 0) {
|
||||
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
|
||||
avxf cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(child_mask);
|
||||
if(child_mask == 0) {
|
||||
node_addr = __float_as_int(cnodes[r]);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Two children are hit, push far child, and continue with
|
||||
* closer child.
|
||||
*/
|
||||
int c0 = __float_as_int(cnodes[r]);
|
||||
float d0 = ((float*)&dist)[r];
|
||||
r = __bscf(child_mask);
|
||||
int c1 = __float_as_int(cnodes[r]);
|
||||
float d1 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
if(d1 < d0) {
|
||||
node_addr = c1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
node_addr = c0;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Here starts the slow path for 3 or 4 hit children. We push
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c2 = __float_as_int(cnodes[r]);
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Four children are hit, push all onto stack and sort 4
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
|
||||
/* Five children are hit, push all onto stack and sort 5
|
||||
* stack items, continue with closest child
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c4 = __float_as_int(cnodes[r]);
|
||||
float d4 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
/* Six children are hit, push all onto stack and sort 6
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c5 = __float_as_int(cnodes[r]);
|
||||
float d5 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
|
||||
/* Seven children are hit, push all onto stack and sort 7
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c6 = __float_as_int(cnodes[r]);
|
||||
float d6 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
/* Eight children are hit, push all onto stack and sort 8
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c7 = __float_as_int(cnodes[r]);
|
||||
float d7 = ((float*)&dist)[r];
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c7;
|
||||
traversal_stack[stack_ptr].dist = d7;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6],
|
||||
&traversal_stack[stack_ptr - 7]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
|
||||
/* If node is leaf, fetch triangle list. */
|
||||
if(node_addr < 0) {
|
||||
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
|
||||
int prim_addr = __float_as_int(leaf.x);
|
||||
|
||||
int prim_addr2 = __float_as_int(leaf.y);
|
||||
const uint type = __float_as_int(leaf.w);
|
||||
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
|
||||
/* Primitive intersection. */
|
||||
switch(type & PRIMITIVE_ALL) {
|
||||
case PRIMITIVE_TRIANGLE: {
|
||||
/* Intersect ray against primitive, */
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
if(triangle_intersect_local(kg,
|
||||
local_isect,
|
||||
P,
|
||||
dir,
|
||||
object,
|
||||
local_object,
|
||||
prim_addr,
|
||||
isect_t,
|
||||
lcg_state,
|
||||
max_hits))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
case PRIMITIVE_MOTION_TRIANGLE: {
|
||||
/* Intersect ray against primitive. */
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
if(motion_triangle_intersect_local(kg,
|
||||
local_isect,
|
||||
P,
|
||||
dir,
|
||||
ray->time,
|
||||
object,
|
||||
local_object,
|
||||
prim_addr,
|
||||
isect_t,
|
||||
lcg_state,
|
||||
max_hits))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
return false;
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
|
@ -0,0 +1,532 @@
|
|||
/*
|
||||
* Copyright 2011-2014, Blender Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* Aligned nodes intersection AVX code is adopted from Embree,
|
||||
*/
|
||||
|
||||
struct OBVHStackItem {
|
||||
int addr;
|
||||
float dist;
|
||||
};
|
||||
|
||||
ccl_device_inline void obvh_near_far_idx_calc(const float3& idir,
|
||||
int *ccl_restrict near_x,
|
||||
int *ccl_restrict near_y,
|
||||
int *ccl_restrict near_z,
|
||||
int *ccl_restrict far_x,
|
||||
int *ccl_restrict far_y,
|
||||
int *ccl_restrict far_z)
|
||||
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
*near_x = 0; *far_x = 1;
|
||||
*near_y = 2; *far_y = 3;
|
||||
*near_z = 4; *far_z = 5;
|
||||
|
||||
const size_t mask = movemask(ssef(idir.m128));
|
||||
|
||||
const int mask_x = mask & 1;
|
||||
const int mask_y = (mask & 2) >> 1;
|
||||
const int mask_z = (mask & 4) >> 2;
|
||||
|
||||
*near_x += mask_x; *far_x -= mask_x;
|
||||
*near_y += mask_y; *far_y -= mask_y;
|
||||
*near_z += mask_z; *far_z -= mask_z;
|
||||
#else
|
||||
if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
|
||||
if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
|
||||
if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a,
|
||||
OBVHStackItem *ccl_restrict b)
|
||||
{
|
||||
OBVHStackItem tmp = *a;
|
||||
*a = *b;
|
||||
*b = tmp;
|
||||
}
|
||||
|
||||
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
|
||||
OBVHStackItem *ccl_restrict s2,
|
||||
OBVHStackItem *ccl_restrict s3)
|
||||
{
|
||||
if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
|
||||
if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
|
||||
if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
|
||||
}
|
||||
|
||||
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
|
||||
OBVHStackItem *ccl_restrict s2,
|
||||
OBVHStackItem *ccl_restrict s3,
|
||||
OBVHStackItem *ccl_restrict s4)
|
||||
{
|
||||
if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
|
||||
if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); }
|
||||
if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); }
|
||||
if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); }
|
||||
if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
|
||||
}
|
||||
|
||||
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
|
||||
OBVHStackItem *ccl_restrict s2,
|
||||
OBVHStackItem *ccl_restrict s3,
|
||||
OBVHStackItem *ccl_restrict s4,
|
||||
OBVHStackItem *ccl_restrict s5)
|
||||
{
|
||||
obvh_stack_sort(s1, s2, s3, s4);
|
||||
if(s5->dist < s4->dist) {
|
||||
obvh_item_swap(s4, s5);
|
||||
if(s4->dist < s3->dist) {
|
||||
obvh_item_swap(s3, s4);
|
||||
if(s3->dist < s2->dist) {
|
||||
obvh_item_swap(s2, s3);
|
||||
if(s2->dist < s1->dist) {
|
||||
obvh_item_swap(s1, s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
|
||||
OBVHStackItem *ccl_restrict s2,
|
||||
OBVHStackItem *ccl_restrict s3,
|
||||
OBVHStackItem *ccl_restrict s4,
|
||||
OBVHStackItem *ccl_restrict s5,
|
||||
OBVHStackItem *ccl_restrict s6)
|
||||
{
|
||||
obvh_stack_sort(s1, s2, s3, s4, s5);
|
||||
if(s6->dist < s5->dist) {
|
||||
obvh_item_swap(s5, s6);
|
||||
if(s5->dist < s4->dist) {
|
||||
obvh_item_swap(s4, s5);
|
||||
if(s4->dist < s3->dist) {
|
||||
obvh_item_swap(s3, s4);
|
||||
if(s3->dist < s2->dist) {
|
||||
obvh_item_swap(s2, s3);
|
||||
if(s2->dist < s1->dist) {
|
||||
obvh_item_swap(s1, s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
|
||||
OBVHStackItem *ccl_restrict s2,
|
||||
OBVHStackItem *ccl_restrict s3,
|
||||
OBVHStackItem *ccl_restrict s4,
|
||||
OBVHStackItem *ccl_restrict s5,
|
||||
OBVHStackItem *ccl_restrict s6,
|
||||
OBVHStackItem *ccl_restrict s7)
|
||||
{
|
||||
obvh_stack_sort(s1, s2, s3, s4, s5, s6);
|
||||
if(s7->dist < s6->dist) {
|
||||
obvh_item_swap(s6, s7);
|
||||
if(s6->dist < s5->dist) {
|
||||
obvh_item_swap(s5, s6);
|
||||
if(s5->dist < s4->dist) {
|
||||
obvh_item_swap(s4, s5);
|
||||
if(s4->dist < s3->dist) {
|
||||
obvh_item_swap(s3, s4);
|
||||
if(s3->dist < s2->dist) {
|
||||
obvh_item_swap(s2, s3);
|
||||
if(s2->dist < s1->dist) {
|
||||
obvh_item_swap(s1, s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
|
||||
OBVHStackItem *ccl_restrict s2,
|
||||
OBVHStackItem *ccl_restrict s3,
|
||||
OBVHStackItem *ccl_restrict s4,
|
||||
OBVHStackItem *ccl_restrict s5,
|
||||
OBVHStackItem *ccl_restrict s6,
|
||||
OBVHStackItem *ccl_restrict s7,
|
||||
OBVHStackItem *ccl_restrict s8)
|
||||
{
|
||||
obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
|
||||
if(s8->dist < s7->dist) {
|
||||
obvh_item_swap(s7, s8);
|
||||
if(s7->dist < s6->dist) {
|
||||
obvh_item_swap(s6, s7);
|
||||
if(s6->dist < s5->dist) {
|
||||
obvh_item_swap(s5, s6);
|
||||
if(s5->dist < s4->dist) {
|
||||
obvh_item_swap(s4, s5);
|
||||
if(s4->dist < s3->dist) {
|
||||
obvh_item_swap(s3, s4);
|
||||
if(s3->dist < s2->dist) {
|
||||
obvh_item_swap(s2, s3);
|
||||
if(s2->dist < s1->dist) {
|
||||
obvh_item_swap(s1, s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Axis-aligned nodes intersection */
|
||||
|
||||
ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
|
||||
const avxf& isect_near,
|
||||
const avxf& isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avx3f& org_idir,
|
||||
#else
|
||||
const avx3f& org,
|
||||
#endif
|
||||
const avx3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int node_addr,
|
||||
avxf *ccl_restrict dist)
|
||||
{
|
||||
const int offset = node_addr + 2;
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x);
|
||||
const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y);
|
||||
const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z);
|
||||
const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x);
|
||||
const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y);
|
||||
const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z);
|
||||
|
||||
const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
|
||||
const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
|
||||
const avxb vmask = tnear <= tfar;
|
||||
int mask = (int)movemask(vmask);
|
||||
*dist = tnear;
|
||||
return mask;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int obvh_aligned_node_intersect_robust(
|
||||
KernelGlobals *ccl_restrict kg,
|
||||
const avxf& isect_near,
|
||||
const avxf& isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avx3f& P_idir,
|
||||
#else
|
||||
const avx3f& P,
|
||||
#endif
|
||||
const avx3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int node_addr,
|
||||
const float difl,
|
||||
avxf *ccl_restrict dist)
|
||||
{
|
||||
const int offset = node_addr + 2;
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x);
|
||||
const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x);
|
||||
const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y);
|
||||
const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y);
|
||||
const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z);
|
||||
const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z);
|
||||
|
||||
const float round_down = 1.0f - difl;
|
||||
const float round_up = 1.0f + difl;
|
||||
const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
|
||||
const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
|
||||
const avxb vmask = round_down*tnear <= round_up*tfar;
|
||||
int mask = (int)movemask(vmask);
|
||||
*dist = tnear;
|
||||
return mask;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Unaligned nodes intersection */
|
||||
|
||||
ccl_device_inline int obvh_unaligned_node_intersect(
|
||||
KernelGlobals *ccl_restrict kg,
|
||||
const avxf& isect_near,
|
||||
const avxf& isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avx3f& org_idir,
|
||||
#endif
|
||||
const avx3f& org,
|
||||
const avx3f& dir,
|
||||
const avx3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int node_addr,
|
||||
avxf *ccl_restrict dist)
|
||||
{
|
||||
const int offset = node_addr;
|
||||
const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
|
||||
const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
|
||||
const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
|
||||
|
||||
const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
|
||||
const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
|
||||
const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
|
||||
|
||||
const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
|
||||
const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
|
||||
const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
|
||||
|
||||
const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
|
||||
const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
|
||||
const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
|
||||
|
||||
const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
|
||||
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
|
||||
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
|
||||
|
||||
const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
|
||||
aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
|
||||
aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
|
||||
|
||||
const avxf neg_one(-1.0f);
|
||||
const avxf nrdir_x = neg_one / aligned_dir_x,
|
||||
nrdir_y = neg_one / aligned_dir_y,
|
||||
nrdir_z = neg_one / aligned_dir_z;
|
||||
|
||||
const avxf tlower_x = aligned_P_x * nrdir_x,
|
||||
tlower_y = aligned_P_y * nrdir_y,
|
||||
tlower_z = aligned_P_z * nrdir_z;
|
||||
|
||||
const avxf tupper_x = tlower_x - nrdir_x,
|
||||
tupper_y = tlower_y - nrdir_y,
|
||||
tupper_z = tlower_z - nrdir_z;
|
||||
|
||||
const avxf tnear_x = min(tlower_x, tupper_x);
|
||||
const avxf tnear_y = min(tlower_y, tupper_y);
|
||||
const avxf tnear_z = min(tlower_z, tupper_z);
|
||||
const avxf tfar_x = max(tlower_x, tupper_x);
|
||||
const avxf tfar_y = max(tlower_y, tupper_y);
|
||||
const avxf tfar_z = max(tlower_z, tupper_z);
|
||||
const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
|
||||
const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
|
||||
const avxb vmask = tnear <= tfar;
|
||||
*dist = tnear;
|
||||
return movemask(vmask);
|
||||
}
|
||||
|
||||
ccl_device_inline int obvh_unaligned_node_intersect_robust(
|
||||
KernelGlobals *ccl_restrict kg,
|
||||
const avxf& isect_near,
|
||||
const avxf& isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avx3f& P_idir,
|
||||
#endif
|
||||
const avx3f& P,
|
||||
const avx3f& dir,
|
||||
const avx3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int node_addr,
|
||||
const float difl,
|
||||
avxf *ccl_restrict dist)
|
||||
{
|
||||
const int offset = node_addr;
|
||||
const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
|
||||
const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
|
||||
const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
|
||||
|
||||
const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
|
||||
const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
|
||||
const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
|
||||
|
||||
const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
|
||||
const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
|
||||
const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
|
||||
|
||||
const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
|
||||
const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
|
||||
const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
|
||||
|
||||
const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
|
||||
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
|
||||
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
|
||||
|
||||
const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
|
||||
aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
|
||||
aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
|
||||
|
||||
const avxf neg_one(-1.0f);
|
||||
const avxf nrdir_x = neg_one / aligned_dir_x,
|
||||
nrdir_y = neg_one / aligned_dir_y,
|
||||
nrdir_z = neg_one / aligned_dir_z;
|
||||
|
||||
const avxf tlower_x = aligned_P_x * nrdir_x,
|
||||
tlower_y = aligned_P_y * nrdir_y,
|
||||
tlower_z = aligned_P_z * nrdir_z;
|
||||
|
||||
const avxf tupper_x = tlower_x - nrdir_x,
|
||||
tupper_y = tlower_y - nrdir_y,
|
||||
tupper_z = tlower_z - nrdir_z;
|
||||
|
||||
const float round_down = 1.0f - difl;
|
||||
const float round_up = 1.0f + difl;
|
||||
|
||||
const avxf tnear_x = min(tlower_x, tupper_x);
|
||||
const avxf tnear_y = min(tlower_y, tupper_y);
|
||||
const avxf tnear_z = min(tlower_z, tupper_z);
|
||||
const avxf tfar_x = max(tlower_x, tupper_x);
|
||||
const avxf tfar_y = max(tlower_y, tupper_y);
|
||||
const avxf tfar_z = max(tlower_z, tupper_z);
|
||||
|
||||
const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
|
||||
const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
|
||||
const avxb vmask = round_down*tnear <= round_up*tfar;
|
||||
*dist = tnear;
|
||||
return movemask(vmask);
|
||||
}
|
||||
|
||||
/* Intersectors wrappers.
|
||||
*
|
||||
* They'll check node type and call appropriate intersection code.
|
||||
*/
|
||||
|
||||
ccl_device_inline int obvh_node_intersect(
|
||||
KernelGlobals *ccl_restrict kg,
|
||||
const avxf& isect_near,
|
||||
const avxf& isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avx3f& org_idir,
|
||||
#endif
|
||||
const avx3f& org,
|
||||
const avx3f& dir,
|
||||
const avx3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int node_addr,
|
||||
avxf *ccl_restrict dist)
|
||||
{
|
||||
const int offset = node_addr;
|
||||
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return obvh_unaligned_node_intersect(kg,
|
||||
isect_near,
|
||||
isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
org_idir,
|
||||
#endif
|
||||
org,
|
||||
dir,
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return obvh_aligned_node_intersect(kg,
|
||||
isect_near,
|
||||
isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
org_idir,
|
||||
#else
|
||||
org,
|
||||
#endif
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
dist);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline int obvh_node_intersect_robust(
|
||||
KernelGlobals *ccl_restrict kg,
|
||||
const avxf& isect_near,
|
||||
const avxf& isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const avx3f& P_idir,
|
||||
#endif
|
||||
const avx3f& P,
|
||||
const avx3f& dir,
|
||||
const avx3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int node_addr,
|
||||
const float difl,
|
||||
avxf *ccl_restrict dist)
|
||||
{
|
||||
const int offset = node_addr;
|
||||
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return obvh_unaligned_node_intersect_robust(kg,
|
||||
isect_near,
|
||||
isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir,
|
||||
#endif
|
||||
P,
|
||||
dir,
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
difl,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return obvh_aligned_node_intersect_robust(kg,
|
||||
isect_near,
|
||||
isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir,
|
||||
#else
|
||||
P,
|
||||
#endif
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
difl,
|
||||
dist);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,687 @@
|
|||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* This is a template BVH traversal function, where various features can be
|
||||
* enabled/disabled. This way we can compile optimized versions for each case
|
||||
* without new features slowing things down.
|
||||
*
|
||||
* BVH_INSTANCING: object instancing
|
||||
* BVH_HAIR: hair curve rendering
|
||||
* BVH_MOTION: motion blur rendering
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT obvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT obvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect_array,
|
||||
const int skip_object,
|
||||
const uint max_hits,
|
||||
uint *num_hits)
|
||||
{
|
||||
/* TODO(sergey):
|
||||
* - Test if pushing distance on the stack helps.
|
||||
* - Likely and unlikely for if() statements.
|
||||
* - Test restrict attribute for pointers.
|
||||
*/
|
||||
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
|
||||
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stack_ptr = 0;
|
||||
int node_addr = kernel_data.bvh.root;
|
||||
|
||||
/* Ray parameters in registers. */
|
||||
const float tmax = ray->t;
|
||||
float3 P = ray->P;
|
||||
float3 dir = bvh_clamp_direction(ray->D);
|
||||
float3 idir = bvh_inverse_direction(dir);
|
||||
int object = OBJECT_NONE;
|
||||
float isect_t = tmax;
|
||||
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
Transform ob_itfm;
|
||||
#endif
|
||||
|
||||
*num_hits = 0;
|
||||
isect_array->t = tmax;
|
||||
|
||||
#ifndef __KERNEL_SSE41__
|
||||
if(!isfinite(P.x)) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
int num_hits_in_instance = 0;
|
||||
#endif
|
||||
|
||||
avxf tnear(0.0f), tfar(isect_t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
#endif
|
||||
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
int near_x, near_y, near_z;
|
||||
int far_x, far_y, far_z;
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
|
||||
/* Traversal loop. */
|
||||
do {
|
||||
do {
|
||||
/* Traverse internal nodes. */
|
||||
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
|
||||
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
|
||||
(void)inodes;
|
||||
|
||||
if(false
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
|| ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
|| UNLIKELY(ray->time < inodes.y)
|
||||
|| UNLIKELY(ray->time > inodes.z)
|
||||
#endif
|
||||
) {
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
avxf dist;
|
||||
int child_mask = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
//#if !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
&dist);
|
||||
|
||||
if(child_mask != 0) {
|
||||
avxf cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(child_mask);
|
||||
if(child_mask == 0) {
|
||||
node_addr = __float_as_int(cnodes[r]);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Two children are hit, push far child, and continue with
|
||||
* closer child.
|
||||
*/
|
||||
int c0 = __float_as_int(cnodes[r]);
|
||||
float d0 = ((float*)&dist)[r];
|
||||
r = __bscf(child_mask);
|
||||
int c1 = __float_as_int(cnodes[r]);
|
||||
float d1 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
if(d1 < d0) {
|
||||
node_addr = c1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
node_addr = c0;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Here starts the slow path for 3 or 4 hit children. We push
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c2 = __float_as_int(cnodes[r]);
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Four children are hit, push all onto stack and sort 4
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
|
||||
/* Five children are hit, push all onto stack and sort 5
|
||||
* stack items, continue with closest child
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c4 = __float_as_int(cnodes[r]);
|
||||
float d4 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Six children are hit, push all onto stack and sort 6
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c5 = __float_as_int(cnodes[r]);
|
||||
float d5 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
|
||||
/* Seven children are hit, push all onto stack and sort 7
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c6 = __float_as_int(cnodes[r]);
|
||||
float d6 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Eight children are hit, push all onto stack and sort 8
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c7 = __float_as_int(cnodes[r]);
|
||||
float d7 = ((float*)&dist)[r];
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c7;
|
||||
traversal_stack[stack_ptr].dist = d7;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6],
|
||||
&traversal_stack[stack_ptr - 7]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
|
||||
/* If node is leaf, fetch triangle list. */
|
||||
if(node_addr < 0) {
|
||||
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
int prim_addr = __float_as_int(leaf.x);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(prim_addr >= 0) {
|
||||
#endif
|
||||
int prim_addr2 = __float_as_int(leaf.y);
|
||||
const uint type = __float_as_int(leaf.w);
|
||||
const uint p_type = type & PRIMITIVE_ALL;
|
||||
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
|
||||
/* Primitive intersection. */
|
||||
if(p_type == PRIMITIVE_TRIANGLE) {
|
||||
int prim_count = prim_addr2 - prim_addr;
|
||||
if(prim_count < 3) {
|
||||
while(prim_addr < prim_addr2) {
|
||||
kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
|
||||
int hit = triangle_intersect(kg,
|
||||
isect_array,
|
||||
P,
|
||||
dir,
|
||||
PATH_RAY_SHADOW,
|
||||
object,
|
||||
prim_addr);
|
||||
/* Shadow ray early termination. */
|
||||
if(hit) {
|
||||
/* detect if this surface has a shader with transparent shadows */
|
||||
|
||||
/* todo: optimize so primitive visibility flag indicates if
|
||||
* the primitive has a transparent shadow shader? */
|
||||
int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
|
||||
int shader = 0;
|
||||
|
||||
#ifdef __HAIR__
|
||||
if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
|
||||
#endif
|
||||
{
|
||||
shader = kernel_tex_fetch(__tri_shader, prim);
|
||||
}
|
||||
#ifdef __HAIR__
|
||||
else {
|
||||
float4 str = kernel_tex_fetch(__curves, prim);
|
||||
shader = __float_as_int(str.z);
|
||||
}
|
||||
#endif
|
||||
int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
|
||||
|
||||
/* if no transparent shadows, all light is blocked */
|
||||
if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
|
||||
return true;
|
||||
}
|
||||
/* if maximum number of hits reached, block all light */
|
||||
else if(*num_hits == max_hits) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* move on to next entry in intersections array */
|
||||
isect_array++;
|
||||
(*num_hits)++;
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
num_hits_in_instance++;
|
||||
#endif
|
||||
|
||||
isect_array->t = isect_t;
|
||||
}
|
||||
|
||||
prim_addr++;
|
||||
} //while
|
||||
} else {
|
||||
kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
int* nhiptr = &num_hits_in_instance;
|
||||
#else
|
||||
int nhi= 0;
|
||||
int *nhiptr = &nhi;
|
||||
#endif
|
||||
|
||||
int result = triangle_intersect8(kg,
|
||||
&isect_array,
|
||||
P,
|
||||
dir,
|
||||
PATH_RAY_SHADOW,
|
||||
object,
|
||||
prim_addr,
|
||||
prim_count,
|
||||
num_hits,
|
||||
max_hits,
|
||||
nhiptr,
|
||||
isect_t);
|
||||
if(result == 2) {
|
||||
return true;
|
||||
}
|
||||
} // prim_count
|
||||
} // PRIMITIVE_TRIANGLE
|
||||
else {
|
||||
while(prim_addr < prim_addr2) {
|
||||
kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
|
||||
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
uint tri_object = (object == OBJECT_NONE)
|
||||
? kernel_tex_fetch(__prim_object, prim_addr)
|
||||
: object;
|
||||
if(tri_object == skip_object) {
|
||||
++prim_addr;
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool hit;
|
||||
|
||||
/* todo: specialized intersect functions which don't fill in
|
||||
* isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
|
||||
* might give a few % performance improvement */
|
||||
|
||||
switch(p_type) {
|
||||
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
case PRIMITIVE_MOTION_TRIANGLE: {
|
||||
hit = motion_triangle_intersect(kg,
|
||||
isect_array,
|
||||
P,
|
||||
dir,
|
||||
ray->time,
|
||||
PATH_RAY_SHADOW,
|
||||
object,
|
||||
prim_addr);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
case PRIMITIVE_CURVE:
|
||||
case PRIMITIVE_MOTION_CURVE: {
|
||||
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
|
||||
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
|
||||
hit = cardinal_curve_intersect(kg,
|
||||
isect_array,
|
||||
P,
|
||||
dir,
|
||||
PATH_RAY_SHADOW,
|
||||
object,
|
||||
prim_addr,
|
||||
ray->time,
|
||||
curve_type,
|
||||
NULL,
|
||||
0, 0);
|
||||
}
|
||||
else {
|
||||
hit = curve_intersect(kg,
|
||||
isect_array,
|
||||
P,
|
||||
dir,
|
||||
PATH_RAY_SHADOW,
|
||||
object,
|
||||
prim_addr,
|
||||
ray->time,
|
||||
curve_type,
|
||||
NULL,
|
||||
0, 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
default: {
|
||||
hit = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Shadow ray early termination. */
|
||||
if(hit) {
|
||||
/* detect if this surface has a shader with transparent shadows */
|
||||
|
||||
/* todo: optimize so primitive visibility flag indicates if
|
||||
* the primitive has a transparent shadow shader? */
|
||||
int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
|
||||
int shader = 0;
|
||||
|
||||
#ifdef __HAIR__
|
||||
if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
|
||||
#endif
|
||||
{
|
||||
shader = kernel_tex_fetch(__tri_shader, prim);
|
||||
}
|
||||
#ifdef __HAIR__
|
||||
else {
|
||||
float4 str = kernel_tex_fetch(__curves, prim);
|
||||
shader = __float_as_int(str.z);
|
||||
}
|
||||
#endif
|
||||
int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
|
||||
|
||||
/* if no transparent shadows, all light is blocked */
|
||||
if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
|
||||
return true;
|
||||
}
|
||||
/* if maximum number of hits reached, block all light */
|
||||
else if(*num_hits == max_hits) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* move on to next entry in intersections array */
|
||||
isect_array++;
|
||||
(*num_hits)++;
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
num_hits_in_instance++;
|
||||
#endif
|
||||
|
||||
isect_array->t = isect_t;
|
||||
}
|
||||
|
||||
prim_addr++;
|
||||
}//while prim
|
||||
}
|
||||
}
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
else {
|
||||
/* Instance push. */
|
||||
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
|
||||
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
|
||||
# else
|
||||
isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
|
||||
# endif
|
||||
|
||||
num_hits_in_instance = 0;
|
||||
isect_array->t = isect_t;
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
node_addr = kernel_tex_fetch(__object_node, object);
|
||||
|
||||
}
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(stack_ptr >= 0) {
|
||||
kernel_assert(object != OBJECT_NONE);
|
||||
|
||||
/* Instance pop. */
|
||||
if(num_hits_in_instance) {
|
||||
float t_fac;
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
|
||||
# else
|
||||
bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
|
||||
# endif
|
||||
/* Scale isect->t to adjust for instancing. */
|
||||
for(int i = 0; i < num_hits_in_instance; i++) {
|
||||
(isect_array-i-1)->t *= t_fac;
|
||||
}
|
||||
}
|
||||
else {
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
|
||||
# else
|
||||
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
|
||||
# endif
|
||||
}
|
||||
|
||||
isect_t = tmax;
|
||||
isect_array->t = isect_t;
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
object = OBJECT_NONE;
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
|
@ -0,0 +1,642 @@
|
|||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* This is a template BVH traversal function, where various features can be
|
||||
* enabled/disabled. This way we can compile optimized versions for each case
|
||||
* without new features slowing things down.
|
||||
*
|
||||
* BVH_INSTANCING: object instancing
|
||||
* BVH_HAIR: hair curve rendering
|
||||
* BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
|
||||
* BVH_MOTION: motion blur rendering
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT obvh_node_intersect
|
||||
# define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
|
||||
#else
|
||||
# define NODE_INTERSECT obvh_aligned_node_intersect
|
||||
# define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
|
||||
#endif
|
||||
|
||||
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect,
|
||||
const uint visibility
|
||||
#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
,uint *lcg_state,
|
||||
float difl,
|
||||
float extmax
|
||||
#endif
|
||||
)
|
||||
{
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
|
||||
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
traversal_stack[0].dist = -FLT_MAX;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stack_ptr = 0;
|
||||
int node_addr = kernel_data.bvh.root;
|
||||
float node_dist = -FLT_MAX;
|
||||
|
||||
/* Ray parameters in registers. */
|
||||
float3 P = ray->P;
|
||||
float3 dir = bvh_clamp_direction(ray->D);
|
||||
float3 idir = bvh_inverse_direction(dir);
|
||||
int object = OBJECT_NONE;
|
||||
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
Transform ob_itfm;
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_SSE41__
|
||||
if(!isfinite(P.x)) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
isect->t = ray->t;
|
||||
isect->u = 0.0f;
|
||||
isect->v = 0.0f;
|
||||
isect->prim = PRIM_NONE;
|
||||
isect->object = OBJECT_NONE;
|
||||
|
||||
BVH_DEBUG_INIT();
|
||||
avxf tnear(0.0f), tfar(ray->t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
#endif
|
||||
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
int near_x, near_y, near_z;
|
||||
int far_x, far_y, far_z;
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
/* Traversal loop. */
|
||||
do {
|
||||
do {
|
||||
/* Traverse internal nodes. */
|
||||
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
|
||||
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
|
||||
(void)inodes;
|
||||
|
||||
if(UNLIKELY(node_dist > isect->t)
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
|| UNLIKELY(ray->time < inodes.y)
|
||||
|| UNLIKELY(ray->time > inodes.z)
|
||||
#endif
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
|| (__float_as_uint(inodes.x) & visibility) == 0
|
||||
#endif
|
||||
)
|
||||
{
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
int child_mask;
|
||||
avxf dist;
|
||||
|
||||
BVH_DEBUG_NEXT_NODE();
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
if(difl != 0.0f) {
|
||||
/* NOTE: We extend all the child BB instead of fetching
|
||||
* and checking visibility flags for each of the,
|
||||
*
|
||||
* Need to test if doing opposite would be any faster.
|
||||
*/
|
||||
child_mask = NODE_INTERSECT_ROBUST(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
# endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
difl,
|
||||
&dist);
|
||||
}
|
||||
else
|
||||
#endif /* BVH_HAIR_MINIMUM_WIDTH */
|
||||
{
|
||||
child_mask = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
&dist);
|
||||
}
|
||||
|
||||
if(child_mask != 0) {
|
||||
avxf cnodes;
|
||||
/* TODO(sergey): Investigate whether moving cnodes upwards
|
||||
* gives a speedup (will be different cache pattern but will
|
||||
* avoid extra check here),
|
||||
*/
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(child_mask);
|
||||
float d0 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
node_addr = __float_as_int(cnodes[r]);
|
||||
node_dist = d0;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Two children are hit, push far child, and continue with
|
||||
* closer child.
|
||||
*/
|
||||
int c0 = __float_as_int(cnodes[r]);
|
||||
r = __bscf(child_mask);
|
||||
int c1 = __float_as_int(cnodes[r]);
|
||||
float d1 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
if(d1 < d0) {
|
||||
node_addr = c1;
|
||||
node_dist = d1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
node_addr = c0;
|
||||
node_dist = d0;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Here starts the slow path for 3 or 4 hit children. We push
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c2 = __float_as_int(cnodes[r]);
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Four children are hit, push all onto stack and sort 4
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
|
||||
/* Five children are hit, push all onto stack and sort 5
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c4 = __float_as_int(cnodes[r]);
|
||||
float d4 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Six children are hit, push all onto stack and sort 6
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c5 = __float_as_int(cnodes[r]);
|
||||
float d5 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
|
||||
/* Seven children are hit, push all onto stack and sort 7
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c6 = __float_as_int(cnodes[r]);
|
||||
float d6 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Eight children are hit, push all onto stack and sort 8
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c7 = __float_as_int(cnodes[r]);
|
||||
float d7 = ((float*)&dist)[r];
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c7;
|
||||
traversal_stack[stack_ptr].dist = d7;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6],
|
||||
&traversal_stack[stack_ptr - 7]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
}
|
||||
|
||||
/* If node is leaf, fetch triangle list. */
|
||||
if(node_addr < 0) {
|
||||
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if(UNLIKELY((node_dist > isect->t) ||
|
||||
((__float_as_uint(leaf.z) & visibility) == 0)))
|
||||
#else
|
||||
if(UNLIKELY((node_dist > isect->t)))
|
||||
#endif
|
||||
{
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
int prim_addr = __float_as_int(leaf.x);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(prim_addr >= 0) {
|
||||
#endif
|
||||
int prim_addr2 = __float_as_int(leaf.y);
|
||||
const uint type = __float_as_int(leaf.w);
|
||||
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
|
||||
/* Primitive intersection. */
|
||||
switch(type & PRIMITIVE_ALL) {
|
||||
case PRIMITIVE_TRIANGLE: {
|
||||
int prim_count = prim_addr2 - prim_addr;
|
||||
if(prim_count < 3) {
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
BVH_DEBUG_NEXT_INTERSECTION();
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
if(triangle_intersect(kg,
|
||||
isect,
|
||||
P,
|
||||
dir,
|
||||
visibility,
|
||||
object,
|
||||
prim_addr))
|
||||
{
|
||||
tfar = avxf(isect->t);
|
||||
/* Shadow ray early termination. */
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}//for
|
||||
}
|
||||
else {
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
if(triangle_intersect8(kg,
|
||||
&isect,
|
||||
P,
|
||||
dir,
|
||||
visibility,
|
||||
object,
|
||||
prim_addr,
|
||||
prim_count,
|
||||
0,
|
||||
0,
|
||||
NULL,
|
||||
0.0f))
|
||||
{
|
||||
tfar = avxf(isect->t);
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}//prim count
|
||||
break;
|
||||
}
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
case PRIMITIVE_MOTION_TRIANGLE: {
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
BVH_DEBUG_NEXT_INTERSECTION();
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
if(motion_triangle_intersect(kg,
|
||||
isect,
|
||||
P,
|
||||
dir,
|
||||
ray->time,
|
||||
visibility,
|
||||
object,
|
||||
prim_addr))
|
||||
{
|
||||
tfar = avxf(isect->t);
|
||||
/* Shadow ray early termination. */
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif /* BVH_FEATURE(BVH_MOTION) */
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
case PRIMITIVE_CURVE:
|
||||
case PRIMITIVE_MOTION_CURVE: {
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
BVH_DEBUG_NEXT_INTERSECTION();
|
||||
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
|
||||
kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
|
||||
bool hit;
|
||||
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
|
||||
hit = cardinal_curve_intersect(kg,
|
||||
isect,
|
||||
P,
|
||||
dir,
|
||||
visibility,
|
||||
object,
|
||||
prim_addr,
|
||||
ray->time,
|
||||
curve_type,
|
||||
lcg_state,
|
||||
difl,
|
||||
extmax);
|
||||
}
|
||||
else {
|
||||
hit = curve_intersect(kg,
|
||||
isect,
|
||||
P,
|
||||
dir,
|
||||
visibility,
|
||||
object,
|
||||
prim_addr,
|
||||
ray->time,
|
||||
curve_type,
|
||||
lcg_state,
|
||||
difl,
|
||||
extmax);
|
||||
}
|
||||
if(hit) {
|
||||
tfar = avxf(isect->t);
|
||||
/* Shadow ray early termination. */
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif /* BVH_FEATURE(BVH_HAIR) */
|
||||
}
|
||||
}
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
else {
|
||||
/* Instance push. */
|
||||
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
|
||||
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
|
||||
# else
|
||||
qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
|
||||
# endif
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
|
||||
traversal_stack[stack_ptr].dist = -FLT_MAX;
|
||||
|
||||
node_addr = kernel_tex_fetch(__object_node, object);
|
||||
|
||||
BVH_DEBUG_NEXT_INSTANCE();
|
||||
}
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(stack_ptr >= 0) {
|
||||
kernel_assert(object != OBJECT_NONE);
|
||||
|
||||
/* Instance pop. */
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
|
||||
# else
|
||||
isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
|
||||
# endif
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
object = OBJECT_NONE;
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
node_dist = traversal_stack[stack_ptr].dist;
|
||||
--stack_ptr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
return (isect->prim != PRIM_NONE);
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
||||
#undef NODE_INTERSECT_ROBUST
|
|
@ -0,0 +1,483 @@
|
|||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* This is a template BVH traversal function for volumes, where
|
||||
* various features can be enabled/disabled. This way we can compile optimized
|
||||
* versions for each case without new features slowing things down.
|
||||
*
|
||||
* BVH_INSTANCING: object instancing
|
||||
* BVH_MOTION: motion blur rendering
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT obvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT obvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect,
|
||||
const uint visibility)
|
||||
{
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
|
||||
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stack_ptr = 0;
|
||||
int node_addr = kernel_data.bvh.root;
|
||||
|
||||
/* Ray parameters in registers. */
|
||||
float3 P = ray->P;
|
||||
float3 dir = bvh_clamp_direction(ray->D);
|
||||
float3 idir = bvh_inverse_direction(dir);
|
||||
int object = OBJECT_NONE;
|
||||
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
Transform ob_itfm;
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_SSE41__
|
||||
if(!isfinite(P.x)) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
isect->t = ray->t;
|
||||
isect->u = 0.0f;
|
||||
isect->v = 0.0f;
|
||||
isect->prim = PRIM_NONE;
|
||||
isect->object = OBJECT_NONE;
|
||||
|
||||
avxf tnear(0.0f), tfar(ray->t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
#endif
|
||||
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
int near_x, near_y, near_z;
|
||||
int far_x, far_y, far_z;
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
|
||||
/* Traversal loop. */
|
||||
do {
|
||||
do {
|
||||
/* Traverse internal nodes. */
|
||||
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
|
||||
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if((__float_as_uint(inodes.x) & visibility) == 0) {
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
avxf dist;
|
||||
int child_mask = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
&dist);
|
||||
|
||||
if(child_mask != 0) {
|
||||
avxf cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(child_mask);
|
||||
if(child_mask == 0) {
|
||||
node_addr = __float_as_int(cnodes[r]);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Two children are hit, push far child, and continue with
|
||||
* closer child.
|
||||
*/
|
||||
int c0 = __float_as_int(cnodes[r]);
|
||||
float d0 = ((float*)&dist)[r];
|
||||
r = __bscf(child_mask);
|
||||
int c1 = __float_as_int(cnodes[r]);
|
||||
float d1 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
if(d1 < d0) {
|
||||
node_addr = c1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
node_addr = c0;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Here starts the slow path for 3 or 4 hit children. We push
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c2 = __float_as_int(cnodes[r]);
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Four children are hit, push all onto stack and sort 4
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
|
||||
/* Five children are hit, push all onto stack and sort 5
|
||||
* stack items, continue with closest child
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c4 = __float_as_int(cnodes[r]);
|
||||
float d4 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Six children are hit, push all onto stack and sort 6
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c5 = __float_as_int(cnodes[r]);
|
||||
float d5 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
|
||||
/* Seven children are hit, push all onto stack and sort 7
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c6 = __float_as_int(cnodes[r]);
|
||||
float d6 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Eight children are hit, push all onto stack and sort 8
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c7 = __float_as_int(cnodes[r]);
|
||||
float d7 = ((float*)&dist)[r];
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c7;
|
||||
traversal_stack[stack_ptr].dist = d7;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6],
|
||||
&traversal_stack[stack_ptr - 7]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
|
||||
/* If node is leaf, fetch triangle list. */
|
||||
if(node_addr < 0) {
|
||||
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
|
||||
|
||||
if((__float_as_uint(leaf.z) & visibility) == 0) {
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
int prim_addr = __float_as_int(leaf.x);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(prim_addr >= 0) {
|
||||
#endif
|
||||
int prim_addr2 = __float_as_int(leaf.y);
|
||||
const uint type = __float_as_int(leaf.w);
|
||||
const uint p_type = type & PRIMITIVE_ALL;
|
||||
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
|
||||
/* Primitive intersection. */
|
||||
switch(p_type) {
|
||||
case PRIMITIVE_TRIANGLE: {
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
/* Only primitives from volume object. */
|
||||
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
|
||||
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
|
||||
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
|
||||
continue;
|
||||
}
|
||||
/* Intersect ray against primitive. */
|
||||
triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
case PRIMITIVE_MOTION_TRIANGLE: {
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
/* Only primitives from volume object. */
|
||||
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
|
||||
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
|
||||
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
|
||||
continue;
|
||||
}
|
||||
/* Intersect ray against primitive. */
|
||||
motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
else {
|
||||
/* Instance push. */
|
||||
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
|
||||
int object_flag = kernel_tex_fetch(__object_flag, object);
|
||||
if(object_flag & SD_OBJECT_HAS_VOLUME) {
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
|
||||
# else
|
||||
isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
|
||||
# endif
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
node_addr = kernel_tex_fetch(__object_node, object);
|
||||
}
|
||||
else {
|
||||
/* Pop. */
|
||||
object = OBJECT_NONE;
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(stack_ptr >= 0) {
|
||||
kernel_assert(object != OBJECT_NONE);
|
||||
|
||||
/* Instance pop. */
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
|
||||
# else
|
||||
isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
|
||||
# endif
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
object = OBJECT_NONE;
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
return (isect->prim != PRIM_NONE);
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
|
@ -0,0 +1,554 @@
|
|||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* This is a template BVH traversal function for volumes, where
|
||||
* various features can be enabled/disabled. This way we can compile optimized
|
||||
* versions for each case without new features slowing things down.
|
||||
*
|
||||
* BVH_INSTANCING: object instancing
|
||||
* BVH_MOTION: motion blur rendering
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT obvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT obvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect_array,
|
||||
const uint max_hits,
|
||||
const uint visibility)
|
||||
{
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
|
||||
traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stack_ptr = 0;
|
||||
int node_addr = kernel_data.bvh.root;
|
||||
|
||||
/* Ray parameters in registers. */
|
||||
const float tmax = ray->t;
|
||||
float3 P = ray->P;
|
||||
float3 dir = bvh_clamp_direction(ray->D);
|
||||
float3 idir = bvh_inverse_direction(dir);
|
||||
int object = OBJECT_NONE;
|
||||
float isect_t = tmax;
|
||||
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
Transform ob_itfm;
|
||||
#endif
|
||||
|
||||
uint num_hits = 0;
|
||||
isect_array->t = tmax;
|
||||
|
||||
#ifndef __KERNEL_SSE41__
|
||||
if(!isfinite(P.x)) {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
int num_hits_in_instance = 0;
|
||||
#endif
|
||||
|
||||
avxf tnear(0.0f), tfar(isect_t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
#endif
|
||||
avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
int near_x, near_y, near_z;
|
||||
int far_x, far_y, far_z;
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
|
||||
/* Traversal loop. */
|
||||
do {
|
||||
do {
|
||||
/* Traverse internal nodes. */
|
||||
while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
|
||||
float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if((__float_as_uint(inodes.x) & visibility) == 0) {
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
avxf dist;
|
||||
int child_mask = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
node_addr,
|
||||
&dist);
|
||||
|
||||
if(child_mask != 0) {
|
||||
avxf cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(child_mask);
|
||||
if(child_mask == 0) {
|
||||
node_addr = __float_as_int(cnodes[r]);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Two children are hit, push far child, and continue with
|
||||
* closer child.
|
||||
*/
|
||||
int c0 = __float_as_int(cnodes[r]);
|
||||
float d0 = ((float*)&dist)[r];
|
||||
r = __bscf(child_mask);
|
||||
int c1 = __float_as_int(cnodes[r]);
|
||||
float d1 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
if(d1 < d0) {
|
||||
node_addr = c1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
node_addr = c0;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Here starts the slow path for 3 or 4 hit children. We push
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c1;
|
||||
traversal_stack[stack_ptr].dist = d1;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c0;
|
||||
traversal_stack[stack_ptr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c2 = __float_as_int(cnodes[r]);
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Four children are hit, push all onto stack and sort 4
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c3;
|
||||
traversal_stack[stack_ptr].dist = d3;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c2;
|
||||
traversal_stack[stack_ptr].dist = d2;
|
||||
|
||||
/* Five children are hit, push all onto stack and sort 5
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c4 = __float_as_int(cnodes[r]);
|
||||
float d4 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Six children are hit, push all onto stack and sort 6
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c5 = __float_as_int(cnodes[r]);
|
||||
float d5 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c5;
|
||||
traversal_stack[stack_ptr].dist = d5;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c4;
|
||||
traversal_stack[stack_ptr].dist = d4;
|
||||
|
||||
/* Seven children are hit, push all onto stack and sort 7
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c6 = __float_as_int(cnodes[r]);
|
||||
float d6 = ((float*)&dist)[r];
|
||||
if(child_mask == 0) {
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Eight children are hit, push all onto stack and sort 8
|
||||
* stack items, continue with closest child.
|
||||
*/
|
||||
r = __bscf(child_mask);
|
||||
int c7 = __float_as_int(cnodes[r]);
|
||||
float d7 = ((float*)&dist)[r];
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c7;
|
||||
traversal_stack[stack_ptr].dist = d7;
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = c6;
|
||||
traversal_stack[stack_ptr].dist = d6;
|
||||
obvh_stack_sort(&traversal_stack[stack_ptr],
|
||||
&traversal_stack[stack_ptr - 1],
|
||||
&traversal_stack[stack_ptr - 2],
|
||||
&traversal_stack[stack_ptr - 3],
|
||||
&traversal_stack[stack_ptr - 4],
|
||||
&traversal_stack[stack_ptr - 5],
|
||||
&traversal_stack[stack_ptr - 6],
|
||||
&traversal_stack[stack_ptr - 7]);
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
|
||||
/* If node is leaf, fetch triangle list. */
|
||||
if(node_addr < 0) {
|
||||
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
|
||||
|
||||
if((__float_as_uint(leaf.z) & visibility) == 0) {
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
int prim_addr = __float_as_int(leaf.x);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(prim_addr >= 0) {
|
||||
#endif
|
||||
int prim_addr2 = __float_as_int(leaf.y);
|
||||
const uint type = __float_as_int(leaf.w);
|
||||
const uint p_type = type & PRIMITIVE_ALL;
|
||||
bool hit;
|
||||
|
||||
/* Pop. */
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
|
||||
/* Primitive intersection. */
|
||||
switch(p_type) {
|
||||
case PRIMITIVE_TRIANGLE: {
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
/* Only primitives from volume object. */
|
||||
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
|
||||
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
|
||||
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
|
||||
continue;
|
||||
}
|
||||
/* Intersect ray against primitive. */
|
||||
hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
|
||||
if(hit) {
|
||||
/* Move on to next entry in intersections array. */
|
||||
isect_array++;
|
||||
num_hits++;
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
num_hits_in_instance++;
|
||||
#endif
|
||||
isect_array->t = isect_t;
|
||||
if(num_hits == max_hits) {
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
|
||||
# else
|
||||
Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
|
||||
float t_fac = 1.0f / len(transform_direction(&itfm, dir));
|
||||
# endif
|
||||
for(int i = 0; i < num_hits_in_instance; i++) {
|
||||
(isect_array-i-1)->t *= t_fac;
|
||||
}
|
||||
#endif /* BVH_FEATURE(BVH_INSTANCING) */
|
||||
return num_hits;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
case PRIMITIVE_MOTION_TRIANGLE: {
|
||||
for(; prim_addr < prim_addr2; prim_addr++) {
|
||||
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
|
||||
/* Only primitives from volume object. */
|
||||
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
|
||||
int object_flag = kernel_tex_fetch(__object_flag, tri_object);
|
||||
if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
|
||||
continue;
|
||||
}
|
||||
/* Intersect ray against primitive. */
|
||||
hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
|
||||
if(hit) {
|
||||
/* Move on to next entry in intersections array. */
|
||||
isect_array++;
|
||||
num_hits++;
|
||||
# if BVH_FEATURE(BVH_INSTANCING)
|
||||
num_hits_in_instance++;
|
||||
# endif
|
||||
isect_array->t = isect_t;
|
||||
if(num_hits == max_hits) {
|
||||
# if BVH_FEATURE(BVH_INSTANCING)
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
|
||||
# else
|
||||
Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
|
||||
float t_fac = 1.0f / len(transform_direction(&itfm, dir));
|
||||
# endif
|
||||
for(int i = 0; i < num_hits_in_instance; i++) {
|
||||
(isect_array-i-1)->t *= t_fac;
|
||||
}
|
||||
# endif /* BVH_FEATURE(BVH_INSTANCING) */
|
||||
return num_hits;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
else {
|
||||
/* Instance push. */
|
||||
object = kernel_tex_fetch(__prim_object, -prim_addr-1);
|
||||
int object_flag = kernel_tex_fetch(__object_flag, object);
|
||||
if(object_flag & SD_OBJECT_HAS_VOLUME) {
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
|
||||
# else
|
||||
isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
|
||||
# endif
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect_t);
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
num_hits_in_instance = 0;
|
||||
isect_array->t = isect_t;
|
||||
|
||||
++stack_ptr;
|
||||
kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
|
||||
traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
node_addr = kernel_tex_fetch(__object_node, object);
|
||||
}
|
||||
else {
|
||||
/* Pop. */
|
||||
object = OBJECT_NONE;
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
#if BVH_FEATURE(BVH_INSTANCING)
|
||||
if(stack_ptr >= 0) {
|
||||
kernel_assert(object != OBJECT_NONE);
|
||||
|
||||
/* Instance pop. */
|
||||
if(num_hits_in_instance) {
|
||||
float t_fac;
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
|
||||
# else
|
||||
bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
|
||||
# endif
|
||||
/* Scale isect->t to adjust for instancing. */
|
||||
for(int i = 0; i < num_hits_in_instance; i++) {
|
||||
(isect_array-i-1)->t *= t_fac;
|
||||
}
|
||||
}
|
||||
else {
|
||||
# if BVH_FEATURE(BVH_MOTION)
|
||||
bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
|
||||
# else
|
||||
bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
|
||||
# endif
|
||||
}
|
||||
|
||||
isect_t = tmax;
|
||||
isect_array->t = isect_t;
|
||||
|
||||
obvh_near_far_idx_calc(idir,
|
||||
&near_x, &near_y, &near_z,
|
||||
&far_x, &far_y, &far_z);
|
||||
tfar = avxf(isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
|
||||
# endif
|
||||
idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
|
||||
# endif
|
||||
|
||||
object = OBJECT_NONE;
|
||||
node_addr = traversal_stack[stack_ptr].addr;
|
||||
--stack_ptr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
} while(node_addr != ENTRYPOINT_SENTINEL);
|
||||
|
||||
return num_hits;
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
|
@ -85,7 +85,8 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
|
|||
|
||||
/* Axis-aligned nodes intersection */
|
||||
|
||||
ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
|
||||
//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
|
||||
static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
|
||||
const ssef& isect_near,
|
||||
const ssef& isect_far,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/*
|
||||
/*
|
||||
* Copyright 2014, Blender Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
@ -70,7 +70,479 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
|
|||
return false;
|
||||
}
|
||||
|
||||
/* Special ray intersection routines for local intersection. In that case we
|
||||
#define cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D))
|
||||
#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
|
||||
ccl_device_inline
|
||||
#else
|
||||
ccl_device_forceinline
|
||||
#endif
|
||||
int ray_triangle_intersect8(KernelGlobals *kg,
|
||||
float3 ray_P,
|
||||
float3 ray_dir,
|
||||
Intersection **isect,
|
||||
uint visibility,
|
||||
int object,
|
||||
__m256 *triA,
|
||||
__m256 *triB,
|
||||
__m256 *triC,
|
||||
int prim_addr,
|
||||
int prim_num,
|
||||
uint *num_hits,
|
||||
uint max_hits,
|
||||
int *num_hits_in_instance,
|
||||
float isec_t)
|
||||
{
|
||||
|
||||
const unsigned char prim_num_mask = (1 << prim_num) - 1;
|
||||
|
||||
const __m256i zero256 = _mm256_setzero_si256();
|
||||
|
||||
const __m256 Px256 = _mm256_set1_ps(ray_P.x);
|
||||
const __m256 Py256 = _mm256_set1_ps(ray_P.y);
|
||||
const __m256 Pz256 = _mm256_set1_ps(ray_P.z);
|
||||
|
||||
const __m256 dirx256 = _mm256_set1_ps(ray_dir.x);
|
||||
const __m256 diry256 = _mm256_set1_ps(ray_dir.y);
|
||||
const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
|
||||
|
||||
/* Calculate vertices relative to ray origin. */
|
||||
/* const float3 v0 = tri_c - P;
|
||||
const float3 v1 = tri_a - P;
|
||||
const float3 v2 = tri_b - P; */
|
||||
|
||||
__m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
|
||||
__m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
|
||||
__m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
|
||||
|
||||
__m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256);
|
||||
__m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256);
|
||||
__m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256);
|
||||
|
||||
__m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256);
|
||||
__m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256);
|
||||
__m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256);
|
||||
|
||||
__m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256);
|
||||
__m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256);
|
||||
__m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256);
|
||||
|
||||
__m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256);
|
||||
__m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256);
|
||||
__m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256);
|
||||
|
||||
__m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256);
|
||||
__m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
|
||||
__m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
|
||||
|
||||
/* Calculate triangle edges.
|
||||
const float3 e0 = v2 - v0;
|
||||
const float3 e1 = v0 - v1;
|
||||
const float3 e2 = v1 - v2;*/
|
||||
|
||||
__m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
|
||||
__m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
|
||||
__m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
|
||||
|
||||
__m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256);
|
||||
__m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256);
|
||||
__m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256);
|
||||
|
||||
__m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256);
|
||||
__m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
|
||||
__m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
|
||||
|
||||
/* Perform edge tests.
|
||||
const float U = dot(cross(v2 + v0, e0), ray_dir);
|
||||
const float V = dot(cross(v0 + v1, e1), ray_dir);
|
||||
const float W = dot(cross(v1 + v2, e2), ray_dir);*/
|
||||
|
||||
//cross (AyBz - AzBy, AzBx -AxBz, AxBy - AyBx)
|
||||
__m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
|
||||
__m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
|
||||
__m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
|
||||
//vertical dot
|
||||
__m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
|
||||
U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256));
|
||||
U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256));
|
||||
|
||||
__m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
|
||||
__m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
|
||||
__m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
|
||||
//vertical dot
|
||||
__m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
|
||||
V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256));
|
||||
V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256));
|
||||
|
||||
__m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
|
||||
__m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
|
||||
__m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
|
||||
//vertical dot
|
||||
__m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
|
||||
W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256));
|
||||
W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256));
|
||||
|
||||
//const float minUVW = min(U, min(V, W));
|
||||
//const float maxUVW = max(U, max(V, W));
|
||||
#if 0
|
||||
__m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256));
|
||||
__m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256));
|
||||
|
||||
//if(minUVW < 0.0f && maxUVW > 0.0f)
|
||||
__m256i mask_minmaxUVW_256 = _mm256_and_si256(
|
||||
_mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)),
|
||||
//_mm256_castps_si256(minUVW_256),
|
||||
_mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256));
|
||||
#else
|
||||
__m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
|
||||
__m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
|
||||
__m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
|
||||
__m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1);
|
||||
|
||||
const __m256i one256 = _mm256_set1_epi32(1);
|
||||
const __m256i two256 = _mm256_set1_epi32(2);
|
||||
|
||||
__m256i mask_minmaxUVW_256 = _mm256_or_si256(
|
||||
_mm256_cmpeq_epi32(one256, UVW_256_1),
|
||||
_mm256_cmpeq_epi32(two256, UVW_256_1) );
|
||||
#endif
|
||||
|
||||
unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
|
||||
if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Calculate geometry normal and denominator. */
|
||||
// const float3 Ng1 = cross(e1, e0);
|
||||
//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
|
||||
|
||||
__m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
|
||||
__m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
|
||||
__m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
|
||||
|
||||
//const float3 Ng = Ng1 + Ng1;
|
||||
Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
|
||||
Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
|
||||
Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
|
||||
|
||||
//const float den = dot3(Ng, dir);
|
||||
//vertical dot
|
||||
__m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
|
||||
den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256));
|
||||
den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256));
|
||||
|
||||
// __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
|
||||
|
||||
/* Perform depth test. */
|
||||
//const float T = dot3(v0, Ng);
|
||||
__m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
|
||||
T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256));
|
||||
T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256));
|
||||
|
||||
//const int sign_den = (__float_as_int(den) & 0x80000000);
|
||||
const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
|
||||
__m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
|
||||
|
||||
//const float sign_T = xor_signmask(T, sign_den);
|
||||
__m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
|
||||
|
||||
/*if((sign_T < 0.0f) || mask_minmaxUVW_pos { return false;} */
|
||||
unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
|
||||
if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
|
||||
return false;
|
||||
} /**/
|
||||
|
||||
__m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
|
||||
|
||||
|
||||
ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
|
||||
ccl_align(32) unsigned int mask_minmaxUVW8[8];
|
||||
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE){
|
||||
__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256
|
||||
|
||||
__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
|
||||
|
||||
__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
|
||||
__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
|
||||
|
||||
__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
|
||||
_mm256_castps_si256(
|
||||
_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
|
||||
)
|
||||
);
|
||||
/* __m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256,
|
||||
_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256),
|
||||
_CMP_GT_OS
|
||||
) );*/
|
||||
|
||||
mask0 = _mm256_or_si256(mask1, mask0);
|
||||
//unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0));
|
||||
//unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256));
|
||||
//unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos));
|
||||
mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
|
||||
mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
|
||||
|
||||
unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
|
||||
if((mask_final & prim_num_mask) == 0) { //all bits NOT set
|
||||
return false;
|
||||
} /**/
|
||||
|
||||
unsigned long i = 0;
|
||||
#if defined(_MSC_VER)
|
||||
unsigned char res = _BitScanForward(&i, (unsigned long)mask_final);
|
||||
#else
|
||||
i = __builtin_ffs(mask_final)-1;
|
||||
#endif
|
||||
|
||||
den_256 = _mm256_rcp_ps(den_256); //inv_den
|
||||
U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den
|
||||
V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den
|
||||
T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den
|
||||
|
||||
_mm256_store_ps(U8, U_256);
|
||||
_mm256_store_ps(V8, V_256);
|
||||
_mm256_store_ps(T8, T_256);
|
||||
|
||||
|
||||
//here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true
|
||||
|
||||
(*isect)->u = U8[i];
|
||||
(*isect)->v = V8[i];
|
||||
(*isect)->t = T8[i];
|
||||
|
||||
(*isect)->prim = (prim_addr + i);
|
||||
(*isect)->object = object;
|
||||
(*isect)->type = PRIMITIVE_TRIANGLE;
|
||||
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
_mm256_store_ps(den8, den_256);
|
||||
_mm256_store_ps(U8, U_256);
|
||||
_mm256_store_ps(V8, V_256);
|
||||
_mm256_store_ps(T8, T_256);
|
||||
|
||||
_mm256_store_ps(sign_T8, sign_T_256);
|
||||
_mm256_store_ps(xor_signmask8, xor_signmask_256);
|
||||
_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
|
||||
|
||||
int ret = false;
|
||||
|
||||
if(visibility == PATH_RAY_SHADOW) {
|
||||
for(int i = 0; i < prim_num; i++) {
|
||||
if(!mask_minmaxUVW8[i]) {
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
|
||||
#endif
|
||||
{
|
||||
if((sign_T8[i] >= 0.0f) &&
|
||||
(sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
|
||||
{
|
||||
if(den8[i]) {
|
||||
const float inv_den = 1.0f / den8[i];
|
||||
|
||||
(*isect)->u = U8[i] * inv_den;
|
||||
(*isect)->v = V8[i] * inv_den;
|
||||
(*isect)->t = T8[i] * inv_den;
|
||||
|
||||
(*isect)->prim = (prim_addr + i);
|
||||
(*isect)->object = object;
|
||||
(*isect)->type = PRIMITIVE_TRIANGLE;
|
||||
|
||||
int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
|
||||
int shader = 0;
|
||||
|
||||
#ifdef __HAIR__
|
||||
if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
|
||||
#endif
|
||||
{
|
||||
shader = kernel_tex_fetch(__tri_shader, prim);
|
||||
}
|
||||
#ifdef __HAIR__
|
||||
else {
|
||||
float4 str = kernel_tex_fetch(__curves, prim);
|
||||
shader = __float_as_int(str.z);
|
||||
}
|
||||
#endif
|
||||
int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
|
||||
|
||||
/* if no transparent shadows, all light is blocked */
|
||||
if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
|
||||
return 2;
|
||||
}
|
||||
/* if maximum number of hits reached, block all light */
|
||||
else if(*num_hits == max_hits) {
|
||||
return 2;
|
||||
}
|
||||
/* move on to next entry in intersections array */
|
||||
ret = true;
|
||||
|
||||
(*isect)++;
|
||||
(*num_hits)++;
|
||||
|
||||
(*num_hits_in_instance)++;
|
||||
|
||||
(*isect)->t = isec_t;
|
||||
|
||||
} //den
|
||||
} //if sign
|
||||
} //vis
|
||||
}//if mask
|
||||
} //for
|
||||
}
|
||||
else { //default case
|
||||
for(int i = 0; i < prim_num; i++) {
|
||||
if(!mask_minmaxUVW8[i]) {
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
|
||||
#endif
|
||||
{
|
||||
if((sign_T8[i] >= 0.0f) &&
|
||||
(sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
|
||||
{
|
||||
if(den8[i]) {
|
||||
const float inv_den = 1.0f / den8[i];
|
||||
|
||||
(*isect)->u = U8[i] * inv_den;
|
||||
(*isect)->v = V8[i] * inv_den;
|
||||
(*isect)->t = T8[i] * inv_den;
|
||||
|
||||
(*isect)->prim = (prim_addr + i);
|
||||
(*isect)->object = object;
|
||||
(*isect)->type = PRIMITIVE_TRIANGLE;
|
||||
|
||||
ret = true;
|
||||
} //den
|
||||
} //if sign
|
||||
} //vis
|
||||
}//if mask
|
||||
} //for
|
||||
} //default
|
||||
return ret;
|
||||
}// else PATH_RAY_SHADOW_OPAQUE
|
||||
|
||||
}
|
||||
|
||||
//vz static
|
||||
ccl_device_inline
|
||||
int triangle_intersect8(KernelGlobals *kg,
|
||||
Intersection **isect,
|
||||
float3 P,
|
||||
float3 dir,
|
||||
uint visibility,
|
||||
int object,
|
||||
int prim_addr,
|
||||
int prim_num,
|
||||
uint *num_hits,
|
||||
uint max_hits,
|
||||
int *num_hits_in_instance,
|
||||
float isec_t)
|
||||
{
|
||||
__m128 tri_a[8], tri_b[8], tri_c[8];
|
||||
__m256 tritmp[12], tri[12];
|
||||
__m256 triA[3], triB[3], triC[3];
|
||||
|
||||
int i, r;
|
||||
|
||||
uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
|
||||
for(i = 0; i < prim_num; i++) {
|
||||
tri_a[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
|
||||
tri_b[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
|
||||
tri_c[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
|
||||
}
|
||||
//create 9 or 12 placeholders
|
||||
tri[0] = _mm256_castps128_ps256(tri_a[0]); //_mm256_zextps128_ps256
|
||||
tri[1] = _mm256_castps128_ps256(tri_b[0]);//_mm256_zextps128_ps256
|
||||
tri[2] = _mm256_castps128_ps256(tri_c[0]);//_mm256_zextps128_ps256
|
||||
|
||||
tri[3] = _mm256_castps128_ps256(tri_a[1]); //_mm256_zextps128_ps256
|
||||
tri[4] = _mm256_castps128_ps256(tri_b[1]);//_mm256_zextps128_ps256
|
||||
tri[5] = _mm256_castps128_ps256(tri_c[1]);//_mm256_zextps128_ps256
|
||||
|
||||
tri[6] = _mm256_castps128_ps256(tri_a[2]); //_mm256_zextps128_ps256
|
||||
tri[7] = _mm256_castps128_ps256(tri_b[2]);//_mm256_zextps128_ps256
|
||||
tri[8] = _mm256_castps128_ps256(tri_c[2]);//_mm256_zextps128_ps256
|
||||
|
||||
if(prim_num > 3) {
|
||||
tri[9] = _mm256_castps128_ps256(tri_a[3]); //_mm256_zextps128_ps256
|
||||
tri[10] = _mm256_castps128_ps256(tri_b[3]);//_mm256_zextps128_ps256
|
||||
tri[11] = _mm256_castps128_ps256(tri_c[3]);//_mm256_zextps128_ps256
|
||||
}
|
||||
|
||||
for(i = 4, r = 0; i < prim_num; i ++, r += 3) {
|
||||
tri[r] = _mm256_insertf128_ps(tri[r] , tri_a[i], 1);
|
||||
tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1);
|
||||
tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1);
|
||||
}
|
||||
|
||||
//------------------------------------------------
|
||||
//0! Xa0 Ya0 Za0 1 Xa4 Ya4 Za4 1
|
||||
//1! Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1
|
||||
//2! Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1
|
||||
|
||||
//3! Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1
|
||||
//4! Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5 1
|
||||
//5! Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1
|
||||
|
||||
//6! Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1
|
||||
//7! Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6 1
|
||||
//8! Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1
|
||||
|
||||
//9! Xa3 Ya3 Za3 1 Xa7 Ya7 Za7 1
|
||||
//10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7 1
|
||||
//11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7 1
|
||||
|
||||
//"transpose"
|
||||
tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]); //0! Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5
|
||||
tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]); //1! Za0 Za1 1 1 Za4 Za5 1 1
|
||||
|
||||
tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]); //2! Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7
|
||||
tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]); //3! Za2 Za3 1 1 Za6 Za7 1 1
|
||||
|
||||
tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]); //4! Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5
|
||||
tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]); //5! Zb0 Zb1 1 1 Zb4 Zb5 1 1
|
||||
|
||||
tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]); //6! Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7
|
||||
tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]); //7! Zb2 Zb3 1 1 Zb6 Zb7 1 1
|
||||
|
||||
tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]); //8! Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5
|
||||
tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]); //9! Zc0 Zc1 1 1 Zc4 Zc5 1 1
|
||||
|
||||
tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]); //10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7
|
||||
tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]); //11! Zc2 Zc3 1 1 Zc6 Zc7 1 1
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
triA[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7
|
||||
triA[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7
|
||||
triA[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]), _mm256_castps_pd(tritmp[3]))); // Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7
|
||||
|
||||
triB[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Xb0 Xb1 Xb2 Xb3 Xb4 Xb5 Xb5 Xb7
|
||||
triB[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Yb0 Yb1 Yb2 Yb3 Yb4 Yb5 Yb5 Yb7
|
||||
triB[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]), _mm256_castps_pd(tritmp[7]))); // Zb0 Zb1 Zb2 Zb3 Zb4 Zb5 Zb5 Zb7
|
||||
|
||||
triC[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7
|
||||
triC[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7
|
||||
triC[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]), _mm256_castps_pd(tritmp[11]))); //Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
|
||||
int result = ray_triangle_intersect8(kg, P,
|
||||
dir,
|
||||
isect,
|
||||
visibility, object,
|
||||
triA,
|
||||
triB,
|
||||
triC,
|
||||
prim_addr,
|
||||
prim_num,
|
||||
num_hits,
|
||||
max_hits,
|
||||
num_hits_in_instance,
|
||||
isec_t);
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Special ray intersection routines for subsurface scattering. In that case we
|
||||
* only want to intersect with primitives in the same object, and if case of
|
||||
* multiple hits we pick a single random primitive as the intersection point.
|
||||
* Returns whether traversal should be stopped.
|
||||
|
@ -83,7 +555,7 @@ ccl_device_inline bool triangle_intersect_local(
|
|||
float3 P,
|
||||
float3 dir,
|
||||
int object,
|
||||
int local_object,
|
||||
int local_object,
|
||||
int prim_addr,
|
||||
float tmax,
|
||||
uint *lcg_state,
|
||||
|
|
|
@ -71,15 +71,13 @@ CCL_NAMESPACE_BEGIN
|
|||
/* Texture types to be compatible with CUDA textures. These are really just
|
||||
* simple arrays and after inlining fetch hopefully revert to being a simple
|
||||
* pointer lookup. */
|
||||
|
||||
template<typename T> struct texture {
|
||||
ccl_always_inline const T& fetch(int index)
|
||||
{
|
||||
kernel_assert(index >= 0 && index < width);
|
||||
return data[index];
|
||||
}
|
||||
|
||||
#ifdef __KERNEL_AVX__
|
||||
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
|
||||
/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
|
||||
* compatibility with existing indicies and data structures.
|
||||
*/
|
||||
|
@ -90,7 +88,6 @@ template<typename T> struct texture {
|
|||
ssef *ssef_node_data = &ssef_data[index];
|
||||
return _mm256_loadu_ps((float *)ssef_node_data);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __KERNEL_SSE2__
|
||||
|
@ -148,6 +145,10 @@ ccl_device_inline void print_sse3i(const char *label, sse3i& a)
|
|||
print_ssei(label, a.z);
|
||||
}
|
||||
|
||||
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
|
||||
typedef vector3<avxf> avx3f;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -1384,8 +1384,9 @@ typedef enum KernelBVHLayout {
|
|||
|
||||
BVH_LAYOUT_BVH2 = (1 << 0),
|
||||
BVH_LAYOUT_BVH4 = (1 << 1),
|
||||
BVH_LAYOUT_BVH8 = (1 << 2),
|
||||
|
||||
BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4,
|
||||
BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
|
||||
BVH_LAYOUT_ALL = (unsigned int)(-1),
|
||||
} KernelBVHLayout;
|
||||
|
||||
|
|
|
@ -78,6 +78,7 @@ set(SRC_HEADERS
|
|||
util_sky_model.h
|
||||
util_sky_model_data.h
|
||||
util_avxf.h
|
||||
util_avxb.h
|
||||
util_sseb.h
|
||||
util_ssef.h
|
||||
util_ssei.h
|
||||
|
@ -98,7 +99,9 @@ set(SRC_HEADERS
|
|||
util_types_float3_impl.h
|
||||
util_types_float4.h
|
||||
util_types_float4_impl.h
|
||||
util_types_int2.h
|
||||
util_types_float8.h
|
||||
util_types_float8_impl.h
|
||||
util_types_int2.h
|
||||
util_types_int2_impl.h
|
||||
util_types_int3.h
|
||||
util_types_int3_impl.h
|
||||
|
|
|
@ -0,0 +1,192 @@
|
|||
/*
|
||||
* Copyright 2011-2013 Intel Corporation
|
||||
* Modifications Copyright 2014, Blender Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0(the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __UTIL_AVXB_H__
|
||||
#define __UTIL_AVXB_H__
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
struct avxf;
|
||||
|
||||
/*! 4-wide SSE bool type. */
|
||||
struct avxb
|
||||
{
|
||||
typedef avxb Mask; // mask type
|
||||
typedef avxf Float; // float type
|
||||
|
||||
enum { size = 8 }; // number of SIMD elements
|
||||
union { __m256 m256; int32_t v[8]; }; // data
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Constructors, Assignment & Cast Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline avxb ( ) {}
|
||||
__forceinline avxb ( const avxb& other ) { m256 = other.m256; }
|
||||
__forceinline avxb& operator=( const avxb& other ) { m256 = other.m256; return *this; }
|
||||
|
||||
__forceinline avxb( const __m256 input ) : m256(input) {}
|
||||
__forceinline operator const __m256&( void ) const { return m256; }
|
||||
__forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
|
||||
__forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
|
||||
|
||||
//__forceinline avxb ( bool a )
|
||||
// : m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
|
||||
//__forceinline avxb ( bool a, bool b)
|
||||
// : m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
|
||||
//__forceinline avxb ( bool a, bool b, bool c, bool d)
|
||||
// : m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
|
||||
//__forceinline avxb(int mask) {
|
||||
// assert(mask >= 0 && mask < 16);
|
||||
// m128 = _mm_lookupmask_ps[mask];
|
||||
//}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Constants
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
|
||||
__forceinline avxb( TrueTy ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Array Access
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline bool operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; }
|
||||
__forceinline int32_t& operator []( const size_t i ) { assert(i < 8); return v[i]; }
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Unary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Binary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); }
|
||||
__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); }
|
||||
__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Assignment Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; }
|
||||
__forceinline const avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; }
|
||||
__forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Comparison Operators + Select
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
|
||||
__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
|
||||
|
||||
__forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
|
||||
#if defined(__KERNEL_SSE41__)
|
||||
return _mm256_blendv_ps(f, t, m);
|
||||
#else
|
||||
return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Movement/Shifting/Shuffling Functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); }
|
||||
__forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); }
|
||||
|
||||
#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \
|
||||
((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
|
||||
|
||||
template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
|
||||
__forceinline const avxb shuffle( const avxb& a ) {
|
||||
return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0)));
|
||||
}
|
||||
|
||||
/*
|
||||
template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) {
|
||||
return _mm_movelh_ps(a, a);
|
||||
}
|
||||
|
||||
template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
|
||||
return _mm_movehl_ps(a, a);
|
||||
}
|
||||
|
||||
template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
|
||||
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
}
|
||||
|
||||
template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
|
||||
return _mm_movelh_ps(a, b);
|
||||
}
|
||||
|
||||
template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
|
||||
return _mm_movehl_ps(b, a);
|
||||
}
|
||||
|
||||
#if defined(__KERNEL_SSE3__)
|
||||
template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
|
||||
template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
|
||||
#endif
|
||||
|
||||
#if defined(__KERNEL_SSE41__)
|
||||
template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
|
||||
template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
|
||||
template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
|
||||
#endif
|
||||
*/
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Reduction Operations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__KERNEL_SSE41__)
|
||||
__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); }
|
||||
#else
|
||||
__forceinline size_t popcnt( const avxb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3])+bool(a[4])+
|
||||
bool(a[5])+bool(a[6])+bool(a[7]); }
|
||||
#endif
|
||||
|
||||
__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xf; }
|
||||
__forceinline bool reduce_or ( const avxb& a ) { return _mm256_movemask_ps(a) != 0x0; }
|
||||
__forceinline bool all ( const avxb& b ) { return _mm256_movemask_ps(b) == 0xf; }
|
||||
__forceinline bool any ( const avxb& b ) { return _mm256_movemask_ps(b) != 0x0; }
|
||||
__forceinline bool none ( const avxb& b ) { return _mm256_movemask_ps(b) == 0x0; }
|
||||
|
||||
__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Debug Functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
ccl_device_inline void print_avxb(const char *label, const avxb &a)
|
||||
{
|
||||
printf("%s: %df %df %df %df %df %df %df %d\n",
|
||||
label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
//#endif
|
||||
|
|
@ -19,7 +19,8 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifdef __KERNEL_AVX__
|
||||
struct avxb;
|
||||
|
||||
struct avxf
|
||||
{
|
||||
typedef avxf Float;
|
||||
|
@ -53,6 +54,9 @@ struct avxf
|
|||
__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
|
||||
m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
|
||||
|
||||
__forceinline avxf(float3 a) :
|
||||
m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) {}
|
||||
|
||||
|
||||
__forceinline avxf(int a3, int a2, int a1, int a0)
|
||||
{
|
||||
|
@ -73,8 +77,24 @@ struct avxf
|
|||
m256 = _mm256_insertf128_ps(foo, b, 1);
|
||||
}
|
||||
|
||||
__forceinline const float& operator [](const size_t i) const { assert(i < 8); return f[i]; }
|
||||
__forceinline float& operator [](const size_t i) { assert(i < 8); return f[i]; }
|
||||
};
|
||||
|
||||
__forceinline avxf cross(const avxf& a, const avxf& b)
|
||||
{
|
||||
avxf r(0.0, a[4]*b[5] - a[5]*b[4], a[6]*b[4] - a[4]*b[6], a[5]*b[6] - a[6]*b[5],
|
||||
0.0, a[0]*b[1] - a[1]*b[0], a[2]*b[0] - a[0]*b[2], a[1]*b[2] - a[2]*b[1]);
|
||||
return r;
|
||||
}
|
||||
|
||||
__forceinline void dot3(const avxf& a, const avxf& b, float &den, float &den2)
|
||||
{
|
||||
const avxf t = _mm256_mul_ps(a.m256, b.m256);
|
||||
den = ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
|
||||
den2 = ((float*)&t)[4] + ((float*)&t)[5] + ((float*)&t)[6];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Unary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -107,6 +127,9 @@ __forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256
|
|||
|
||||
__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
|
||||
|
||||
__forceinline const avxf max(const avxf& a, const avxf& b) { return _mm256_max_ps(a.m256, b.m256); }
|
||||
__forceinline const avxf min(const avxf& a, const avxf& b) { return _mm256_min_ps(a.m256, b.m256); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Movement/Shifting/Shuffling Functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -160,6 +183,18 @@ ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
|
|||
return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
|
||||
}
|
||||
|
||||
//#if defined(__KERNEL_SSE41__)
|
||||
__forceinline avxf maxi(const avxf& a, const avxf& b) {
|
||||
const avxf ci = _mm256_max_ps(a, b);
|
||||
return ci;
|
||||
}
|
||||
|
||||
__forceinline avxf mini(const avxf& a, const avxf& b) {
|
||||
const avxf ci = _mm256_min_ps(a, b);
|
||||
return ci;
|
||||
}
|
||||
//#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Ternary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -178,6 +213,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
|
|||
return c-(a*b);
|
||||
#endif
|
||||
}
|
||||
__forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) {
|
||||
return _mm256_fmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Comparison Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef __KERNEL_AVX2__
|
||||
__forceinline const avxb operator <=(const avxf& a, const avxf& b) {
|
||||
return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef _mm256_set_m128
|
||||
|
@ -190,4 +238,3 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
|
|||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
@ -57,7 +57,19 @@ void DebugFlags::CPU::reset()
|
|||
#undef STRINGIFY
|
||||
#undef CHECK_CPU_FLAGS
|
||||
|
||||
bvh_layout = BVH_LAYOUT_DEFAULT;
|
||||
if (getenv("CYCLES_BVH2") != NULL) {
|
||||
bvh_layout = BVH_LAYOUT_BVH2;
|
||||
}
|
||||
else if (getenv("CYCLES_BVH4") != NULL) {
|
||||
bvh_layout = BVH_LAYOUT_BVH4;
|
||||
}
|
||||
else if (getenv("CYCLES_BVH8") != NULL) {
|
||||
bvh_layout = BVH_LAYOUT_BVH8;
|
||||
}
|
||||
else {
|
||||
bvh_layout = BVH_LAYOUT_DEFAULT;
|
||||
}
|
||||
|
||||
split_kernel = false;
|
||||
}
|
||||
|
||||
|
|
|
@ -121,6 +121,7 @@ CCL_NAMESPACE_END
|
|||
#include "util/util_types_float2.h"
|
||||
#include "util/util_types_float3.h"
|
||||
#include "util/util_types_float4.h"
|
||||
#include "util/util_types_float8.h"
|
||||
|
||||
#include "util/util_types_vector3.h"
|
||||
|
||||
|
@ -140,6 +141,7 @@ CCL_NAMESPACE_END
|
|||
#include "util/util_types_float2_impl.h"
|
||||
#include "util/util_types_float3_impl.h"
|
||||
#include "util/util_types_float4_impl.h"
|
||||
#include "util/util_types_float8_impl.h"
|
||||
|
||||
#include "util/util_types_vector3_impl.h"
|
||||
|
||||
|
@ -148,7 +150,10 @@ CCL_NAMESPACE_END
|
|||
# include "util/util_sseb.h"
|
||||
# include "util/util_ssei.h"
|
||||
# include "util/util_ssef.h"
|
||||
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
|
||||
# include "util/util_avxb.h"
|
||||
# include "util/util_avxf.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* __UTIL_TYPES_H__ */
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
Copyright (c) 2017, Intel Corporation
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __UTIL_TYPES_FLOAT8_H__
|
||||
#define __UTIL_TYPES_FLOAT8_H__
|
||||
|
||||
#ifndef __UTIL_TYPES_H__
|
||||
# error "Do not include this file directly, include util_types.h instead."
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
|
||||
struct ccl_try_align(16) float8 {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
union {
|
||||
__m256 m256;
|
||||
struct { float a, b, c, d, e, f, g, h; };
|
||||
};
|
||||
|
||||
__forceinline float8();
|
||||
__forceinline float8(const float8& a);
|
||||
__forceinline explicit float8(const __m256& a);
|
||||
|
||||
__forceinline operator const __m256&(void) const;
|
||||
__forceinline operator __m256&(void);
|
||||
|
||||
__forceinline float8& operator =(const float8& a);
|
||||
|
||||
#else /* __KERNEL_AVX2__ */
|
||||
float a, b, c, d, e, f, g, h;
|
||||
#endif /* __KERNEL_AVX2__ */
|
||||
|
||||
__forceinline float operator[](int i) const;
|
||||
__forceinline float& operator[](int i);
|
||||
};
|
||||
|
||||
ccl_device_inline float8 make_float8(float f);
|
||||
ccl_device_inline float8 make_float8(float a, float b, float c, float d,
|
||||
float e, float f, float g, float h);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __UTIL_TYPES_FLOAT8_H__ */
|
|
@ -0,0 +1,113 @@
|
|||
/*
|
||||
Copyright (c) 2017, Intel Corporation
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
|
||||
#define __UTIL_TYPES_FLOAT8_IMPL_H__
|
||||
|
||||
#ifndef __UTIL_TYPES_H__
|
||||
# error "Do not include this file directly, include util_types.h instead."
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
# include <cstdio>
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
#ifdef __KERNEL_AVX2__
|
||||
__forceinline float8::float8()
|
||||
{
|
||||
}
|
||||
|
||||
__forceinline float8::float8(const float8& f)
|
||||
: m256(f.m256)
|
||||
{
|
||||
}
|
||||
|
||||
__forceinline float8::float8(const __m256& f)
|
||||
: m256(f)
|
||||
{
|
||||
}
|
||||
|
||||
__forceinline float8::operator const __m256&(void) const
|
||||
{
|
||||
return m256;
|
||||
}
|
||||
|
||||
__forceinline float8::operator __m256&(void)
|
||||
{
|
||||
return m256;
|
||||
}
|
||||
|
||||
__forceinline float8& float8::operator =(const float8& f)
|
||||
{
|
||||
m256 = f.m256;
|
||||
return *this;
|
||||
}
|
||||
#endif /* __KERNEL_AVX2__ */
|
||||
|
||||
__forceinline float float8::operator[](int i) const
|
||||
{
|
||||
util_assert(i >= 0);
|
||||
util_assert(i < 8);
|
||||
return *(&a + i);
|
||||
}
|
||||
|
||||
__forceinline float& float8::operator[](int i)
|
||||
{
|
||||
util_assert(i >= 0);
|
||||
util_assert(i < 8);
|
||||
return *(&a + i);
|
||||
}
|
||||
|
||||
ccl_device_inline float8 make_float8(float f)
|
||||
{
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float8 r(_mm256_set1_ps(f));
|
||||
#else
|
||||
float8 r = {f, f, f, f, f, f, f, f};
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
ccl_device_inline float8 make_float8(float a, float b, float c, float d,
|
||||
float e, float f, float g, float h)
|
||||
{
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
|
||||
#else
|
||||
float8 r = {a, b, c, d, e, f, g, h};
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */
|
Loading…
Reference in New Issue