Cycles: Pack kernel textures into buffers for OpenCL

Image textures were being packed into a single buffer for OpenCL, which
limited the amount of memory available for images to the size of one
buffer (usually 4gb on AMD hardware). By packing textures into multiple
buffers that limit is removed, while simultaneously reducing the number
of buffers that need to be passed to each kernel.

Benchmarks were within 2%.

Fixes T51554.

Differential Revision: https://developer.blender.org/D2745
This commit is contained in:
Mai Lavelle 2017-08-08 07:12:04 -04:00
parent b53e35c655
commit ec8ae4d5e9
Notes: blender-bot 2023-02-14 06:57:57 +01:00
Referenced by issue #53249, [regression] OpenCL performance becomes very random with big scenes.
Referenced by issue #51554, OpenCL Textures exceed available single buffer allocation memory limit
25 changed files with 685 additions and 328 deletions

View File

@ -34,11 +34,13 @@ set(SRC
set(SRC_OPENCL
opencl/opencl.h
opencl/memory_manager.h
opencl/opencl_base.cpp
opencl/opencl_mega.cpp
opencl/opencl_split.cpp
opencl/opencl_util.cpp
opencl/memory_manager.cpp
)
if(WITH_CYCLES_NETWORK)

View File

@ -379,11 +379,9 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
info.num = 0;
info.has_bindless_textures = true;
info.pack_images = false;
foreach(DeviceInfo &device, subdevices) {
assert(device.type == info.multi_devices[0].type);
info.pack_images |= device.pack_images;
info.has_bindless_textures &= device.has_bindless_textures;
}

View File

@ -53,7 +53,6 @@ public:
int num;
bool display_device;
bool advanced_shading;
bool pack_images;
bool has_bindless_textures; /* flag for GPU and Multi device */
bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
vector<DeviceInfo> multi_devices;
@ -65,7 +64,6 @@ public:
num = 0;
display_device = false;
advanced_shading = true;
pack_images = false;
has_bindless_textures = false;
use_split_kernel = false;
}

View File

@ -977,7 +977,6 @@ void device_cpu_info(vector<DeviceInfo>& devices)
info.id = "CPU";
info.num = 0;
info.advanced_shading = true;
info.pack_images = false;
devices.insert(devices.begin(), info);
}

View File

@ -2164,7 +2164,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
info.advanced_shading = (major >= 2);
info.has_bindless_textures = (major >= 3);
info.pack_images = false;
int pci_location[3] = {0, 0, 0};
cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);

View File

@ -95,7 +95,6 @@ void device_opencl_info(vector<DeviceInfo>& devices)
/* We don't know if it's used for display, but assume it is. */
info.display_device = true;
info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
info.pack_images = true;
info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
device_type);
info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;

View File

@ -0,0 +1,253 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_OPENCL
#include "util/util_foreach.h"
#include "device/opencl/opencl.h"
#include "device/opencl/memory_manager.h"
CCL_NAMESPACE_BEGIN
void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation)
{
allocations.push_back(&allocation);
}
void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device)
{
bool need_realloc = false;
/* Calculate total size and remove any freed. */
size_t total_size = 0;
for(int i = allocations.size()-1; i >= 0; i--) {
Allocation* allocation = allocations[i];
/* Remove allocations that have been freed. */
if(!allocation->mem || allocation->mem->memory_size() == 0) {
allocation->device_buffer = NULL;
allocation->size = 0;
allocations.erase(allocations.begin()+i);
need_realloc = true;
continue;
}
/* Get actual size for allocation. */
size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
if(allocation->size != alloc_size) {
/* Allocation is either new or resized. */
allocation->size = alloc_size;
allocation->needs_copy_to_device = true;
need_realloc = true;
}
total_size += alloc_size;
}
if(need_realloc) {
cl_ulong max_buffer_size;
clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
if(total_size > max_buffer_size) {
device->set_error("Scene too complex to fit in available memory.");
return;
}
device_memory *new_buffer = new device_memory;
new_buffer->resize(total_size);
device->mem_alloc(string_printf("buffer_%p", this).data(), *new_buffer, MEM_READ_ONLY);
size_t offset = 0;
foreach(Allocation* allocation, allocations) {
if(allocation->needs_copy_to_device) {
/* Copy from host to device. */
opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
CL_MEM_PTR(new_buffer->device_pointer),
CL_FALSE,
offset,
allocation->mem->memory_size(),
(void*)allocation->mem->data_pointer,
0, NULL, NULL
));
allocation->needs_copy_to_device = false;
}
else {
/* Fast copy from memory already on device. */
opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue,
CL_MEM_PTR(buffer->device_pointer),
CL_MEM_PTR(new_buffer->device_pointer),
allocation->desc.offset,
offset,
allocation->mem->memory_size(),
0, NULL, NULL
));
}
allocation->desc.offset = offset;
offset += allocation->size;
}
device->mem_free(*buffer);
delete buffer;
buffer = new_buffer;
}
else {
assert(total_size == buffer->data_size);
size_t offset = 0;
foreach(Allocation* allocation, allocations) {
if(allocation->needs_copy_to_device) {
/* Copy from host to device. */
opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
CL_MEM_PTR(buffer->device_pointer),
CL_FALSE,
offset,
allocation->mem->memory_size(),
(void*)allocation->mem->data_pointer,
0, NULL, NULL
));
allocation->needs_copy_to_device = false;
}
offset += allocation->size;
}
}
/* Not really necessary, but seems to improve responsiveness for some reason. */
clFinish(device->cqCommandQueue);
}
void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *device)
{
device->mem_free(*buffer);
}
MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer()
{
DeviceBuffer* smallest = device_buffers;
foreach(DeviceBuffer& device_buffer, device_buffers) {
if(device_buffer.size < smallest->size) {
smallest = &device_buffer;
}
}
return smallest;
}
MemoryManager::MemoryManager(OpenCLDeviceBase *device) : device(device), need_update(false)
{
}
void MemoryManager::free()
{
foreach(DeviceBuffer& device_buffer, device_buffers) {
device_buffer.free(device);
}
}
void MemoryManager::alloc(const char *name, device_memory& mem)
{
Allocation& allocation = allocations[name];
allocation.mem = &mem;
allocation.needs_copy_to_device = true;
if(!allocation.device_buffer) {
DeviceBuffer* device_buffer = smallest_device_buffer();
allocation.device_buffer = device_buffer;
allocation.desc.device_buffer = device_buffer - device_buffers;
device_buffer->add_allocation(allocation);
device_buffer->size += mem.memory_size();
}
need_update = true;
}
bool MemoryManager::free(device_memory& mem)
{
foreach(AllocationsMap::value_type& value, allocations) {
Allocation& allocation = value.second;
if(allocation.mem == &mem) {
allocation.device_buffer->size -= mem.memory_size();
allocation.mem = NULL;
allocation.needs_copy_to_device = false;
need_update = true;
return true;
}
}
return false;
}
MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
{
update_device_memory();
Allocation& allocation = allocations[name];
return allocation.desc;
}
void MemoryManager::update_device_memory()
{
if(!need_update) {
return;
}
need_update = false;
foreach(DeviceBuffer& device_buffer, device_buffers) {
device_buffer.update_device_memory(device);
}
}
void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
{
update_device_memory();
foreach(DeviceBuffer& device_buffer, device_buffers) {
if(device_buffer.buffer->device_pointer) {
device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
}
else {
device->kernel_set_args(kernel, (*narg)++, device->null_mem);
}
}
}
CCL_NAMESPACE_END
#endif /* WITH_OPENCL */

View File

@ -0,0 +1,105 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "device/device.h"
#include "util/util_map.h"
#include "util/util_vector.h"
#include "util/util_string.h"
#include "clew.h"
CCL_NAMESPACE_BEGIN
class OpenCLDeviceBase;
class MemoryManager {
public:
static const int NUM_DEVICE_BUFFERS = 8;
struct BufferDescriptor {
uint device_buffer;
cl_ulong offset;
};
private:
struct DeviceBuffer;
struct Allocation {
device_memory *mem;
DeviceBuffer *device_buffer;
size_t size; /* Size of actual allocation, may be larger than requested. */
BufferDescriptor desc;
bool needs_copy_to_device;
Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
{
}
};
struct DeviceBuffer {
device_memory *buffer;
vector<Allocation*> allocations;
size_t size; /* Size of all allocations. */
DeviceBuffer() : buffer(new device_memory), size(0)
{
}
~DeviceBuffer() {
delete buffer;
buffer = NULL;
}
void add_allocation(Allocation& allocation);
void update_device_memory(OpenCLDeviceBase *device);
void free(OpenCLDeviceBase *device);
};
OpenCLDeviceBase *device;
DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
typedef unordered_map<string, Allocation> AllocationsMap;
AllocationsMap allocations;
bool need_update;
DeviceBuffer* smallest_device_buffer();
public:
MemoryManager(OpenCLDeviceBase *device);
void free(); /* Free all memory. */
void alloc(const char *name, device_memory& mem);
bool free(device_memory& mem);
BufferDescriptor get_descriptor(string name);
void update_device_memory();
void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
};
CCL_NAMESPACE_END

View File

@ -25,6 +25,8 @@
#include "clew.h"
#include "device/opencl/memory_manager.h"
CCL_NAMESPACE_BEGIN
/* Disable workarounds, seems to be working fine on latest drivers. */
@ -224,6 +226,18 @@ public:
static string get_kernel_md5();
};
#define opencl_device_assert(device, stmt) \
{ \
cl_int err = stmt; \
\
if(err != CL_SUCCESS) { \
string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
if((device)->error_msg == "") \
(device)->error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
} \
} (void)0
#define opencl_assert(stmt) \
{ \
cl_int err = stmt; \
@ -344,6 +358,7 @@ public:
size_t global_size_round_up(int group_size, int global_size);
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
void shader(DeviceTask& task);
@ -525,6 +540,34 @@ protected:
virtual string build_options_for_base_program(
const DeviceRequestedFeatures& /*requested_features*/);
private:
MemoryManager memory_manager;
friend MemoryManager;
struct tex_info_t {
uint buffer, padding;
cl_ulong offset;
uint width, height, depth, options;
};
static_assert_align(tex_info_t, 16);
vector<tex_info_t> texture_descriptors;
device_memory texture_descriptors_buffer;
struct Texture {
device_memory* mem;
InterpolationType interpolation;
ExtensionType extension;
};
typedef map<string, Texture> TexturesMap;
TexturesMap textures;
bool textures_need_update;
protected:
void flush_texture_buffers();
};
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);

View File

@ -63,7 +63,7 @@ void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
}
OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
: Device(info, stats, background_)
: Device(info, stats, background_), memory_manager(this)
{
cpPlatform = NULL;
cdDevice = NULL;
@ -71,6 +71,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
cqCommandQueue = NULL;
null_mem = 0;
device_initialized = false;
textures_need_update = true;
vector<OpenCLPlatformDevice> usable_devices;
OpenCLInfo::get_usable_devices(&usable_devices);
@ -126,6 +127,12 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
return;
}
/* Allocate this right away so that texture_descriptors_buffer is placed at offset 0 in the device memory buffers */
texture_descriptors.resize(1);
texture_descriptors_buffer.resize(1);
texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
fprintf(stderr, "Device init success\n");
device_initialized = true;
}
@ -134,6 +141,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
{
task_pool.stop();
memory_manager.free();
if(null_mem)
clReleaseMemObject(CL_MEM_PTR(null_mem));
@ -493,29 +502,31 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
void OpenCLDeviceBase::tex_alloc(const char *name,
device_memory& mem,
InterpolationType /*interpolation*/,
ExtensionType /*extension*/)
InterpolationType interpolation,
ExtensionType extension)
{
VLOG(1) << "Texture allocate: " << name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")";
mem_alloc(NULL, mem, MEM_READ_ONLY);
mem_copy_to(mem);
assert(mem_map.find(name) == mem_map.end());
mem_map.insert(MemMap::value_type(name, mem.device_pointer));
memory_manager.alloc(name, mem);
textures[name] = {&mem, interpolation, extension};
textures_need_update = true;
}
void OpenCLDeviceBase::tex_free(device_memory& mem)
{
if(mem.device_pointer) {
foreach(const MemMap::value_type& value, mem_map) {
if(value.second == mem.device_pointer) {
mem_map.erase(value.first);
break;
}
}
if(memory_manager.free(mem)) {
textures_need_update = true;
}
mem_free(mem);
foreach(TexturesMap::value_type& value, textures) {
if(value.second.mem == &mem) {
textures.erase(value.first);
break;
}
}
}
@ -581,6 +592,104 @@ void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const
opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
}
void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
{
flush_texture_buffers();
memory_manager.set_kernel_arg_buffers(kernel, narg);
}
void OpenCLDeviceBase::flush_texture_buffers()
{
if(!textures_need_update) {
return;
}
textures_need_update = false;
/* Setup slots for textures. */
int num_slots = 0;
struct texture_slot_t {
string name;
int slot;
};
vector<texture_slot_t> texture_slots;
#define KERNEL_TEX(type, ttype, name) \
if(textures.find(#name) != textures.end()) { \
texture_slots.push_back({#name, num_slots}); \
} \
num_slots++;
#include "kernel/kernel_textures.h"
int num_data_slots = num_slots;
foreach(TexturesMap::value_type& tex, textures) {
string name = tex.first;
if(string_startswith(name, "__tex_image")) {
int pos = name.rfind("_");
int id = atoi(name.data() + pos + 1);
texture_slots.push_back({name, num_data_slots + id});
num_slots = max(num_slots, num_data_slots + id + 1);
}
}
/* Realloc texture descriptors buffer. */
memory_manager.free(texture_descriptors_buffer);
texture_descriptors.resize(num_slots);
texture_descriptors_buffer.resize(num_slots * sizeof(tex_info_t));
texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
/* Fill in descriptors */
foreach(texture_slot_t& slot, texture_slots) {
Texture& tex = textures[slot.name];
tex_info_t& info = texture_descriptors[slot.slot];
MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
info.offset = desc.offset;
info.buffer = desc.device_buffer;
if(string_startswith(slot.name, "__tex_image")) {
info.width = tex.mem->data_width;
info.height = tex.mem->data_height;
info.depth = tex.mem->data_depth;
info.options = 0;
if(tex.interpolation == INTERPOLATION_CLOSEST) {
info.options |= (1 << 0);
}
switch(tex.extension) {
case EXTENSION_REPEAT:
info.options |= (1 << 1);
break;
case EXTENSION_EXTEND:
info.options |= (1 << 2);
break;
case EXTENSION_CLIP:
info.options |= (1 << 3);
break;
default:
break;
}
}
}
/* Force write of descriptors. */
memory_manager.free(texture_descriptors_buffer);
memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
}
void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
{
/* cast arguments to cl types */
@ -605,10 +714,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_
d_rgba,
d_buffer);
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
#include "kernel/kernel_textures.h"
#undef KERNEL_TEX
set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
start_arg_index += kernel_set_args(ckFilmConvertKernel,
start_arg_index,
@ -1030,10 +1136,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
d_output_luma);
}
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(kernel, &start_arg_index, #name);
#include "kernel/kernel_textures.h"
#undef KERNEL_TEX
set_kernel_arg_buffers(kernel, &start_arg_index);
start_arg_index += kernel_set_args(kernel,
start_arg_index,

View File

@ -82,10 +82,7 @@ public:
d_buffer,
d_rng_state);
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
#include "kernel/kernel_textures.h"
#undef KERNEL_TEX
set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index);
start_arg_index += kernel_set_args(ckPathTraceKernel,
start_arg_index,

View File

@ -99,6 +99,8 @@ public:
void thread_run(DeviceTask *task)
{
flush_texture_buffers();
if(task->type == DeviceTask::FILM_CONVERT) {
film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
}
@ -113,10 +115,19 @@ public:
*/
typedef struct KernelGlobals {
ccl_constant KernelData *data;
ccl_global char *buffers[8];
typedef struct _tex_info_t {
uint buffer, padding;
ulong offset;
uint width, height, depth, options;
} _tex_info_t;
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name;
_tex_info_t name;
#include "kernel/kernel_textures.h"
#undef KERNEL_TEX
SplitData split_data;
SplitParams split_param_data;
} KernelGlobals;
@ -217,11 +228,7 @@ public:
*cached_memory.ray_state,
*cached_memory.rng_state);
/* TODO(sergey): Avoid map lookup here. */
#define KERNEL_TEX(type, ttype, name) \
device->set_kernel_arg_mem(program(), &start_arg_index, #name);
#include "kernel/kernel_textures.h"
#undef KERNEL_TEX
device->set_kernel_arg_buffers(program(), &start_arg_index);
start_arg_index +=
device->kernel_set_args(program(),
@ -352,11 +359,7 @@ public:
ray_state,
rtile.rng_state);
/* TODO(sergey): Avoid map lookup here. */
#define KERNEL_TEX(type, ttype, name) \
device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
#include "kernel/kernel_textures.h"
#undef KERNEL_TEX
device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
start_arg_index +=
device->kernel_set_args(device->program_data_init(),

View File

@ -142,7 +142,7 @@
/* data lookup defines */
#define kernel_data (*kg->data)
#define kernel_tex_fetch(t, index) kg->t[index]
#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)]
/* define NULL */
#define NULL 0

View File

@ -23,6 +23,10 @@
# include "util/util_vector.h"
#endif
#ifdef __KERNEL_OPENCL__
# include "util/util_atomic.h"
#endif
CCL_NAMESPACE_BEGIN
/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@ -109,11 +113,22 @@ typedef struct KernelGlobals {
#ifdef __KERNEL_OPENCL__
# define KERNEL_TEX(type, ttype, name) \
typedef type name##_t;
# include "kernel/kernel_textures.h"
typedef struct tex_info_t {
uint buffer, padding;
ulong offset;
uint width, height, depth, options;
} tex_info_t;
typedef ccl_addr_space struct KernelGlobals {
ccl_constant KernelData *data;
ccl_global char *buffers[8];
# define KERNEL_TEX(type, ttype, name) \
ccl_global type *name;
tex_info_t name;
# include "kernel/kernel_textures.h"
# ifdef __SPLIT_KERNEL__
@ -122,6 +137,57 @@ typedef ccl_addr_space struct KernelGlobals {
# endif
} KernelGlobals;
#define KERNEL_BUFFER_PARAMS \
ccl_global char *buffer0, \
ccl_global char *buffer1, \
ccl_global char *buffer2, \
ccl_global char *buffer3, \
ccl_global char *buffer4, \
ccl_global char *buffer5, \
ccl_global char *buffer6, \
ccl_global char *buffer7
#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
{
#ifdef __SPLIT_KERNEL__
if(ccl_local_id(0) + ccl_local_id(1) == 0)
#endif
{
kg->buffers[0] = buffer0;
kg->buffers[1] = buffer1;
kg->buffers[2] = buffer2;
kg->buffers[3] = buffer3;
kg->buffers[4] = buffer4;
kg->buffers[5] = buffer5;
kg->buffers[6] = buffer6;
kg->buffers[7] = buffer7;
}
# ifdef __SPLIT_KERNEL__
ccl_barrier(CCL_LOCAL_MEM_FENCE);
# endif
}
ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
{
# ifdef __SPLIT_KERNEL__
if(ccl_local_id(0) + ccl_local_id(1) == 0)
# endif
{
ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0];
# define KERNEL_TEX(type, ttype, name) \
kg->name = *(info++);
# include "kernel/kernel_textures.h"
}
# ifdef __SPLIT_KERNEL__
ccl_barrier(CCL_LOCAL_MEM_FENCE);
# endif
}
#endif /* __KERNEL_OPENCL__ */
/* Interpolated lookup table access */

View File

@ -15,30 +15,42 @@
*/
/* For OpenCL all images are packed in a single array, and we do manual lookup
* and interpolation. */
/* For OpenCL we do manual lookup and interpolation. */
ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) {
const uint tex_offset = id
#define KERNEL_TEX(type, ttype, name) + 1
#include "kernel/kernel_textures.h"
;
return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset];
}
#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)]
ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
{
const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
const int texture_type = kernel_tex_type(id);
/* Float4 */
if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
return kernel_tex_fetch(__tex_image_float4_packed, offset);
return tex_fetch(float4, info, offset);
}
/* Byte4 */
else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
uchar4 r = tex_fetch(uchar4, info, offset);
float f = 1.0f/255.0f;
return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
}
/* Float */
else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
float f = kernel_tex_fetch(__tex_image_float_packed, offset);
float f = tex_fetch(float, info, offset);
return make_float4(f, f, f, 1.0f);
}
/* Byte */
else {
uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
uchar r = tex_fetch(uchar, info, offset);
float f = r * (1.0f/255.0f);
return make_float4(f, f, f, 1.0f);
}
@ -64,17 +76,17 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
return x - (float)i;
}
ccl_device_inline uint kernel_decode_image_interpolation(uint4 info)
ccl_device_inline uint kernel_decode_image_interpolation(uint info)
{
return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
}
ccl_device_inline uint kernel_decode_image_extension(uint4 info)
ccl_device_inline uint kernel_decode_image_extension(uint info)
{
if(info.w & (1 << 1)) {
if(info & (1 << 1)) {
return EXTENSION_REPEAT;
}
else if(info.w & (1 << 2)) {
else if(info & (1 << 2)) {
return EXTENSION_EXTEND;
}
else {
@ -84,13 +96,16 @@ ccl_device_inline uint kernel_decode_image_extension(uint4 info)
ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
{
uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
uint width = info.x;
uint height = info.y;
uint offset = info.z;
const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
uint width = info->width;
uint height = info->height;
uint offset = 0;
/* Decode image options. */
uint interpolation = kernel_decode_image_interpolation(info);
uint extension = kernel_decode_image_extension(info);
uint interpolation = kernel_decode_image_interpolation(info->options);
uint extension = kernel_decode_image_extension(info->options);
/* Actual sampling. */
float4 r;
int ix, iy, nix, niy;
@ -150,14 +165,17 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
{
uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
uint width = info.x;
uint height = info.y;
uint offset = info.z;
uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
uint width = info->width;
uint height = info->height;
uint offset = 0;
uint depth = info->depth;
/* Decode image options. */
uint interpolation = kernel_decode_image_interpolation(info);
uint extension = kernel_decode_image_extension(info);
uint interpolation = kernel_decode_image_interpolation(info->options);
uint extension = kernel_decode_image_extension(info->options);
/* Actual sampling. */
float4 r;
int ix, iy, iz, nix, niy, niz;

View File

@ -184,15 +184,8 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
# else
/* bindless textures */
KERNEL_TEX(uint, texture_uint, __bindless_mapping)
# endif
#endif
/* packed image (opencl) */
KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed)
KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed)
KERNEL_TEX(float, texture_float, __tex_image_float_packed)
KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)
# endif /* __CUDA_ARCH__ */
#endif /* __KERNEL_CUDA__ */
#undef KERNEL_TEX
#undef KERNEL_IMAGE_TEX

View File

@ -52,9 +52,7 @@ __kernel void kernel_ocl_path_trace(
ccl_global float *buffer,
ccl_global uint *rng_state,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
int sample,
int sx, int sy, int sw, int sh, int offset, int stride)
@ -63,9 +61,8 @@ __kernel void kernel_ocl_path_trace(
kg->data = data;
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
int y = sy + ccl_global_id(1);
@ -82,9 +79,7 @@ __kernel void kernel_ocl_shader(
ccl_global float4 *output,
ccl_global float *output_luma,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
int type, int sx, int sw, int offset, int sample)
{
@ -92,9 +87,8 @@ __kernel void kernel_ocl_shader(
kg->data = data;
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
@ -114,9 +108,7 @@ __kernel void kernel_ocl_bake(
ccl_global uint4 *input,
ccl_global float4 *output,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
int type, int filter, int sx, int sw, int offset, int sample)
{
@ -124,9 +116,8 @@ __kernel void kernel_ocl_bake(
kg->data = data;
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
@ -144,9 +135,7 @@ __kernel void kernel_ocl_convert_to_byte(
ccl_global uchar4 *rgba,
ccl_global float *buffer,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
float sample_scale,
int sx, int sy, int sw, int sh, int offset, int stride)
@ -155,9 +144,8 @@ __kernel void kernel_ocl_convert_to_byte(
kg->data = data;
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
int y = sy + ccl_global_id(1);
@ -171,9 +159,7 @@ __kernel void kernel_ocl_convert_to_half_float(
ccl_global uchar4 *rgba,
ccl_global float *buffer,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
float sample_scale,
int sx, int sy, int sw, int sh, int offset, int stride)
@ -182,9 +168,8 @@ __kernel void kernel_ocl_convert_to_half_float(
kg->data = data;
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
int y = sy + ccl_global_id(1);

View File

@ -25,11 +25,7 @@ __kernel void kernel_ocl_path_trace_data_init(
int num_elements,
ccl_global char *ray_state,
ccl_global uint *rng_state,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
int start_sample,
int end_sample,
int sx, int sy, int sw, int sh, int offset, int stride,
@ -46,10 +42,7 @@ __kernel void kernel_ocl_path_trace_data_init(
num_elements,
ray_state,
rng_state,
#define KERNEL_TEX(type, ttype, name) name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_ARGS,
start_sample,
end_sample,
sx, sy, sw, sh, offset, stride,

View File

@ -25,9 +25,7 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
ccl_global char *ray_state,
ccl_global uint *rng_state,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
ccl_global int *queue_index,
ccl_global char *use_queues_flag,
@ -52,12 +50,9 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
}
ccl_barrier(CCL_LOCAL_MEM_FENCE);
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
kg

View File

@ -52,9 +52,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
ccl_global uint *rng_state,
#ifdef __KERNEL_OPENCL__
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
KERNEL_BUFFER_PARAMS,
#endif
int start_sample,
@ -100,9 +98,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
#ifdef __KERNEL_OPENCL__
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
kernel_set_buffer_info(kg);
#endif
int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);

View File

@ -43,7 +43,6 @@ static bool isfinite(half /*value*/)
ImageManager::ImageManager(const DeviceInfo& info)
{
need_update = true;
pack_images = false;
osl_texture_system = NULL;
animation_frame = 0;
@ -87,11 +86,6 @@ ImageManager::~ImageManager()
}
}
void ImageManager::set_pack_images(bool pack_images_)
{
pack_images = pack_images_;
}
void ImageManager::set_osl_texture_system(void *texture_system)
{
osl_texture_system = texture_system;
@ -742,7 +736,7 @@ void ImageManager::device_load_image(Device *device,
pixels[3] = TEX_IMAGE_MISSING_A;
}
if(!pack_images) {
{
thread_scoped_lock device_lock(device_mutex);
device->tex_alloc(name.c_str(),
tex_img,
@ -771,7 +765,7 @@ void ImageManager::device_load_image(Device *device,
pixels[0] = TEX_IMAGE_MISSING_R;
}
if(!pack_images) {
{
thread_scoped_lock device_lock(device_mutex);
device->tex_alloc(name.c_str(),
tex_img,
@ -803,7 +797,7 @@ void ImageManager::device_load_image(Device *device,
pixels[3] = (TEX_IMAGE_MISSING_A * 255);
}
if(!pack_images) {
{
thread_scoped_lock device_lock(device_mutex);
device->tex_alloc(name.c_str(),
tex_img,
@ -831,7 +825,7 @@ void ImageManager::device_load_image(Device *device,
pixels[0] = (TEX_IMAGE_MISSING_R * 255);
}
if(!pack_images) {
{
thread_scoped_lock device_lock(device_mutex);
device->tex_alloc(name.c_str(),
tex_img,
@ -862,7 +856,7 @@ void ImageManager::device_load_image(Device *device,
pixels[3] = TEX_IMAGE_MISSING_A;
}
if(!pack_images) {
{
thread_scoped_lock device_lock(device_mutex);
device->tex_alloc(name.c_str(),
tex_img,
@ -890,7 +884,7 @@ void ImageManager::device_load_image(Device *device,
pixels[0] = TEX_IMAGE_MISSING_R;
}
if(!pack_images) {
{
thread_scoped_lock device_lock(device_mutex);
device->tex_alloc(name.c_str(),
tex_img,
@ -1047,9 +1041,6 @@ void ImageManager::device_update(Device *device,
pool.wait_work();
if(pack_images)
device_pack_images(device, dscene, progress);
need_update = false;
}
@ -1079,141 +1070,6 @@ void ImageManager::device_update_slot(Device *device,
}
}
uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot)
{
uint8_t options = 0;
/* Image Options are packed into one uint:
* bit 0 -> Interpolation
* bit 1 + 2 + 3 -> Extension
*/
if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) {
options |= (1 << 0);
}
if(images[type][slot]->extension == EXTENSION_REPEAT) {
options |= (1 << 1);
}
else if(images[type][slot]->extension == EXTENSION_EXTEND) {
options |= (1 << 2);
}
else /* EXTENSION_CLIP */ {
options |= (1 << 3);
}
return options;
}
template<typename T>
void ImageManager::device_pack_images_type(
ImageDataType type,
const vector<device_vector<T>*>& cpu_textures,
device_vector<T> *device_image,
uint4 *info)
{
size_t size = 0, offset = 0;
/* First step is to calculate size of the texture we need. */
for(size_t slot = 0; slot < images[type].size(); slot++) {
if(images[type][slot] == NULL) {
continue;
}
device_vector<T>& tex_img = *cpu_textures[slot];
size += tex_img.size();
}
/* Now we know how much memory we need, so we can allocate and fill. */
T *pixels = device_image->resize(size);
for(size_t slot = 0; slot < images[type].size(); slot++) {
if(images[type][slot] == NULL) {
continue;
}
device_vector<T>& tex_img = *cpu_textures[slot];
uint8_t options = pack_image_options(type, slot);
const int index = type_index_to_flattened_slot(slot, type) * 2;
info[index] = make_uint4(tex_img.data_width,
tex_img.data_height,
offset,
options);
info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
memcpy(pixels + offset,
(void*)tex_img.data_pointer,
tex_img.memory_size());
offset += tex_img.size();
}
}
void ImageManager::device_pack_images(Device *device,
DeviceScene *dscene,
Progress& /*progess*/)
{
/* For OpenCL, we pack all image textures into a single large texture, and
* do our own interpolation in the kernel.
*/
/* TODO(sergey): This will over-allocate a bit, but this is constant memory
* so should be fine for a short term.
*/
const size_t info_size = max4(max_flattened_slot(IMAGE_DATA_TYPE_FLOAT4),
max_flattened_slot(IMAGE_DATA_TYPE_BYTE4),
max_flattened_slot(IMAGE_DATA_TYPE_FLOAT),
max_flattened_slot(IMAGE_DATA_TYPE_BYTE));
uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
/* Pack byte4 textures. */
device_pack_images_type(IMAGE_DATA_TYPE_BYTE4,
dscene->tex_byte4_image,
&dscene->tex_image_byte4_packed,
info);
/* Pack float4 textures. */
device_pack_images_type(IMAGE_DATA_TYPE_FLOAT4,
dscene->tex_float4_image,
&dscene->tex_image_float4_packed,
info);
/* Pack byte textures. */
device_pack_images_type(IMAGE_DATA_TYPE_BYTE,
dscene->tex_byte_image,
&dscene->tex_image_byte_packed,
info);
/* Pack float textures. */
device_pack_images_type(IMAGE_DATA_TYPE_FLOAT,
dscene->tex_float_image,
&dscene->tex_image_float_packed,
info);
/* Push textures to the device. */
if(dscene->tex_image_byte4_packed.size()) {
if(dscene->tex_image_byte4_packed.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
device->tex_free(dscene->tex_image_byte4_packed);
}
device->tex_alloc("__tex_image_byte4_packed", dscene->tex_image_byte4_packed);
}
if(dscene->tex_image_float4_packed.size()) {
if(dscene->tex_image_float4_packed.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
device->tex_free(dscene->tex_image_float4_packed);
}
device->tex_alloc("__tex_image_float4_packed", dscene->tex_image_float4_packed);
}
if(dscene->tex_image_byte_packed.size()) {
if(dscene->tex_image_byte_packed.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
device->tex_free(dscene->tex_image_byte_packed);
}
device->tex_alloc("__tex_image_byte_packed", dscene->tex_image_byte_packed);
}
if(dscene->tex_image_float_packed.size()) {
if(dscene->tex_image_float_packed.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
device->tex_free(dscene->tex_image_float_packed);
}
device->tex_alloc("__tex_image_float_packed", dscene->tex_image_float_packed);
}
if(dscene->tex_image_packed_info.size()) {
if(dscene->tex_image_packed_info.device_pointer) {
thread_scoped_lock device_lock(device_mutex);
device->tex_free(dscene->tex_image_packed_info);
}
device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info);
}
}
void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene)
{
for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
@ -1239,18 +1095,6 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene)
dscene->tex_float_image.clear();
dscene->tex_byte_image.clear();
dscene->tex_half_image.clear();
device->tex_free(dscene->tex_image_float4_packed);
device->tex_free(dscene->tex_image_byte4_packed);
device->tex_free(dscene->tex_image_float_packed);
device->tex_free(dscene->tex_image_byte_packed);
device->tex_free(dscene->tex_image_packed_info);
dscene->tex_image_float4_packed.clear();
dscene->tex_image_byte4_packed.clear();
dscene->tex_image_float_packed.clear();
dscene->tex_image_byte_packed.clear();
dscene->tex_image_packed_info.clear();
}
CCL_NAMESPACE_END

View File

@ -76,7 +76,6 @@ public:
void device_free_builtin(Device *device, DeviceScene *dscene);
void set_osl_texture_system(void *texture_system);
void set_pack_images(bool pack_images_);
bool set_animation_frame_update(int frame);
bool need_update;
@ -130,7 +129,6 @@ private:
vector<Image*> images[IMAGE_DATA_NUM_TYPES];
void *osl_texture_system;
bool pack_images;
bool file_load_image_generic(Image *img,
ImageInput **in,
@ -152,8 +150,6 @@ private:
int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
string name_from_type(int type);
uint8_t pack_image_options(ImageDataType type, size_t slot);
void device_load_image(Device *device,
DeviceScene *dscene,
Scene *scene,
@ -164,17 +160,6 @@ private:
DeviceScene *dscene,
ImageDataType type,
int slot);
template<typename T>
void device_pack_images_type(
ImageDataType type,
const vector<device_vector<T>*>& cpu_textures,
device_vector<T> *device_image,
uint4 *info);
void device_pack_images(Device *device,
DeviceScene *dscene,
Progress& progess);
};
CCL_NAMESPACE_END

View File

@ -1925,16 +1925,7 @@ void MeshManager::device_update_displacement_images(Device *device,
if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
continue;
}
if(device->info.pack_images) {
/* If device requires packed images we need to update all
* images now, even if they're not used for displacement.
*/
image_manager->device_update(device,
dscene,
scene,
progress);
return;
}
ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode*>(node);
int slot = image_node->slot;
if(slot != -1) {

View File

@ -148,8 +148,6 @@ void Scene::device_update(Device *device_, Progress& progress)
* - Film needs light manager to run for use_light_visibility
* - Lookup tables are done a second time to handle film tables
*/
image_manager->set_pack_images(device->info.pack_images);
progress.set_status("Updating Shaders");
shader_manager->device_update(device, &dscene, this, progress);

View File

@ -121,13 +121,6 @@ public:
vector<device_vector<uchar>* > tex_byte_image;
vector<device_vector<half>* > tex_half_image;
/* opencl images */
device_vector<float4> tex_image_float4_packed;
device_vector<uchar4> tex_image_byte4_packed;
device_vector<float> tex_image_float_packed;
device_vector<uchar> tex_image_byte_packed;
device_vector<uint4> tex_image_packed_info;
KernelData data;
};