Cycles / CUDA: Increase maximum image textures on GPU.

Instead of 95, we can use 145 images now. This only affects Kepler and above (sm30, sm_35 and sm_50).

This can be increased further if needed, but let's first test if this does not come with a performance impact.

Originally developed during my GSoC 2013.
This commit is contained in:
Thomas Dinges 2014-05-11 03:38:39 +02:00
parent 8904eaf504
commit c08c931fb6
Notes: blender-bot 2023-12-22 20:14:11 +01:00
Referenced by issue #40379, Cycles : CUDA Rendering : Environment with MIS uses double memory usage
Referenced by issue #40172, LibFree() crashes Blender
Referenced by issue #40182, BGE crash
9 changed files with 139 additions and 13 deletions

View File

@ -54,6 +54,7 @@ public:
bool display_device;
bool advanced_shading;
bool pack_images;
bool extended_images; /* flag for GPU and Multi device */
vector<DeviceInfo> multi_devices;
DeviceInfo()
@ -64,6 +65,7 @@ public:
display_device = false;
advanced_shading = true;
pack_images = false;
extended_images = false;
}
};

View File

@ -1150,6 +1150,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
int major, minor;
cuDeviceComputeCapability(&major, &minor, num);
info.advanced_shading = (major >= 2);
info.extended_images = (major >= 3);
info.pack_images = false;
/* if device has a kernel timeout, assume it is used for display */

View File

@ -328,6 +328,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
info.advanced_shading = with_advanced_shading;
info.pack_images = false;
info.extended_images = true;
foreach(DeviceInfo& subinfo, devices) {
if(subinfo.type == type) {
@ -351,6 +352,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
if(subinfo.display_device)
info.display_device = true;
info.pack_images = info.pack_images || subinfo.pack_images;
info.extended_images = info.extended_images && subinfo.extended_images;
num_added++;
}
}

View File

@ -174,6 +174,61 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099)
/* Kepler and above */
#if __CUDA_ARCH__ >= 300
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_102)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_103)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_104)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_105)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_106)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_107)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_108)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_109)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_110)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_111)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_112)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_113)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_114)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_115)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_116)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_117)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_118)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_119)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_120)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_121)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_122)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_123)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_124)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_125)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_126)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_127)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_128)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_129)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_130)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_131)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_132)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_133)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_134)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_135)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_136)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_137)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_138)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_139)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_140)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_141)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_142)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_143)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_144)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_145)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_146)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_147)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_148)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_149)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_150)
#endif
/* packed image (opencl) */
KERNEL_TEX(uchar4, texture_uchar4, __tex_image_packed)
KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)

View File

@ -149,8 +149,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
* - group by size and use a 3d texture, performance impact
* - group into larger texture with some padding for correct lerp
*
* also note that cuda has 128 textures limit, we use 100 now, since
* we still need some for other storage */
* also note that cuda has a textures limit (128 for Fermi, 256 for Kepler),
* and we cannot use all since we still need some for other storage */
switch(id) {
case 0: r = kernel_tex_image_interp(__tex_image_float_000, x, y); break;
@ -253,7 +253,62 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
default:
#if __CUDA_ARCH__ >= 300
case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break;
case 103: r = kernel_tex_image_interp(__tex_image_103, x, y); break;
case 104: r = kernel_tex_image_interp(__tex_image_104, x, y); break;
case 105: r = kernel_tex_image_interp(__tex_image_105, x, y); break;
case 106: r = kernel_tex_image_interp(__tex_image_106, x, y); break;
case 107: r = kernel_tex_image_interp(__tex_image_107, x, y); break;
case 108: r = kernel_tex_image_interp(__tex_image_108, x, y); break;
case 109: r = kernel_tex_image_interp(__tex_image_109, x, y); break;
case 110: r = kernel_tex_image_interp(__tex_image_110, x, y); break;
case 111: r = kernel_tex_image_interp(__tex_image_111, x, y); break;
case 112: r = kernel_tex_image_interp(__tex_image_112, x, y); break;
case 113: r = kernel_tex_image_interp(__tex_image_113, x, y); break;
case 114: r = kernel_tex_image_interp(__tex_image_114, x, y); break;
case 115: r = kernel_tex_image_interp(__tex_image_115, x, y); break;
case 116: r = kernel_tex_image_interp(__tex_image_116, x, y); break;
case 117: r = kernel_tex_image_interp(__tex_image_117, x, y); break;
case 118: r = kernel_tex_image_interp(__tex_image_118, x, y); break;
case 119: r = kernel_tex_image_interp(__tex_image_119, x, y); break;
case 120: r = kernel_tex_image_interp(__tex_image_120, x, y); break;
case 121: r = kernel_tex_image_interp(__tex_image_121, x, y); break;
case 122: r = kernel_tex_image_interp(__tex_image_122, x, y); break;
case 123: r = kernel_tex_image_interp(__tex_image_123, x, y); break;
case 124: r = kernel_tex_image_interp(__tex_image_124, x, y); break;
case 125: r = kernel_tex_image_interp(__tex_image_125, x, y); break;
case 126: r = kernel_tex_image_interp(__tex_image_126, x, y); break;
case 127: r = kernel_tex_image_interp(__tex_image_127, x, y); break;
case 128: r = kernel_tex_image_interp(__tex_image_128, x, y); break;
case 129: r = kernel_tex_image_interp(__tex_image_129, x, y); break;
case 130: r = kernel_tex_image_interp(__tex_image_130, x, y); break;
case 131: r = kernel_tex_image_interp(__tex_image_131, x, y); break;
case 132: r = kernel_tex_image_interp(__tex_image_132, x, y); break;
case 133: r = kernel_tex_image_interp(__tex_image_133, x, y); break;
case 134: r = kernel_tex_image_interp(__tex_image_134, x, y); break;
case 135: r = kernel_tex_image_interp(__tex_image_135, x, y); break;
case 136: r = kernel_tex_image_interp(__tex_image_136, x, y); break;
case 137: r = kernel_tex_image_interp(__tex_image_137, x, y); break;
case 138: r = kernel_tex_image_interp(__tex_image_138, x, y); break;
case 139: r = kernel_tex_image_interp(__tex_image_139, x, y); break;
case 140: r = kernel_tex_image_interp(__tex_image_140, x, y); break;
case 141: r = kernel_tex_image_interp(__tex_image_141, x, y); break;
case 142: r = kernel_tex_image_interp(__tex_image_142, x, y); break;
case 143: r = kernel_tex_image_interp(__tex_image_143, x, y); break;
case 144: r = kernel_tex_image_interp(__tex_image_144, x, y); break;
case 145: r = kernel_tex_image_interp(__tex_image_145, x, y); break;
case 146: r = kernel_tex_image_interp(__tex_image_146, x, y); break;
case 147: r = kernel_tex_image_interp(__tex_image_147, x, y); break;
case 148: r = kernel_tex_image_interp(__tex_image_148, x, y); break;
case 149: r = kernel_tex_image_interp(__tex_image_149, x, y); break;
case 150: r = kernel_tex_image_interp(__tex_image_150, x, y); break;
#endif
default:
kernel_assert(0);
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}

View File

@ -59,11 +59,16 @@ void ImageManager::set_osl_texture_system(void *texture_system)
osl_texture_system = texture_system;
}
void ImageManager::set_extended_image_limits(void)
void ImageManager::set_extended_image_limits(const DeviceInfo& info)
{
tex_num_images = TEX_EXTENDED_NUM_IMAGES;
tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES;
tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START;
if(info.type == DEVICE_CPU) {
tex_num_images = TEX_EXTENDED_NUM_IMAGES_CPU;
tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES;
tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START;
}
else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.extended_images) {
tex_num_images = TEX_EXTENDED_NUM_IMAGES_GPU;
}
}
bool ImageManager::set_animation_frame_update(int frame)

View File

@ -17,6 +17,7 @@
#ifndef __IMAGE_H__
#define __IMAGE_H__
#include "device.h"
#include "device_memory.h"
#include "util_string.h"
@ -27,11 +28,16 @@
CCL_NAMESPACE_BEGIN
/* generic */
#define TEX_NUM_IMAGES 95
#define TEX_IMAGE_BYTE_START TEX_NUM_FLOAT_IMAGES
/* extended gpu */
#define TEX_EXTENDED_NUM_IMAGES_GPU 145
/* extended cpu */
#define TEX_EXTENDED_NUM_FLOAT_IMAGES 1024
#define TEX_EXTENDED_NUM_IMAGES 1024
#define TEX_EXTENDED_NUM_IMAGES_CPU 1024
#define TEX_EXTENDED_IMAGE_BYTE_START TEX_EXTENDED_NUM_FLOAT_IMAGES
/* color to use when textures are not found */
@ -59,7 +65,7 @@ public:
void set_osl_texture_system(void *texture_system);
void set_pack_images(bool pack_images_);
void set_extended_image_limits(void);
void set_extended_image_limits(const DeviceInfo& info);
bool set_animation_frame_update(int frame);
bool need_update;

View File

@ -63,8 +63,8 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
else
shader_manager = ShaderManager::create(this, SceneParams::SVM);
if (device_info_.type == DEVICE_CPU)
image_manager->set_extended_image_limits();
/* Extended image limits for CPU and GPUs */
image_manager->set_extended_image_limits(device_info_);
}
Scene::~Scene()

View File

@ -105,8 +105,8 @@ public:
/* integrator */
device_vector<uint> sobol_directions;
/* images */
device_vector<uchar4> tex_image[TEX_EXTENDED_NUM_IMAGES];
/* cpu images */
device_vector<uchar4> tex_image[TEX_EXTENDED_NUM_IMAGES_CPU];
device_vector<float4> tex_float_image[TEX_EXTENDED_NUM_FLOAT_IMAGES];
/* opencl images */