Page MenuHome

doug65536_opencl_patch2_IGNORE_OTHER_ONE.diff

File Metadata

Author
Doug Gale (doug65536)
Created
Nov 13 2013, 4:29 PM

doug65536_opencl_patch2_IGNORE_OTHER_ONE.diff

Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt (revision 47903)
+++ CMakeLists.txt (working copy)
@@ -126,7 +126,7 @@
option(WITH_PYTHON_MODULE "Enable building as a python module (experimental, only enable for development)" OFF)
option(WITH_BUILDINFO "Include extra build details (only disable for development & faster builds)" ON)
option(WITH_IK_ITASC "Enable ITASC IK solver (only disable for development & for incompatible C++ compilers)" ON)
-option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke and audio effects)" OFF)
+option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke and audio effects)" ON)
option(WITH_BULLET "Enable Bullet (Physics Engine)" ON)
option(WITH_GAMEENGINE "Enable Game Engine" ON)
option(WITH_PLAYER "Build Player" OFF)
@@ -146,8 +146,9 @@
mark_as_advanced(WITH_AUDASPACE)
-# (unix defaults to OpenMP On)
-if(UNIX AND NOT APPLE)
+# (unix+windows defaults to OpenMP On)
+# VS OpenMP wastes too much CPU spinning
+if((UNIX AND NOT APPLE))
set(PLATFORM_DEFAULT ON)
else()
set(PLATFORM_DEFAULT OFF)
@@ -178,7 +179,7 @@
option(WITH_MOD_REMESH "Enable Remesh Modifier" ON)
option(WITH_MOD_CLOTH_ELTOPO "Enable Experimental cloth solver" OFF)
mark_as_advanced(WITH_MOD_CLOTH_ELTOPO)
-option(WITH_MOD_OCEANSIM "Enable Ocean Modifier" OFF)
+option(WITH_MOD_OCEANSIM "Enable Ocean Modifier" ON)
# Image format support
option(WITH_IMAGE_OPENEXR "Enable OpenEXR Support (http://www.openexr.com)" ON)
@@ -191,16 +192,16 @@
option(WITH_IMAGE_FRAMESERVER "Enable image FrameServer Support for rendering" ON)
# Audio/Video format support
-option(WITH_CODEC_FFMPEG "Enable FFMPeg Support (http://ffmpeg.org)" OFF)
+option(WITH_CODEC_FFMPEG "Enable FFMPeg Support (http://ffmpeg.org)" ON)
-option(WITH_CODEC_SNDFILE "Enable libsndfile Support (http://www.mega-nerd.com/libsndfile)" OFF)
+option(WITH_CODEC_SNDFILE "Enable libsndfile Support (http://www.mega-nerd.com/libsndfile)" ON)
if(APPLE OR (WIN32 AND NOT UNIX))
option(WITH_CODEC_QUICKTIME "Enable Quicktime Support" OFF)
endif()
# 3D format support
# disable opencollada on non-apple unix because opencollada has no package for debian
-option(WITH_OPENCOLLADA "Enable OpenCollada Support (http://www.opencollada.org)" OFF)
+option(WITH_OPENCOLLADA "Enable OpenCollada Support (http://www.opencollada.org)" ON)
# Sound output
option(WITH_SDL "Enable SDL for sound and joystick support" ON)
@@ -235,7 +236,7 @@
# Cycles
option(WITH_CYCLES "Enable cycles Render Engine" ON)
-option(WITH_CYCLES_TEST "Build cycles test application" OFF)
+option(WITH_CYCLES_TEST "Build cycles test application" ON)
option(WITH_CYCLES_OSL "Build Cycles with OSL support" OFF)
option(WITH_CYCLES_CUDA_BINARIES "Build cycles CUDA binaries" OFF)
set(CYCLES_CUDA_BINARIES_ARCH sm_13 sm_20 sm_21 CACHE STRING "CUDA architectures to build binaries for")
@@ -247,10 +248,10 @@
mark_as_advanced(WITH_MEM_JEMALLOC)
# Debug
-option(WITH_CXX_GUARDEDALLOC "Enable GuardedAlloc for C++ memory allocation tracking (only enable for development)" OFF)
+option(WITH_CXX_GUARDEDALLOC "Enable GuardedAlloc for C++ memory allocation tracking (only enable for development)" ON)
mark_as_advanced(WITH_CXX_GUARDEDALLOC)
-option(WITH_ASSERT_ABORT "Call abort() when raising an assertion through BLI_assert()" OFF)
+option(WITH_ASSERT_ABORT "Call abort() when raising an assertion through BLI_assert()" ON)
mark_as_advanced(WITH_ASSERT_ABORT)
@@ -755,6 +756,9 @@
else()
set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/windows)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+
# Setup 64bit and 64bit windows systems
if(CMAKE_CL_64)
message("64 bit compiler detected.")
@@ -814,25 +818,25 @@
add_definitions(/D_CRT_NONSTDC_NO_DEPRECATE /D_CRT_SECURE_NO_DEPRECATE /D_SCL_SECURE_NO_DEPRECATE /D_CONSOLE /D_LIB)
- set(CMAKE_CXX_FLAGS "/nologo /J /Gd /EHsc" CACHE STRING "MSVC MT C++ flags " FORCE)
- set(CMAKE_C_FLAGS "/nologo /J /Gd" CACHE STRING "MSVC MT C++ flags " FORCE)
+ set(CMAKE_CXX_FLAGS "/openmp- /MP /nologo /J /Gd /EHsc" CACHE STRING "MSVC MT C++ flags " FORCE)
+ set(CMAKE_C_FLAGS "/openmp- /MP /nologo /J /Gd" CACHE STRING "MSVC MT C++ flags " FORCE)
if(CMAKE_CL_64)
- set(CMAKE_CXX_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /Zi" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_CXX_FLAGS_DEBUG "/MP /Od /RTC1 /MTd /Zi" CACHE STRING "MSVC MT flags " FORCE)
else()
- set(CMAKE_CXX_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /ZI" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_CXX_FLAGS_DEBUG "/MP /Od /RTC1 /MTd /ZI" CACHE STRING "MSVC MT flags " FORCE)
endif()
- set(CMAKE_CXX_FLAGS_RELEASE "/O2 /Ob2 /MT" CACHE STRING "MSVC MT flags " FORCE)
- set(CMAKE_CXX_FLAGS_MINSIZEREL "/O1 /Ob1 /MT" CACHE STRING "MSVC MT flags " FORCE)
- set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /Ob1 /MT /Zi" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /Ox /Gy /Ob2 /MT" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} /Os /Gy /Ob2 /MT" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /Ox /Gy /favor:INTEL64 /Ob1 /MT /Zi" CACHE STRING "MSVC MT flags " FORCE)
if(CMAKE_CL_64)
- set(CMAKE_C_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /Zi" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} /Od /RTC1 /MTd /Zi" CACHE STRING "MSVC MT flags " FORCE)
else()
- set(CMAKE_C_FLAGS_DEBUG "/Od /Gm /RTC1 /MTd /ZI" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} /Od /RTC1 /MTd /ZI" CACHE STRING "MSVC MT flags " FORCE)
endif()
- set(CMAKE_C_FLAGS_RELEASE "/O2 /Ob2 /MT" CACHE STRING "MSVC MT flags " FORCE)
- set(CMAKE_C_FLAGS_MINSIZEREL "/O1 /Ob1 /MT" CACHE STRING "MSVC MT flags " FORCE)
- set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /Ob1 /MT /Zi" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS} /MP /Ox /Ob2 /Gy /MT" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS} /MP /Os /Ob2 /Gy /MT" CACHE STRING "MSVC MT flags " FORCE)
+ set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} /MP /Ox /Ob2 /Gy /MT /Zi" CACHE STRING "MSVC MT flags " FORCE)
# most msvc warnings are C & C++
set(_WARNINGS "/W3 /wd4018 /wd4244 /wd4305 /wd4800 /wd4181 /wd4065 /wd4267 /we4013 /wd4200")
Index: intern/cycles/kernel/kernel_projection.h
===================================================================
--- intern/cycles/kernel/kernel_projection.h (revision 47903)
+++ intern/cycles/kernel/kernel_projection.h (working copy)
@@ -97,7 +97,12 @@
if(r > 1.0f)
return make_float3(0.0f, 0.0f, 0.0f);
- float phi = acosf((r != 0.0f)? u/r: 0.0f);
+ // Intel OpenCL doesn't like '?' operator with floating point types
+ float phi;
+ if (r != 0.0f)
+ phi = acosf(u/r);
+ else
+ phi = acosf(0.0f);
float theta = asinf(r) * (fov / M_PI_F);
if(v < 0.0f) phi = -phi;
@@ -132,7 +137,12 @@
if(r > rmax)
return make_float3(0.0f, 0.0f, 0.0f);
- float phi = acosf((r != 0.0f)? u/r: 0.0f);
+ // Intel OpenCL doesn't like '?' operator with floating point types
+ float phi;
+ if (r != 0.0f)
+ phi = acosf(u/r);
+ else
+ phi = acosf(0.0f);
float theta = 2.0f * asinf(r/(2.0f * lens));
if(v < 0.0f) phi = -phi;
Index: intern/cycles/device/device_opencl.cpp
===================================================================
--- intern/cycles/device/device_opencl.cpp (revision 47903)
+++ intern/cycles/device/device_opencl.cpp (working copy)
@@ -22,6 +22,8 @@
#include <stdlib.h>
#include <string.h>
+#include "util_thread.h"
+
#include "device.h"
#include "device_intern.h"
@@ -43,18 +45,30 @@
public:
cl_context cxContext;
cl_command_queue cqCommandQueue;
+ cl_command_queue cqPeekQueue;
cl_platform_id cpPlatform;
cl_device_id cdDevice;
cl_program cpProgram;
cl_kernel ckPathTraceKernel;
cl_kernel ckFilmConvertKernel;
cl_int ciErr;
- map<string, device_vector<uchar>*> const_mem_map;
- map<string, device_memory*> mem_map;
+ typedef map<string, device_vector<uchar>*> const_mem_map_t;
+ const_mem_map_t const_mem_map;
+ typedef map<string, device_memory*> mem_map_t;
+ mem_map_t mem_map;
device_ptr null_mem;
bool device_initialized;
string platform_name;
+ typedef map<int, cl_event> WaitMap;
+ WaitMap waits;
+ thread_mutex waitsLock;
+ int nextWaitId;
+
+ cl_event throttleEvent;
+ int throttleCount, throttleLevel;
+ double throttleTime;
+
const char *opencl_error_string(cl_int err)
{
switch (err) {
@@ -140,6 +154,12 @@
#endif
}
}
+
+ static void error_notify(const char *msg, const void *ptr, size_t size, void *tp)
+ {
+ printf("OpenCL error, msg=\"%s\", ptr=%p, size=%d\n",
+ msg, ptr, (int)size);
+ }
OpenCLDevice(DeviceInfo& info, bool background_)
{
@@ -147,12 +167,17 @@
cpPlatform = NULL;
cxContext = NULL;
cqCommandQueue = NULL;
+ cqPeekQueue = NULL;
cpProgram = NULL;
ckPathTraceKernel = NULL;
ckFilmConvertKernel = NULL;
null_mem = 0;
device_initialized = false;
-
+ throttleEvent = 0;
+ throttleCount = 0;
+ throttleLevel = 6;
+ nextWaitId = 1;
+
/* setup platform */
cl_uint num_platforms;
@@ -165,38 +190,62 @@
return;
}
- ciErr = clGetPlatformIDs(1, &cpPlatform, NULL);
+ vector<cl_platform_id> platform_ids;
+ platform_ids.resize(num_platforms);
+
+ ciErr = clGetPlatformIDs(num_platforms, &platform_ids[0], NULL);
if(opencl_error(ciErr))
return;
- char name[256];
- clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
- platform_name = name;
-
- /* get devices */
+ cl_uint num_devices;
vector<cl_device_id> device_ids;
- cl_uint num_devices;
+ cl_uint offset = 0;
+ cdDevice = 0;
- if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &num_devices)))
- return;
+ for (int platform = 0; platform < num_platforms; ++platform, offset += num_devices)
+ {
+ cpPlatform = platform_ids[platform];
- if(info.num > num_devices) {
- if(num_devices == 0)
- opencl_error("OpenCL: no devices found.");
- else
- opencl_error("OpenCL: specified device not found.");
- return;
+ char name[256];
+ clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
+ platform_name = name;
+
+ /* get devices */
+
+ num_devices = 0;
+ if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices)))
+ continue;
+
+ if(num_devices == 0 || info.num - offset > num_devices)
+ continue;
+
+ // See if the device identified by info.num can't be in this platform
+ if (info.num >= offset + num_devices)
+ continue;
+
+ device_ids.resize(num_devices);
+
+ if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL)))
+ return;
+
+ cdDevice = device_ids[info.num - offset];
+ break;
}
- device_ids.resize(num_devices);
-
- if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, num_devices, &device_ids[0], NULL)))
+ if (cdDevice == 0)
+ {
+ cpPlatform = 0;
return;
+ }
- cdDevice = device_ids[info.num];
+ cl_context_properties props[] =
+ {
+ CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
+ 0, 0
+ };
/* create context */
- cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
+ cxContext = clCreateContext(props, 1, &cdDevice, error_notify, NULL, &ciErr);
if(opencl_error(ciErr))
return;
@@ -205,6 +254,7 @@
return;
null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
+
device_initialized = true;
}
@@ -441,12 +491,28 @@
if(null_mem)
clReleaseMemObject(CL_MEM_PTR(null_mem));
- map<string, device_vector<uchar>*>::iterator mt;
+ const_mem_map_t::iterator mt;
for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
mem_free(*(mt->second));
delete mt->second;
}
+ // Cleanup abandoned waits
+ thread_scoped_lock hold(waitsLock);
+ foreach (WaitMap::value_type &wait, waits)
+ {
+ if (wait.second)
+ {
+ clReleaseEvent(wait.second);
+ wait.second = 0;
+ }
+ }
+ waits.clear();
+ hold.unlock();
+
+ if (throttleEvent != 0)
+ clReleaseEvent(throttleEvent);
+
if(ckPathTraceKernel)
clReleaseKernel(ckPathTraceKernel);
if(ckFilmConvertKernel)
@@ -455,10 +521,135 @@
clReleaseProgram(cpProgram);
if(cqCommandQueue)
clReleaseCommandQueue(cqCommandQueue);
+ if (cqPeekQueue)
+ clReleaseCommandQueue(cqPeekQueue);
if(cxContext)
clReleaseContext(cxContext);
+
+ // Don't leave values hanging around
+ throttleEvent = 0;
+ ckPathTraceKernel = 0;
+ ckFilmConvertKernel = 0;
+ cpProgram = 0;
+ cqCommandQueue = 0;
+ cqPeekQueue = 0;
+ cxContext = 0;
}
+ void async_begin()
+ {
+ if (!asyncBatch)
+ {
+ asyncBatch = new AsyncBatchType;
+ asyncBatch->reserve(peakBatch);
+ }
+ }
+
+ void async_end()
+ {
+ if (!asyncBatch)
+ return;
+
+ if (asyncBatch->size())
+ {
+ int count = mem_async_wait_multiple(&asyncBatch->front(), asyncBatch->size());
+ if (peakBatch < count)
+ peakBatch = count;
+ }
+ delete asyncBatch;
+ asyncBatch = 0;
+ }
+
+ bool is_async()
+ {
+ return asyncBatch != 0;
+ }
+
+ WaitMap::iterator new_wait()
+ {
+ thread_scoped_lock hold(waitsLock);
+ int waitId = nextWaitId++;
+ pair<WaitMap::iterator,bool> p = waits.insert(
+ WaitMap::value_type(waitId,
+ WaitMap::value_type::second_type(0)));
+ hold.unlock();
+ assert(p.second);
+ return p.first;
+ }
+
+ int mem_async_wait_multiple(const int *items, int count)
+ {
+ if (!count)
+ return 0;
+
+ std::vector<cl_event> seqEvents(0);
+ seqEvents.reserve(count);
+
+ // Lookup all the cl_event objects into seqEvents
+ thread_scoped_lock hold(waitsLock);
+ int done;
+ for (int i = 0; i < count; ++i)
+ {
+ int id = items[i];
+ WaitMap::iterator p = waits.find(id);
+ if (p != waits.end() && p->second != 0)
+ seqEvents.push_back(p->second);
+ }
+ hold.unlock();
+
+ ciErr = clWaitForEvents(seqEvents.size(), &seqEvents[0]);
+ opencl_assert(ciErr);
+
+ // Release cl_event objects and erase waits
+ done = 0;
+ hold.lock();
+ for (int i = 0; i < count; ++i)
+ {
+ int id = items[i];
+ WaitMap::iterator p = waits.find(id);
+ if (p != waits.end())
+ {
+ ciErr = clReleaseEvent(p->second);
+ opencl_assert(ciErr);
+ p->second = 0;
+ waits.erase(p);
+ ++done;
+ }
+ }
+
+ return done;
+
+ //std::vector<cl_event> seqEvents(0);
+ //seqEvents.reserve(count);
+ //
+ //// Put all the cl_event objects sequentially in memory
+ //hold.lock();
+ //for (int i = 0; i < count; ++i)
+ //{
+ // if (items[i])
+ // {
+ // WaitMap::iterator p = waits.find(items[i]);
+ // if (p != waits.end())
+ // seqEvents.push_back(p->second);
+ // }
+ //}
+ //hold.unlock();
+ //
+ //ciErr = clWaitForEvents(seqEvents.size(), &seqEvents[0]);
+ //opencl_assert(ciErr);
+ //
+ ////std::for_each(seqEvents.begin(), seqEvents.end(), [&](cl_event&i){
+ //// ciErr = clReleaseEvent(i);
+ ////});
+ //
+ //hold.lock();
+ //std::for_each(&items[0], &items[count], [&](const int &id){
+ // waits.erase(id);
+ //});
+ //
+ //return count;
+ }
+
void mem_alloc(device_memory& mem, MemoryType type)
{
size_t size = mem.memory_size();
@@ -475,21 +666,81 @@
void mem_copy_to(device_memory& mem)
{
+ if (asyncBatch)
+ return asyncBatch->push_back(mem_copy_to_async(mem));
+
/* this is blocking */
size_t size = mem.memory_size();
- ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL);
+ ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE,
+ 0, size, (void*)mem.data_pointer, 0, NULL, NULL);
opencl_assert(ciErr);
}
- void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+ int mem_copy_to_async(device_memory &mem)
{
+ WaitMap::iterator wait = new_wait();
+
+ size_t size = mem.memory_size();
+ ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_FALSE,
+ 0, size, (void*)mem.data_pointer, 0, NULL, &wait->second);
+ opencl_assert(ciErr);
+
+ return wait->first;
+ }
+
+ void mem_copy_from(device_memory& mem, int y, int w, int h, int elem, bool peek)
+ {
+ if (asyncBatch)
+ return asyncBatch->push_back(mem_copy_from_async(mem, y, w, h, elem, peek));
+
size_t offset = elem*y*w;
size_t size = elem*w*h;
- ciErr = clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL);
+ if (peek && cqPeekQueue == NULL)
+ cqPeekQueue = clCreateCommandQueue(cxContext, cdDevice,
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ciErr);
+
+ ciErr = clEnqueueReadBuffer(peek ? cqPeekQueue : cqCommandQueue,
+ CL_MEM_PTR(mem.device_pointer), CL_TRUE,
+ offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL);
opencl_assert(ciErr);
}
+ void mem_async_wait(int id)
+ {
+ thread_scoped_lock hold(waitsLock);
+
+ WaitMap::iterator p = waits.find(id);
+ if (p != waits.end())
+ {
+ if (p->second)
+ {
+ clWaitForEvents(1, &p->second);
+ clReleaseEvent(p->second);
+ }
+ waits.erase(p);
+ }
+ }
+
+ int mem_copy_from_async(device_memory& mem, int y, int w, int h, int elem, bool peek)
+ {
+ size_t offset = elem*y*w;
+ size_t size = elem*w*h;
+
+ WaitMap::iterator wait = new_wait();
+
+ if (peek && cqPeekQueue == NULL)
+ cqPeekQueue = clCreateCommandQueue(cxContext, cdDevice,
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ciErr);
+
+ ciErr = clEnqueueReadBuffer(peek ? cqPeekQueue : cqCommandQueue,
+ CL_MEM_PTR(mem.device_pointer), CL_FALSE,
+ offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, &wait->second);
+ opencl_assert(ciErr);
+
+ return wait->first;
+ }
+
void mem_zero(device_memory& mem)
{
if(mem.device_pointer) {
@@ -498,6 +749,16 @@
}
}
+ int mem_zero_async(device_memory &mem)
+ {
+ int waitId = 0;
+ if(mem.data_pointer) {
+ memset((void*)mem.data_pointer, 0, mem.memory_size());
+ waitId = mem_copy_to_async(mem);
+ }
+ return waitId;
+ }
+
void mem_free(device_memory& mem)
{
if(mem.device_pointer) {
@@ -507,27 +768,43 @@
}
}
- void const_copy_to(const char *name, void *host, size_t size)
+ device_memory &const_copy_to_common(const char *name, void *host, size_t size)
{
- if(const_mem_map.find(name) == const_mem_map.end()) {
+ const_mem_map_t::iterator p = const_mem_map.find(name);
+ if(p == const_mem_map.end()) {
device_vector<uchar> *data = new device_vector<uchar>();
data->copy((uchar*)host, size);
mem_alloc(*data, MEM_READ_ONLY);
- const_mem_map[name] = data;
+ p = const_mem_map.insert(const_mem_map_t::value_type(
+ name, data)).first;
}
- else {
- device_vector<uchar> *data = const_mem_map[name];
+ else
+ {
+ device_vector<uchar> *data = p->second;
data->copy((uchar*)host, size);
}
- mem_copy_to(*const_mem_map[name]);
+ return *p->second;
}
+ void const_copy_to(const char *name, void *host, size_t size)
+ {
+ if (asyncBatch)
+ return asyncBatch->push_back(const_copy_to_async(name, host, size));
+ mem_copy_to(const_copy_to_common(name, host, size));
+ }
+
+ int const_copy_to_async(const char *name, void *host, size_t size)
+ {
+ return mem_copy_to_async(const_copy_to_common(name, host, size));
+ }
+
void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
{
mem_alloc(mem, MEM_READ_ONLY);
mem_copy_to(mem);
+ assert(mem_map.find(name) == mem_map.end());
mem_map[name] = &mem;
}
@@ -589,10 +866,47 @@
size_t local_size[2] = {workgroup_size, workgroup_size};
size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)};
+ cl_event *throttlePtr = 0;
+
+ // Limit queue level (this prevents too much GUI slowdown when computing on GUI GPU)
+ //opencl_assert(clFinish(cqCommandQueue));
+ if (++throttleCount >= throttleLevel)
+ {
+ throttleCount -= throttleLevel;
+
+ // Synchronize with the command (throttleLevel) commands ago
+ double nowtime = time_dt();
+
+ if (throttleEvent != 0)
+ {
+ double elap = nowtime - throttleTime;
+
+ // Dynamically adjust throttle depth
+ if (elap > 2.5 && throttleLevel > 4)
+ {
+ throttleLevel--;
+ printf("Throttle set to %d, elap %lf\n", throttleLevel, elap);
+ }
+ else if (elap < 1.5 && throttleLevel < 128)
+ {
+ throttleLevel++;
+ printf("Throttle set to %d, elap %lf\n", throttleLevel, elap);
+ }
+
+ clWaitForEvents(1, &throttleEvent);
+ clReleaseEvent(throttleEvent);
+ throttleEvent = 0;
+ }
+
+ // Queue up an event that we'll wait for next time
+ throttlePtr = &throttleEvent;
+
+ throttleTime = nowtime;
+ }
+
/* run kernel */
- ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckPathTraceKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
+ ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckPathTraceKernel, 2, NULL, global_size, local_size, 0, NULL, throttlePtr);
opencl_assert(ciErr);
- opencl_assert(clFinish(cqCommandQueue));
}
cl_int set_kernel_arg_mem(cl_kernel kernel, int *narg, const char *name)
@@ -667,7 +981,7 @@
/* run kernel */
ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckFilmConvertKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
opencl_assert(ciErr);
- opencl_assert(clFinish(cqCommandQueue));
+ //opencl_assert(clFinish(cqCommandQueue));
}
void task_add(DeviceTask& maintask)
@@ -690,6 +1004,7 @@
void task_wait()
{
+ clFlush(cqCommandQueue);
}
void task_cancel()
@@ -713,37 +1028,50 @@
if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0)
return;
- if(clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS)
- return;
+ vector<cl_platform_id> platform_ids;
+ platform_ids.resize(num_platforms);
- if(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &num_devices) != CL_SUCCESS)
+ if(clGetPlatformIDs(num_platforms, &platform_ids.front(), NULL) != CL_SUCCESS)
return;
-
- device_ids.resize(num_devices);
- if(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, num_devices, &device_ids[0], NULL) != CL_SUCCESS)
- return;
-
- /* add devices */
- for(int num = 0; num < num_devices; num++) {
- cl_device_id device_id = device_ids[num];
- char name[1024];
+ cl_uint offset = 0;
- if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
+ for (int platform = 0; platform < num_platforms; ++platform, offset += num_devices)
+ {
+ platform_id = platform_ids[platform];
+
+ if(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices) != CL_SUCCESS)
continue;
- DeviceInfo info;
+ device_ids.resize(num_devices);
- info.type = DEVICE_OPENCL;
- info.description = string(name);
- info.id = string_printf("OPENCL_%d", num);
- info.num = num;
- /* we don't know if it's used for display, but assume it is */
- info.display_device = true;
- info.advanced_shading = false;
- info.pack_images = true;
+ if(num_devices == 0 ||clGetDeviceIDs(platform_id,
+ CL_DEVICE_TYPE_ALL,
+ num_devices, &device_ids[0], NULL) != CL_SUCCESS)
+ continue;
+
+ /* add devices */
+ for(int num = 0; num < num_devices; num++) {
+ cl_device_id device_id = device_ids[num];
+ char name[1024];
- devices.push_back(info);
+ if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
+ continue;
+
+ DeviceInfo info;
+
+ info.type = DEVICE_OPENCL;
+ info.description = string(name);
+ info.num = num + offset;
+ info.multi_group = platform + 1;
+ info.id = string_printf("OPENCL_%d", info.num);
+ /* we don't know if it's used for display, but assume it is */
+ info.display_device = true;
+ info.advanced_shading = false;
+ info.pack_images = true;
+
+ devices.push_back(info);
+ }
}
}
Index: intern/cycles/device/device.cpp
===================================================================
--- intern/cycles/device/device.cpp (revision 47903)
+++ intern/cycles/device/device.cpp (working copy)
@@ -30,6 +30,7 @@
#include "util_opengl.h"
#include "util_types.h"
#include "util_vector.h"
+#include "util_thread.h"
CCL_NAMESPACE_BEGIN
@@ -76,6 +77,7 @@
}
}
else {
+ // Split the work item vertically across the devices
num = min(h, num);
for(int i = 0; i < num; i++) {
@@ -94,14 +96,17 @@
/* Device */
+// Thread local implicit batch batch_begin, batch_end
+__thread Device::AsyncBatchType *Device::asyncBatch;
+
void Device::pixels_alloc(device_memory& mem)
{
mem_alloc(mem, MEM_READ_WRITE);
}
-void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
+void Device::pixels_copy_from(device_memory& mem, int y, int w, int h, bool peek)
{
- mem_copy_from(mem, y, w, h, sizeof(uint8_t)*4);
+ mem_copy_from(mem, y, w, h, sizeof(uint8_t)*4, peek);
}
void Device::pixels_free(device_memory& mem)
@@ -111,7 +116,7 @@
void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent)
{
- pixels_copy_from(rgba, y, w, h);
+ pixels_copy_from(rgba, y, w, h, true);
if(transparent) {
glEnable(GL_BLEND);
@@ -249,6 +254,9 @@
static bool devices_init = false;
if(!devices_init) {
+
+ device_cpu_info(devices);
+
#ifdef WITH_CUDA
if(cuLibraryInit())
device_cuda_info(devices);
@@ -263,8 +271,6 @@
device_multi_info(devices);
#endif
- device_cpu_info(devices);
-
#ifdef WITH_NETWORK
device_network_info(devices);
#endif
Index: intern/cycles/device/device_multi.cpp
===================================================================
--- intern/cycles/device/device_multi.cpp (revision 47903)
+++ intern/cycles/device/device_multi.cpp (working copy)
@@ -27,25 +27,73 @@
#include "util_list.h"
#include "util_map.h"
#include "util_time.h"
+#include "util_thread.h"
CCL_NAMESPACE_BEGIN
+// Synchronous requests are split into a pair of async requests issued to each device
+// (in parallel across subdevices if openmp is available)
+// async_begin cause all synchronous requests to be asynchronous until async_end
+
class MultiDevice : public Device
{
public:
+ struct proxy_device_memory
+ : public device_memory
+ {
+ proxy_device_memory(const device_memory &rhs)
+ {
+ data_type = rhs.data_type;
+ data_elements = rhs.data_elements;
+ data_pointer = rhs.data_pointer;
+ data_size = rhs.data_size;
+ data_width = rhs.data_width;
+ data_height = rhs.data_height;
+ device_pointer = 0;
+ }
+
+ proxy_device_memory(const proxy_device_memory &rhs)
+ {
+ data_type = rhs.data_type;
+ data_elements = rhs.data_elements;
+ data_pointer = rhs.data_pointer;
+ data_size = rhs.data_size;
+ data_width = rhs.data_width;
+ data_height = rhs.data_height;
+ device_pointer = rhs.device_pointer;
+ }
+ };
+
struct SubDevice {
SubDevice(Device *device_)
- : device(device_) {}
+ : device(device_) {}
Device *device;
- map<device_ptr, device_ptr> ptr_map;
+
+ typedef map<device_ptr, proxy_device_memory> PtrMap;
+ typedef pair<PtrMap::iterator, bool> PtrMapIns;
+ PtrMap ptr_map;
};
- list<SubDevice> devices;
+ vector<SubDevice> devices;
device_ptr unique_ptr;
+ // Locks access to the subdevice sub_ptr (would be nice to have rw lock)
+ thread_mutex subLock;
+
+ typedef map<int, vector<int> > WaitMap;
+ WaitMap waits;
+ thread_mutex waitsLock;
+ int nextWaitId;
+
+ AsyncBatchType *asyncBatch;
+ int peakBatch;
+
MultiDevice(DeviceInfo& info, bool background_)
- : unique_ptr(1)
+ : unique_ptr(1)
+ , nextWaitId(1)
+ , asyncBatch(0)
+ , peakBatch(0)
{
Device *device;
background = background_;
@@ -70,6 +118,67 @@
#endif
}
+ void async_begin()
+ {
+ if (!asyncBatch)
+ {
+ foreach(SubDevice& sub, devices) {
+ sub.device->async_begin();
+ }
+
+ asyncBatch = new AsyncBatchType(0);
+ asyncBatch->reserve(peakBatch);
+ }
+ }
+
+ void async_end()
+ {
+ if (asyncBatch)
+ {
+ // Wait in reverse order, last wait probably finishes last
+ // This could minimize the overhead of a lot of completion waits
+ // The opencl queue is in order now, so this optimization should work
+ // Hopefully it will fly through the rest of the finished waits
+
+ if (asyncBatch->size())
+ {
+ mem_async_wait_multiple(&asyncBatch->front(), asyncBatch->size());
+ if (peakBatch < asyncBatch->size())
+ peakBatch = asyncBatch->size();
+ }
+ delete asyncBatch;
+ asyncBatch = 0;
+
+ foreach(SubDevice& sub, devices) {
+ sub.device->async_end();
+ }
+ }
+ }
+
+ int mem_async_wait_multiple(const int *items, int count)
+ {
+ // Group subdevice ids by device
+ std::vector<std::vector<int>> subdevIds(devices.size());
+ for (int d = 0; d < (int)devices.size(); ++d)
+ subdevIds[d].resize(count);
+
+ // Lookup each wait's vector of subdevice waits
+ for (int i = 0; i < count; ++i)
+ {
+ WaitMap::iterator p = waits.find(items[i]);
+ assert(p != waits.end());
+
+ for (int d = 0; d < (int)devices.size(); ++d)
+ subdevIds[d][i] = p->second[d];
+ }
+
+ // Use subdevice's multiple-wait facility, it's proably more efficient
+ for (int d = 0; d < (int)devices.size(); ++d)
+ devices[d].device->mem_async_wait_multiple(&subdevIds[d][0], count);
+
+ return count;
+ }
+
~MultiDevice()
{
foreach(SubDevice& sub, devices)
@@ -89,190 +198,444 @@
return error_msg;
}
+ // Lookup subdevice device_memory objects and update the data_pointers
+ void prepare_subdevices(std::vector<SubDevice::PtrMap::iterator> &result,
+ device_memory &mem)
+ {
+ thread_scoped_lock hold(subLock);
+ if (mem.device_pointer != 0)
+ {
+ for (int i = 0; i < devices.size(); ++i)
+ {
+ SubDevice::PtrMap::iterator p = devices[i].ptr_map.find(mem.device_pointer);
+ assert(p != devices[i].ptr_map.end());
+ p->second.data_pointer = mem.data_pointer;
+ result[i] = p;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ pair<SubDevice::PtrMap::iterator,bool> p = sub.ptr_map.insert(
+ SubDevice::PtrMap::value_type(unique_ptr,
+ SubDevice::PtrMap::value_type::second_type(mem)));
+ assert(p.second);
+ result[i] = p.first;
+ }
+ mem.device_pointer = unique_ptr++;
+ }
+ }
+
+ WaitMap::iterator new_wait()
+ {
+ thread_scoped_lock hold(waitsLock);
+ int waitId = nextWaitId++;
+ pair<WaitMap::iterator, bool> waitIns = waits.insert(
+ WaitMap::value_type(waitId,
+ WaitMap::value_type::second_type(devices.size())));
+ hold.unlock();
+ assert(waitIns.second);
+ return waitIns.first;
+ }
+
+ void mem_async_wait(int id)
+ {
+ thread_scoped_lock hold(waitsLock);
+ WaitMap::iterator p = waits.find(id);
+ assert(p != waits.end());
+ hold.unlock();
+
+ if (p == waits.end())
+ return;
+
+ // Wait in reverse order, probably more efficient
+ for (int i = (int)devices.size(); i > 0; --i)
+ devices[i-1].device->mem_async_wait(p->second[i-1]);
+
+ hold.lock();
+ waits.erase(p);
+ }
+
bool load_kernels(bool experimental)
{
- foreach(SubDevice& sub, devices)
- if(!sub.device->load_kernels(experimental))
- return false;
+ bool ok = true;
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ if(!devices[i].device->load_kernels(experimental))
+ ok = false;
- return true;
+ return ok;
}
void mem_alloc(device_memory& mem, MemoryType type)
{
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = 0;
- sub.device->mem_alloc(mem, type);
- sub.ptr_map[unique_ptr] = mem.device_pointer;
+ assert(mem.device_pointer == 0);
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ subdevs[i]->second.device_pointer = 0;
+ sub.device->mem_alloc(subdevs[i]->second, type);
+ assert(subdevs[i]->second.device_pointer != 0);
}
-
- mem.device_pointer = unique_ptr++;
}
void mem_copy_to(device_memory& mem)
{
+ if (asyncBatch)
+ return asyncBatch->push_back(mem_copy_to_async(mem));
+ else
+ mem_async_wait(mem_copy_to_async(mem));
+ }
+
+ int mem_copy_to_async(device_memory &mem)
+ {
device_ptr tmp = mem.device_pointer;
+ assert(tmp > 0 && tmp < unique_ptr);
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = sub.ptr_map[tmp];
- sub.device->mem_copy_to(mem);
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ WaitMap::iterator wait = new_wait();
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ wait->second[i] = sub.device->mem_copy_to_async(subdevs[i]->second);
}
- mem.device_pointer = tmp;
+ return wait->first;
}
- void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+ // Work is divided among subdevices
+ void mem_copy_from(device_memory& mem, int y, int w, int h, int elem, bool peek)
{
+ if (asyncBatch)
+ asyncBatch->push_back(mem_copy_from_async(mem, y, w, h, elem, peek));
+ else
+ mem_async_wait(mem_copy_from_async(mem, y, w, h, elem, peek));
+ }
+
+ // Work is divided among subdevices
+ int mem_copy_from_async(device_memory& mem, int y, int w, int h, int elem, bool peek)
+ {
device_ptr tmp = mem.device_pointer;
+ assert(tmp > 0 && tmp < unique_ptr);
+
int i = 0, sub_h = h/devices.size();
- foreach(SubDevice& sub, devices) {
+ WaitMap::iterator wait = new_wait();
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
int sy = y + i*sub_h;
int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
- mem.device_pointer = sub.ptr_map[tmp];
- sub.device->mem_copy_from(mem, sy, w, sh, elem);
- i++;
+ int subWait = sub.device->mem_copy_from_async(subdevs[i]->second, sy, w, sh, elem, peek);
+
+ wait->second[i] = subWait;
}
- mem.device_pointer = tmp;
+ return wait->first;
}
void mem_zero(device_memory& mem)
{
+ if (asyncBatch)
+ asyncBatch->push_back(mem_zero_async(mem));
+ else
+ mem_async_wait(mem_zero_async(mem));
+ }
+
+ int mem_zero_async(device_memory &mem)
+ {
device_ptr tmp = mem.device_pointer;
+ assert(tmp > 0 && tmp < unique_ptr);
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = sub.ptr_map[tmp];
- sub.device->mem_zero(mem);
+ WaitMap::iterator wait = new_wait();
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ int subWait = devices[i].device->mem_zero_async(subdevs[i]->second);
+ wait->second[i] = subWait;
}
- mem.device_pointer = tmp;
+ return wait->first;
}
void mem_free(device_memory& mem)
{
device_ptr tmp = mem.device_pointer;
+ assert(tmp > 0 && tmp < unique_ptr);
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = sub.ptr_map[tmp];
- sub.device->mem_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(tmp));
+ if (!tmp)
+ return;
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ sub.device->mem_free(subdevs[i]->second);
}
+ thread_scoped_lock hold(subLock);
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ sub.ptr_map.erase(subdevs[i]);
+ }
+
mem.device_pointer = 0;
}
void const_copy_to(const char *name, void *host, size_t size)
{
- foreach(SubDevice& sub, devices)
- sub.device->const_copy_to(name, host, size);
+ if (asyncBatch)
+ asyncBatch->push_back(const_copy_to_async(name, host, size));
+ else
+ mem_async_wait(const_copy_to_async(name, host, size));
}
- void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+ int const_copy_to_async(const char *name, void *host, size_t size)
{
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = 0;
- sub.device->tex_alloc(name, mem, interpolation, periodic);
- sub.ptr_map[unique_ptr] = mem.device_pointer;
+ WaitMap::iterator wait = new_wait();
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ wait->second[i] = sub.device->const_copy_to_async(name, host, size);
}
- mem.device_pointer = unique_ptr++;
+ return wait->first;
}
+ void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+ {
+ assert(mem.device_pointer == 0);
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ subdevs[i]->second.data_pointer = mem.data_pointer;
+ subdevs[i]->second.device_pointer = 0;
+ sub.device->tex_alloc(name, subdevs[i]->second, interpolation, periodic);
+ assert(subdevs[i]->second.device_pointer != 0);
+ }
+ }
+
void tex_free(device_memory& mem)
{
device_ptr tmp = mem.device_pointer;
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = sub.ptr_map[tmp];
- sub.device->tex_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(tmp));
+ if (!tmp)
+ return;
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ sub.device->tex_free(subdevs[i]->second);
}
+ thread_scoped_lock hold(subLock);
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ sub.ptr_map.erase(subdevs[i]);
+ }
+
mem.device_pointer = 0;
}
void pixels_alloc(device_memory& mem)
{
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = 0;
- sub.device->pixels_alloc(mem);
- sub.ptr_map[unique_ptr] = mem.device_pointer;
+ assert(mem.device_pointer == 0);
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+
+ subdevs[i]->second.device_pointer = 0;
+ sub.device->pixels_alloc(subdevs[i]->second);
+ assert(subdevs[i]->second.device_pointer != 0);
}
-
- mem.device_pointer = unique_ptr++;
}
void pixels_free(device_memory& mem)
{
device_ptr tmp = mem.device_pointer;
- foreach(SubDevice& sub, devices) {
- mem.device_pointer = sub.ptr_map[tmp];
- sub.device->pixels_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(tmp));
+ if (!tmp)
+ return;
+
+ printf("multi pixels_free %d\n", mem.device_pointer);
+
+ assert(tmp > 0 && tmp < unique_ptr);
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ sub.device->pixels_free(subdevs[i]->second);
}
+ thread_scoped_lock hold(subLock);
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ sub.ptr_map.erase(subdevs[i]);
+ }
+
mem.device_pointer = 0;
}
- void pixels_copy_from(device_memory& mem, int y, int w, int h)
+ // Work is divided among subdevices
+ void pixels_copy_from(device_memory& mem, int y, int w, int h, bool peek)
{
+ if (asyncBatch)
+ return asyncBatch->push_back(pixels_copy_from_async(mem, y, w, h, peek));
+ else
+ mem_async_wait(pixels_copy_from_async(mem, y, w, h, peek));
+ }
+
+ // Work is divided among subdevices
+ int pixels_copy_from_async(device_memory& mem, int y, int w, int h, bool peek)
+ {
device_ptr tmp = mem.device_pointer;
+ assert(tmp > 0 && tmp < unique_ptr);
+
int i = 0, sub_h = h/devices.size();
- foreach(SubDevice& sub, devices) {
+ WaitMap::iterator wait = new_wait();
+
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ #pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
int sy = y + i*sub_h;
int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
- mem.device_pointer = sub.ptr_map[tmp];
- sub.device->pixels_copy_from(mem, sy, w, sh);
- i++;
+ wait->second[i] = sub.device->pixels_copy_from_async(subdevs[i]->second, sy, w, sh, peek);
}
- mem.device_pointer = tmp;
+ return wait->first;
}
- void draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent)
+ // Work is divided among subdevices
+ void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
{
- device_ptr tmp = rgba.device_pointer;
int i = 0, sub_h = h/devices.size();
int sub_height = height/devices.size();
- foreach(SubDevice& sub, devices) {
+ std::vector<SubDevice::PtrMap::iterator> subdevs(devices.size());
+ prepare_subdevices(subdevs, mem);
+
+ //#pragma omp parallel for
+ for (int i = 0; i < (int)devices.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
int sy = y + i*sub_h;
int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
int sheight = (i == (int)devices.size() - 1)? height - sub_height*i: sub_height;
int sdy = dy + i*sub_height;
/* adjust math for w/width */
- rgba.device_pointer = sub.ptr_map[tmp];
- sub.device->draw_pixels(rgba, sy, w, sh, sdy, width, sheight, transparent);
- i++;
+ sub.device->draw_pixels(subdevs[i]->second, sy, w, sh, sdy, width, sheight, transparent);
}
-
- rgba.device_pointer = tmp;
}
+ // Work is divided vertically among subdevices using task.split
void task_add(DeviceTask& task)
{
list<DeviceTask> tasks;
task.split(tasks, devices.size());
+ vector<DeviceTask> taskVector(tasks.begin(), tasks.end());
- foreach(SubDevice& sub, devices) {
- if(!tasks.empty()) {
- DeviceTask subtask = tasks.front();
- tasks.pop_front();
+ tasks.clear();
- if(task.buffer) subtask.buffer = sub.ptr_map[task.buffer];
- if(task.rng_state) subtask.rng_state = sub.ptr_map[task.rng_state];
- if(task.rgba) subtask.rgba = sub.ptr_map[task.rgba];
- if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
- if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
+ thread_scoped_lock hold(subLock);
- sub.device->task_add(subtask);
+ #pragma omp parallel for
+ for (int i = 0; i < (int)taskVector.size(); ++i)
+ {
+ SubDevice &sub = devices[i];
+ DeviceTask &subtask = taskVector[i];
+
+ SubDevice::PtrMap::iterator p;
+
+ if(task.buffer)
+ {
+ p = sub.ptr_map.find(task.buffer);
+ assert(p != sub.ptr_map.end());
+ subtask.buffer = p->second.device_pointer;
}
+
+ if(task.rng_state)
+ {
+ p = sub.ptr_map.find(task.rng_state);
+ assert(p != sub.ptr_map.end());
+ subtask.rng_state = p->second.device_pointer;
+ }
+
+ if(task.rgba)
+ {
+ p = sub.ptr_map.find(task.rgba);
+ assert(p != sub.ptr_map.end());
+ subtask.rgba = p->second.device_pointer;
+ }
+
+ if(task.shader_input)
+ {
+ p = sub.ptr_map.find(task.shader_input);
+ assert(p != sub.ptr_map.end());
+ subtask.shader_input = p->second.device_pointer;
+ }
+
+ if(task.shader_output)
+ {
+ p = sub.ptr_map.find(task.shader_output);
+ assert(p != sub.ptr_map.end());
+ subtask.shader_output = p->second.device_pointer;
+ }
+
+ sub.device->task_add(subtask);
}
}
@@ -294,7 +657,7 @@
return new MultiDevice(info, background);
}
-static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool with_display, bool with_advanced_shading, const char *id_fmt, int num)
+static bool device_multi_add(vector<DeviceInfo>& devices)
{
DeviceInfo info;
@@ -303,20 +666,29 @@
map<string, int>::iterator dt;
int num_added = 0, num_display = 0;
- info.advanced_shading = with_advanced_shading;
- info.pack_images = false;
+ int addcount = 0;
- foreach(DeviceInfo& subinfo, devices) {
- if(subinfo.type == type) {
- if(subinfo.advanced_shading != info.advanced_shading)
- continue;
- if(subinfo.display_device) {
- if(with_display)
- num_display++;
- else
- continue;
- }
+ // Go through every possible combination of devices
+ vector<DeviceInfo*> comboDevices;
+ vector<DeviceInfo> newDevicesFront, newDevicesBack;
+ vector<bool> enable(devices.size());
+ for (int combo = 1; combo < (1 << devices.size()); ++combo)
+ {
+ comboDevices.clear();
+ for (int scan = 0; scan < devices.size(); ++scan)
+ if (combo & (1 << scan))
+ comboDevices.push_back(&devices[scan]);
+ if (comboDevices.size() < 2)
+ continue;
+ // Found a combination
+
+ info.display_device = true;
+ info.advanced_shading = true;
+ info.pack_images = false;
+ info.multi_devices.clear();
+ foreach(DeviceInfo* subinfoptr, comboDevices) {
+ DeviceInfo &subinfo = *subinfoptr;
string key = subinfo.description;
if(dupli_map.find(key) == dupli_map.end())
@@ -325,69 +697,81 @@
dupli_map[key]++;
info.multi_devices.push_back(subinfo);
- if(subinfo.display_device)
- info.display_device = true;
+ if(!subinfo.display_device)
+ info.display_device = false;
+ if(!subinfo.advanced_shading)
+ info.advanced_shading = false;
info.pack_images = info.pack_images || subinfo.pack_images;
num_added++;
}
- }
- if(num_added <= 1 || (with_display && num_display == 0))
- return false;
+ if(num_added <= 1)
+ continue;
- /* generate string */
- stringstream desc;
- vector<string> last_tokens;
- bool first = true;
+ /* generate string */
+ stringstream desc;
+ //vector<string> last_tokens;
+ //bool first = true;
+ //
+ //for(dt = dupli_map.begin(); dt != dupli_map.end(); dt++) {
+ // if(!first) desc << " + ";
+ // first = false;
+ //
+ // /* get name and count */
+ // string name = dt->first;
+ // int count = dt->second;
+ //
+ // /* strip common prefixes */
+ // vector<string> tokens;
+ // string_split(tokens, dt->first);
+ //
+ // if(tokens.size() > 1) {
+ // int i;
+ //
+ // for(i = 0; i < tokens.size() && i < last_tokens.size(); i++)
+ // if(tokens[i] != last_tokens[i])
+ // break;
+ //
+ // name = "";
+ // for(; i < tokens.size(); i++) {
+ // name += tokens[i];
+ // if(i != tokens.size() - 1)
+ // name += " ";
+ // }
+ // }
+ //
+ // last_tokens = tokens;
+ //
+ // /* add */
+ // if(count > 1)
+ // desc << name << " (" << count << "x)";
+ // else
+ // desc << name;
+ //}
- for(dt = dupli_map.begin(); dt != dupli_map.end(); dt++) {
- if(!first) desc << " + ";
- first = false;
-
- /* get name and count */
- string name = dt->first;
- int count = dt->second;
-
- /* strip common prefixes */
- vector<string> tokens;
- string_split(tokens, dt->first);
-
- if(tokens.size() > 1) {
- int i;
-
- for(i = 0; i < tokens.size() && i < last_tokens.size(); i++)
- if(tokens[i] != last_tokens[i])
- break;
-
- name = "";
- for(; i < tokens.size(); i++) {
- name += tokens[i];
- if(i != tokens.size() - 1)
- name += " ";
- }
+ bool first = true;
+ foreach(DeviceInfo &d, info.multi_devices)
+ {
+ desc << (first ? "" : " + ") << d.description;
+ first = false;
}
- last_tokens = tokens;
+ /* add info */
+ info.type = DEVICE_MULTI;
+ info.description = desc.str();
+ desc.clear();
+ info.num = addcount++;
+ info.id = string_printf("MULTI_%d", info.num);
- /* add */
- if(count > 1)
- desc << name << " (" << count << "x)";
+ if(info.display_device)
+ newDevicesBack.push_back(info);
else
- desc << name;
+ newDevicesFront.insert(newDevicesFront.begin(), info);
}
-
- /* add info */
- info.type = DEVICE_MULTI;
- info.description = desc.str();
- info.id = string_printf(id_fmt, num);
- info.display_device = with_display;
- info.num = 0;
-
- if(with_display)
- devices.push_back(info);
- else
- devices.insert(devices.begin(), info);
-
+ foreach(DeviceInfo &d, newDevicesBack)
+ devices.push_back(d);
+ foreach(DeviceInfo &d, newDevicesFront)
+ devices.insert(devices.begin(), d);
return true;
}
@@ -395,16 +779,7 @@
{
int num = 0;
- if(!device_multi_add(devices, DEVICE_CUDA, false, true, "CUDA_MULTI_%d", num++))
- device_multi_add(devices, DEVICE_CUDA, false, false, "CUDA_MULTI_%d", num++);
- if(!device_multi_add(devices, DEVICE_CUDA, true, true, "CUDA_MULTI_%d", num++))
- device_multi_add(devices, DEVICE_CUDA, true, false, "CUDA_MULTI_%d", num++);
-
- num = 0;
- if(!device_multi_add(devices, DEVICE_OPENCL, false, true, "OPENCL_MULTI_%d", num++))
- device_multi_add(devices, DEVICE_OPENCL, false, false, "OPENCL_MULTI_%d", num++);
- if(!device_multi_add(devices, DEVICE_OPENCL, true, true, "OPENCL_MULTI_%d", num++))
- device_multi_add(devices, DEVICE_OPENCL, true, false, "OPENCL_MULTI_%d", num++);
+ device_multi_add(devices);
}
CCL_NAMESPACE_END
Index: intern/cycles/device/device.h
===================================================================
--- intern/cycles/device/device.h (revision 47903)
+++ intern/cycles/device/device.h (working copy)
@@ -51,6 +51,7 @@
string description;
string id;
int num;
+ int multi_group;
bool display_device;
bool advanced_shading;
bool pack_images;
@@ -61,6 +62,7 @@
type = DEVICE_CPU;
id = "CPU";
num = 0;
+ multi_group = 0;
display_device = false;
advanced_shading = true;
pack_images = false;
@@ -97,14 +99,23 @@
class Device {
protected:
- Device() {}
+ Device()
+ : peakBatch(0) {}
bool background;
string error_msg;
+ typedef std::vector<int> AsyncBatchType;
+ static __thread AsyncBatchType *asyncBatch;
+ int peakBatch;
+
public:
virtual ~Device() {}
+ virtual void async_begin() {}
+ virtual void async_end() {}
+ virtual bool is_async() { return false; }
+
/* info */
DeviceInfo info;
virtual const string& error_message() { return error_msg; }
@@ -112,8 +123,17 @@
/* regular memory */
virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
virtual void mem_copy_to(device_memory& mem) = 0;
+ // Implement the one with peek if you have ordering latency on your device
virtual void mem_copy_from(device_memory& mem,
- int y, int w, int h, int elem) = 0;
+ int y, int w, int h, int elem)
+ {
+ mem_copy_from(mem, y, w, h, elem, false);
+ }
+ virtual void mem_copy_from(device_memory& mem,
+ int y, int w, int h, int elem, bool peek)
+ {
+ mem_copy_from(mem, y, w, h, elem);
+ }
virtual void mem_zero(device_memory& mem) = 0;
virtual void mem_free(device_memory& mem) = 0;
@@ -122,12 +142,17 @@
/* texture memory */
virtual void tex_alloc(const char *name, device_memory& mem,
- bool interpolation = false, bool periodic = false) {};
- virtual void tex_free(device_memory& mem) {};
+ bool interpolation = false, bool periodic = false) {}
+ virtual void tex_free(device_memory& mem) {}
/* pixel memory */
virtual void pixels_alloc(device_memory& mem);
- virtual void pixels_copy_from(device_memory& mem, int y, int w, int h);
+ // Implement the one with peek if you have ordering latency on your device
+ virtual void pixels_copy_from(device_memory& mem, int y, int w, int h)
+ {
+ pixels_copy_from(mem, y, w, h, false);
+ }
+ virtual void pixels_copy_from(device_memory& mem, int y, int w, int h, bool peek);
virtual void pixels_free(device_memory& mem);
/* open shading language, only for CPU device */
@@ -140,6 +165,42 @@
virtual void task_add(DeviceTask& task) = 0;
virtual void task_wait() = 0;
virtual void task_cancel() = 0;
+
+ /* async ops */
+ /* These run the operation asynchronously, and return an id that can
+ be later passed to mem_async_wait to wait for completion */
+ virtual void mem_async_wait(int waitid) {}
+
+ virtual int mem_copy_from_async(device_memory& mem,
+ int y, int w, int h, int elem, bool peek)
+ {
+ mem_copy_from(mem, y, w, h, elem, peek);
+ return 0;
+ }
+
+ virtual int pixels_copy_from_async(device_memory& mem, int y, int w, int h, bool peek)
+ {
+ pixels_copy_from(mem, y, w, h, peek);
+ return 0;
+ }
+
+ virtual int mem_copy_to_async(device_memory& mem)
+ {
+ mem_copy_to(mem);
+ return 0;
+ }
+
+ virtual int const_copy_to_async(const char *name, void *host, size_t size)
+ {
+ const_copy_to(name, host, size);
+ return 0;
+ }
+
+ virtual int mem_zero_async(device_memory &mem)
+ {
+ mem_zero(mem);
+ return 0;
+ }
/* opengl drawing */
virtual void draw_pixels(device_memory& mem, int y, int w, int h,
@@ -157,6 +218,24 @@
static string string_from_type(DeviceType type);
static vector<DeviceType>& available_types();
static vector<DeviceInfo>& available_devices();
+
+ // Pass an iterator of std::pair<Device*,int> of events to wait for
+ // This version (with no 'this' pointer) gets device pointers from pair
+ template<typename Titer>
+ static void mem_async_wait_multiple_device(Titer &b, Titer &e)
+ {
+ for (Titer p = b; p != e; ++p)
+ p->first->mem_async_wait(p->second);
+ }
+
+ // Subclasses can specialize multiple wait if it can go faster
+ // Sequentially wait for each using subclass mem_async_wait
+ virtual int mem_async_wait_multiple(const int *items, int count)
+ {
+ for (int i = count; i > 0; --i)
+ mem_async_wait(items[i-1]);
+ return count;
+ }
};
CCL_NAMESPACE_END
Index: intern/cycles/render/session.cpp
===================================================================
--- intern/cycles/render/session.cpp (revision 47903)
+++ intern/cycles/render/session.cpp (working copy)
@@ -549,7 +549,15 @@
/* update scene */
if(scene->need_update())
+ {
+ // Make all memory transfers in-order but asynchronous
+ //device->async_begin();
+
scene->device_update(device, progress);
+
+ // Synchronize upload
+ //device->async_end();
+ }
}
void Session::update_status_time(bool show_pause, bool show_done)
Index: intern/cycles/render/mesh.cpp
===================================================================
--- intern/cycles/render/mesh.cpp (revision 47903)
+++ intern/cycles/render/mesh.cpp (working copy)
@@ -251,7 +251,7 @@
if(!transform_applied) {
string msg = "Updating Mesh BVH ";
- if(name == "")
+ if(name.empty())
msg += string_printf("%u/%u", (uint)(n+1), (uint)total);
else
msg += string_printf("%s %u/%u", name.c_str(), (uint)(n+1), (uint)total);
Index: intern/cycles/render/buffers.cpp
===================================================================
--- intern/cycles/render/buffers.cpp (revision 47903)
+++ intern/cycles/render/buffers.cpp (working copy)
@@ -125,12 +125,12 @@
device->mem_copy_to(rng_state);
}
-bool RenderBuffers::copy_from_device()
+bool RenderBuffers::copy_from_device(bool peek)
{
if(!buffer.device_pointer)
return false;
- device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+ device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float), peek);
return true;
}
@@ -335,7 +335,7 @@
return;
/* read buffer from device */
- device->pixels_copy_from(rgba, 0, w, h);
+ device->pixels_copy_from(rgba, 0, w, h, true);
/* write image */
ImageOutput *out = ImageOutput::create(filename);
Index: intern/cycles/render/buffers.h
===================================================================
--- intern/cycles/render/buffers.h (revision 47903)
+++ intern/cycles/render/buffers.h (working copy)
@@ -79,7 +79,7 @@
void reset(Device *device, BufferParams& params);
- bool copy_from_device();
+ bool copy_from_device(bool peek);
bool get_pass(PassType type, float exposure, int sample, int components, float *pixels);
protected:
Index: intern/cycles/blender/blender_session.cpp
===================================================================
--- intern/cycles/blender/blender_session.cpp (revision 47903)
+++ intern/cycles/blender/blender_session.cpp (working copy)
@@ -237,20 +237,20 @@
break;
/* write result */
- write_render_result();
+ write_render_result(false);
}
/* delete render result */
RE_engine_end_result((RenderEngine*)b_engine.ptr.data, (RenderResult*)b_rr.ptr.data);
}
-void BlenderSession::write_render_result()
+void BlenderSession::write_render_result(bool peek)
{
/* get state */
RenderBuffers *buffers = session->buffers;
/* copy data from device */
- if(!buffers->copy_from_device())
+ if(!buffers->copy_from_device(peek))
return;
BufferParams& params = buffers->params;
@@ -435,7 +435,7 @@
/* offline render, redraw if timeout passed */
if(time_dt() - last_redraw_time > 1.0) {
- write_render_result();
+ write_render_result(true);
engine_tag_redraw((RenderEngine*)b_engine.ptr.data);
last_redraw_time = time_dt();
}
Index: intern/cycles/blender/blender_session.h
===================================================================
--- intern/cycles/blender/blender_session.h (revision 47903)
+++ intern/cycles/blender/blender_session.h (working copy)
@@ -46,7 +46,7 @@
/* offline render */
void render();
- void write_render_result();
+ void write_render_result(bool peek);
/* interactive updates */
void synchronize();
Index: intern/cycles/util/util_task.cpp
===================================================================
--- intern/cycles/util/util_task.cpp (revision 47903)
+++ intern/cycles/util/util_task.cpp (working copy)
@@ -66,7 +66,7 @@
bool found_entry = false;
list<TaskScheduler::Entry>::iterator it;
- for(it = TaskScheduler::queue.begin(); it != TaskScheduler::queue.end(); it++) {
+ for(it = TaskScheduler::queue.begin(); it != TaskScheduler::queue.end(); ++it) {
TaskScheduler::Entry& entry = *it;
if(entry.pool == this) {
Index: intern/cycles/util/util_thread.h
===================================================================
--- intern/cycles/util/util_thread.h (revision 47903)
+++ intern/cycles/util/util_thread.h (working copy)
@@ -89,7 +89,7 @@
#else
-#ifdef __WIN32
+#if defined(__WIN32) || defined(WIN32)
#define __thread __declspec(thread)
#endif
Index: source/blender/windowmanager/intern/wm_window.c
===================================================================
--- source/blender/windowmanager/intern/wm_window.c (revision 47903)
+++ source/blender/windowmanager/intern/wm_window.c (working copy)
@@ -969,7 +969,7 @@
/* no event, we sleep 5 milliseconds */
if (hasevent == 0)
- PIL_sleep_ms(5);
+ PIL_sleep_ms(16);
}
void wm_window_process_events_nosleep(void)
Index: source/blender/compositor/intern/COM_CompositorContext.h
===================================================================
--- source/blender/compositor/intern/COM_CompositorContext.h (revision 47903)
+++ source/blender/compositor/intern/COM_CompositorContext.h (working copy)
@@ -149,8 +149,8 @@
/**
* @brief set has this system active openclDevices?
*/
- void setHasActiveOpenCLDevices(bool hasAvtiveOpenCLDevices) {
- this->hasActiveOpenCLDevices = hasAvtiveOpenCLDevices;
+ void setHasActiveOpenCLDevices(bool hasActiveOpenCLDevices) {
+ this->hasActiveOpenCLDevices = hasActiveOpenCLDevices;
}
int getChunksize() { return this->getbNodeTree()->chunksize; }
Index: source/blender/compositor/intern/COM_WorkScheduler.cpp
===================================================================
--- source/blender/compositor/intern/COM_WorkScheduler.cpp (revision 47146)
+++ source/blender/compositor/intern/COM_WorkScheduler.cpp (working copy)
@@ -28,7 +28,7 @@
#include "COM_OpenCLDevice.h"
#include "OCL_opencl.h"
#include "stdio.h"
-#include "COM_OpenCLKernels.cl.cpp"
+#include "COM_OpenCLKernels.cl.h"
#include "BKE_global.h"
#if COM_CURRENT_THREADING_MODEL == COM_TM_NOTHREAD
@@ -40,7 +40,7 @@
/// @brief global state of the WorkScheduler.
-static WorkSchedulerState state;
+//static WorkSchedulerState state;
/// @brief list of all CPUDevices. for every hardware thread an instance of CPUDevice is created
static vector<CPUDevice*> cpudevices;
@@ -104,7 +104,7 @@
return NULL;
}
-bool WorkScheduler::isStopping() {return state == COM_WSS_STOPPING;}
+//bool WorkScheduler::isStopping() {return state == COM_WSS_STOPPING;}
#endif
@@ -237,60 +237,55 @@
error = clGetPlatformIDs(0, 0, &numberOfPlatforms);
if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
if (G.f & G_DEBUG) printf("%d number of platforms\n", numberOfPlatforms);
- cl_platform_id *platforms = new cl_platform_id[numberOfPlatforms];
- error = clGetPlatformIDs(numberOfPlatforms, platforms, 0);
+ vector<cl_platform_id> platforms(numberOfPlatforms);
+ error = clGetPlatformIDs(numberOfPlatforms, &platforms[0], 0);
unsigned int indexPlatform;
- cl_uint totalNumberOfDevices = 0;
+
+ // Separate context per platform
for (indexPlatform = 0 ; indexPlatform < numberOfPlatforms ; indexPlatform ++) {
- cl_platform_id platform = platforms[indexPlatform];
cl_uint numberOfDevices;
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices);
- totalNumberOfDevices += numberOfDevices;
- }
+ clGetDeviceIDs(platforms[indexPlatform], CL_DEVICE_TYPE_ALL, 0, 0, &numberOfDevices);
+ vector<cl_device_id> cldevices(numberOfDevices);
+ clGetDeviceIDs(platforms[indexPlatform], CL_DEVICE_TYPE_ALL, numberOfDevices, &cldevices[0], 0);
- cl_device_id *cldevices = new cl_device_id[totalNumberOfDevices];
- unsigned int numberOfDevicesReceived = 0;
- for (indexPlatform = 0 ; indexPlatform < numberOfPlatforms ; indexPlatform ++) {
- cl_platform_id platform = platforms[indexPlatform];
- cl_uint numberOfDevices;
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices);
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices+numberOfDevicesReceived*sizeof (cl_device_id), 0);
- numberOfDevicesReceived += numberOfDevices;
- }
- context = clCreateContext(NULL, totalNumberOfDevices, cldevices, clContextError, NULL, &error);
- if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
- program = clCreateProgramWithSource(context, 1, &sourcecode, 0, &error);
- error = clBuildProgram(program, totalNumberOfDevices, cldevices, 0, 0, 0);
- if (error != CL_SUCCESS) {
- cl_int error2;
- size_t ret_val_size;
- printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
- error2 = clGetProgramBuildInfo(program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
- if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
- char *build_log = new char[ret_val_size+1];
- error2 = clGetProgramBuildInfo(program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
- if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
- build_log[ret_val_size] = '\0';
- printf("%s", build_log);
- delete build_log;
+ context = clCreateContext(NULL, numberOfDevices, &cldevices[0], clContextError, NULL, &error);
+ if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
- }
- unsigned int indexDevices;
- for (indexDevices = 0 ; indexDevices < totalNumberOfDevices ; indexDevices ++) {
- cl_device_id device = cldevices[indexDevices];
- OpenCLDevice *clDevice = new OpenCLDevice(context, device, program);
- clDevice->initialize(),
- gpudevices.push_back(clDevice);
- if (G.f & G_DEBUG) {
- char resultString[32];
- error = clGetDeviceInfo(device, CL_DEVICE_NAME, 32, resultString, 0);
- printf("OPENCL_DEVICE: %s, ", resultString);
- error = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 32, resultString, 0);
- printf("%s\n", resultString);
+ program = clCreateProgramWithSource(context, 1, &sourcecode, 0, &error);
+ if (error == CL_SUCCESS)
+ error = clBuildProgram(program, numberOfDevices, &cldevices[0], 0, 0, 0);
+
+ if (error == CL_SUCCESS)
+ {
+ unsigned int indexDevices;
+ for (indexDevices = 0 ; indexDevices < numberOfDevices ; indexDevices ++)
+ {
+ cl_device_id device = cldevices[indexDevices];
+ OpenCLDevice *clDevice = new OpenCLDevice(context, device, program);
+ clDevice->initialize(),
+ gpudevices.push_back(clDevice);
+ char resultString[32];
+ error = clGetDeviceInfo(device, CL_DEVICE_NAME, 32, resultString, 0);
+ printf("OPENCL_DEVICE: %s, ", resultString);
+ error = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 32, resultString, 0);
+ printf("%s\n", resultString);
+ }
}
+ else
+ {
+ cl_int error2;
+ size_t ret_val_size;
+ printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
+ error2 = clGetProgramBuildInfo(program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+ if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
+ char *build_log = new char[ret_val_size+1];
+ error2 = clGetProgramBuildInfo(program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+ if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
+ build_log[ret_val_size] = '\0';
+ printf("%s", build_log);
+ delete build_log;
+ }
}
- delete [] cldevices;
- delete [] platforms;
}
#endif
#endif
Index: extern/libmv/third_party/ceres/internal/ceres/visibility_based_preconditioner.h
===================================================================
--- extern/libmv/third_party/ceres/internal/ceres/visibility_based_preconditioner.h (revision 47903)
+++ extern/libmv/third_party/ceres/internal/ceres/visibility_based_preconditioner.h (working copy)
@@ -69,7 +69,7 @@
class BlockRandomAccessSparseMatrix;
class BlockSparseMatrixBase;
-class CompressedRowBlockStructure;
+struct CompressedRowBlockStructure;
class SchurEliminatorBase;
// This class implements three preconditioners for Structure from
Index: extern/libmv/third_party/ceres/internal/ceres/canonical_views_clustering.h
===================================================================
--- extern/libmv/third_party/ceres/internal/ceres/canonical_views_clustering.h (revision 47903)
+++ extern/libmv/third_party/ceres/internal/ceres/canonical_views_clustering.h (working copy)
@@ -52,7 +52,7 @@
namespace ceres {
namespace internal {
-class CanonicalViewsClusteringOptions;
+struct CanonicalViewsClusteringOptions;
// Compute a partitioning of the vertices of the graph using the
// canonical views clustering algorithm.
Index: extern/libmv/third_party/ceres/internal/ceres/visibility.h
===================================================================
--- extern/libmv/third_party/ceres/internal/ceres/visibility.h (revision 47903)
+++ extern/libmv/third_party/ceres/internal/ceres/visibility.h (working copy)
@@ -42,7 +42,7 @@
namespace ceres {
namespace internal {
-class CompressedRowBlockStructure;
+struct CompressedRowBlockStructure;
// Given a compressed row block structure, computes the set of
// e_blocks "visible" to each f_block. If an e_block co-occurs with an
Index: extern/libmv/third_party/ceres/internal/ceres/block_jacobi_preconditioner.h
===================================================================
--- extern/libmv/third_party/ceres/internal/ceres/block_jacobi_preconditioner.h (revision 47903)
+++ extern/libmv/third_party/ceres/internal/ceres/block_jacobi_preconditioner.h (working copy)
@@ -37,7 +37,7 @@
namespace ceres {
namespace internal {
-class CompressedRowBlockStructure;
+struct CompressedRowBlockStructure;
class LinearOperator;
class SparseMatrix;

Event Timeline