Cycles: CUDA support for rendering scenes that don't fit on GPU.

In that case it can now fall back to CPU memory, at the cost of reduced performance. For scenes that fit in GPU memory, this commit should not cause any noticeable slowdowns. We don't use all physical system RAM, since that can cause OS instability. We leave at least half of system RAM or 4GB to other software, whichever is smaller. For image textures in host memory, performance was maybe 20-30% slower in our tests (although this is highly hardware and scene dependent). Once other type of data doesn't fit on the GPU, performance can be e.g. 10x slower, and at that point it's probably better to just render on the CPU. Differential Revision: https://developer.blender.org/D2056
2017-11-04 00:33:38 +01:00 · 2017-11-04 00:33:38 +01:00 · c621832d3d
parent 6699454fb6
commit c621832d3d
6 changed files with 294 additions and 22 deletions
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@ -37,6 +37,7 @@
 #  include <cudaGL.h>
 #endif
 #include "util/util_debug.h"
+#include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
 #include "util/util_md5.h"
@ -128,6 +129,12 @@ public:
 	CUdevice cuDevice;
 	CUcontext cuContext;
 	CUmodule cuModule, cuFilterModule;
+	size_t device_texture_headroom;
+	size_t device_working_headroom;
+	bool move_texture_to_host;
+	size_t map_host_used;
+	size_t map_host_limit;
+	int can_map_host;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
@ -135,12 +142,15 @@ public:

 	struct CUDAMem {
 		CUDAMem()
-		: texobject(0), array(0) {}
+		: texobject(0), array(0), map_host_pointer(0), free_map_host(false) {}

 		CUtexObject texobject;
 		CUarray array;
+		void *map_host_pointer;
+		bool free_map_host;
 	};
-	map<device_memory*, CUDAMem> cuda_mem_map;
+	typedef map<device_memory*, CUDAMem> CUDAMemMap;
+	CUDAMemMap cuda_mem_map;

 	struct PixelMem {
 		GLuint cuPBO;
@ -240,6 +250,13 @@ public:

 		need_texture_info = false;

+		device_texture_headroom = 0;
+		device_working_headroom = 0;
+		move_texture_to_host = false;
+		map_host_limit = 0;
+		map_host_used = 0;
+		can_map_host = 0;
+
 		/* Intialize CUDA. */
 		if(cuda_error(cuInit(0)))
 			return;
@ -248,9 +265,16 @@ public:
 		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
 			return;

-		/* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+		/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+		 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
 		 * so we can predict which memory to map to host. */
+		cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
 		unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+		if(can_map_host) {
+			ctx_flags |= CU_CTX_MAP_HOST;
+			init_host_memory();
+		}

 		/* Create context. */
 		CUresult result;
@ -611,6 +635,50 @@ public:
 		VLOG(1) << "Local memory reserved "
 		        << string_human_readable_number(free_before - free_after) << " bytes. ("
 		        << string_human_readable_size(free_before - free_after) << ")";
+
+#if 0
+		/* For testing mapped host memory, fill up device memory. */
+		const size_t keep_mb = 1024;
+
+		while(free_after > keep_mb * 1024 * 1024LL) {
+			CUdeviceptr tmp;
+			cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+			cuMemGetInfo(&free_after, &total);
+		}
+#endif
+	}
+
+	void init_host_memory()
+	{
+		/* Limit amount of host mapped memory, because allocating too much can
+		 * cause system instability. Leave at least half or 4 GB of system
+		 * memory free, whichever is smaller. */
+		size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+		size_t system_ram = system_physical_ram();
+
+		if(system_ram > 0) {
+			if(system_ram / 2 > default_limit) {
+				map_host_limit = system_ram - default_limit;
+			}
+			else {
+				map_host_limit = system_ram / 2;
+			}
+		}
+		else {
+			VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+			map_host_limit = 0;
+		}
+
+		/* Amount of device memory to keep is free after texture memory
+		 * and working memory allocations respectively. We set the working
+		 * memory limit headroom lower so that some space is left after all
+		 * texture memory allocations. */
+		device_working_headroom = 32 * 1024 * 1024LL; // 32MB
+		device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
+
+		VLOG(1) << "Mapped host memory limit set to "
+		        << string_human_readable_number(map_host_limit) << " bytes. ("
+		        << string_human_readable_size(map_host_limit) << ")";
 	}

 	void load_texture_info()
@ -621,20 +689,167 @@ public:
 		}
 	}

-	CUDAMem *generic_alloc(device_memory& mem, size_t padding = 0)
+	void move_textures_to_host(size_t size, bool for_texture)
+	{
+		/* Signal to reallocate textures in host memory only. */
+		move_texture_to_host = true;
+
+		while(size > 0) {
+			/* Find suitable memory allocation to move. */
+			device_memory *max_mem = NULL;
+			size_t max_size = 0;
+			bool max_is_image = false;
+
+			foreach(CUDAMemMap::value_type& pair, cuda_mem_map) {
+				device_memory& mem = *pair.first;
+				CUDAMem *cmem = &pair.second;
+
+				bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+				bool is_image = is_texture && (mem.data_height > 1);
+
+				/* Can't move this type of memory. */
+				if(!is_texture || cmem->array) {
+					continue;
+				}
+
+				/* Already in host memory. */
+				if(cmem->map_host_pointer) {
+					continue;
+				}
+
+				/* For other textures, only move image textures. */
+				if(for_texture && !is_image) {
+					continue;
+				}
+
+				/* Try to move largest allocation, prefer moving images. */
+				if(is_image > max_is_image ||
+				   (is_image == max_is_image && mem.device_size > max_size)) {
+					max_is_image = is_image;
+					max_size = mem.device_size;
+					max_mem = &mem;
+				}
+			}
+
+			/* Move to host memory. This part is mutex protected since
+			 * multiple CUDA devices could be moving the memory. The
+			 * first one will do it, and the rest will adopt the pointer. */
+			if(max_mem) {
+				VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+				static thread_mutex move_mutex;
+				thread_scoped_lock lock(move_mutex);
+
+				/* Preserve the original device pointer, in case of multi device
+				 * we can't change it because the pointer mapping would break. */
+				device_ptr prev_pointer = max_mem->device_pointer;
+				size_t prev_size = max_mem->device_size;
+
+				tex_free(*max_mem);
+				tex_alloc(*max_mem);
+				size = (max_size >= size)? 0: size - max_size;
+
+				max_mem->device_pointer = prev_pointer;
+				max_mem->device_size = prev_size;
+			}
+			else {
+				break;
+			}
+		}
+
+		/* Update texture info array with new pointers. */
+		load_texture_info();
+
+		move_texture_to_host = false;
+	}
+
+	CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0)
 	{
 		CUDAContextScope scope(this);

+		CUdeviceptr device_pointer = 0;
+		size_t size = mem.memory_size() + pitch_padding;
+
+		CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+		const char *status = "";
+
+		/* First try allocating in device memory, respecting headroom. We make
+		 * an exception for texture info. It is small and frequently accessed,
+		 * so treat it as working memory.
+		 *
+		 * If there is not enough room for working memory, we will try to move
+		 * textures to host memory, assuming the performance impact would have
+		 * been worse for working memory. */
+		bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+		bool is_image = is_texture && (mem.data_height > 1);
+
+		size_t headroom = (is_texture)? device_texture_headroom:
+		                                device_working_headroom;
+
+		size_t total = 0, free = 0;
+		cuMemGetInfo(&free, &total);
+
+		/* Move textures to host memory if needed. */
+		if(!move_texture_to_host && !is_image && (size + headroom) >= free) {
+			move_textures_to_host(size + headroom - free, is_texture);
+			cuMemGetInfo(&free, &total);
+		}
+
+		/* Allocate in device memory. */
+		if(!move_texture_to_host && (size + headroom) < free) {
+			mem_alloc_result = cuMemAlloc(&device_pointer, size);
+			if(mem_alloc_result == CUDA_SUCCESS) {
+				status = " in device memory";
+			}
+		}
+
+		/* Fall back to mapped host memory if needed and possible. */
+		void *map_host_pointer = 0;
+		bool free_map_host = false;
+
+		if(mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+		   map_host_used + size < map_host_limit) {
+			if(mem.shared_pointer) {
+				/* Another device already allocated host memory. */
+				mem_alloc_result = CUDA_SUCCESS;
+				map_host_pointer = mem.shared_pointer;
+			}
+			else {
+				/* Allocate host memory ourselves. */
+				mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size,
+				                                  CU_MEMHOSTALLOC_DEVICEMAP |
+				                                  CU_MEMHOSTALLOC_WRITECOMBINED);
+				mem.shared_pointer = map_host_pointer;
+				free_map_host = true;
+			}
+
+			if(mem_alloc_result == CUDA_SUCCESS) {
+				cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
+				map_host_used += size;
+				status = " in host memory";
+
+				/* Replace host pointer with our host allocation. Only works if
+				 * CUDA memory layout is the same and has no pitch padding. */
+				if(pitch_padding == 0 && mem.host_pointer && mem.host_pointer != mem.shared_pointer) {
+					memcpy(mem.shared_pointer, mem.host_pointer, size);
+					mem.host_free();
+					mem.host_pointer = mem.shared_pointer;
+				}
+			}
+		}
+
+		if(mem_alloc_result != CUDA_SUCCESS) {
+			cuda_assert(mem_alloc_result);
+			status = " failed, out of memory";
+		}
+
 		if(mem.name) {
 			VLOG(1) << "Buffer allocate: " << mem.name << ", "
 					<< string_human_readable_number(mem.memory_size()) << " bytes. ("
-					<< string_human_readable_size(mem.memory_size()) << ")";
+					<< string_human_readable_size(mem.memory_size()) << ")"
+					<< status;
 		}

-		/* Allocate memory on device. */
-		CUdeviceptr device_pointer = 0;
-		size_t size = mem.memory_size();
-		cuda_assert(cuMemAlloc(&device_pointer, size + padding));
 		mem.device_pointer = (device_ptr)device_pointer;
 		mem.device_size = size;
 		stats.mem_alloc(size);
@ -645,6 +860,8 @@ public:

 		/* Insert into map of allocations. */
 		CUDAMem *cmem = &cuda_mem_map[&mem];
+		cmem->map_host_pointer = map_host_pointer;
+		cmem->free_map_host = free_map_host;
 		return cmem;
 	}

@ -652,7 +869,12 @@ public:
 	{
 		if(mem.host_pointer && mem.device_pointer) {
 			CUDAContextScope scope(this);
-			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
+
+			if(mem.host_pointer != mem.shared_pointer) {
+				cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer),
+				                         mem.host_pointer,
+				                         mem.memory_size()));
+			}
 		}
 	}

@ -660,8 +882,24 @@ public:
 	{
 		if(mem.device_pointer) {
 			CUDAContextScope scope(this);
+			const CUDAMem& cmem = cuda_mem_map[&mem];

-			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
+			if(cmem.map_host_pointer) {
+				/* Free host memory. */
+				if(cmem.free_map_host) {
+					cuMemFreeHost(cmem.map_host_pointer);
+					if(mem.host_pointer == mem.shared_pointer) {
+						mem.host_pointer = 0;
+					}
+					mem.shared_pointer = 0;
+				}
+
+				map_host_used -= mem.device_size;
+			}
+			else {
+				/* Free device memory. */
+				cuMemFree(mem.device_pointer);
+			}

 			stats.mem_free(mem.device_size);
 			mem.device_pointer = 0;
@ -735,7 +973,8 @@ public:
 			memset(mem.host_pointer, 0, mem.memory_size());
 		}

-		if(mem.device_pointer) {
+		if(mem.device_pointer &&
+		   (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
 			CUDAContextScope scope(this);
 			cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
 		}
@ -774,10 +1013,6 @@ public:
 	{
 		CUDAContextScope scope(this);

-		VLOG(1) << "Texture allocate: " << mem.name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-
 		/* Check if we are on sm_30 or above, for bindless textures. */
 		bool has_fermi_limits = info.has_fermi_limits;

@ -881,6 +1116,10 @@ public:
 			desc.NumChannels = mem.data_elements;
 			desc.Flags = 0;

+			VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+
 			cuda_assert(cuArray3DCreate(&array_3d, &desc));

 			if(!array_3d) {
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@ -35,7 +35,8 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
  extension(EXTENSION_REPEAT),
  device(device),
  device_pointer(0),
-  host_pointer(0)
+  host_pointer(0),
+  shared_pointer(0)
 {
 }

--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@ -197,10 +197,13 @@ public:
 	Device *device;
 	device_ptr device_pointer;
 	void *host_pointer;
+	void *shared_pointer;

 	virtual ~device_memory();

 protected:
+	friend class CUDADevice;
+
 	/* Only create through subclasses. */
 	device_memory(Device *device, const char *name, MemoryType type);

--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@ -48,11 +48,17 @@ public:
 	MultiDevice(DeviceInfo& info, Stats &stats, bool background_)
 	: Device(info, stats, background_), unique_key(1)
 	{
-		Device *device;
-
 		foreach(DeviceInfo& subinfo, info.multi_devices) {
-			device = Device::create(subinfo, sub_stats_, background);
-			devices.push_back(SubDevice(device));
+			Device *device = Device::create(subinfo, sub_stats_, background);
+
+			/* Always add CPU devices at the back since GPU devices can change
+			 * host memory pointers, which CPU uses as device pointer. */
+			if(subinfo.type == DEVICE_CPU) {
+				devices.push_back(SubDevice(device));
+			}
+			else {
+				devices.push_front(SubDevice(device));
+			}
 		}

 #ifdef WITH_NETWORK
@ -63,7 +69,7 @@ public:
 		vector<string> servers = discovery.get_server_list();

 		foreach(string& server, servers) {
-			device = device_network_create(info, stats, server.c_str());
+			Device *device = device_network_create(info, stats, server.c_str());
 			if(device)
 				devices.push_back(SubDevice(device));
 		}
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@ -292,5 +292,26 @@ bool system_cpu_support_avx2()

 #endif

+size_t system_physical_ram()
+{
+#ifdef _WIN32
+	MEMORYSTATUSEX ram;
+	ram.dwLength = sizeof (ram);
+	GlobalMemoryStatusEx(&ram);
+	return ram.ullTotalPhys * 1024;
+#elif defined(__APPLE__)
+	uint64_t ram = 0;
+	size_t len = sizeof(ram);
+	if (sysctlbyname("hw.memsize", &ram, &len, NULL, 0) == 0) {
+		return ram;
+	}
+	return 0;
+#else
+	size_t ps = sysconf(_SC_PAGESIZE);
+	size_t pn = sysconf(_SC_PHYS_PAGES);
+	return ps * pn;
+#endif
+}
+
 CCL_NAMESPACE_END

--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@ -42,6 +42,8 @@ bool system_cpu_support_sse41();
 bool system_cpu_support_avx();
 bool system_cpu_support_avx2();

+size_t system_physical_ram();
+
 CCL_NAMESPACE_END

 #endif /* __UTIL_SYSTEM_H__ */