Cycles: reduce triangle memory usage with packed_float3

Depends on D13243 Differential Revision: https://developer.blender.org/D13244
2021-11-16 14:03:59 +01:00 · 2021-11-16 14:03:59 +01:00 · 063ad8635e
parent 9937d5379c
commit 063ad8635e
20 changed files with 237 additions and 157 deletions
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@ -159,9 +159,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()

  /* Record geometric normal. */
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
-  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
-  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

  /* Continue tracing (without this the trace call would return after the first hit). */
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@ -106,9 +106,9 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals kg,
 {
  Transform tfm;

-  tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0);
-  tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1);
-  tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2);
+  tfm.x = kernel_tex_fetch(__attributes_float4, desc.offset + 0);
+  tfm.y = kernel_tex_fetch(__attributes_float4, desc.offset + 1);
+  tfm.z = kernel_tex_fetch(__attributes_float4, desc.offset + 2);

  return tfm;
 }
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@ -126,8 +126,8 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);

 #  ifdef __RAY_DIFFERENTIALS__
    if (dx)
@ -149,7 +149,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
    }
    else {
      return make_float3(0.0f, 0.0f, 0.0f);
@ -168,8 +168,8 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + k0);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + k1);

 #  ifdef __RAY_DIFFERENTIALS__
    if (dx)
@ -191,7 +191,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
    }
    else {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
--- a/intern/cycles/kernel/geom/motion_curve.h
+++ b/intern/cycles/kernel/geom/motion_curve.h
@ -48,8 +48,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg,

    offset += step * numkeys;

-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
  }
 }

@ -106,10 +106,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg,

    offset += step * numkeys;

-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
-    keys[2] = kernel_tex_fetch(__attributes_float3, offset + k2);
-    keys[3] = kernel_tex_fetch(__attributes_float3, offset + k3);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
+    keys[2] = kernel_tex_fetch(__attributes_float4, offset + k2);
+    keys[3] = kernel_tex_fetch(__attributes_float4, offset + k3);
  }
 }

--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@ -43,9 +43,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 {
  if (step == numsteps) {
    /* center step: regular vertex location */
-    verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+    verts[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+    verts[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+    verts[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
  }
  else {
    /* center step not store in this array */
@ -54,9 +54,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,

    offset += step * numverts;

-    verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    verts[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    verts[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    verts[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
  }
 }

@ -70,9 +70,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
 {
  if (step == numsteps) {
    /* center step: regular vertex location */
-    normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
  }
  else {
    /* center step is not stored in this array */
@ -81,9 +81,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,

    offset += step * numverts;

-    normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
  }
 }

--- a/intern/cycles/kernel/geom/patch.h
+++ b/intern/cycles/kernel/geom/patch.h
@ -380,7 +380,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals kg,
    *dv = make_float3(0.0f, 0.0f, 0.0f);

  for (int i = 0; i < num_control; i++) {
-    float3 v = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + indices[i]));
+    float3 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);

    val += v * weights[i];
    if (du)
@ -417,7 +417,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals kg,
    *dv = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

  for (int i = 0; i < num_control; i++) {
-    float4 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
+    float4 v = kernel_tex_fetch(__attributes_float4, offset + indices[i]);

    val += v * weights[i];
    if (du)
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@ -284,18 +284,33 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
    int numverts, numkeys;
    object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);

-    /* lookup attributes */
-    motion_pre = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
-    desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE) ? numverts : numkeys;
-    motion_post = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
 #ifdef __HAIR__
-    if (is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
-      object_position_transform(kg, sd, &motion_pre);
-      object_position_transform(kg, sd, &motion_post);
+    if (is_curve_primitive) {
+      motion_pre = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+      desc.offset += numkeys;
+      motion_post = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+
+      /* Curve */
+      if ((sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+        object_position_transform(kg, sd, &motion_pre);
+        object_position_transform(kg, sd, &motion_post);
+      }
    }
+    else
 #endif
+        if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+      /* Triangle */
+      if (subd_triangle_patch(kg, sd) == ~0) {
+        motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+      else {
+        motion_pre = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+    }
  }

  /* object motion. note that depending on the mesh having motion vectors, this
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@ -443,8 +443,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
    if (dy)
      *dy = make_float3(0.0f, 0.0f, 0.0f);

-    return float4_to_float3(
-        kernel_tex_fetch(__attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch)));
+    return kernel_tex_fetch(__attributes_float3,
+                            desc.offset + subd_triangle_patch_face(kg, patch));
  }
  else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
    float2 uv[3];
@ -452,10 +452,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

    uint4 v = subd_triangle_patch_indices(kg, patch);

-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.x));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.y));
-    float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.z));
-    float3 f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.w));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
+    float3 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
+    float3 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@ -484,10 +484,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

    float3 f0, f1, f2, f3;

-    f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
-    f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
-    f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
-    f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
+    f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
+    f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
+    f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
+    f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@ -513,7 +513,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
    if (dy)
      *dy = make_float3(0.0f, 0.0f, 0.0f);

-    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+    return kernel_tex_fetch(__attributes_float3, desc.offset);
  }
  else {
    if (dx)
@ -590,7 +590,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
    if (dy)
      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

-    return kernel_tex_fetch(__attributes_float3,
+    return kernel_tex_fetch(__attributes_float4,
                            desc.offset + subd_triangle_patch_face(kg, patch));
  }
  else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
@ -599,10 +599,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,

    uint4 v = subd_triangle_patch_indices(kg, patch);

-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
-    float4 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
-    float4 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + v.x);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + v.y);
+    float4 f2 = kernel_tex_fetch(__attributes_float4, desc.offset + v.z);
+    float4 f3 = kernel_tex_fetch(__attributes_float4, desc.offset + v.w);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@ -642,10 +642,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
          color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset)));
    }
    else {
-      f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
-      f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
-      f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
-      f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
+      f0 = kernel_tex_fetch(__attributes_float4, corners[0] + desc.offset);
+      f1 = kernel_tex_fetch(__attributes_float4, corners[1] + desc.offset);
+      f2 = kernel_tex_fetch(__attributes_float4, corners[2] + desc.offset);
+      f3 = kernel_tex_fetch(__attributes_float4, corners[3] + desc.offset);
    }

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
@ -672,7 +672,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
    if (dy)
      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
  }
  else {
    if (dx)
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@ -29,9 +29,9 @@ ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderDat
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-  const float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);

  /* return normal */
  if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
@ -54,9 +54,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
  /* compute point */
  float t = 1.0f - u - v;
  *P = (u * v0 + v * v1 + t * v2);
@ -78,9 +78,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3])
 {
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
 }

 /* Triangle vertex locations and vertex normals */
@ -91,12 +91,12 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg,
                                                     float3 N[3])
 {
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
-  N[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  N[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  N[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  N[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  N[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  N[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
 }

 /* Interpolate smooth vertex normal from vertices */
@ -106,9 +106,9 @@ triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);

  float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1);

@ -120,9 +120,9 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);

  /* ensure that the normals are in object space */
  if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
@ -145,9 +145,9 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
 {
  /* fetch triangle vertex coordinates */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  const float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 p0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 p1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 p2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);

  /* compute derivatives of P w.r.t. uv */
  *dPdu = (p0 - p2);
@ -267,15 +267,15 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
+      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
    }
    else {
      const int tri = desc.offset + sd->prim * 3;
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+      f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
+      f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
+      f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
    }

 #ifdef __RAY_DIFFERENTIALS__
@ -298,7 +298,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                               desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
    }
    else {
      return make_float3(0.0f, 0.0f, 0.0f);
@ -318,16 +318,16 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+      f0 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.z);
    }
    else {
      const int tri = desc.offset + sd->prim * 3;
      if (desc.element == ATTR_ELEMENT_CORNER) {
-        f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
-        f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
-        f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
+        f0 = kernel_tex_fetch(__attributes_float4, tri + 0);
+        f1 = kernel_tex_fetch(__attributes_float4, tri + 1);
+        f2 = kernel_tex_fetch(__attributes_float4, tri + 2);
      }
      else {
        f0 = color_srgb_to_linear_v4(
@ -359,7 +359,7 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                               desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
    }
    else {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@ -40,7 +40,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
 #else
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
 #endif
@ -51,9 +51,9 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                             ssef_verts,
 #else
-                             float4_to_float3(tri_a),
-                             float4_to_float3(tri_b),
-                             float4_to_float3(tri_c),
+                             tri_a,
+                             tri_b,
+                             tri_c,
 #endif
                             &u,
                             &v,
@ -109,9 +109,9 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
 #  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
 #  else
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
 #  endif
  float t, u, v;
  if (!ray_triangle_intersect(P,
@ -179,9 +179,9 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,

  /* Record geometric normal. */
 #  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
 #  endif
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

@ -223,9 +223,9 @@ ccl_device_inline float3 triangle_refine(KernelGlobals kg,
  P = P + D * t;

  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
@ -280,9 +280,9 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals kg,

 #  ifdef __INTERSECTION_REFINE__
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@ -75,7 +75,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
                                          const AttributeDescriptor desc)
 {
  if (desc.element & (ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
  }
  else if (desc.element == ATTR_ELEMENT_VOXEL) {
    /* todo: optimize this so we don't have to transform both here and in
--- a/intern/cycles/kernel/textures.h
+++ b/intern/cycles/kernel/textures.h
@ -40,11 +40,11 @@ KERNEL_TEX(DecomposedTransform, __camera_motion)

 /* triangles */
 KERNEL_TEX(uint, __tri_shader)
-KERNEL_TEX(float4, __tri_vnormal)
+KERNEL_TEX(packed_float3, __tri_vnormal)
 KERNEL_TEX(uint4, __tri_vindex)
 KERNEL_TEX(uint, __tri_patch)
 KERNEL_TEX(float2, __tri_patch_uv)
-KERNEL_TEX(float4, __tri_verts)
+KERNEL_TEX(packed_float3, __tri_verts)

 /* curves */
 KERNEL_TEX(KernelCurve, __curves)
@ -58,7 +58,8 @@ KERNEL_TEX(uint, __patches)
 KERNEL_TEX(uint4, __attributes_map)
 KERNEL_TEX(float, __attributes_float)
 KERNEL_TEX(float2, __attributes_float2)
-KERNEL_TEX(float4, __attributes_float3)
+KERNEL_TEX(packed_float3, __attributes_float3)
+KERNEL_TEX(float4, __attributes_float4)
 KERNEL_TEX(uchar4, __attributes_uchar4)

 /* lights */
--- a/intern/cycles/scene/attribute.cpp
+++ b/intern/cycles/scene/attribute.cpp
@ -404,6 +404,10 @@ AttrKernelDataType Attribute::kernel_type(const Attribute &attr)
    return AttrKernelDataType::FLOAT2;
  }

+  if (attr.type == TypeFloat4 || attr.type == TypeRGBA || attr.type == TypeDesc::TypeMatrix) {
+    return AttrKernelDataType::FLOAT4;
+  }
+
  return AttrKernelDataType::FLOAT3;
 }

@ -585,7 +589,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
        attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE);
        break;
      case ATTR_STD_MOTION_VERTEX_POSITION:
-        attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE_KEY_MOTION);
+        attr = add(name, TypeDesc::TypeFloat4, ATTR_ELEMENT_CURVE_KEY_MOTION);
        break;
      case ATTR_STD_CURVE_INTERCEPT:
        attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY);
--- a/intern/cycles/scene/attribute.h
+++ b/intern/cycles/scene/attribute.h
@ -47,12 +47,7 @@ struct Transform;
 *
 * The values of this enumeration are also used as flags to detect changes in AttributeSet. */

-enum AttrKernelDataType {
-  FLOAT = 0,
-  FLOAT2 = 1,
-  FLOAT3 = 2,
-  UCHAR4 = 3,
-};
+enum AttrKernelDataType { FLOAT = 0, FLOAT2 = 1, FLOAT3 = 2, FLOAT4 = 3, UCHAR4 = 4, NUM = 5 };

 /* Attribute
 *
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@ -551,6 +551,7 @@ static void update_attribute_element_size(Geometry *geom,
                                          size_t *attr_float_size,
                                          size_t *attr_float2_size,
                                          size_t *attr_float3_size,
+                                          size_t *attr_float4_size,
                                          size_t *attr_uchar4_size)
 {
  if (mattr) {
@ -569,7 +570,10 @@ static void update_attribute_element_size(Geometry *geom,
      *attr_float2_size += size;
    }
    else if (mattr->type == TypeDesc::TypeMatrix) {
-      *attr_float3_size += size * 4;
+      *attr_float4_size += size * 4;
+    }
+    else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) {
+      *attr_float4_size += size;
    }
    else {
      *attr_float3_size += size;
@ -582,8 +586,10 @@ void GeometryManager::update_attribute_element_offset(Geometry *geom,
                                                      size_t &attr_float_offset,
                                                      device_vector<float2> &attr_float2,
                                                      size_t &attr_float2_offset,
-                                                      device_vector<float4> &attr_float3,
+                                                      device_vector<packed_float3> &attr_float3,
                                                      size_t &attr_float3_offset,
+                                                      device_vector<float4> &attr_float4,
+                                                      size_t &attr_float4_offset,
                                                      device_vector<uchar4> &attr_uchar4,
                                                      size_t &attr_uchar4_offset,
                                                      Attribute *mattr,
@ -646,18 +652,30 @@ void GeometryManager::update_attribute_element_offset(Geometry *geom,
    }
    else if (mattr->type == TypeDesc::TypeMatrix) {
      Transform *tfm = mattr->data_transform();
-      offset = attr_float3_offset;
+      offset = attr_float4_offset;

-      assert(attr_float3.size() >= offset + size * 3);
+      assert(attr_float4.size() >= offset + size * 3);
      if (mattr->modified) {
        for (size_t k = 0; k < size * 3; k++) {
-          attr_float3[offset + k] = (&tfm->x)[k];
+          attr_float4[offset + k] = (&tfm->x)[k];
        }
      }
-      attr_float3_offset += size * 3;
+      attr_float4_offset += size * 3;
+    }
+    else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) {
+      float4 *data = mattr->data_float4();
+      offset = attr_float4_offset;
+
+      assert(attr_float4.size() >= offset + size);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size; k++) {
+          attr_float4[offset + k] = data[k];
+        }
+      }
+      attr_float4_offset += size;
    }
    else {
-      float4 *data = mattr->data_float4();
+      float3 *data = mattr->data_float3();
      offset = attr_float3_offset;

      assert(attr_float3.size() >= offset + size);
@ -783,6 +801,7 @@ void GeometryManager::device_update_attributes(Device *device,
  size_t attr_float_size = 0;
  size_t attr_float2_size = 0;
  size_t attr_float3_size = 0;
+  size_t attr_float4_size = 0;
  size_t attr_uchar4_size = 0;

  for (size_t i = 0; i < scene->geometry.size(); i++) {
@ -797,6 +816,7 @@ void GeometryManager::device_update_attributes(Device *device,
                                    &attr_float_size,
                                    &attr_float2_size,
                                    &attr_float3_size,
+                                    &attr_float4_size,
                                    &attr_uchar4_size);

      if (geom->is_mesh()) {
@ -809,6 +829,7 @@ void GeometryManager::device_update_attributes(Device *device,
                                      &attr_float_size,
                                      &attr_float2_size,
                                      &attr_float3_size,
+                                      &attr_float4_size,
                                      &attr_uchar4_size);
      }
    }
@ -824,6 +845,7 @@ void GeometryManager::device_update_attributes(Device *device,
                                    &attr_float_size,
                                    &attr_float2_size,
                                    &attr_float3_size,
+                                    &attr_float4_size,
                                    &attr_uchar4_size);
    }
  }
@ -831,19 +853,22 @@ void GeometryManager::device_update_attributes(Device *device,
  dscene->attributes_float.alloc(attr_float_size);
  dscene->attributes_float2.alloc(attr_float2_size);
  dscene->attributes_float3.alloc(attr_float3_size);
+  dscene->attributes_float4.alloc(attr_float4_size);
  dscene->attributes_uchar4.alloc(attr_uchar4_size);

  /* The order of those flags needs to match that of AttrKernelDataType. */
-  const bool attributes_need_realloc[4] = {
+  const bool attributes_need_realloc[AttrKernelDataType::NUM] = {
      dscene->attributes_float.need_realloc(),
      dscene->attributes_float2.need_realloc(),
      dscene->attributes_float3.need_realloc(),
+      dscene->attributes_float4.need_realloc(),
      dscene->attributes_uchar4.need_realloc(),
  };

  size_t attr_float_offset = 0;
  size_t attr_float2_offset = 0;
  size_t attr_float3_offset = 0;
+  size_t attr_float4_offset = 0;
  size_t attr_uchar4_offset = 0;

  /* Fill in attributes. */
@ -868,6 +893,8 @@ void GeometryManager::device_update_attributes(Device *device,
                                      attr_float2_offset,
                                      dscene->attributes_float3,
                                      attr_float3_offset,
+                                      dscene->attributes_float4,
+                                      attr_float4_offset,
                                      dscene->attributes_uchar4,
                                      attr_uchar4_offset,
                                      attr,
@ -891,6 +918,8 @@ void GeometryManager::device_update_attributes(Device *device,
                                        attr_float2_offset,
                                        dscene->attributes_float3,
                                        attr_float3_offset,
+                                        dscene->attributes_float4,
+                                        attr_float4_offset,
                                        dscene->attributes_uchar4,
                                        attr_uchar4_offset,
                                        subd_attr,
@ -923,6 +952,8 @@ void GeometryManager::device_update_attributes(Device *device,
                                      attr_float2_offset,
                                      dscene->attributes_float3,
                                      attr_float3_offset,
+                                      dscene->attributes_float4,
+                                      attr_float4_offset,
                                      dscene->attributes_uchar4,
                                      attr_uchar4_offset,
                                      attr,
@ -954,6 +985,7 @@ void GeometryManager::device_update_attributes(Device *device,
  dscene->attributes_float.copy_to_device_if_modified();
  dscene->attributes_float2.copy_to_device_if_modified();
  dscene->attributes_float3.copy_to_device_if_modified();
+  dscene->attributes_float4.copy_to_device_if_modified();
  dscene->attributes_uchar4.copy_to_device_if_modified();

  if (progress.get_cancel())
@ -1080,9 +1112,9 @@ void GeometryManager::device_update_mesh(Device *,
    /* normals */
    progress.set_status("Updating Mesh", "Computing normals");

-    float4 *tri_verts = dscene->tri_verts.alloc(tri_size * 3);
+    packed_float3 *tri_verts = dscene->tri_verts.alloc(tri_size * 3);
    uint *tri_shader = dscene->tri_shader.alloc(tri_size);
-    float4 *vnormal = dscene->tri_vnormal.alloc(vert_size);
+    packed_float3 *vnormal = dscene->tri_vnormal.alloc(vert_size);
    uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
    uint *tri_patch = dscene->tri_patch.alloc(tri_size);
    float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
@ -1293,18 +1325,21 @@ enum {
  ATTR_FLOAT_MODIFIED = (1 << 2),
  ATTR_FLOAT2_MODIFIED = (1 << 3),
  ATTR_FLOAT3_MODIFIED = (1 << 4),
-  ATTR_UCHAR4_MODIFIED = (1 << 5),
+  ATTR_FLOAT4_MODIFIED = (1 << 5),
+  ATTR_UCHAR4_MODIFIED = (1 << 6),

-  CURVE_DATA_NEED_REALLOC = (1 << 6),
-  MESH_DATA_NEED_REALLOC = (1 << 7),
+  CURVE_DATA_NEED_REALLOC = (1 << 7),
+  MESH_DATA_NEED_REALLOC = (1 << 8),

-  ATTR_FLOAT_NEEDS_REALLOC = (1 << 8),
-  ATTR_FLOAT2_NEEDS_REALLOC = (1 << 9),
-  ATTR_FLOAT3_NEEDS_REALLOC = (1 << 10),
-  ATTR_UCHAR4_NEEDS_REALLOC = (1 << 11),
+  ATTR_FLOAT_NEEDS_REALLOC = (1 << 9),
+  ATTR_FLOAT2_NEEDS_REALLOC = (1 << 10),
+  ATTR_FLOAT3_NEEDS_REALLOC = (1 << 11),
+  ATTR_FLOAT4_NEEDS_REALLOC = (1 << 12),
+  ATTR_UCHAR4_NEEDS_REALLOC = (1 << 13),

  ATTRS_NEED_REALLOC = (ATTR_FLOAT_NEEDS_REALLOC | ATTR_FLOAT2_NEEDS_REALLOC |
-                        ATTR_FLOAT3_NEEDS_REALLOC | ATTR_UCHAR4_NEEDS_REALLOC),
+                        ATTR_FLOAT3_NEEDS_REALLOC | ATTR_FLOAT4_NEEDS_REALLOC |
+                        ATTR_UCHAR4_NEEDS_REALLOC),
  DEVICE_MESH_DATA_NEEDS_REALLOC = (MESH_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC),
  DEVICE_CURVE_DATA_NEEDS_REALLOC = (CURVE_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC),
 };
@ -1332,10 +1367,17 @@ static void update_device_flags_attribute(uint32_t &device_update_flags,
        device_update_flags |= ATTR_FLOAT3_MODIFIED;
        break;
      }
+      case AttrKernelDataType::FLOAT4: {
+        device_update_flags |= ATTR_FLOAT4_MODIFIED;
+        break;
+      }
      case AttrKernelDataType::UCHAR4: {
        device_update_flags |= ATTR_UCHAR4_MODIFIED;
        break;
      }
+      case AttrKernelDataType::NUM: {
+        break;
+      }
    }
  }
 }
@ -1352,6 +1394,9 @@ static void update_attribute_realloc_flags(uint32_t &device_update_flags,
  if (attributes.modified(AttrKernelDataType::FLOAT3)) {
    device_update_flags |= ATTR_FLOAT3_NEEDS_REALLOC;
  }
+  if (attributes.modified(AttrKernelDataType::FLOAT4)) {
+    device_update_flags |= ATTR_FLOAT4_NEEDS_REALLOC;
+  }
  if (attributes.modified(AttrKernelDataType::UCHAR4)) {
    device_update_flags |= ATTR_UCHAR4_NEEDS_REALLOC;
  }
@ -1553,6 +1598,14 @@ void GeometryManager::device_update_preprocess(Device *device, Scene *scene, Pro
    dscene->attributes_float3.tag_modified();
  }

+  if (device_update_flags & ATTR_FLOAT4_NEEDS_REALLOC) {
+    dscene->attributes_map.tag_realloc();
+    dscene->attributes_float4.tag_realloc();
+  }
+  else if (device_update_flags & ATTR_FLOAT4_MODIFIED) {
+    dscene->attributes_float4.tag_modified();
+  }
+
  if (device_update_flags & ATTR_UCHAR4_NEEDS_REALLOC) {
    dscene->attributes_map.tag_realloc();
    dscene->attributes_uchar4.tag_realloc();
@ -2014,6 +2067,7 @@ void GeometryManager::device_update(Device *device,
  dscene->attributes_float.clear_modified();
  dscene->attributes_float2.clear_modified();
  dscene->attributes_float3.clear_modified();
+  dscene->attributes_float4.clear_modified();
  dscene->attributes_uchar4.clear_modified();
 }

@ -2041,6 +2095,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc
  dscene->attributes_float.free_if_need_realloc(force_free);
  dscene->attributes_float2.free_if_need_realloc(force_free);
  dscene->attributes_float3.free_if_need_realloc(force_free);
+  dscene->attributes_float4.free_if_need_realloc(force_free);
  dscene->attributes_uchar4.free_if_need_realloc(force_free);

  /* Signal for shaders like displacement not to do ray tracing. */
--- a/intern/cycles/scene/geometry.h
+++ b/intern/cycles/scene/geometry.h
@ -257,8 +257,10 @@ class GeometryManager {
                                              size_t &attr_float_offset,
                                              device_vector<float2> &attr_float2,
                                              size_t &attr_float2_offset,
-                                              device_vector<float4> &attr_float3,
+                                              device_vector<packed_float3> &attr_float3,
                                              size_t &attr_float3_offset,
+                                              device_vector<float4> &attr_float4,
+                                              size_t &attr_float4_offset,
                                              device_vector<uchar4> &attr_uchar4,
                                              size_t &attr_uchar4_offset,
                                              Attribute *mattr,
--- a/intern/cycles/scene/mesh.cpp
+++ b/intern/cycles/scene/mesh.cpp
@ -707,7 +707,7 @@ void Mesh::pack_shaders(Scene *scene, uint *tri_shader)
  }
 }

-void Mesh::pack_normals(float4 *vnormal)
+void Mesh::pack_normals(packed_float3 *vnormal)
 {
  Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL);
  if (attr_vN == NULL) {
@ -727,11 +727,14 @@ void Mesh::pack_normals(float4 *vnormal)
    if (do_transform)
      vNi = safe_normalize(transform_direction(&ntfm, vNi));

-    vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f);
+    vnormal[i] = make_float3(vNi.x, vNi.y, vNi.z);
  }
 }

-void Mesh::pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, float2 *tri_patch_uv)
+void Mesh::pack_verts(packed_float3 *tri_verts,
+                      uint4 *tri_vindex,
+                      uint *tri_patch,
+                      float2 *tri_patch_uv)
 {
  size_t verts_size = verts.size();

@ -752,9 +755,9 @@ void Mesh::pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, flo

    tri_patch[i] = (!get_num_subd_faces()) ? -1 : (triangle_patch[i] * 8 + patch_offset);

-    tri_verts[i * 3] = float3_to_float4(verts[t.v[0]]);
-    tri_verts[i * 3 + 1] = float3_to_float4(verts[t.v[1]]);
-    tri_verts[i * 3 + 2] = float3_to_float4(verts[t.v[2]]);
+    tri_verts[i * 3] = verts[t.v[0]];
+    tri_verts[i * 3 + 1] = verts[t.v[1]];
+    tri_verts[i * 3 + 2] = verts[t.v[2]];
  }
 }

--- a/intern/cycles/scene/mesh.h
+++ b/intern/cycles/scene/mesh.h
@ -223,8 +223,11 @@ class Mesh : public Geometry {
  void get_uv_tiles(ustring map, unordered_set<int> &tiles) override;

  void pack_shaders(Scene *scene, uint *shader);
-  void pack_normals(float4 *vnormal);
-  void pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, float2 *tri_patch_uv);
+  void pack_normals(packed_float3 *vnormal);
+  void pack_verts(packed_float3 *tri_verts,
+                  uint4 *tri_vindex,
+                  uint *tri_patch,
+                  float2 *tri_patch_uv);
  void pack_patches(uint *patch_data);

  PrimitiveType primitive_type() const override;
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@ -74,6 +74,7 @@ DeviceScene::DeviceScene(Device *device)
      attributes_float(device, "__attributes_float", MEM_GLOBAL),
      attributes_float2(device, "__attributes_float2", MEM_GLOBAL),
      attributes_float3(device, "__attributes_float3", MEM_GLOBAL),
+      attributes_float4(device, "__attributes_float4", MEM_GLOBAL),
      attributes_uchar4(device, "__attributes_uchar4", MEM_GLOBAL),
      light_distribution(device, "__light_distribution", MEM_GLOBAL),
      lights(device, "__lights", MEM_GLOBAL),
--- a/intern/cycles/scene/scene.h
+++ b/intern/cycles/scene/scene.h
@ -81,9 +81,9 @@ class DeviceScene {
  device_vector<float2> prim_time;

  /* mesh */
-  device_vector<float4> tri_verts;
+  device_vector<packed_float3> tri_verts;
  device_vector<uint> tri_shader;
-  device_vector<float4> tri_vnormal;
+  device_vector<packed_float3> tri_vnormal;
  device_vector<uint4> tri_vindex;
  device_vector<uint> tri_patch;
  device_vector<float2> tri_patch_uv;
@ -108,7 +108,8 @@ class DeviceScene {
  device_vector<uint4> attributes_map;
  device_vector<float> attributes_float;
  device_vector<float2> attributes_float2;
-  device_vector<float4> attributes_float3;
+  device_vector<packed_float3> attributes_float3;
+  device_vector<float4> attributes_float4;
  device_vector<uchar4> attributes_uchar4;

  /* lights */