Skip L1 cache when loading nodes.

Node access is very irregular as each thread descends the BVH tree. Each node is only 16 bytes, so the 128 byte cache line size in the L1 cache means that a lot of useless data is often fetched. Using some embedded PTX, we can force the L1 cache to be skipped, going directly to L2. The L2 cache line is 32 bytes long, which means that both children in a binary tree will be cached at the same time. This improves the speed on the default generated binary trees, but does not help an optimized tree yet.
author: Stan Seibert <stan@mtrr.org> 2012-02-06 11:15:57 -0500
committer: tlatorre <tlatorre@uchicago.edu> 2021-05-09 08:42:38 -0700
commit: f1f102beae06f098e74073492b824da237f89fab (patch)
tree: 45278a1590b2ffc68a8469748ccdd49f05ae90e3
parent: 82c5b61619cc707d907bc598b7b82633d0059934 (diff)
download: chroma-f1f102beae06f098e74073492b824da237f89fab.tar.gz
chroma-f1f102beae06f098e74073492b824da237f89fab.tar.bz2
chroma-f1f102beae06f098e74073492b824da237f89fab.zip
1 files changed, 10 insertions, 1 deletions
diff --git a/chroma/cuda/geometry.h b/chroma/cuda/geometry.h
index 86ced6c..577da72 100644
--- a/chroma/cuda/geometry.h
+++ b/chroma/cuda/geometry.h
@@ -13,10 +13,19 @@ to_float3(const uint3 &a)
   return make_float3(a.x, a.y, a.z);
 }
 
+__device__ uint4 read_skip_l1(uint4 *ptr)
+{
+  uint4 val;
+  asm(" ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) 
+      : "l"(ptr) : "memory");
+  return val;
+}
+
 __device__ Node
 get_node(Geometry *geometry, const unsigned int &i)
 {
-    uint4 node = geometry->nodes[i];
+    uint4 node = read_skip_l1(geometry->nodes + i);
     Node node_struct;
 
     if (node.x == 0) {
author	Stan Seibert <stan@mtrr.org>	2012-02-06 11:15:57 -0500
committer	tlatorre <tlatorre@uchicago.edu>	2021-05-09 08:42:38 -0700
commit	f1f102beae06f098e74073492b824da237f89fab (patch)
tree	45278a1590b2ffc68a8469748ccdd49f05ae90e3
parent	82c5b61619cc707d907bc598b7b82633d0059934 (diff)
download	chroma-f1f102beae06f098e74073492b824da237f89fab.tar.gz chroma-f1f102beae06f098e74073492b824da237f89fab.tar.bz2 chroma-f1f102beae06f098e74073492b824da237f89fab.zip