2010-08-19

Calculate PerlinNoise faster with an optimization to grad()

A simple optimization to the gradient function makes the noise function twice as fast.

PerlinNoise.grad()
inline float grad(int hash, float x, float y, float z)
   {  
 //float u = (h < 8) ? x : y;
 //float v = (h < 4) ? y : ((h == 12 || h == 14) ? x : z);
 //return ((h & 1) == 0 ? u : -u) + ((h & 2) == 0 ? v : -v);

 switch(hash & 0xF)
 {
  case 0x0: return  x + y;
  case 0x1: return -x + y;
  case 0x2: return  x - y;
  case 0x3: return -x - y;
  case 0x4: return  x + z;
  case 0x5: return -x + z;
  case 0x6: return  x - z;
  case 0x7: return -x - z;
  case 0x8: return  y + z;
  case 0x9: return -y + z;
  case 0xA: return  y - z;
  case 0xB: return -y - z;
  case 0xC: return  y + x;
  case 0xD: return -y + z;
  case 0xE: return  y - x;
  case 0xF: return -y - z;
  default: return 0; // never happens
 }
   }

PerlinNoise.noise() (click code to expand)
int   perm[512];

   void initPerlinNoise()
   {
      int i, permutation[] = { 151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168, 68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161, 1, 216, 80, 73, 209, 76, 132, 187, 208, 89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109, 198, 173, 186, 3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152, 2, 44, 154, 163, 70, 221, 153, 101, 155, 167, 43, 172, 9, 129, 22, 39, 253, 19, 98, 108, 110, 79, 113, 224, 232, 178, 185, 112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
         14, 239, 107, 49, 192, 214, 31, 181, 199, 106, 157, 184, 84, 204, 176, 115, 121, 50, 45, 127, 4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78, 66, 215, 61, 156, 180 };

      for (i = 0; i < 256; i++)
         perm[256 + i] = perm[i] = permutation[i];
   }


   inline float fade(float t)
   {
      return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
   }

   inline float lerp(float t, float a, float b)
   {
      return a + t * (b - a);
   }

   inline float grad(int hash, float x, float y, float z)
   {  
 //float u = (h < 8) ? x : y;
 //float v = (h < 4) ? y : ((h == 12 || h == 14) ? x : z);
 //return ((h & 1) == 0 ? u : -u) + ((h & 2) == 0 ? v : -v);

 switch(hash & 0xF)
 {
  case 0x0: return  x + y;
  case 0x1: return -x + y;
  case 0x2: return  x - y;
  case 0x3: return -x - y;
  case 0x4: return  x + x;
  case 0x5: return -x + x;
  case 0x6: return  x - x;
  case 0x7: return -x - x;
  case 0x8: return  y + x;
  case 0x9: return -y + x;
  case 0xA: return  y - x;
  case 0xB: return -y - x;
  case 0xC: return  y + z;
  case 0xD: return -y + x;
  case 0xE: return  y - x;
  case 0xF: return -y - z;
  default: return 0; // never happens
 }
   }

   static inline float __fastcall noise(float x, float y, float z)
   {
      jint ix,iy,iz,gx,gy,gz;
      jint a0,b0,aa,ab,ba,bb;

      float aa0,ab0,ba0,bb0;
      float aa1,ab1,ba1,bb1;
      float a1,a2,a3,a4,a5,a6,a7,a8;
      float u,v,w,a8_5,a4_1;

      ix = (jint)x; x-=ix;
      iy = (jint)y; y-=iy;
      iz = (jint)z; z-=iz;

      gx = ix & 0xFF;
      gy = iy & 0xFF;
      gz = iz & 0xFF;
      
      a0 = gy+perm[gx];
      b0 = gy+perm[gx + 1];
      aa = gz+perm[a0];
      ab = gz+perm[a0 + 1];
      ba = gz+perm[b0];
      bb = gz+perm[b0 + 1];

      aa0 = perm[aa]; aa1 = perm[aa + 1];
      ab0 = perm[ab]; ab1 = perm[ab + 1];
      ba0 = perm[ba]; ba1 = perm[ba + 1];
      bb0 = perm[bb]; bb1 = perm[bb + 1];

      a1 = grad(bb1, x-1, y-1, z-1);
      a2 = grad(ab1, x  , y-1, z-1);
      a3 = grad(ba1, x-1, y  , z-1);
      a4 = grad(aa1, x  , y  , z-1);
      a5 = grad(bb0, x-1, y-1, z  );
      a6 = grad(ab0, x  , y-1, z  );
      a7 = grad(ba0, x-1, y  , z  );
      a8 = grad(aa0, x  , y  , z  );

      u = fade(x);
      v = fade(y);
      w = fade(z);

      a8_5 = lerp(v, lerp(u, a8, a7), lerp(u, a6, a5));
      a4_1 = lerp(v, lerp(u, a4, a3), lerp(u, a2, a1));
      return lerp(w, a8_5, a4_1);
   }

8 comments:

  1. Riven, I am currently working on a Perlin noise implementation and I happened across your post. I like how you've cleaned up the original code (with respect to Mr. Perlin). Do you attribute the 2X speed gain to the switch statement you used in grad()?

    ReplyDelete
  2. that was the only part of the code I changed, so I'm quite certain that's the reason for the speedup ;o)

    ReplyDelete
  3. Gotcha! Well, I like the switch statement--it's a lot easier to understand. I wonder if there would be less repetition in the noise output if more cases were added to the switch. My understanding of Perlin noise isn't good enough to know really. The original paper talks about computing "gradients", but seems like it's done quite differently there:

    http://mrl.nyu.edu/~perlin/paper445.pdf

    Thoughts? I'll probably just try it :) I'm curious, what are you using Perlin for?

    ReplyDelete
  4. There are only 16 possible cases. There is no way to add more calculations to make the noise less repetitive.

    The real solution is to layer noise and apply all all kinds of operations on it, like adding, multiplying, abs(n), max(m,n)... etc. It's virtually impossible to see any patterns after that.

    Finetuning takes ages...

    ReplyDelete
  5. Oh I see what you mean. In fact looking at it now I believe there are actually less than 16...i.e. case 0x1 and 0xE are the same, and 0x0 and 0xC are the same. It's (x or y or z) (- or +) (x or y or z without repeating the first choice), or 3 * 2 * 2 = 12 possibilities.

    I think this is why in Perlin's code he has a comment "Convert lo 4 bits of hash code into 12 gradient directions". This means the hash is nonuniform...but it still looks good ;)

    ReplyDelete
  6. Well, 16-2 (duplicates) = 14, so the remaining pairs are:
    0xB and 0xF
    0x9 and 0xD

    I have no doubt this contributes absolutely nothing to the discussion :o)

    ReplyDelete
  7. why is perm[512] 512 long? it only ever touches 257, correct?

    ReplyDelete
  8. Ivan, that's not true. `perm[]` result (which can be anything from 0 to 255) is fed into itself added to some value between 0 and 255, so it does use all 512 values.

    ReplyDelete