# Instruction Level Parallelism

Original Author: Luke Hutchinson

I remember a long time ago reading Michael Abrash’s Ramblings in Real Time articles in DDJ. 15 years back when they were working on Quake, he explained how they made use of the expensive floating point divide on the original Pentium (October 1996, )

Here is the code again. This time each line has been commented with a clock cycle and also a letter. The letters are simply to make referring to individual lines simpler in the explanation below.

```
inline vec_float4 mat4_mul_vec4( const mat4 & m, vec_float4 v ){

vec_float4 zero;

vec_float4 ret;

vec_float4 x, y, z, w;

zero = (vec_float4) vec_vspltisw( 0 );    // -1       [A]

x = vec_vspltw( v, 0 );                   //  0       [B]

y = vec_vspltw( v, 1 );                   //  1       [C]

z = vec_vspltw( v, 2 );                   //  2       [D]

w = vec_vspltw( v, 3 );                   //  3       [E]

ret = vec_vmaddfp( m.col0, x, zero );     //  4       [F]

ret = vec_vmaddfp( m.col1, y, ret );      // 16       [G]

ret = vec_vmaddfp( m.col2, z, ret );      // 28       [H]

ret = vec_vmaddfp( m.col3, w, ret );      // 40       [I]

return ret;                               // 52       [J]

}

```

WTF does cycle -1 mean ? Have we built a time machine ? No, but i’ll get to that in a minute.

On cycles 0-3, we execute the vec_vspltws [B]-[E]. On cycle 4, the results of instructions [A] and [B] are ready so we can use them as inputs to the vec_vmaddfp [F]. [F] has a 12 cycle latency, so the next vec_vmaddfp at [G] is blocked until cycle 16, and so on.

So what’s the go with the magic -1 ? [A] does not depend on either input argument m or v, so depending on how mat4_mul_vec4() is used, it may not be important. If as is often the case, m and or v are calculated just before calling mat4_mul_vec4(), then [A] can generally be slotted in somewhere for free.

For example,

```
inline vec_float foo( mat4 m, vec_float4 v0, vec_float4 v1 ){

return mat4_mul_vec4( m, vec_vaddfp( v0, v1 ) );

}

```

Here the vec_vspltisw can be scheduled in the latency window between vec_vaddfp and the first vec_vspltw.

So how do we do better than the first implementation ?

```
inline vec_float4 mat4_mul_vec4( const mat4 & m, vec_float4 v ){

vec_float4 zero;

vec_float4 ret, t0, t1;

vec_float4 x, y, z, w;

zero = (vec_float4) vec_vspltisw( 0 );    // -1       [A]

x = vec_vspltw( v, 0 );                   //  0       [B]

z = vec_vspltw( v, 2 );                   //  1       [C]

y = vec_vspltw( v, 1 );                   //  2       [D]

w = vec_vspltw( v, 3 );                   //  3       [E]

t0 = vec_vmaddfp( m.col0, x, zero );      //  4       [F]

t1 = vec_vmaddfp( m.col2, z, zero );      //  5       [G]

t0 = vec_vmaddfp( m.col1, y, t0 );        //  16      [H]

t1 = vec_vmaddfp( m.col3, w, t1 );        //  17      [I]

ret = vec_vaddfp( t0, t1 );               //  29      [J]

return ret;                               //  41      [K]

}

```

We’ve got more instructions here, there’s an extra vec_vaddfp. If you were just looking at the intrinsics and considering them as little functions in your mental model, then this implementation would look like crack smoking. Yet this second implementation is 41+1 cycles vs 52+1 cycles, that’s more than 20% faster.

There is an underlying pattern to what we did here to optimize for latency. Reducing dependency chains. This is really just the same as any other scheduling problem outside of computer science, be it project management, manufacturing production lines, etc.

Lets look at the two versions again, this time with the instruction dependencies shown. The ascii art here isn’t brilliant, but if you look at it for a second hopefully it makes sense. Where one intrinsic depends on the result of a previous, we’ve depicted this with an arc ending with an ‘o‘. The “critical path” is the longest dependency chain; this has been highlighted.

```
inline vec_float4 mat4_mul_vec4(

const mat4 & m,                            // -.

vec_float4 v ){                            // ===+

//  . |

vec_float4 zero;                          //  . |

vec_float4 ret;                           //  . |

vec_float4 x, y, z, w;                    //  . |

//  . |

zero = (vec_float4) vec_vspltisw( 0 );    // -.-|-.

//  . | .

x = vec_vspltw( v, 0 );                   // ===o===+

//  . . . |

y = vec_vspltw( v, 1 );                   // -.-o-.-|-.

//  . . . | .

z = vec_vspltw( v, 2 );                   // -.-o-.-|-.-.

//  . . . | . .

w = vec_vspltw( v, 3 );                   // -.-o-.-|-.-.-.

//  .   . | . . .

ret = vec_vmaddfp( m.col0, x, zero );     // =o===o=o========+

//  .       . . .  |

ret = vec_vmaddfp( m.col1, y, ret );      // =o=======o======o=+

//  .         . .    |

ret = vec_vmaddfp( m.col2, z, ret );      // =o=========o======o=+

//  .           .      |

ret = vec_vmaddfp( m.col3, w, ret );      // =o===========o======o=+

//                       |

return ret;                               // ======================o

}

```

In the original version, the critical path is

```
v

x = vec_vspltw( v, 0 );

ret = vec_vmaddfp( m.col0, x, zero );

ret = vec_vmaddfp( m.col1, y, zero );

ret = vec_vmaddfp( m.col2, z, zero );

ret = vec_vmaddfp( m.col3, w, zero );

return ret;

```

And the optimized version,

```
inline vec_float4 mat4_mul_vec4(

const mat4 & m,                            // -.

vec_float4 v ){                            // ===+

//  . |

vec_float4 zero;                          //  . |

vec_float4 ret, t0, t1;                   //  . |

vec_float4 x, y, z, w;                    //  . |

//  . |

zero = (vec_float4) vec_vspltisw( 0 );    // -.-|-.

//  . | .

x = vec_vspltw( v, 0 );                   // -.-o-.-.

//  . | . .

z = vec_vspltw( v, 2 );                   // ===o=====+

//  . . . . |

y = vec_vspltw( v, 1 );                   // -.-o-.-.-|-.

//  . . . . | .

w = vec_vspltw( v, 3 );                   // -.-o-.-.-|-.-.

//  .   . . | . .

t0 = vec_vmaddfp( m.col0, x, zero );      // -o---o-o-|-.-.-.

//  .   .   | . . .

t1 = vec_vmaddfp( m.col2, z, zero );      // =o===o===o=======+

//  .         . . . |

t0 = vec_vmaddfp( m.col1, y, t0 );        // -o---------o-.-.-|-.

//  .           . . | .

t1 = vec_vmaddfp( m.col3, w, t1 );        // =o===========o===o===+

//                    . |

ret = vec_vaddfp( t0, t1 );               // ===================o=o=+

//                        |

return ret;                               // =======================o

}

```

The critical path,

```
v

z = vec_vspltw( v, 2 );

t1 = vec_vmaddfp( m.col2, z, zero );

t1 = vec_vmaddfp( m.col3, w, t1 );

ret = vec_vaddfp( t0, t1 );

return ret;

```

Comparing the two versions this way makes the reason for the speed up clear.

Obviously commenting the dependency chains like this in your code is a bit unweildly, but i find visualizing it this way is very helpful.

Earlier i said that we had just used PPC as an example, and that this even applied to SSE. Out-of-order execution does throw an extra complication into the mix, but at the end of the day, the cpu isn’t doing anything magic, it can never run faster than the longest dependency chain. But as assumptions are dangerous, and to make sure we haven’t gone all theoretical and missed some important practical issues, lets check this.

These tests are run on a Core i7 Nehalem running Ubuntu GNU/Linux, and on a PS3 running Yellow Dog GNU/Linux. Each test displays the clock cycle count for the original ‘slower’ version, and the new ‘faster’ version, plus the percentage speed up the faster version acheived. For the PPU and SPU versions where we have predicted what the speed up should be, the measured results match up very closely (the SPU results are predicited in comments in the code). Due to the myriad of different SSE implementations, attempting to predict the exact speed up is less useful, but the end result is still a good one. The output of the tests is

```
sse

slower 0x000538f4

faster 0x000482a8

speed up 13.635%

ppu

slower 0x000032c7

faster 0x0000280a

speed up 21.148%

spu

slower 0x00001d4e

faster 0x0000186c

speed up 16.662%

```

main.c++

```
#include <stdio.h>

#include <string.h>

#if defined ( __PPU__ )

# include <altivec.h>

# include <vec_types.h>

typedef vec_float4 float4;

#elif defined ( __SPU__ )

# include <spu_intrinsics.h>

typedef qword float4;

#else

# include <xmmintrin.h>

typedef __m128 float4;

#endif

extern float4 latency( const float *m4data, const float *v4data );

int main( int argc, char **argv ){

static const float m4data[16] __attribute__((aligned(16)))={

1.f, 0.f, 0.f, 0.f,

0.f, 1.f, 0.f, 0.f,

0.f, 0.f, 1.f, 0.f,

0.f, 0.f, 0.f, 1.f,

};

static const float v4data[4] __attribute__((aligned(16)))={

2.f, 3.f, 4.f, 5.f,

};

latency( m4data, v4data );

return 0;

}

```

latency_sse.c++

```
#include <stdint.h>

#include <stdio.h>

#include <xmmintrin.h>

struct mat4{

__m128 col0, col1, col2, col3;

};

// Slower version where all addps' are dependant on each other

static inline __m128 mat4_mul_vec4_slower( const mat4 & m, __m128 v ){

// Notice we are being careful here with temporary variables to make sure that

// the destination and one source for each intrinsic is the same.  Most SSE

// (but not AVX) instructions modify one of the source operands.  By

// explicitly copying v up front, GCC ((Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5)

// generates slightly better code.

__m128 t0 = v;

__m128 t1 = v;

__m128 t2 = v;

__m128 t3 = v;

t0 = _mm_shuffle_ps( t0, t0, 0xff );

t1 = _mm_shuffle_ps( t1, t1, 0xaa );

t2 = _mm_shuffle_ps( t2, t2, 0x55 );

t3 = _mm_shuffle_ps( t3, t3, 0x00 );

t0 = _mm_mul_ps( t0, m.col0 );

t1 = _mm_mul_ps( t1, m.col1 );

t2 = _mm_mul_ps( t2, m.col2 );

t3 = _mm_mul_ps( t3, m.col3 );

t0 = _mm_add_ps( t0, t1 );

t0 = _mm_add_ps( t0, t2 );

t0 = _mm_add_ps( t0, t3 );

return t0;

}

// Faster version with two addps' in parallel

static inline __m128 mat4_mul_vec4_faster( const mat4 & m, __m128 v ){

__m128 t0 = v;

__m128 t1 = v;

__m128 t2 = v;

__m128 t3 = v;

t0 = _mm_shuffle_ps( t0, t0, 0xff );

t1 = _mm_shuffle_ps( t1, t1, 0xaa );

t2 = _mm_shuffle_ps( t2, t2, 0x55 );

t3 = _mm_shuffle_ps( t3, t3, 0x00 );

t0 = _mm_mul_ps( t0, m.col0 );

t1 = _mm_mul_ps( t1, m.col1 );

t2 = _mm_mul_ps( t2, m.col2 );

t3 = _mm_mul_ps( t3, m.col3 );

t0 = _mm_add_ps( t0, t1 );

t2 = _mm_add_ps( t2, t3 );

t0 = _mm_add_ps( t0, t2 );

return t0;

}

static inline uint64_t clock(){

uint32_t lo, hi, pid;

asm volatile( "rdtscp" : "=a"(lo), "=d"(hi), "=c"(pid) );

return ((uint64_t)hi<<32)|lo;

}

__m128 latency( const float *m4data, const float *v4data ){

mat4 m4={

};

__m128 v4 = _mm_load_ps( v4data );

enum { NUM_LOOPS=10000 };

// Profile slower code

uint64_t t0;

{

// Call once before we read the time stamp counter, to get rid of cache

// effects

v4 = mat4_mul_vec4_slower( m4, v4 );

t0 = clock();

for ( unsigned i=0; i<NUM_LOOPS; ++i ){

v4 = mat4_mul_vec4_slower( m4, v4 );

}

t0 = clock() - t0;

}

// Profile faster code

uint64_t t1;

{

v4 = mat4_mul_vec4_faster( m4, v4 );

t1 = clock();

for ( unsigned i=0; i<NUM_LOOPS; ++i ){

v4 = mat4_mul_vec4_faster( m4, v4 );

}

t1 = clock() - t1;

}

printf( "slower 0x%08xnfaster 0x%08xnspeed up %.3f%%n",

(unsigned)t0, (unsigned)t1, 100.f*(float)((int64_t)(t0-t1))/(float)t0 );

// Return final result so the calculations cannot be optimized out

return v4;

}

```

latency_ppu.c++

```
#include <altivec.h>

#include <ppu_intrinsics.h>

#include <stdint.h>

#include <stdio.h>

#include <vec_types.h>

struct mat4{

vec_float4 col0, col1, col2, col3;

};

// Slower version where all vmaddfp's are dependant on each other.  Note that

// passing m by reference here is actually important.  ppu-g++ (4.1.1 from

// Barcelona Supercomputing Center) generates significantly worse code when

// passed by value.

static inline vec_float4 mat4_mul_vec4_slower( const mat4 & m, vec_float4 v ){

vec_float4 zero;

vec_float4 ret;

vec_float4 x, y, z, w;

zero = (vec_float4) vec_vspltisw( 0 );    // -1

x = vec_vspltw( v, 0 );                   //  0

y = vec_vspltw( v, 1 );                   //  1

z = vec_vspltw( v, 2 );                   //  2

w = vec_vspltw( v, 3 );                   //  3

ret = vec_vmaddfp( m.col0, x, zero );     //  4

ret = vec_vmaddfp( m.col1, y, ret );      //  16

ret = vec_vmaddfp( m.col2, z, ret );      //  28

ret = vec_vmaddfp( m.col3, w, ret );      //  40

return ret;                               //  52

}

// Faster version with two vmaddfp's in parallel

static inline vec_float4 mat4_mul_vec4_faster( const mat4 & m, vec_float4 v ){

vec_float4 zero;

vec_float4 ret, t0, t1;

vec_float4 x, y, z, w;

zero = (vec_float4) vec_vspltisw( 0 );    // -1

x = vec_vspltw( v, 0 );                   //  0

z = vec_vspltw( v, 2 );                   //  1

y = vec_vspltw( v, 1 );                   //  2

w = vec_vspltw( v, 3 );                   //  3

t0 = vec_vmaddfp( m.col0, x, zero );      //  4

t1 = vec_vmaddfp( m.col2, z, zero );      //  5

t0 = vec_vmaddfp( m.col1, y, t0 );        //  16

t1 = vec_vmaddfp( m.col3, w, t1 );        //  17

ret = vec_vaddfp( t0, t1 );               //  29

return ret;                               //  41

}

static inline uint32_t clock(){

// As a workaround for a bug in the CellBE (where overflow from least

// significant word to most significant word is not atomic), we simply discard

// the most significant word.  Our test is quick enough that we don't need to

// worry about the least significant word overflowing more than once.

register uint64_t tb;

asm volatile( "mftb  %0nt" : "=r"(tb) );

return (uint32_t) tb;

}

vec_float4 latency( const float *m4data, const float *v4data ){

mat4 m4={

vec_lvxl( 0,  m4data ),

vec_lvxl( 16, m4data ),

vec_lvxl( 32, m4data ),

vec_lvxl( 48, m4data ),

};

vec_float4 v4 = vec_lvxl( 0, v4data );

enum { NUM_LOOPS=10000 };

// Profile slower code

uint32_t t0;

{

// Call once before we read the time stamp counter, to get rid of cache

// effects

v4 = mat4_mul_vec4_slower( m4, v4 );

t0 = clock();

for ( unsigned i=0; i<NUM_LOOPS; ++i ){

v4 = mat4_mul_vec4_slower( m4, v4 );

}

t0 = clock() - t0;

}

// Profile faster code

uint32_t t1;

{

v4 = mat4_mul_vec4_faster( m4, v4 );

t1 = clock();

for ( unsigned i=0; i<NUM_LOOPS; ++i ){

v4 = mat4_mul_vec4_faster( m4, v4 );

}

t1 = clock() - t1;

}

printf( "slower 0x%08xnfaster 0x%08xnspeed up %.3f%%n",

(unsigned)t0, (unsigned)t1, 100.f*(float)((int32_t)(t0-t1))/(float)t0 );

// Return final result so the calculations cannot be optimized out

return v4;

}

```

latency_spu.c++

```
#include <spu_intrinsics.h>

#include <stdint.h>

#include <stdio.h>

struct mat4{

qword col0, col1, col2, col3;

};

// Slower version where all fm/fma's are dependant on each other

static inline qword mat4_mul_vec4_slower( const mat4 & m, qword v ){

qword shufAAAA, shufBBBB, shufCCCC, shufDDDD;

qword ret;

qword x, y, z, w;

shufAAAA = si_ila( 0x10203 );             // -5

shufBBBB = si_orbi( shufAAAA, 4 );        // -3

shufCCCC = si_orbi( shufAAAA, 8 );        // -2

shufDDDD = si_orbi( shufAAAA, 12 );       // -1

x = si_shufb( v, v, shufAAAA );           //  0

y = si_shufb( v, v, shufBBBB );           //  1

z = si_shufb( v, v, shufCCCC );           //  2

w = si_shufb( v, v, shufDDDD );           //  3

ret = si_fm ( m.col0, x );                //  4

ret = si_fma( m.col1, y, ret );           //  10

ret = si_fma( m.col2, z, ret );           //  16

ret = si_fma( m.col3, w, ret );           //  22

return ret;                               //  28

}

// Faster version with two fm's and two fma's in parallel

static inline qword mat4_mul_vec4_faster( const mat4 & m, qword v ){

qword shufAAAA, shufBBBB, shufCCCC, shufDDDD;

qword ret;

qword x, y, z, w;

qword xy, zw;

shufAAAA = si_ila( 0x10203 );             // -5

shufCCCC = si_orbi( shufAAAA, 8 );        // -3

shufBBBB = si_orbi( shufAAAA, 4 );        // -2

shufDDDD = si_orbi( shufAAAA, 12 );       // -1

x = si_shufb( v, v, shufAAAA );           //  0

z = si_shufb( v, v, shufCCCC );           //  1

y = si_shufb( v, v, shufBBBB );           //  2

w = si_shufb( v, v, shufDDDD );           //  3

xy = si_fm( m.col0, x );                  //  4

zw = si_fm( m.col2, z );                  //  5

xy = si_fma( m.col1, y, xy );             // 10

zw = si_fma( m.col3, w, zw );             // 11

ret = si_fa( xy, zw );                    // 17

return ret;                               // 23

}

static inline uint32_t clock(){

}

qword latency( const float *m4data, const float *v4data ){

mat4 m4={

si_lqd( si_from_ptr(m4data), 0  ),

si_lqd( si_from_ptr(m4data), 16 ),

si_lqd( si_from_ptr(m4data), 32 ),

si_lqd( si_from_ptr(m4data), 48 )

};

qword v4 = si_lqd( si_from_ptr(v4data), 0 );

enum { NUM_LOOPS=10000 };

// Initialize decrementer to maximum value

spu_writech( SPU_WrDec, 0xffffffff );

// Profile slower code

uint32_t t0;

{

t0 = clock();

for ( unsigned i=0; i<NUM_LOOPS; ++i ){

v4 = mat4_mul_vec4_slower( m4, v4 );

}

t0 = clock() - t0;

}

// Profile faster code

uint32_t t1;

{

t1 = clock();

for ( unsigned i=0; i<NUM_LOOPS; ++i ){

v4 = mat4_mul_vec4_faster( m4, v4 );

}

t1 = clock() - t1;

}

printf( "slower 0x%08xnfaster 0x%08xnspeed up %.3f%%n",

(unsigned)t0, (unsigned)t1, 100.f*(float)((int32_t)(t0-t1))/(float)t0 );

// Return final result so the calculations cannot be optimized out

return v4;

}

```

build.sh

```
#! /bin/sh

set -e

COMMON_CXXFLAGS="-O3 -Wall -Werror -pedantic"

g++     \${COMMON_CXXFLAGS}           main.c++ latency_sse.c++ -o latency_sse

ppu-g++ \${COMMON_CXXFLAGS} -maltivec main.c++ latency_ppu.c++ -o latency_ppu

spu-g++ \${COMMON_CXXFLAGS}           main.c++ latency_spu.c++ -o latency_spu

echo 'sse'

./latency_sse

echo 'nppu'

echo 'nspu'