/* Drawn from roboticus' (1118303) & hdb's(1118256) iterative implementations (Algorithm also described by graff & bitingduck) */ void xlateBuffersI( register U64 *p, const U32 size, const U32 offset ) { U32 register cnt = size, dst = 0, cmp = 0; U64 register tmp = p[ dst ]; assert( offset < size ); assert( ( (U64)p & 7ull ) == 0 ); while( cnt-- ) { U32 register src = ( dst + offset ) % size; if( src == cmp ) { p[ dst ] = tmp; dst = ++cmp; tmp = p[ dst ]; } else { p[ dst ] = p[ src ]; dst = src; } } return; } __forceinline void swapElems( register U64 *p, const U32 x, const U32 y ) { register U64 temp = p[ x ]; p[ x ] = p[ y ]; p[ y ] = temp; } /* Adapted from anonymonk's post 1118262 */ void xlateBuffersR( U64 *p, U32 size, U32 x, U32 y ) { U32 i, savedY = y; if( y == size ) return; for( i = x; i < savedY; ++i ) { if( y == size ) y = savedY; swapElems( p, x++, y++ ); } xlateBuffersR( p, size, x, y ); }