On 03/09/2011 04:25 PM, bearophile wrote:
Tom:
What is the most efficient way of implement a rotation of ubyte[4] array?
By rotation I mean: rotateRight([1, 2, 3, 4]) -> [4, 1, 2, 3]
Two versions, I have done no benchmarks so far:
import std.c.stdio: printf;
union Four {
ubyte[4] a;
uint u;
}
void showFour(Four f) {
printf("f.u: %u\n", f.u);
printf("f.a: [%d, %d, %d, %d]\n",
cast(int)f.a[0], cast(int)f.a[1],
cast(int)f.a[2], cast(int)f.a[3]);
}
void main() {
Four f;
f.a[] = [1, 2, 3, 4];
showFour(f);
f.u = (f.u<< 8) | (f.u>> 24);
showFour(f);
printf("\n");
// alternative
f.a[] = [1, 2, 3, 4];
uint u2 = f.u;
showFour(f);
printf("u2: %u\n", u2);
asm {
rol u2, 8;
}
f.u = u2;
showFour(f);
}
/*
dmd -O -release test.d
__Dmain comdat
push EBP
mov EBP,ESP
sub ESP,8
push 4
mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
push 4
push 3
push 2
push 1
push 4
mov dword ptr -8[EBP],0
push EAX
call near ptr __d_arrayliteralT
add ESP,018h
push EAX
lea EAX,-8[EBP]
push EAX
call near ptr _memcpy
mov EAX,-8[EBP]
call near ptr _D4test8showFourFS4test4FourZv
mov EAX,-8[EBP]
mov ECX,-8[EBP]
shl EAX,8 ;<=========
shr ECX,018h
or EAX,ECX
mov -8[EBP],EAX
mov EAX,-8[EBP]
call near ptr _D4test8showFourFS4test4FourZv
mov EAX,offset FLAT:_DATA[024h]
push EAX
call near ptr _printf
mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
push 4
push 4
push 3
push 2
push 1
push 4
push EAX
call near ptr __d_arrayliteralT
add ESP,018h
push EAX
lea EAX,-8[EBP]
push EAX
call near ptr _memcpy
mov EAX,-8[EBP]
mov -4[EBP],EAX
mov EAX,-8[EBP]
call near ptr _D4test8showFourFS4test4FourZv
mov EAX,offset FLAT:_DATA[028h]
push dword ptr -4[EBP]
push EAX
call near ptr _printf
add ESP,024h
rol -4[EBP],8 ;<=========
mov EAX,-4[EBP]
mov -8[EBP],EAX
mov EAX,-4[EBP]
call near ptr _D4test8showFourFS4test4FourZv
mov ESP,EBP
pop EBP
ret
*/
In theory a C/C++/D compiler has to compile an expression like (x<< 8)|(x>>24)
with a ROL instruction, in practice DMD doesn't do it. Months ago I have asked the two
(four in X86) roll instructions to be added to the Phobos core intrinsics module, but I am
not sure what Walter answered me.
Bye,
bearophile
I love it.
I've done a little benchmark that just repeats the rotate left a certain
number of times, and then a rotate right a certain number of times. It
looks like shifting ( << | >> ) is faster than the process of copying
the uint value, shifting it, and copying it back. If I move the
assignment for the rol outside of the for loop, the rol is about twice
as fast.
http://dl.dropbox.com/u/12135920/rotate.d
Both are anywhere from 30 to 80 times faster than the slicing method I
proposed (also included in the rotate.d file).
------
Rotating static array left to right 5000000 times
Finished in 1971.46 milliseconds
Rotating static array right to left 5000000 times
Finished in 1987.60 milliseconds
Rotating dynamic array left to right 5000000 times
Finished in 1932.40 milliseconds
Rotating dynamic array right to left 5000000 times
Finished in 1981.71 milliseconds
Shifting Union left to right 5000000 times
Finished in 33.46 milliseconds
Shifting Union right to left 5000000 times
Finished in 34.26 milliseconds
Rolling Union left to right 5000000 times
Finished in 67.51 milliseconds
Rolling Union right to left 5000000 times
Finished in 67.47 milliseconds
Rolling Union left to right 5000000 times with assignment with temporary
variable outside of the loop
Finished in 28.81 milliseconds
Rolling Union right to left 5000000 times with assignment with temporary
variable outside of the loop
Finished in 25.57 milliseconds