Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread bearophile
Tom:

> What is the most efficient way of implement a rotation of ubyte[4] array?
> 
> By rotation I mean: rotateRight([1, 2, 3, 4]) -> [4, 1, 2, 3]

Two versions, I have done no benchmarks so far:

import std.c.stdio: printf;

union Four {
ubyte[4] a;
uint u;
}

void showFour(Four f) {
printf("f.u: %u\n", f.u);
printf("f.a: [%d, %d, %d, %d]\n",
   cast(int)f.a[0], cast(int)f.a[1],
   cast(int)f.a[2], cast(int)f.a[3]);
}

void main() {
Four f;
f.a[] = [1, 2, 3, 4];
showFour(f);
f.u = (f.u << 8) | (f.u >> 24);
showFour(f);
printf("\n");

// alternative
f.a[] = [1, 2, 3, 4];
uint u2 = f.u;
showFour(f);
printf("u2: %u\n", u2);
asm {
rol  u2, 8;
}
f.u = u2;
showFour(f);
}

/*

dmd -O -release test.d

__Dmain comdat
pushEBP
mov EBP,ESP
sub ESP,8
push4
mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
push4
push3
push2
push1
push4
mov dword ptr -8[EBP],0
pushEAX
callnear ptr __d_arrayliteralT
add ESP,018h
pushEAX
lea EAX,-8[EBP]
pushEAX
callnear ptr _memcpy
mov EAX,-8[EBP]
callnear ptr _D4test8showFourFS4test4FourZv
mov EAX,-8[EBP]
mov ECX,-8[EBP]
shl EAX,8; <=
shr ECX,018h
or  EAX,ECX
mov -8[EBP],EAX
mov EAX,-8[EBP]
callnear ptr _D4test8showFourFS4test4FourZv
mov EAX,offset FLAT:_DATA[024h]
pushEAX
callnear ptr _printf
mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
push4
push4
push3
push2
push1
push4
pushEAX
callnear ptr __d_arrayliteralT
add ESP,018h
pushEAX
lea EAX,-8[EBP]
pushEAX
callnear ptr _memcpy
mov EAX,-8[EBP]
mov -4[EBP],EAX
mov EAX,-8[EBP]
callnear ptr _D4test8showFourFS4test4FourZv
mov EAX,offset FLAT:_DATA[028h]
pushdword ptr -4[EBP]
pushEAX
callnear ptr _printf
add ESP,024h
rol -4[EBP],8   ; <=
mov EAX,-4[EBP]
mov -8[EBP],EAX
mov EAX,-4[EBP]
callnear ptr _D4test8showFourFS4test4FourZv
mov ESP,EBP
pop EBP
ret
*/

In theory a C/C++/D compiler has to compile an expression like (x<< 8)|(x>>24) 
with a ROL instruction, in practice DMD doesn't do it. Months ago I have asked 
the two (four in X86) roll instructions to be added to the Phobos core 
intrinsics module, but I am not sure what Walter answered me.

Bye,
bearophile


Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread Kai Meyer

On 03/09/2011 03:41 PM, Tom wrote:

What is the most efficient way of implement a rotation of ubyte[4] array?

By rotation I mean: rotateRight([1, 2, 3, 4]) -> [4, 1, 2, 3]

TIA,
Tom;


I don't know of anything more efficient than:
ubyte[4] bytes = [1,2,3,4];
bytes = bytes[$-1] ~ bytes[0..$-1]; // Rotate left
bytes = bytes[1..$] ~ bytes[0]; // Rotate right

Both static arrays and dynamic arrays (ubyte[] bytes = [1,2,3,4];) 
perform about the same between 1 and 10 milling rotations in either 
direction. I think a temporary array might be created for the rhs, and 
then have the values of the rhs array copied to the lhs array, but I 
don't know. With static arrays, I'm not sure there would be a way to get 
around it with out at least a temporary value for the one that's moving 
between the first and last positions.


Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread Tom

El 09/03/2011 20:25, bearophile escribió:

Tom:


What is the most efficient way of implement a rotation of ubyte[4] array?

By rotation I mean: rotateRight([1, 2, 3, 4]) ->  [4, 1, 2, 3]


Two versions, I have done no benchmarks so far:

import std.c.stdio: printf;

union Four {
 ubyte[4] a;
 uint u;
}

void showFour(Four f) {
 printf("f.u: %u\n", f.u);
 printf("f.a: [%d, %d, %d, %d]\n",
cast(int)f.a[0], cast(int)f.a[1],
cast(int)f.a[2], cast(int)f.a[3]);
}

void main() {
 Four f;
 f.a[] = [1, 2, 3, 4];
 showFour(f);
 f.u = (f.u<<  8) | (f.u>>  24);
 showFour(f);
 printf("\n");

 // alternative
 f.a[] = [1, 2, 3, 4];
 uint u2 = f.u;
 showFour(f);
 printf("u2: %u\n", u2);
 asm {
 rol  u2, 8;
 }
 f.u = u2;
 showFour(f);
}

/*

dmd -O -release test.d

__Dmain comdat
 pushEBP
 mov EBP,ESP
 sub ESP,8
 push4
 mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
 push4
 push3
 push2
 push1
 push4
 mov dword ptr -8[EBP],0
 pushEAX
 callnear ptr __d_arrayliteralT
 add ESP,018h
 pushEAX
 lea EAX,-8[EBP]
 pushEAX
 callnear ptr _memcpy
 mov EAX,-8[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov EAX,-8[EBP]
 mov ECX,-8[EBP]
 shl EAX,8;<=
 shr ECX,018h
 or  EAX,ECX
 mov -8[EBP],EAX
 mov EAX,-8[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov EAX,offset FLAT:_DATA[024h]
 pushEAX
 callnear ptr _printf
 mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
 push4
 push4
 push3
 push2
 push1
 push4
 pushEAX
 callnear ptr __d_arrayliteralT
 add ESP,018h
 pushEAX
 lea EAX,-8[EBP]
 pushEAX
 callnear ptr _memcpy
 mov EAX,-8[EBP]
 mov -4[EBP],EAX
 mov EAX,-8[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov EAX,offset FLAT:_DATA[028h]
 pushdword ptr -4[EBP]
 pushEAX
 callnear ptr _printf
 add ESP,024h
 rol -4[EBP],8   ;<=
 mov EAX,-4[EBP]
 mov -8[EBP],EAX
 mov EAX,-4[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov ESP,EBP
 pop EBP
 ret
*/

In theory a C/C++/D compiler has to compile an expression like (x<<  8)|(x>>24) 
with a ROL instruction, in practice DMD doesn't do it. Months ago I have asked the two 
(four in X86) roll instructions to be added to the Phobos core intrinsics module, but I am 
not sure what Walter answered me.

Bye,
bearophile


Wow, thanks b, didn't know of the rol instruction...

Tom;


Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread Jonathan M Davis
On Wednesday, March 09, 2011 15:35:29 Kai Meyer wrote:
> On 03/09/2011 03:41 PM, Tom wrote:
> > What is the most efficient way of implement a rotation of ubyte[4] array?
> > 
> > By rotation I mean: rotateRight([1, 2, 3, 4]) -> [4, 1, 2, 3]
> > 
> > TIA,
> > Tom;
> 
> I don't know of anything more efficient than:
>  ubyte[4] bytes = [1,2,3,4];
>  bytes = bytes[$-1] ~ bytes[0..$-1]; // Rotate left

I'm stunned that this works. I'd even consider reporting it as a bug. You're 
concatenating a ubyte[] ont a ubyte...

>  bytes = bytes[1..$] ~ bytes[0]; // Rotate right

You're concatenating a ubyte onto a slice of the array (so it's ubyte[] instead 
of ubyte[4]). That will result in a temporary whose value will then be assigned 
to the original ubyte[4].

> Both static arrays and dynamic arrays (ubyte[] bytes = [1,2,3,4];)
> perform about the same between 1 and 10 milling rotations in either
> direction. I think a temporary array might be created for the rhs, and
> then have the values of the rhs array copied to the lhs array, but I
> don't know. With static arrays, I'm not sure there would be a way to get
> around it with out at least a temporary value for the one that's moving
> between the first and last positions.

Honestly, given that this is 4 ubytes, I would fully expect that the fastest 
way 
to do this would involve casting it to a unit and shifting it - something along 
the lines of what Bearophile suggested. I'd be _very_ suprised if this 
implementation were faster, since it involves creating a temporary array.

- Jonathan M Davis


Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread Kai Meyer

On 03/09/2011 04:25 PM, bearophile wrote:

Tom:


What is the most efficient way of implement a rotation of ubyte[4] array?

By rotation I mean: rotateRight([1, 2, 3, 4]) ->  [4, 1, 2, 3]


Two versions, I have done no benchmarks so far:

import std.c.stdio: printf;

union Four {
 ubyte[4] a;
 uint u;
}

void showFour(Four f) {
 printf("f.u: %u\n", f.u);
 printf("f.a: [%d, %d, %d, %d]\n",
cast(int)f.a[0], cast(int)f.a[1],
cast(int)f.a[2], cast(int)f.a[3]);
}

void main() {
 Four f;
 f.a[] = [1, 2, 3, 4];
 showFour(f);
 f.u = (f.u<<  8) | (f.u>>  24);
 showFour(f);
 printf("\n");

 // alternative
 f.a[] = [1, 2, 3, 4];
 uint u2 = f.u;
 showFour(f);
 printf("u2: %u\n", u2);
 asm {
 rol  u2, 8;
 }
 f.u = u2;
 showFour(f);
}

/*

dmd -O -release test.d

__Dmain comdat
 pushEBP
 mov EBP,ESP
 sub ESP,8
 push4
 mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
 push4
 push3
 push2
 push1
 push4
 mov dword ptr -8[EBP],0
 pushEAX
 callnear ptr __d_arrayliteralT
 add ESP,018h
 pushEAX
 lea EAX,-8[EBP]
 pushEAX
 callnear ptr _memcpy
 mov EAX,-8[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov EAX,-8[EBP]
 mov ECX,-8[EBP]
 shl EAX,8;<=
 shr ECX,018h
 or  EAX,ECX
 mov -8[EBP],EAX
 mov EAX,-8[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov EAX,offset FLAT:_DATA[024h]
 pushEAX
 callnear ptr _printf
 mov EAX,offset FLAT:_D12TypeInfo_xAh6__initZ
 push4
 push4
 push3
 push2
 push1
 push4
 pushEAX
 callnear ptr __d_arrayliteralT
 add ESP,018h
 pushEAX
 lea EAX,-8[EBP]
 pushEAX
 callnear ptr _memcpy
 mov EAX,-8[EBP]
 mov -4[EBP],EAX
 mov EAX,-8[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov EAX,offset FLAT:_DATA[028h]
 pushdword ptr -4[EBP]
 pushEAX
 callnear ptr _printf
 add ESP,024h
 rol -4[EBP],8   ;<=
 mov EAX,-4[EBP]
 mov -8[EBP],EAX
 mov EAX,-4[EBP]
 callnear ptr _D4test8showFourFS4test4FourZv
 mov ESP,EBP
 pop EBP
 ret
*/

In theory a C/C++/D compiler has to compile an expression like (x<<  8)|(x>>24) 
with a ROL instruction, in practice DMD doesn't do it. Months ago I have asked the two 
(four in X86) roll instructions to be added to the Phobos core intrinsics module, but I am 
not sure what Walter answered me.

Bye,
bearophile


I love it.
I've done a little benchmark that just repeats the rotate left a certain 
number of times, and then a rotate right a certain number of times. It 
looks like shifting ( << | >> ) is faster than the process of copying 
the uint value, shifting it, and copying it back. If I move the 
assignment for the rol outside of the for loop, the rol is about twice 
as fast.

http://dl.dropbox.com/u/12135920/rotate.d

Both are anywhere from 30 to 80 times faster than the slicing method I 
proposed (also included in the rotate.d file).



--
Rotating static array left to right 500 times
Finished in1971.46 milliseconds
Rotating static array right to left 500 times
Finished in1987.60 milliseconds
Rotating dynamic array left to right 500 times
Finished in1932.40 milliseconds
Rotating dynamic array right to left 500 times
Finished in1981.71 milliseconds
Shifting Union left to right 500 times
Finished in  33.46 milliseconds
Shifting Union right to left 500 times
Finished in  34.26 milliseconds
Rolling Union left to right 500 times
Finished in  67.51 milliseconds
Rolling Union right to left 500 times
Finished in  67.47 milliseconds
Rolling Union left to right 500 times with assignment with temporary 
variable outside of the loop

Finished in  28.81 milliseconds
Rolling Union right to left 500 times with assignment with temporary 
variable outside of the loop

Finished in  25.57 milliseconds



Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread spir

On 03/10/2011 12:55 AM, Jonathan M Davis wrote:

I don't know of anything more efficient than:
>ubyte[4] bytes = [1,2,3,4];
>bytes = bytes[$-1] ~ bytes[0..$-1]; // Rotate left

I'm stunned that this works. I'd even consider reporting it as a bug. You're
concatenating a ubyte[] ont a ubyte...


This works for other arrays as well. dmd understands.

Denis
--
_
vita es estrany
spir.wikidot.com



Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread U2 fan
== Quote from bearophile (bearophileh...@lycos.com)'s article
> Tom:
> > What is the most efficient way of implement a rotation of ubyte[4] array?
> >
> > By rotation I mean: rotateRight([1, 2, 3, 4]) -> [4, 1, 2, 3]
> Two versions, I have done no benchmarks so far:
> import std.c.stdio: printf;
> union Four {
> ubyte[4] a;
> uint u;
> }
> void showFour(Four f) {
> printf("f.u: %u\n", f.u);
> printf("f.a: [%d, %d, %d, %d]\n",
>cast(int)f.a[0], cast(int)f.a[1],
>cast(int)f.a[2], cast(int)f.a[3]);
> }
> void main() {
> Four f;
> f.a[] = [1, 2, 3, 4];
> showFour(f);
> f.u = (f.u << 8) | (f.u >> 24);
> showFour(f);
> printf("\n");
> // alternative
> f.a[] = [1, 2, 3, 4];
> uint u2 = f.u;
> showFour(f);
> printf("u2: %u\n", u2);
> asm {
> rol  u2, 8;
> }
> f.u = u2;
> showFour(f);
> }
> Bye,
> bearophile

I am offend!


Re: Best way in D2 to rotate a ubyte[4] array

2011-03-09 Thread Andrew Wiley
On Wed, Mar 9, 2011 at 7:25 PM, U2 fan  wrote:
> == Quote from bearophile (bearophileh...@lycos.com)'s article
>> Tom:
>> > What is the most efficient way of implement a rotation of ubyte[4] array?
>> >
>> > By rotation I mean: rotateRight([1, 2, 3, 4]) -> [4, 1, 2, 3]
>> Two versions, I have done no benchmarks so far:
>> import std.c.stdio: printf;
>> union Four {
>>     ubyte[4] a;
>>     uint u;
>> }
>> void showFour(Four f) {
>>     printf("f.u: %u\n", f.u);
>>     printf("f.a: [%d, %d, %d, %d]\n",
>>            cast(int)f.a[0], cast(int)f.a[1],
>>            cast(int)f.a[2], cast(int)f.a[3]);
>> }
>> void main() {
>>     Four f;
>>     f.a[] = [1, 2, 3, 4];
>>     showFour(f);
>>     f.u = (f.u << 8) | (f.u >> 24);
>>     showFour(f);
>>     printf("\n");
>>     // alternative
>>     f.a[] = [1, 2, 3, 4];
>>     uint u2 = f.u;
>>     showFour(f);
>>     printf("u2: %u\n", u2);
>>     asm {
>>         rol  u2, 8;
>>     }
>>     f.u = u2;
>>     showFour(f);
>> }
>> Bye,
>> bearophile
>
> I am offend!
>

Once I figured it out, I lol'd quite a bit.


Re: Best way in D2 to rotate a ubyte[4] array

2011-03-10 Thread bearophile
While creating the rotation code I have found two things I don't understand. 
Maybe some of you is able to help me understand.

This version of the code:


union Four {
uint u;
ubyte[4] a;
}
void main() {
Four f;
asm {
rol f.u, 8;
}
}


DMD 2.052 gives this error, do you know why?

test.d(8): bad type/size of operands 'f.u'

--

So to avoid wasting load asm instructions I have tried to write it like this:


union Four {
ubyte[4] arr;
uint ui;
}
void main() {
Four fo;
fo.arr[0] = 1;
fo.arr[1] = 2;
fo.arr[2] = 3;
fo.arr[3] = 4;
uint* uptr = &(fo.ui);
asm {
rol [uptr], 8;
}
asm {
rol uptr, 8;
}
}


but looking at the asm it produces, do you know why the rol with [uptr] and 
uptr are translated to the same instruction (so it rotates the pointer instead 
of the pointed uint)?


__Dmain comdat
push EBP
mov EBP,ESP
sub ESP,8
mov dword ptr -8[EBP],0
lea EAX,-8[EBP]
mov byte ptr -8[EBP],1
mov byte ptr -7[EBP],2
mov byte ptr -6[EBP],3
mov byte ptr -5[EBP],4
mov -4[EBP],EAX
rol -4[EBP],8  ; <==
rol -4[EBP],8  ; <==
mov ESP,EBP
pop EBP
ret

Bye and thank you,
bearophile


Re: Best way in D2 to rotate a ubyte[4] array

2011-03-10 Thread spir

On 03/10/2011 11:01 PM, bearophile wrote:

While creating the rotation code I have found two things I don't understand. 
Maybe some of you is able to help me understand.

This version of the code:


union Four {
 uint u;
 ubyte[4] a;
}
void main() {
 Four f;
 asm {
 rol f.u, 8;
 }
}


DMD 2.052 gives this error, do you know why?

test.d(8): bad type/size of operands 'f.u'

--

So to avoid wasting load asm instructions I have tried to write it like this:


union Four {
 ubyte[4] arr;
 uint ui;
}
void main() {
 Four fo;
 fo.arr[0] = 1;
 fo.arr[1] = 2;
 fo.arr[2] = 3;
 fo.arr[3] = 4;
 uint* uptr =&(fo.ui);
 asm {
 rol [uptr], 8;
 }
 asm {
 rol uptr, 8;
 }
}


but looking at the asm it produces, do you know why the rol with [uptr] and 
uptr are translated to the same instruction (so it rotates the pointer instead 
of the pointed uint)?


__Dmain comdat
 push EBP
 mov EBP,ESP
 sub ESP,8
 mov dword ptr -8[EBP],0
 lea EAX,-8[EBP]
 mov byte ptr -8[EBP],1
 mov byte ptr -7[EBP],2
 mov byte ptr -6[EBP],3
 mov byte ptr -5[EBP],4
 mov -4[EBP],EAX
 rol -4[EBP],8  ;<==
 rol -4[EBP],8  ;<==
 mov ESP,EBP
 pop EBP
 ret

Bye and thank you,
bearophile


About first point: looks like a bug at the interface between union and asm. I 
could rol either a pointer or an uint, but none of them member of a union. 
Dunno why. Indeed, using '<<' works as expected.

But I could have this work (which rather confirms the above interpretation):

union Four {
uint u;
ubyte[4] a;
}

void main() {
Four f;

f.u = 0x_01_02_03_04u;
writefln ("'0x%08X'" , f.u);

// deceiving D:
auto p = &(f.u);
auto u = *p;

asm {
rol u, 8;
}
writefln ("'0x%08X'" , u);

/*  output:
'0x01020304'
'0x02030401'
*/
}

I have no idea about the second point.

Denis
--
_
vita es estrany
spir.wikidot.com