inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr )                                     \
>> do {                                            \
>> uint32_t * _src_ptr;                            \
>> uint32_t * _dest_ptr;                           \
>> uint32_t _size;                                 \
>>                                                \
>>  _dest_ptr = (uint32_t *)dst;                          \
>>  _src_ptr = src;                               \
>>  _size =nr;                                    \
>>     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
>>                "je     1f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "addq   $4, %%rsi\n"            \
>>                "movl   %%eax, (%%rdi)\n"       \
>>                "addq   $4, %%rdi\n"            \
>>                "decl   %%ecx\n"                \
>> "1:             movl    %%ecx, %%eax\n"         \
>>                "shrl   $3, %%ecx\n"            \
>>                "je     3f\n"                   \
>> ".p2align 4 \n"                                 \
>> "2:             movq    (%%rsi), %%r8\n"        \
>>                "movq   8(%%rsi), %%r9\n"       \
>>                "addq   $32, %%rsi\n"           \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq   $32, %%rdi\n"           \
>>                "movq   -16(%%rsi), %%r8\n"     \
>>                "movq   -8(%%rsi), %%r9\n"      \
>>                "decl   %%ecx\n"                \
>>                "movq   %%r8, -16(%%rdi)\n"     \
>>                "movq   %%r9, -8(%%rdi)\n"      \
>>                "jnz    2b\n"                   \
>> "3:             testb  $7, %%al\n"              \
>>                "je     6f\n"                   \
>>                "testb  $4, %%al\n"             \
>>                "je     4f\n"                   \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "movq    8(%%rsi), %%r9\n"      \
>>                "addq    $16, %%rsi\n"          \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq    $16, %%rdi\n"          \
>> "4:     testb   $2, %%al\n"                     \
>>                "je      5f\n"                  \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "addq   $8, %%rsi\n"            \
>>                "movq  %%r8, (%%rdi)\n"         \
>>                "addq    $8, %%rdi\n"           \
>> "5:    testb   $1, %%al\n"                      \
>>                "je     6f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "movl   %%eax, (%%rdi)\n"       \
>> "6: \n"                                         \
>>                : "=%c" (_size)                 \
>>                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
>>                : "%eax", "%r8", "%r9"                  \
>>                );                                      \
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something 

Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


possible-_mesa_remove_extra_moves-fix.patch
Description: Binary data
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
>  And any special case added for MUL would seem to apply to any ALU
> instruction.  That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 15:58:12 
> PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either.  I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[1].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[2].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 17:40:21 
> PDT ---
> Created an attachment (id=35777)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35777
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD  tmp.x, angle, 1/(2*PI), 0.5
> FRACT   tmp.x, tmp.x
> ADD     tmp.y, tmp.x, 1
> CNDGE   tmp.x, tmp.x, tmp.x, tmp.y
> MULADD  tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
>           What    |Removed                     |Added
> 
>  Attachment #35777|0                           |1
>        is obsolete|                            |
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35787
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 +       GLint log2 = 0;
 +
 +       while (n >>= 1)
 +               ++log2;
 +       return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring  just 13 
>> operations
>>
>> +               pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> +               pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> +               pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> +               pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> +               pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> +               pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /*  tested and verified  */
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>>                              y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
>        const unsigned char /*int*/ spread_bits[8] = {
>                0x00,  /* 0b000 to 0b0 */
>                0x01,  /* 0b001 to 0b1 */
>                0x04,  /* 0b010 to 0b00100 */
>                0x05,  /* 0b011 to 0b00101 */
>                0x10,  /* 0b100 to 0b1 */
>                0x11,  /* 0b101 to 0b10001 */
>                0x14,  /* 0b110 to 0b10100 */
>                0x15,  /* 0b111 to 0b10101 */
>        };
>
>        pixel_number |= spread_bits[x & 0x07];
>        pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 05:53:36 
> PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(8)
>      0  y: MULADD      R123.y,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>         z: MOV         R0.z,  0.0f
>         w: MOV         R0.w,  1.0f
>      1  x: FRACT       ,  PV0.y
>      2  z: MULADD      R123.z,  PV1.x,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  t: SIN         R0.x,  PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(10)
>      0  y: MOV         R0.y,  0.0f
>         z: MOV         R0.z,  1.0f
>         w: MULADD      R123.w,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>      1  y: FRACT       ,  PV0.w
>      2  x: MULADD      R123.x,  PV1.y,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  z: MUL         ,  PV2.x,  (0x3E22F983, 0.1591549367f).x
>      4  t: SIN         R0.x,  PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 + ? ? ? GLint log2 = 0;
 +
 + ? ? ? while (n >>= 1)
 + ? ? ? ? ? ? ? ++log2;
 + ? ? ? return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring ?just 13 
>> operations
>>
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /* ?tested and verified ?*/
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
> ? ? ? ?const unsigned char /*int*/ spread_bits[8] = {
> ? ? ? ? ? ? ? ?0x00, ?/* 0b000 to 0b0 */
> ? ? ? ? ? ? ? ?0x01, ?/* 0b001 to 0b1 */
> ? ? ? ? ? ? ? ?0x04, ?/* 0b010 to 0b00100 */
> ? ? ? ? ? ? ? ?0x05, ?/* 0b011 to 0b00101 */
> ? ? ? ? ? ? ? ?0x10, ?/* 0b100 to 0b1 */
> ? ? ? ? ? ? ? ?0x11, ?/* 0b101 to 0b10001 */
> ? ? ? ? ? ? ? ?0x14, ?/* 0b110 to 0b10100 */
> ? ? ? ? ? ? ? ?0x15, ?/* 0b111 to 0b10101 */
> ? ? ? ?};
>
> ? ? ? ?pixel_number |= spread_bits[x & 0x07];
> ? ? ? ?pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr ) ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> do { ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _src_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _dest_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> uint32_t _size; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_dest_ptr = (uint32_t *)dst; ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_src_ptr = src; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ?_size =nr; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? __asm__ __volatile__ ("testb ? ? ? $4, %%dil\n" ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 1f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rdi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> "1: ? ? ? ? ? ? movl ? ?%%ecx, %%eax\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"shrl ? $3, %%ecx\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 3f\n" ? ? ? ? ? ? ? ? ? \
>> ".p2align 4 \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> "2: ? ? ? ? ? ? movq ? ?(%%rsi), %%r8\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? 8(%%rsi), %%r9\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rsi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rdi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -16(%%rsi), %%r8\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, -16(%%rdi)\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r9, -8(%%rdi)\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"jnz ? ?2b\n" ? ? ? ? ? ? ? ? ? \
>> "3: ? ? ? ? ? ? testb ?$7, %%al\n" ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"testb ?$4, %%al\n" ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? 4f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rsi\n" ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rdi\n" ? ? ? ? ?\
>> "4: ? ? testb ? $2, %%al\n" ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? ?5f\n" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $8, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ?%%r8, (%%rdi)\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$8, %%rdi\n" ? ? ? ? ? \
>> "5: ? ?testb ? $1, %%al\n" ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> "6: \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "=%c" (_size) ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%eax", "%r8", "%r9" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?); ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something we should
> just let gcc handle? It's pretty smart about inlining calls to memcpy.
> (Certainly your code is an improvement over "rep ; movsl" or "for ( j
> = 0 ; j < nr ; j++ ) dst[j] = ((int *)src)[j];" but really, why bother
> when gcc can do this in a much nicer more maintainable way?
>
> Matt
>

Matt,

GCC inlines memcpy with a REP MOVQ preceded by alignment checks for
byte word and double word to align the destination to be on 8 byte
boundaries. Its also followed further checks in case the number of
bytes is not evenly divisible by 4.

The inner loop is based on AMD's optimization guide's memcpy. This
also allows you to modify it to use the movnti instructions for
writing to the destination ram directly bypassing the cache if you
like.

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 
> 05:53:36 PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(8)
> ? ? ?0 ?y: MULADD ? ? ?R123.y, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?0.0f
> ? ? ? ? w: MOV ? ? ? ? R0.w, ?1.0f
> ? ? ?1 ?x: FRACT ? ? ? , ?PV0.y
> ? ? ?2 ?z: MULADD ? ? ?R123.z, ?PV1.x, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?t: SIN ? ? ? ? R0.x, ?PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(10)
> ? ? ?0 ?y: MOV ? ? ? ? R0.y, ?0.0f
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?1.0f
> ? ? ? ? w: MULADD ? ? ?R123.w, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ?1 ?y: FRACT ? ? ? , ?PV0.w
> ? ? ?2 ?x: MULADD ? ? ?R123.x, ?PV1.y, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?z: MUL ? ? ? ? , ?PV2.x, ?(0x3E22F983, 0.1591549367f).x
> ? ? ?4 ?t: SIN ? ? ? ? R0.x, ?PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
-- next part --
A non-text attachment was scrubbed...
Name: possible-_mesa_remove_extra_moves-fix.patch
Type: application/octet-stream
Size: 3520 bytes
Desc: not available
URL: 



Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
> ?And any special case added for MUL would seem to apply to any ALU
> instruction. ?That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 
> 15:58:12 PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either. ?I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[1].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[2].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 
> 17:40:21 PDT ---
> Created an attachment (id=35777)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35777
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD ?tmp.x, angle, 1/(2*PI), 0.5
> FRACT ? tmp.x, tmp.x
> ADD ? ? tmp.y, tmp.x, 1
> CNDGE ? tmp.x, tmp.x, tmp.x, tmp.y
> MULADD ?tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
> ? ? ? ? ? What ? ?|Removed ? ? ? ? ? ? ? ? ? ? |Added
> 
> ?Attachment #35777|0 ? ? ? ? ? ? ? ? ? ? ? ? ? |1
> ? ? ? ?is obsolete| ? ? ? ? ? ? ? ? ? ? ? ? ? ?|
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35787
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr )                                     \
>> do {                                            \
>> uint32_t * _src_ptr;                            \
>> uint32_t * _dest_ptr;                           \
>> uint32_t _size;                                 \
>>                                                \
>>  _dest_ptr = (uint32_t *)dst;                          \
>>  _src_ptr = src;                               \
>>  _size =nr;                                    \
>>     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
>>                "je     1f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "addq   $4, %%rsi\n"            \
>>                "movl   %%eax, (%%rdi)\n"       \
>>                "addq   $4, %%rdi\n"            \
>>                "decl   %%ecx\n"                \
>> "1:             movl    %%ecx, %%eax\n"         \
>>                "shrl   $3, %%ecx\n"            \
>>                "je     3f\n"                   \
>> ".p2align 4 \n"                                 \
>> "2:             movq    (%%rsi), %%r8\n"        \
>>                "movq   8(%%rsi), %%r9\n"       \
>>                "addq   $32, %%rsi\n"           \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq   $32, %%rdi\n"           \
>>                "movq   -16(%%rsi), %%r8\n"     \
>>                "movq   -8(%%rsi), %%r9\n"      \
>>                "decl   %%ecx\n"                \
>>                "movq   %%r8, -16(%%rdi)\n"     \
>>                "movq   %%r9, -8(%%rdi)\n"      \
>>                "jnz    2b\n"                   \
>> "3:             testb  $7, %%al\n"              \
>>                "je     6f\n"                   \
>>                "testb  $4, %%al\n"             \
>>                "je     4f\n"                   \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "movq    8(%%rsi), %%r9\n"      \
>>                "addq    $16, %%rsi\n"          \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq    $16, %%rdi\n"          \
>> "4:     testb   $2, %%al\n"                     \
>>                "je      5f\n"                  \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "addq   $8, %%rsi\n"            \
>>                "movq  %%r8, (%%rdi)\n"         \
>>                "addq    $8, %%rdi\n"           \
>> "5:    testb   $1, %%al\n"                      \
>>                "je     6f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "movl   %%eax, (%%rdi)\n"       \
>> "6: \n"                                         \
>>                : "=%c" (_size)                 \
>>                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
>>                : "%eax", "%r8", "%r9"                  \
>>                );                                      \
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something 

Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


possible-_mesa_remove_extra_moves-fix.patch
Description: Binary data
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
>  And any special case added for MUL would seem to apply to any ALU
> instruction.  That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 15:58:12 
> PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either.  I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[1].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[2].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 17:40:21 
> PDT ---
> Created an attachment (id=35777)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35777
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD  tmp.x, angle, 1/(2*PI), 0.5
> FRACT   tmp.x, tmp.x
> ADD     tmp.y, tmp.x, 1
> CNDGE   tmp.x, tmp.x, tmp.x, tmp.y
> MULADD  tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
>           What    |Removed                     |Added
> 
>  Attachment #35777|0                           |1
>        is obsolete|                            |
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35787
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 +       GLint log2 = 0;
 +
 +       while (n >>= 1)
 +               ++log2;
 +       return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring  just 13 
>> operations
>>
>> +               pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> +               pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> +               pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> +               pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> +               pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> +               pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /*  tested and verified  */
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>>                              y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
>        const unsigned char /*int*/ spread_bits[8] = {
>                0x00,  /* 0b000 to 0b0 */
>                0x01,  /* 0b001 to 0b1 */
>                0x04,  /* 0b010 to 0b00100 */
>                0x05,  /* 0b011 to 0b00101 */
>                0x10,  /* 0b100 to 0b1 */
>                0x11,  /* 0b101 to 0b10001 */
>                0x14,  /* 0b110 to 0b10100 */
>                0x15,  /* 0b111 to 0b10101 */
>        };
>
>        pixel_number |= spread_bits[x & 0x07];
>        pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 05:53:36 
> PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(8)
>      0  y: MULADD      R123.y,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>         z: MOV         R0.z,  0.0f
>         w: MOV         R0.w,  1.0f
>      1  x: FRACT       ,  PV0.y
>      2  z: MULADD      R123.z,  PV1.x,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  t: SIN         R0.x,  PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(10)
>      0  y: MOV         R0.y,  0.0f
>         z: MOV         R0.z,  1.0f
>         w: MULADD      R123.w,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>      1  y: FRACT       ,  PV0.w
>      2  x: MULADD      R123.x,  PV1.y,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  z: MUL         ,  PV2.x,  (0x3E22F983, 0.1591549367f).x
>      4  t: SIN         R0.x,  PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 
> 05:53:36 PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(8)
> ? ? ?0 ?y: MULADD ? ? ?R123.y, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?0.0f
> ? ? ? ? w: MOV ? ? ? ? R0.w, ?1.0f
> ? ? ?1 ?x: FRACT ? ? ? , ?PV0.y
> ? ? ?2 ?z: MULADD ? ? ?R123.z, ?PV1.x, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?t: SIN ? ? ? ? R0.x, ?PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(10)
> ? ? ?0 ?y: MOV ? ? ? ? R0.y, ?0.0f
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?1.0f
> ? ? ? ? w: MULADD ? ? ?R123.w, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ?1 ?y: FRACT ? ? ? , ?PV0.w
> ? ? ?2 ?x: MULADD ? ? ?R123.x, ?PV1.y, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?z: MUL ? ? ? ? , ?PV2.x, ?(0x3E22F983, 0.1591549367f).x
> ? ? ?4 ?t: SIN ? ? ? ? R0.x, ?PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
-- next part --
A non-text attachment was scrubbed...
Name: possible-_mesa_remove_extra_moves-fix.patch
Type: application/octet-stream
Size: 3520 bytes
Desc: not available
URL: 



Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
> ?And any special case added for MUL would seem to apply to any ALU
> instruction. ?That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 
> 15:58:12 PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either. ?I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[1].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[2].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 
> 17:40:21 PDT ---
> Created an attachment (id=35777)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35777
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD ?tmp.x, angle, 1/(2*PI), 0.5
> FRACT ? tmp.x, tmp.x
> ADD ? ? tmp.y, tmp.x, 1
> CNDGE ? tmp.x, tmp.x, tmp.x, tmp.y
> MULADD ?tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
> ? ? ? ? ? What ? ?|Removed ? ? ? ? ? ? ? ? ? ? |Added
> 
> ?Attachment #35777|0 ? ? ? ? ? ? ? ? ? ? ? ? ? |1
> ? ? ? ?is obsolete| ? ? ? ? ? ? ? ? ? ? ? ? ? ?|
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35787
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 + ? ? ? GLint log2 = 0;
 +
 + ? ? ? while (n >>= 1)
 + ? ? ? ? ? ? ? ++log2;
 + ? ? ? return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring ?just 13 
>> operations
>>
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /* ?tested and verified ?*/
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
> ? ? ? ?const unsigned char /*int*/ spread_bits[8] = {
> ? ? ? ? ? ? ? ?0x00, ?/* 0b000 to 0b0 */
> ? ? ? ? ? ? ? ?0x01, ?/* 0b001 to 0b1 */
> ? ? ? ? ? ? ? ?0x04, ?/* 0b010 to 0b00100 */
> ? ? ? ? ? ? ? ?0x05, ?/* 0b011 to 0b00101 */
> ? ? ? ? ? ? ? ?0x10, ?/* 0b100 to 0b1 */
> ? ? ? ? ? ? ? ?0x11, ?/* 0b101 to 0b10001 */
> ? ? ? ? ? ? ? ?0x14, ?/* 0b110 to 0b10100 */
> ? ? ? ? ? ? ? ?0x15, ?/* 0b111 to 0b10101 */
> ? ? ? ?};
>
> ? ? ? ?pixel_number |= spread_bits[x & 0x07];
> ? ? ? ?pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr ) ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> do { ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _src_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _dest_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> uint32_t _size; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_dest_ptr = (uint32_t *)dst; ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_src_ptr = src; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ?_size =nr; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? __asm__ __volatile__ ("testb ? ? ? $4, %%dil\n" ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 1f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rdi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> "1: ? ? ? ? ? ? movl ? ?%%ecx, %%eax\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"shrl ? $3, %%ecx\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 3f\n" ? ? ? ? ? ? ? ? ? \
>> ".p2align 4 \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> "2: ? ? ? ? ? ? movq ? ?(%%rsi), %%r8\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? 8(%%rsi), %%r9\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rsi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rdi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -16(%%rsi), %%r8\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, -16(%%rdi)\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r9, -8(%%rdi)\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"jnz ? ?2b\n" ? ? ? ? ? ? ? ? ? \
>> "3: ? ? ? ? ? ? testb ?$7, %%al\n" ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"testb ?$4, %%al\n" ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? 4f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rsi\n" ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rdi\n" ? ? ? ? ?\
>> "4: ? ? testb ? $2, %%al\n" ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? ?5f\n" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $8, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ?%%r8, (%%rdi)\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$8, %%rdi\n" ? ? ? ? ? \
>> "5: ? ?testb ? $1, %%al\n" ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> "6: \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "=%c" (_size) ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%eax", "%r8", "%r9" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?); ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something we should
> just let gcc handle? It's pretty smart about inlining calls to memcpy.
> (Certainly your code is an improvement over "rep ; movsl" or "for ( j
> = 0 ; j < nr ; j++ ) dst[j] = ((int *)src)[j];" but really, why bother
> when gcc can do this in a much nicer more maintainable way?
>
> Matt
>

Matt,

GCC inlines memcpy with a REP MOVQ preceded by alignment checks for
byte word and double word to align the destination to be on 8 byte
boundaries. Its also followed further checks in case the number of
bytes is not evenly divisible by 4.

The inner loop is based on AMD's optimization guide's memcpy. This
also allows you to modify it to use the movnti instructions for
writing to the destination ram directly bypassing the cache if you
like.

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr )                                     \
>> do {                                            \
>> uint32_t * _src_ptr;                            \
>> uint32_t * _dest_ptr;                           \
>> uint32_t _size;                                 \
>>                                                \
>>  _dest_ptr = (uint32_t *)dst;                          \
>>  _src_ptr = src;                               \
>>  _size =nr;                                    \
>>     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
>>                "je     1f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "addq   $4, %%rsi\n"            \
>>                "movl   %%eax, (%%rdi)\n"       \
>>                "addq   $4, %%rdi\n"            \
>>                "decl   %%ecx\n"                \
>> "1:             movl    %%ecx, %%eax\n"         \
>>                "shrl   $3, %%ecx\n"            \
>>                "je     3f\n"                   \
>> ".p2align 4 \n"                                 \
>> "2:             movq    (%%rsi), %%r8\n"        \
>>                "movq   8(%%rsi), %%r9\n"       \
>>                "addq   $32, %%rsi\n"           \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq   $32, %%rdi\n"           \
>>                "movq   -16(%%rsi), %%r8\n"     \
>>                "movq   -8(%%rsi), %%r9\n"      \
>>                "decl   %%ecx\n"                \
>>                "movq   %%r8, -16(%%rdi)\n"     \
>>                "movq   %%r9, -8(%%rdi)\n"      \
>>                "jnz    2b\n"                   \
>> "3:             testb  $7, %%al\n"              \
>>                "je     6f\n"                   \
>>                "testb  $4, %%al\n"             \
>>                "je     4f\n"                   \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "movq    8(%%rsi), %%r9\n"      \
>>                "addq    $16, %%rsi\n"          \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq    $16, %%rdi\n"          \
>> "4:     testb   $2, %%al\n"                     \
>>                "je      5f\n"                  \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "addq   $8, %%rsi\n"            \
>>                "movq  %%r8, (%%rdi)\n"         \
>>                "addq    $8, %%rdi\n"           \
>> "5:    testb   $1, %%al\n"                      \
>>                "je     6f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "movl   %%eax, (%%rdi)\n"       \
>> "6: \n"                                         \
>>                : "=%c" (_size)                 \
>>                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
>>                : "%eax", "%r8", "%r9"                  \
>>                );                                      \
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something 

Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


possible-_mesa_remove_extra_moves-fix.patch
Description: Binary data
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
>  And any special case added for MUL would seem to apply to any ALU
> instruction.  That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 15:58:12 
> PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either.  I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[1].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[2].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 17:40:21 
> PDT ---
> Created an attachment (id=35777)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35777
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD  tmp.x, angle, 1/(2*PI), 0.5
> FRACT   tmp.x, tmp.x
> ADD     tmp.y, tmp.x, 1
> CNDGE   tmp.x, tmp.x, tmp.x, tmp.y
> MULADD  tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
>           What    |Removed                     |Added
> 
>  Attachment #35777|0                           |1
>        is obsolete|                            |
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35787
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 +       GLint log2 = 0;
 +
 +       while (n >>= 1)
 +               ++log2;
 +       return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring  just 13 
>> operations
>>
>> +               pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> +               pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> +               pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> +               pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> +               pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> +               pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /*  tested and verified  */
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>>                              y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
>        const unsigned char /*int*/ spread_bits[8] = {
>                0x00,  /* 0b000 to 0b0 */
>                0x01,  /* 0b001 to 0b1 */
>                0x04,  /* 0b010 to 0b00100 */
>                0x05,  /* 0b011 to 0b00101 */
>                0x10,  /* 0b100 to 0b1 */
>                0x11,  /* 0b101 to 0b10001 */
>                0x14,  /* 0b110 to 0b10100 */
>                0x15,  /* 0b111 to 0b10101 */
>        };
>
>        pixel_number |= spread_bits[x & 0x07];
>        pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 05:53:36 
> PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(8)
>      0  y: MULADD      R123.y,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>         z: MOV         R0.z,  0.0f
>         w: MOV         R0.w,  1.0f
>      1  x: FRACT       ,  PV0.y
>      2  z: MULADD      R123.z,  PV1.x,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  t: SIN         R0.x,  PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(10)
>      0  y: MOV         R0.y,  0.0f
>         z: MOV         R0.z,  1.0f
>         w: MULADD      R123.w,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>      1  y: FRACT       ,  PV0.w
>      2  x: MULADD      R123.x,  PV1.y,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  z: MUL         ,  PV2.x,  (0x3E22F983, 0.1591549367f).x
>      4  t: SIN         R0.x,  PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


possible-_mesa_remove_extra_moves-fix.patch
Description: Binary data
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
>  And any special case added for MUL would seem to apply to any ALU
> instruction.  That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 15:58:12 
> PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either.  I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[1].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[2].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 17:40:21 
> PDT ---
> Created an attachment (id=35777)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35777
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD  tmp.x, angle, 1/(2*PI), 0.5
> FRACT   tmp.x, tmp.x
> ADD     tmp.y, tmp.x, 1
> CNDGE   tmp.x, tmp.x, tmp.x, tmp.y
> MULADD  tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
>           What    |Removed                     |Added
> 
>  Attachment #35777|0                           |1
>        is obsolete|                            |
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35787
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 +       GLint log2 = 0;
 +
 +       while (n >>= 1)
 +               ++log2;
 +       return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring  just 13 
>> operations
>>
>> +               pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> +               pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> +               pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> +               pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> +               pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> +               pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /*  tested and verified  */
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>>                              y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
>        const unsigned char /*int*/ spread_bits[8] = {
>                0x00,  /* 0b000 to 0b0 */
>                0x01,  /* 0b001 to 0b1 */
>                0x04,  /* 0b010 to 0b00100 */
>                0x05,  /* 0b011 to 0b00101 */
>                0x10,  /* 0b100 to 0b1 */
>                0x11,  /* 0b101 to 0b10001 */
>                0x14,  /* 0b110 to 0b10100 */
>                0x15,  /* 0b111 to 0b10101 */
>        };
>
>        pixel_number |= spread_bits[x & 0x07];
>        pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 05:53:36 
> PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(8)
>      0  y: MULADD      R123.y,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>         z: MOV         R0.z,  0.0f
>         w: MOV         R0.w,  1.0f
>      1  x: FRACT       ,  PV0.y
>      2  z: MULADD      R123.z,  PV1.x,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  t: SIN         R0.x,  PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(10)
>      0  y: MOV         R0.y,  0.0f
>         z: MOV         R0.z,  1.0f
>         w: MULADD      R123.w,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>      1  y: FRACT       ,  PV0.w
>      2  x: MULADD      R123.x,  PV1.y,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  z: MUL         ,  PV2.x,  (0x3E22F983, 0.1591549367f).x
>      4  t: SIN         R0.x,  PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr )                                     \
>> do {                                            \
>> uint32_t * _src_ptr;                            \
>> uint32_t * _dest_ptr;                           \
>> uint32_t _size;                                 \
>>                                                \
>>  _dest_ptr = (uint32_t *)dst;                          \
>>  _src_ptr = src;                               \
>>  _size =nr;                                    \
>>     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
>>                "je     1f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "addq   $4, %%rsi\n"            \
>>                "movl   %%eax, (%%rdi)\n"       \
>>                "addq   $4, %%rdi\n"            \
>>                "decl   %%ecx\n"                \
>> "1:             movl    %%ecx, %%eax\n"         \
>>                "shrl   $3, %%ecx\n"            \
>>                "je     3f\n"                   \
>> ".p2align 4 \n"                                 \
>> "2:             movq    (%%rsi), %%r8\n"        \
>>                "movq   8(%%rsi), %%r9\n"       \
>>                "addq   $32, %%rsi\n"           \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq   $32, %%rdi\n"           \
>>                "movq   -16(%%rsi), %%r8\n"     \
>>                "movq   -8(%%rsi), %%r9\n"      \
>>                "decl   %%ecx\n"                \
>>                "movq   %%r8, -16(%%rdi)\n"     \
>>                "movq   %%r9, -8(%%rdi)\n"      \
>>                "jnz    2b\n"                   \
>> "3:             testb  $7, %%al\n"              \
>>                "je     6f\n"                   \
>>                "testb  $4, %%al\n"             \
>>                "je     4f\n"                   \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "movq    8(%%rsi), %%r9\n"      \
>>                "addq    $16, %%rsi\n"          \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq    $16, %%rdi\n"          \
>> "4:     testb   $2, %%al\n"                     \
>>                "je      5f\n"                  \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "addq   $8, %%rsi\n"            \
>>                "movq  %%r8, (%%rdi)\n"         \
>>                "addq    $8, %%rdi\n"           \
>> "5:    testb   $1, %%al\n"                      \
>>                "je     6f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "movl   %%eax, (%%rdi)\n"       \
>> "6: \n"                                         \
>>                : "=%c" (_size)                 \
>>                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
>>                : "%eax", "%r8", "%r9"                  \
>>                );                                      \
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something 

inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr ) ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> do { ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _src_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _dest_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> uint32_t _size; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_dest_ptr = (uint32_t *)dst; ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_src_ptr = src; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ?_size =nr; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? __asm__ __volatile__ ("testb ? ? ? $4, %%dil\n" ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 1f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rdi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> "1: ? ? ? ? ? ? movl ? ?%%ecx, %%eax\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"shrl ? $3, %%ecx\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 3f\n" ? ? ? ? ? ? ? ? ? \
>> ".p2align 4 \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> "2: ? ? ? ? ? ? movq ? ?(%%rsi), %%r8\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? 8(%%rsi), %%r9\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rsi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rdi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -16(%%rsi), %%r8\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, -16(%%rdi)\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r9, -8(%%rdi)\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"jnz ? ?2b\n" ? ? ? ? ? ? ? ? ? \
>> "3: ? ? ? ? ? ? testb ?$7, %%al\n" ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"testb ?$4, %%al\n" ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? 4f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rsi\n" ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rdi\n" ? ? ? ? ?\
>> "4: ? ? testb ? $2, %%al\n" ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? ?5f\n" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $8, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ?%%r8, (%%rdi)\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$8, %%rdi\n" ? ? ? ? ? \
>> "5: ? ?testb ? $1, %%al\n" ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> "6: \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "=%c" (_size) ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%eax", "%r8", "%r9" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?); ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something we should
> just let gcc handle? It's pretty smart about inlining calls to memcpy.
> (Certainly your code is an improvement over "rep ; movsl" or "for ( j
> = 0 ; j < nr ; j++ ) dst[j] = ((int *)src)[j];" but really, why bother
> when gcc can do this in a much nicer more maintainable way?
>
> Matt
>

Matt,

GCC inlines memcpy with a REP MOVQ preceded by alignment checks for
byte word and double word to align the destination to be on 8 byte
boundaries. Its also followed further checks in case the number of
bytes is not evenly divisible by 4.

The inner loop is based on AMD's optimization guide's memcpy. This
also allows you to modify it to use the movnti instructions for
writing to the destination ram directly bypassing the cache if you
like.

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 
> 05:53:36 PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(8)
> ? ? ?0 ?y: MULADD ? ? ?R123.y, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?0.0f
> ? ? ? ? w: MOV ? ? ? ? R0.w, ?1.0f
> ? ? ?1 ?x: FRACT ? ? ? , ?PV0.y
> ? ? ?2 ?z: MULADD ? ? ?R123.z, ?PV1.x, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?t: SIN ? ? ? ? R0.x, ?PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(10)
> ? ? ?0 ?y: MOV ? ? ? ? R0.y, ?0.0f
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?1.0f
> ? ? ? ? w: MULADD ? ? ?R123.w, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ?1 ?y: FRACT ? ? ? , ?PV0.w
> ? ? ?2 ?x: MULADD ? ? ?R123.x, ?PV1.y, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?z: MUL ? ? ? ? , ?PV2.x, ?(0x3E22F983, 0.1591549367f).x
> ? ? ?4 ?t: SIN ? ? ? ? R0.x, ?PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
-- next part --
A non-text attachment was scrubbed...
Name: possible-_mesa_remove_extra_moves-fix.patch
Type: application/octet-stream
Size: 3520 bytes
Desc: not available
URL: 



Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
> ?And any special case added for MUL would seem to apply to any ALU
> instruction. ?That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 
> 15:58:12 PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either. ?I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[1].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[2].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 
> 17:40:21 PDT ---
> Created an attachment (id=35777)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35777
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD ?tmp.x, angle, 1/(2*PI), 0.5
> FRACT ? tmp.x, tmp.x
> ADD ? ? tmp.y, tmp.x, 1
> CNDGE ? tmp.x, tmp.x, tmp.x, tmp.y
> MULADD ?tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
> ? ? ? ? ? What ? ?|Removed ? ? ? ? ? ? ? ? ? ? |Added
> 
> ?Attachment #35777|0 ? ? ? ? ? ? ? ? ? ? ? ? ? |1
> ? ? ? ?is obsolete| ? ? ? ? ? ? ? ? ? ? ? ? ? ?|
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35787
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 + ? ? ? GLint log2 = 0;
 +
 + ? ? ? while (n >>= 1)
 + ? ? ? ? ? ? ? ++log2;
 + ? ? ? return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring ?just 13 
>> operations
>>
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /* ?tested and verified ?*/
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
> ? ? ? ?const unsigned char /*int*/ spread_bits[8] = {
> ? ? ? ? ? ? ? ?0x00, ?/* 0b000 to 0b0 */
> ? ? ? ? ? ? ? ?0x01, ?/* 0b001 to 0b1 */
> ? ? ? ? ? ? ? ?0x04, ?/* 0b010 to 0b00100 */
> ? ? ? ? ? ? ? ?0x05, ?/* 0b011 to 0b00101 */
> ? ? ? ? ? ? ? ?0x10, ?/* 0b100 to 0b1 */
> ? ? ? ? ? ? ? ?0x11, ?/* 0b101 to 0b10001 */
> ? ? ? ? ? ? ? ?0x14, ?/* 0b110 to 0b10100 */
> ? ? ? ? ? ? ? ?0x15, ?/* 0b111 to 0b10101 */
> ? ? ? ?};
>
> ? ? ? ?pixel_number |= spread_bits[x & 0x07];
> ? ? ? ?pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr ) ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> do { ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _src_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _dest_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> uint32_t _size; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_dest_ptr = (uint32_t *)dst; ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_src_ptr = src; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ?_size =nr; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? __asm__ __volatile__ ("testb ? ? ? $4, %%dil\n" ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 1f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rdi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> "1: ? ? ? ? ? ? movl ? ?%%ecx, %%eax\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"shrl ? $3, %%ecx\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 3f\n" ? ? ? ? ? ? ? ? ? \
>> ".p2align 4 \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> "2: ? ? ? ? ? ? movq ? ?(%%rsi), %%r8\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? 8(%%rsi), %%r9\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rsi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rdi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -16(%%rsi), %%r8\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, -16(%%rdi)\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r9, -8(%%rdi)\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"jnz ? ?2b\n" ? ? ? ? ? ? ? ? ? \
>> "3: ? ? ? ? ? ? testb ?$7, %%al\n" ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"testb ?$4, %%al\n" ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? 4f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rsi\n" ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rdi\n" ? ? ? ? ?\
>> "4: ? ? testb ? $2, %%al\n" ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? ?5f\n" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $8, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ?%%r8, (%%rdi)\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$8, %%rdi\n" ? ? ? ? ? \
>> "5: ? ?testb ? $1, %%al\n" ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> "6: \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "=%c" (_size) ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%eax", "%r8", "%r9" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?); ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something we should
> just let gcc handle? It's pretty smart about inlining calls to memcpy.
> (Certainly your code is an improvement over "rep ; movsl" or "for ( j
> = 0 ; j < nr ; j++ ) dst[j] = ((int *)src)[j];" but really, why bother
> when gcc can do this in a much nicer more maintainable way?
>
> Matt
>

Matt,

GCC inlines memcpy with a REP MOVQ preceded by alignment checks for
byte word and double word to align the destination to be on 8 byte
boundaries. Its also followed further checks in case the number of
bytes is not evenly divisible by 4.

The inner loop is based on AMD's optimization guide's memcpy. This
also allows you to modify it to use the movnti instructions for
writing to the destination ram directly bypassing the cache if you
like.

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 
> 05:53:36 PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(8)
> ? ? ?0 ?y: MULADD ? ? ?R123.y, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?0.0f
> ? ? ? ? w: MOV ? ? ? ? R0.w, ?1.0f
> ? ? ?1 ?x: FRACT ? ? ? , ?PV0.y
> ? ? ?2 ?z: MULADD ? ? ?R123.z, ?PV1.x, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?t: SIN ? ? ? ? R0.x, ?PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(10)
> ? ? ?0 ?y: MOV ? ? ? ? R0.y, ?0.0f
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?1.0f
> ? ? ? ? w: MULADD ? ? ?R123.w, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ?1 ?y: FRACT ? ? ? , ?PV0.w
> ? ? ?2 ?x: MULADD ? ? ?R123.x, ?PV1.y, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?z: MUL ? ? ? ? , ?PV2.x, ?(0x3E22F983, 0.1591549367f).x
> ? ? ?4 ?t: SIN ? ? ? ? R0.x, ?PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
-- next part --
A non-text attachment was scrubbed...
Name: possible-_mesa_remove_extra_moves-fix.patch
Type: application/octet-stream
Size: 3520 bytes
Desc: not available
URL: 



Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
> ?And any special case added for MUL would seem to apply to any ALU
> instruction. ?That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 
> 15:58:12 PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either. ?I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[1].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[2].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 
> 17:40:21 PDT ---
> Created an attachment (id=35777)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35777
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD ?tmp.x, angle, 1/(2*PI), 0.5
> FRACT ? tmp.x, tmp.x
> ADD ? ? tmp.y, tmp.x, 1
> CNDGE ? tmp.x, tmp.x, tmp.x, tmp.y
> MULADD ?tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
> ? ? ? ? ? What ? ?|Removed ? ? ? ? ? ? ? ? ? ? |Added
> 
> ?Attachment #35777|0 ? ? ? ? ? ? ? ? ? ? ? ? ? |1
> ? ? ? ?is obsolete| ? ? ? ? ? ? ? ? ? ? ? ? ? ?|
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35787
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 + ? ? ? GLint log2 = 0;
 +
 + ? ? ? while (n >>= 1)
 + ? ? ? ? ? ? ? ++log2;
 + ? ? ? return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring ?just 13 
>> operations
>>
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /* ?tested and verified ?*/
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
> ? ? ? ?const unsigned char /*int*/ spread_bits[8] = {
> ? ? ? ? ? ? ? ?0x00, ?/* 0b000 to 0b0 */
> ? ? ? ? ? ? ? ?0x01, ?/* 0b001 to 0b1 */
> ? ? ? ? ? ? ? ?0x04, ?/* 0b010 to 0b00100 */
> ? ? ? ? ? ? ? ?0x05, ?/* 0b011 to 0b00101 */
> ? ? ? ? ? ? ? ?0x10, ?/* 0b100 to 0b1 */
> ? ? ? ? ? ? ? ?0x11, ?/* 0b101 to 0b10001 */
> ? ? ? ? ? ? ? ?0x14, ?/* 0b110 to 0b10100 */
> ? ? ? ? ? ? ? ?0x15, ?/* 0b111 to 0b10101 */
> ? ? ? ?};
>
> ? ? ? ?pixel_number |= spread_bits[x & 0x07];
> ? ? ? ?pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr )                                     \
>> do {                                            \
>> uint32_t * _src_ptr;                            \
>> uint32_t * _dest_ptr;                           \
>> uint32_t _size;                                 \
>>                                                \
>>  _dest_ptr = (uint32_t *)dst;                          \
>>  _src_ptr = src;                               \
>>  _size =nr;                                    \
>>     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
>>                "je     1f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "addq   $4, %%rsi\n"            \
>>                "movl   %%eax, (%%rdi)\n"       \
>>                "addq   $4, %%rdi\n"            \
>>                "decl   %%ecx\n"                \
>> "1:             movl    %%ecx, %%eax\n"         \
>>                "shrl   $3, %%ecx\n"            \
>>                "je     3f\n"                   \
>> ".p2align 4 \n"                                 \
>> "2:             movq    (%%rsi), %%r8\n"        \
>>                "movq   8(%%rsi), %%r9\n"       \
>>                "addq   $32, %%rsi\n"           \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq   $32, %%rdi\n"           \
>>                "movq   -16(%%rsi), %%r8\n"     \
>>                "movq   -8(%%rsi), %%r9\n"      \
>>                "decl   %%ecx\n"                \
>>                "movq   %%r8, -16(%%rdi)\n"     \
>>                "movq   %%r9, -8(%%rdi)\n"      \
>>                "jnz    2b\n"                   \
>> "3:             testb  $7, %%al\n"              \
>>                "je     6f\n"                   \
>>                "testb  $4, %%al\n"             \
>>                "je     4f\n"                   \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "movq    8(%%rsi), %%r9\n"      \
>>                "addq    $16, %%rsi\n"          \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq    $16, %%rdi\n"          \
>> "4:     testb   $2, %%al\n"                     \
>>                "je      5f\n"                  \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "addq   $8, %%rsi\n"            \
>>                "movq  %%r8, (%%rdi)\n"         \
>>                "addq    $8, %%rdi\n"           \
>> "5:    testb   $1, %%al\n"                      \
>>                "je     6f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "movl   %%eax, (%%rdi)\n"       \
>> "6: \n"                                         \
>>                : "=%c" (_size)                 \
>>                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
>>                : "%eax", "%r8", "%r9"                  \
>>                );                                      \
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something 

Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


possible-_mesa_remove_extra_moves-fix.patch
Description: Binary data
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
>  And any special case added for MUL would seem to apply to any ALU
> instruction.  That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 15:58:12 
> PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either.  I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[1].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> +    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> +    pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> +    pAsm->S[2].src.reg   = tmp2;
> +    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 17:40:21 
> PDT ---
> Created an attachment (id=35777)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35777
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD  tmp.x, angle, 1/(2*PI), 0.5
> FRACT   tmp.x, tmp.x
> ADD     tmp.y, tmp.x, 1
> CNDGE   tmp.x, tmp.x, tmp.x, tmp.y
> MULADD  tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
>           What    |Removed                     |Added
> 
>  Attachment #35777|0                           |1
>        is obsolete|                            |
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
>  View: https://bugs.freedesktop.org/attachment.cgi?id=35787
>  Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 +       GLint log2 = 0;
 +
 +       while (n >>= 1)
 +               ++log2;
 +       return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring  just 13 
>> operations
>>
>> +               pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> +               pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> +               pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> +               pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> +               pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> +               pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /*  tested and verified  */
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>>                              y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
>        const unsigned char /*int*/ spread_bits[8] = {
>                0x00,  /* 0b000 to 0b0 */
>                0x01,  /* 0b001 to 0b1 */
>                0x04,  /* 0b010 to 0b00100 */
>                0x05,  /* 0b011 to 0b00101 */
>                0x10,  /* 0b100 to 0b1 */
>                0x11,  /* 0b101 to 0b10001 */
>                0x14,  /* 0b110 to 0b10100 */
>                0x15,  /* 0b111 to 0b10101 */
>        };
>
>        pixel_number |= spread_bits[x & 0x07];
>        pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 05:53:36 
> PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(8)
>      0  y: MULADD      R123.y,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>         z: MOV         R0.z,  0.0f
>         w: MOV         R0.w,  1.0f
>      1  x: FRACT       ,  PV0.y
>      2  z: MULADD      R123.z,  PV1.x,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  t: SIN         R0.x,  PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;   Disassembly 
> 00 ALU: ADDR(32) CNT(10)
>      0  y: MOV         R0.y,  0.0f
>         z: MOV         R0.z,  1.0f
>         w: MULADD      R123.w,  R0.x,  (0x3E22F983, 0.1591549367f).x,
>  0.5
>      1  y: FRACT       ,  PV0.w
>      2  x: MULADD      R123.x,  PV1.y,  (0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
>      3  z: MUL         ,  PV2.x,  (0x3E22F983, 0.1591549367f).x
>      4  t: SIN         R0.x,  PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr ) \
do {\
uint32_t * _src_ptr;\
uint32_t * _dest_ptr;   \
uint32_t _size; \
\
  _dest_ptr = (uint32_t *)dst;  \
  _src_ptr = src;   \
  _size =nr;\
 __asm__ __volatile__ ("testb   $4, %%dil\n"\
"je 1f\n"   \
"movl   (%%rsi), %%eax\n"   \
"addq   $4, %%rsi\n"\
"movl   %%eax, (%%rdi)\n"   \
"addq   $4, %%rdi\n"\
"decl   %%ecx\n"\
"1: movl%%ecx, %%eax\n" \
"shrl   $3, %%ecx\n"\
"je 3f\n"   \
".p2align 4 \n" \
"2: movq(%%rsi), %%r8\n"\
"movq   8(%%rsi), %%r9\n"   \
"addq   $32, %%rsi\n"   \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq   $32, %%rdi\n"   \
"movq   -16(%%rsi), %%r8\n" \
"movq   -8(%%rsi), %%r9\n"  \
"decl   %%ecx\n"\
"movq   %%r8, -16(%%rdi)\n" \
"movq   %%r9, -8(%%rdi)\n"  \
"jnz2b\n"   \
"3: testb  $7, %%al\n"  \
"je 6f\n"   \
"testb  $4, %%al\n" \
"je 4f\n"   \
"movq(%%rsi), %%r8\n"   \
"movq8(%%rsi), %%r9\n"  \
"addq$16, %%rsi\n"  \
"movq   %%r8, (%%rdi)\n"\
"movq   %%r9, 8(%%rdi)\n"   \
"addq$16, %%rdi\n"  \
"4: testb   $2, %%al\n" \
"je  5f\n"  \
"movq(%%rsi), %%r8\n"   \
"addq   $8, %%rsi\n"\
"movq  %%r8, (%%rdi)\n" \
"addq$8, %%rdi\n"   \
"5:testb   $1, %%al\n"  \
"je 6f\n"   \
"movl   (%%rsi), %%eax\n"   \
"movl   %%eax, (%%rdi)\n"   \
"6: \n" \
: "=%c" (_size) \
: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) \
: "%eax", "%r8", "%r9"  \
);  \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


inlined asm x86-64 COPY_DWORDS macro

2010-04-19 Thread Conn Clark
On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner  wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark  wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr ) ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> do { ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _src_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> uint32_t * _dest_ptr; ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> uint32_t _size; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_dest_ptr = (uint32_t *)dst; ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ?_src_ptr = src; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ?_size =nr; ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? __asm__ __volatile__ ("testb ? ? ? $4, %%dil\n" ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 1f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $4, %%rdi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> "1: ? ? ? ? ? ? movl ? ?%%ecx, %%eax\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"shrl ? $3, %%ecx\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 3f\n" ? ? ? ? ? ? ? ? ? \
>> ".p2align 4 \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> "2: ? ? ? ? ? ? movq ? ?(%%rsi), %%r8\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? 8(%%rsi), %%r9\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rsi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $32, %%rdi\n" ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -16(%%rsi), %%r8\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? -8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"decl ? %%ecx\n" ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, -16(%%rdi)\n" ? ? \
>> ? ? ? ? ? ? ? ?"movq ? %%r9, -8(%%rdi)\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"jnz ? ?2b\n" ? ? ? ? ? ? ? ? ? \
>> "3: ? ? ? ? ? ? testb ?$7, %%al\n" ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"testb ?$4, %%al\n" ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? 4f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movq ? ?8(%%rsi), %%r9\n" ? ? ?\
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rsi\n" ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r8, (%%rdi)\n" ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? %%r9, 8(%%rdi)\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$16, %%rdi\n" ? ? ? ? ?\
>> "4: ? ? testb ? $2, %%al\n" ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"je ? ? ?5f\n" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ? ?(%%rsi), %%r8\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? $8, %%rsi\n" ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"movq ?%%r8, (%%rdi)\n" ? ? ? ? \
>> ? ? ? ? ? ? ? ?"addq ? ?$8, %%rdi\n" ? ? ? ? ? \
>> "5: ? ?testb ? $1, %%al\n" ? ? ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?"je ? ? 6f\n" ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? (%%rsi), %%eax\n" ? ? ? \
>> ? ? ? ? ? ? ? ?"movl ? %%eax, (%%rdi)\n" ? ? ? \
>> "6: \n" ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "=%c" (_size) ? ? ? ? ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr) ? ? ? ? \
>> ? ? ? ? ? ? ? ?: "%eax", "%r8", "%r9" ? ? ? ? ? ? ? ? ?\
>> ? ? ? ? ? ? ? ?); ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> ___
>> dri-devel mailing list
>> dri-devel at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something we should
> just let gcc handle? It's pretty smart about inlining calls to memcpy.
> (Certainly your code is an improvement over "rep ; movsl" or "for ( j
> = 0 ; j < nr ; j++ ) dst[j] = ((int *)src)[j];" but really, why bother
> when gcc can do this in a much nicer more maintainable way?
>
> Matt
>

Matt,

GCC inlines memcpy with a REP MOVQ preceded by alignment checks for
byte word and double word to align the destination to be on 8 byte
boundaries. Its also followed further checks in case the number of
bytes is not evenly divisible by 4.

The inner loop is based on AMD's optimization guide's memcpy. This
also allows you to modify it to use the movnti instructions for
writing to the destination ram directly bypassing the cache if you
like.

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-06-08 Thread Conn Clark
On Tue, Jun 8, 2010 at 5:53 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #18 from Andre Maasikas  2010-06-08 
> 05:53:36 PDT ---
> dont' have much net this week to review/test:(
> but i'm ok with it if you make last mul conditional on r700 as
> it has -1..1 range it seems, also amd shader analyzer gives this difference:
>
> RV610 hd2400
>
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(8)
> ? ? ?0 ?y: MULADD ? ? ?R123.y, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?0.0f
> ? ? ? ? w: MOV ? ? ? ? R0.w, ?1.0f
> ? ? ?1 ?x: FRACT ? ? ? , ?PV0.y
> ? ? ?2 ?z: MULADD ? ? ?R123.z, ?PV1.x, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?t: SIN ? ? ? ? R0.x, ?PV2.z
> 01 EXP_DONE: PIX0, R0.xzzw
> END_OF_PROGRAM
>
> 4870 RV770
> ;  ?Disassembly 
> 00 ALU: ADDR(32) CNT(10)
> ? ? ?0 ?y: MOV ? ? ? ? R0.y, ?0.0f
> ? ? ? ? z: MOV ? ? ? ? R0.z, ?1.0f
> ? ? ? ? w: MULADD ? ? ?R123.w, ?R0.x, ?(0x3E22F983, 0.1591549367f).x,
> ?0.5
> ? ? ?1 ?y: FRACT ? ? ? , ?PV0.w
> ? ? ?2 ?x: MULADD ? ? ?R123.x, ?PV1.y, ?(0x40C90FDB, 6.283185482f).y,
> -(0x40490FDB, 3.141592741f).x
> ? ? ?3 ?z: MUL ? ? ? ? , ?PV2.x, ?(0x3E22F983, 0.1591549367f).x
> ? ? ?4 ?t: SIN ? ? ? ? R0.x, ?PV3.z
> 01 EXP_DONE: PIX0, R0.xyyz
> END_OF_PROGRAM
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This is very very strange that amd would change the instruction. I
wonder if it is a bug in their code   Perhaps we need someone with
an r700 to run the sin and cos tests in piglit . The proposed patch
passes on my rs780 (rv610) .

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-05 Thread Conn Clark
Hello,

Here is a possible fix/hack to get _mesa_remove_extra_moves function
in shader/prog_optimize.c usable. As far as I could tell with my
testing there was an issue with this optimizing pass and OPCODE_MUL .
I just added an exception to for this one instruction and made it easy
to add others should further testing indicate they need to be added
too.

It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
also reduced the testing runtime from 234 seconds to 225 seconds.


I have only tested on my radeon hd 3100 based laptop but would like to
hear results from other types of cards too.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
-- next part --
A non-text attachment was scrubbed...
Name: possible-_mesa_remove_extra_moves-fix.patch
Type: application/octet-stream
Size: 3520 bytes
Desc: not available
URL: 



Possible fix to _mesa_remove_extra_moves function in shader/prog_optimize.c (request testing)

2010-05-06 Thread Conn Clark
On Thu, May 6, 2010 at 2:50 PM, Brian Paul  wrote:
> Conn Clark wrote:
>>
>> Hello,
>>
>> Here is a possible fix/hack to get _mesa_remove_extra_moves function
>> in shader/prog_optimize.c usable. As far as I could tell with my
>> testing there was an issue with this optimizing pass and OPCODE_MUL .
>> I just added an exception to for this one instruction and made it easy
>> to add others should further testing indicate they need to be added
>> too.
>>
>> It bumped my Nexuiz scores on demo1 from 5,8,and 12 to 5,9, and 13. It
>> also reduced the testing runtime from 234 seconds to 225 seconds.
>>
>>
>> I have only tested on my radeon hd 3100 based laptop but would like to
>> hear results from other types of cards too.
>
> I'm a bit nervous about enabling that function without a _lot_ more testing.
> ?And any special case added for MUL would seem to apply to any ALU
> instruction. ?That tells me that there's probably other issues to shake out
> of the code before we can enable it.
>
> If you're interested, you should at least run the glean and piglits tests
> which exercise shaders and GPU programs.
>
> -Brian
>

Brian,

 I couldn't agree more about the testing and that is the stage where I
am at. If you read my patch you probably noticed that I had a few ALU
instructions ready to drop into the problematic slot. Of course there
is still the chance that the MUL problem is with the R600/R700 support
itself. So far I have had only 3 testers besides myself (all of them
radeon users).  None of them has reported any problems yet. I will run
piglet and try and get glean running however.

Please don't take my current work as a push to get it included yet.

Thanks for your original work.

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-19 Thread Conn Clark
On Wed, May 19, 2010 at 3:58 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #4 from Alain Perrot  2010-05-19 
> 15:58:12 PDT ---
> (In reply to comment #3)
>> Alain,
>>
>> Okay, The patch I just posted might fix this bug. It doesn't cause any
>> additional errors in piglit either. ?I think its working right with
>> http://github.com/jckarter/hello-gl-ch3 too. Of course I have only tested it 
>> on
>> my RadeonHD 3100 and its my first attempt at r600 assembly so let me know if 
>> it
>> works for you.
>>
>> Note: it could probably be done so its faster but I'm still not to sure on 
>> how
>> everything works yet in the software
>>
>> Conn
>
> Thanks for your help.
>
> Unfortunately, your patch does not fix the cos/sin functions (at least on my
> Radeon HD 3870 / RV670). The hello-gl-ch3 example works better but still not 
> as
> expected, you can compare with Mesa software rendering.
>
> I tried to play with your patch to make it work without success for now. A 
> note
> is that there may be a mistake on the last operand to the CNDGT instruction :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[1].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> should probably be :
>
> + ? ?setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
> + ? ?pAsm->S[2].src.rtype = DST_REG_TEMPORARY;
> + ? ?pAsm->S[2].src.reg ? = tmp2;
> + ? ?setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
>
> But this update does not make the cos/sin functions work.
>
> I will try again tomorrow.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Okay I'll look at it some more tonight. At least I know I'm on the
right track. Thanks for testing.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-20 Thread Conn Clark
On Thu, May 20, 2010 at 5:40 PM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> --- Comment #8 from Alain Perrot  2010-05-20 
> 17:40:21 PDT ---
> Created an attachment (id=35777)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35777
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35777
>
> Alternative assemble_TRIG fix
>
> I can confirm that your patch seems to work for me too.
>
> By the way, you beat me at posting a working patch here :-)
>
> I also figured out that the 0.5 special constant was an issue in your patch,
> and I managed to get a working assemble_TRIG function which implements the
> following instruction sequence (lightly different of yours) to normalize the
> angle:
>
> MULADD ?tmp.x, angle, 1/(2*PI), 0.5
> FRACT ? tmp.x, tmp.x
> ADD ? ? tmp.y, tmp.x, 1
> CNDGE ? tmp.x, tmp.x, tmp.x, tmp.y
> MULADD ?tmp.x, tmp.x, 2*PI, -PI
>
> I don't known if it is better or worse than yours beside the fact that it use
> only one helper variable.
>
> I attached my patch (updated to use the same extended value of PI than yours)
> which fix the assemble_TRIG function, but not the assemble_SCS one.
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its a tough call on who's is the better solution. Yours uses one less
temp reg and mine will allow for a couple of operations to be done in
parallel in the future. I guess we both deserve a pat on the back and
leave it to someone more experienced to make the call on which one to
choose.

Good job

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[Bug 27901] GLSL cos/sin functions broken on Mesa R600 driver

2010-05-21 Thread Conn Clark
On Fri, May 21, 2010 at 11:13 AM,   wrote:
> https://bugs.freedesktop.org/show_bug.cgi?id=27901
>
> Alain Perrot  changed:
>
> ? ? ? ? ? What ? ?|Removed ? ? ? ? ? ? ? ? ? ? |Added
> 
> ?Attachment #35777|0 ? ? ? ? ? ? ? ? ? ? ? ? ? |1
> ? ? ? ?is obsolete| ? ? ? ? ? ? ? ? ? ? ? ? ? ?|
>
> --- Comment #12 from Alain Perrot  2010-05-21 
> 11:13:05 PDT ---
> Created an attachment (id=35787)
> ?View: https://bugs.freedesktop.org/attachment.cgi?id=35787
> ?Review: https://bugs.freedesktop.org/review?bug=27901&attachment=35787
>
> Alternative assemble_TRIG and assemble_SCS fix
>
> Conn,
>
> Attached is the updated patch which includes the assemble_SCS function.
> If it is ok for you, I will submit it (I guess that it should be sent to the
> dri-devel mailing list ?)
>
> --
> Configure bugmail: https://bugs.freedesktop.org/userprefs.cgi?tab=email
> --- You are receiving this mail because: ---
> You are the assignee for the bug.
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

Alain,

Its on the mailing list.

I'll inform them to merge it after I run piglit and verify it works.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
> Alex Deucher wrote:
>>
>> On Thu, May 27, 2010 at 10:55 AM, Matt Turner  wrote:

 +static inline GLint r600_log2(GLint n)
 +{
 + ? ? ? GLint log2 = 0;
 +
 + ? ? ? while (n >>= 1)
 + ? ? ? ? ? ? ? ++log2;
 + ? ? ? return log2;
 +}
>>>
>>> Does mesa not provide something like this?
>>
>> The only one I could find was a gallium utility function.
>
> There's a logbase2() function in teximage.c but it might not be equivalent.
>
> -Brian
>
> ___
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

This code could be written with a faster algorithm requiring  just 13 operations

+   pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+   pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+   pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+   pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+   pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+   pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]



/* suitable for all 16 bit or greater processors that can do an
unsigned 16 bit or greater multiply */
/*  tested and verified  */

pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
 y & 0x07) * 0x & 0x8421) * 0x1249
>> 8) & 0xAA );

Note if it is known that x and y are less than or equal to 7 it can be
done in 11 operations.

Cheers

Conn
-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


[PATCH 1/3] r600: add span support for 2D tiling

2010-05-27 Thread Conn Clark
On Thu, May 27, 2010 at 4:01 PM, Frieder Ferlemann
 wrote:
> Hi,
>
> Am 28.05.2010 00:04, schrieb Conn Clark:
>> On Thu, May 27, 2010 at 8:51 AM, Brian Paul  wrote:
>>
>> This code could be written with a faster algorithm requiring ?just 13 
>> operations
>>
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
>> + ? ? ? ? ? ? ? pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
>> + ? ? ? ? ? ? ? pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
>>
>
>
>> /* suitable for all 16 bit or greater processors that can do an
>> unsigned 16 bit or greater multiply */
>> /* ?tested and verified ?*/
>>
>> pixel_number = x & 0x07) * 0x & 0x8421) * 0x1249 >> 9) & 0x55 ) |
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?y & 0x07) * 0x & 0x8421) * 0x1249
>>>> 8) & 0xAA );
>>
>> Note if it is known that x and y are less than or equal to 7 it can be
>> done in 11 operations.
>
> Cool. How does it compare to:
>
> ? ? ? ?const unsigned char /*int*/ spread_bits[8] = {
> ? ? ? ? ? ? ? ?0x00, ?/* 0b000 to 0b0 */
> ? ? ? ? ? ? ? ?0x01, ?/* 0b001 to 0b1 */
> ? ? ? ? ? ? ? ?0x04, ?/* 0b010 to 0b00100 */
> ? ? ? ? ? ? ? ?0x05, ?/* 0b011 to 0b00101 */
> ? ? ? ? ? ? ? ?0x10, ?/* 0b100 to 0b1 */
> ? ? ? ? ? ? ? ?0x11, ?/* 0b101 to 0b10001 */
> ? ? ? ? ? ? ? ?0x14, ?/* 0b110 to 0b10100 */
> ? ? ? ? ? ? ? ?0x15, ?/* 0b111 to 0b10101 */
> ? ? ? ?};
>
> ? ? ? ?pixel_number |= spread_bits[x & 0x07];
> ? ? ? ?pixel_number |= spread_bits[y & 0x07] << 1;
>
>
> Greetings,
> Frieder
>

Look up tables have some hidden penalties but I think it might be a
win. Looks like we may have to benchmark the solutions against one
another to really know which is best in real life.

Conn

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.


  1   2   >