On 28-12-2021 23:35, Martin Frb via lazarus wrote:


"nx" has a single "1" in each of the 8 bytes in a Qword (based on 64bit).
If we regard each of this bytes as an entity of its own, then we can keep adding those "1".

I also was thinking in that direction, but more about how to optimize that loop using SSE2

Some simple masking achieves the same (an 1 for each byte that starts with %10 bits) in 5 instructions, the load inclusive.

Since 64-bit always supports SSE2, this could work:


{$mode objfpc}{$H+}
{$asmmode intel}

uses sysutils,strutils;

Type int128 = array[0..1] of int64;

const
     mask3       :  array[0..15] of byte  = ( $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0);

      mask4       :  array[0..15] of byte  = (   $80,$80,$80,$80,
                                                 $80,$80,$80,$80,
                                                 $80,$80,$80,$80,
                                                 $80,$80,$80,$80);


      mask2       :  array[0..15] of byte  = ( $1,$1,$1,$1,
                         $1,$1,$1,$1,
                                                 $1,$1,$1,$1,
                                                 $1,$1,$1,$1);

function utf8length(const s : pchar;var res:int128;len:integer):integer;
// len is number of 16-byte counts to accumulate, max 255 I think
// stores 16 bytes worth of counts in "res"
begin
 asm
  movdqu xmm1,[rip+mask3]         // unaligned is SSE3, doesn't work on original X86_64 clawhammer?
  movdqu xmm2,[rip+mask4]
  movdqu xmm3,[rip+mask2]
  pxor xmm4,xmm4

@lbl:
  movdqu xmm0, [rcx]
  pand  xmm0,xmm1      // mask out top 2 bits  ($C0)
  pcmpeqb xmm0,xmm2    // compare with $80. sets byte to 11111111 or 00000000
  pand  xmm0,xmm3      // change to lsb (1/0) per byte only.
  paddb  xmm4,xmm0     // add to cumulative

  add rcx,16
  dec r8
  jne @lbl

  movdqu [rdx],xmm4

end; // no volatile registers used.
end;

function countmask(nx:int64):integer;
// Martin's routine that should be replaced by some punpkl magic, but it is too late now.
begin
   nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
   nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
   result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
end;


// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
chr(%00001001),chr(%01001001));

const testblocks = 5;

var s : string;
    i,j,cnt : integer;
    r : int128;

begin
  randomize;
  setlength(s,testblocks*16);
  // random string but keep a count of bytes with high value %10
  cnt:=0;
  for i:=0 to testblocks*16-1 do
    begin
      j:=random(4);
      if j=1 then inc(cnt);
      s[i+1]:=pattern[j];
    end;

  utf8length(pchar(s),r,testblocks+1);

  writeln(cnt,' = ',countmask(r[0])+countmask(r[1]));
//  writeln(inttohex(r[0],16));
//  writeln(inttohex(r[1],16));

end.



--
_______________________________________________
lazarus mailing list
lazarus@lists.lazarus-ide.org
https://lists.lazarus-ide.org/listinfo/lazarus

Reply via email to