On Mon, Dec 27, 2021 at 10:02 PM Noel Duffy via lazarus <lazarus@lists.lazarus-ide.org> wrote:
> It's not just the euro, though. It's any utf-8 sequence. What I meant was that a single '€' (or any other single UTF8 "character") will not enter the mentioned block. Can you add some debug statements to display the values of the it uses in the calculation like I did in the 4th message in this thread? function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt; const {$ifdef CPU32} ONEMASK =$01010101; EIGHTYMASK=$80808080; {$endif} {$ifdef CPU64} ONEMASK =$0101010101010101; EIGHTYMASK=$8080808080808080; {$endif} var pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop nx: PtrInt; // values processed in block loop i,cnt,e: PtrInt; begin Result := 0; e := ix+ByteCount; // End marker // Handle any initial misaligned bytes. cnt := (not (ix-1)) and (sizeof(PtrInt)-1); if cnt>ByteCount then cnt := ByteCount; for i := 1 to cnt do begin // Is this byte NOT the first byte of a character? writeln('pn8^ = ',byte(pn8^).ToBinString); writeln('pn8^ shr 7 = ',Byte(Byte(pn8^) shr 7).ToBinString); writeln('not pn8^ = ',Byte(not pn8^).ToBinString); writeln('(not pn8^) shr 6 = ',Byte((not pn8^) shr 6).ToBinString); writeln; Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; // Handle complete blocks for i := 1 to (ByteCount-cnt) div sizeof(PtrUInt) do begin // Count bytes which are NOT the first byte of a character. { nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6); {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow. Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8); {$pop} } nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6); Result := Result + PopCnt(PtrUInt(nx)); inc(pnx); end; // Take care of any left-over bytes. while ix<e do begin // Is this byte NOT the first byte of a character? writeln('pn8^ = ',byte(pn8^).ToBinString); writeln('pn8^ shr 7 = ',Byte(Byte(pn8^) shr 7).ToBinString); writeln('not pn8^ = ',Byte(not pn8^).ToBinString); writeln('(not pn8^) shr 6 = ',Byte((not pn8^) shr 6).ToBinString); writeln; //writeln('',); Result += (pn8^ shr 7) and ((not pn8^) shr 6); inc(pn8); end; Result := ByteCount - Result; end; (Just put this in the main unit, no need to change and rebuild LazUtf8 unit) Make sure you app has a console. Then just do something like: S := '€'; Len := Utf8LengthFast(PChar(S), Length(S)); It should output somthing like; pn8^ = 11100010 pn8^ shr 7 = 00000001 not pn8^ = 00011101 (not pn8^) shr 6 = 00000000 pn8^ = 10000010 pn8^ shr 7 = 00000001 not pn8^ = 01111101 (not pn8^) shr 6 = 00000001 pn8^ = 10101100 pn8^ shr 7 = 00000001 not pn8^ = 01010011 (not pn8^) shr 6 = 00000001 Notice that '€' in UTF8 is the byte sequnce 11100010 10000010 10101100 (the values you should see in pn8^. -- Bart -- _______________________________________________ lazarus mailing list lazarus@lists.lazarus-ide.org https://lists.lazarus-ide.org/listinfo/lazarus