--- In [email protected], "darkxerion" <[EMAIL PROTECTED]> wrote:
>
> Hello all,
>
> Like the title, there is any way to find a string in a file or buffer
?
> the fastest one I hope ?
>
> Thanks,
>
Your basic loop to scan a file is pretty simple and
will look like this:

function StringInFile(strFind, strFileName: string): boolean;
const
   BUFSIZE = 8192;
var
   fstm: TFileStream;
   numread: Longint;
   buffer: array [0..BUFSIZE-1] of char;
   szFind: array [0..255] of char;
   found: boolean;
begin
   StrPCopy(szFind, strFind);
   found := False;
   fstm := TFileStream.Create(strFileName, fmOpenRead);
   repeat
     numread := fstrm.Read(Buffer, BUFSIZE);
     if BMFind(szFind, Buffer, numread) >= 0 then
       found := True
     else if numread = BUFSIZE then // more to scan
       fstm.Position := fstmPosition - (Length(strFind)-1);
   until found or (numread < BUFSIZE);
   fstm.Free;
   Result := found;
end;

1) The reason for backing up fstm.Position by nearly the length
of strFind is in case strFind crosses buffer boundaries.

2) The "BMFind" function used above is a Boyer-Moore search
as shown below. This is the fastest string search known.

function BMFind(szSubStr, buf: PChar; iBufSize: integer): integer;
{ Returns -1 if substring not found,
   or zero-based index into buffer if substring found }
var
   iSubStrLen: integer;
   skip: array [char] of integer;
   found: boolean;
   iMaxSubStrIdx: integer;
   iSubStrIdx: integer;
   iBufIdx: integer;
   iScanSubStr: integer;
   mismatch: boolean;
   iBufScanStart: integer;
   ch: char;
begin
   { Initialisations }
   found := False;
   Result := -1;
   { Check if trivial scan for empty string }
   iSubStrLen := StrLen(szSubStr);
   if iSubStrLen = 0 then
   begin
     Result := 0;
     Exit
   end;

   iMaxSubStrIdx := iSubStrLen - 1;
   { Initialise the skip table }
   for ch := Low(skip) to High(skip) do skip[ch] := iSubStrLen;
   for iSubStrIdx := 0 to (iMaxSubStrIdx - 1) do
     skip[szSubStr[iSubStrIdx]] := iMaxSubStrIdx - iSubStrIdx;

   { Scan the buffer, starting comparisons at the end of the substring }
   iBufScanStart := iMaxSubStrIdx;
   while (not found) and (iBufScanStart < iBufSize) do
   begin
     iBufIdx := iBufScanStart;
     iScanSubStr := iMaxSubStrIdx;
     repeat
       mismatch := (szSubStr[iScanSubStr] <> buf[iBufIdx]);
       if not mismatch then
         if iScanSubStr > 0 then
         begin // more characters to scan
           Dec(iBufIdx); Dec(iScanSubStr)
         end
         else
           found := True;
     until mismatch or found;
     if found then
       Result := iBufIdx
     else
       iBufScanStart := iBufScanStart + skip[buf[iBufScanStart]];
   end;
end;

I have included a "wholeword_only" flag n the BMFind below.
This confirms or rejects the "found" result, and will
cause the loop to keep searching if match is rejected.

function BMFind(szSubStr, buf: PChar; iBufSize: integer;
                 wholeword_only: boolean): integer;
{ Returns -1 if substring not found,
   or zero-based index into buffer if substring found }
var
   iSubStrLen: integer;
   skip: array [char] of integer;
   found: boolean;
   iMaxSubStrIdx: integer;
   iSubStrIdx: integer;
   iBufIdx: integer;
   iScanSubStr: integer;
   mismatch: boolean;
   iBufScanStart: integer;
   ch: char;
begin
   found := False;
   Result := -1;
   iSubStrLen := StrLen(szSubStr);
   if iSubStrLen = 0 then
   begin
     Result := 0;
     Exit
   end;

   iMaxSubStrIdx := iSubStrLen - 1;
   { Initialise the skip table }
   for ch := Low(skip) to High(skip) do skip[ch] := iSubStrLen;
   for iSubStrIdx := 0 to (iMaxSubStrIdx - 1) do
     skip[szSubStr[iSubStrIdx]] := iMaxSubStrIdx - iSubStrIdx;

   { Scan the buffer, starting comparisons at the end of the substring }
   iBufScanStart := iMaxSubStrIdx;
   while (not found) and (iBufScanStart < iBufSize) do
   begin
     iBufIdx := iBufScanStart;
     iScanSubStr := iMaxSubStrIdx;
     repeat
       mismatch := (szSubStr[iScanSubStr] <> buf[iBufIdx]);
       if not mismatch then
         if iScanSubStr > 0 then
         begin // more characters to scan
           Dec(iBufIdx); Dec(iScanSubStr)
         end
         else
           found := True;
     until mismatch or found;
     if found and wholeword_only then
     begin
       if (iBufIdx > 0) then
         found := not IsCharAlpha(buf[iBufIdx - 1]);
       if found then
         if iBufScanStart < (iBufSize - 1) then
           found := not IsCharAlpha(buf[iBufScanStart + 1]);
     end;
     if found then
       Result := iBufIdx
     else
       iBufScanStart := iBufScanStart + skip[buf[iBufScanStart]];
   end;
end;

Obviously you'll be tempted to increase BUFSIZE on the assumption
that it will improve performance. My experience is that it does
not, and that 8K is pretty optimum.


Reply via email to