--- In [email protected], "darkxerion" <[EMAIL PROTECTED]> wrote:
>
> Hello all,
>
> Like the title, there is any way to find a string in a file or buffer
?
> the fastest one I hope ?
>
> Thanks,
>
Your basic loop to scan a file is pretty simple and
will look like this:
function StringInFile(strFind, strFileName: string): boolean;
const
BUFSIZE = 8192;
var
fstm: TFileStream;
numread: Longint;
buffer: array [0..BUFSIZE-1] of char;
szFind: array [0..255] of char;
found: boolean;
begin
StrPCopy(szFind, strFind);
found := False;
fstm := TFileStream.Create(strFileName, fmOpenRead);
repeat
numread := fstrm.Read(Buffer, BUFSIZE);
if BMFind(szFind, Buffer, numread) >= 0 then
found := True
else if numread = BUFSIZE then // more to scan
fstm.Position := fstmPosition - (Length(strFind)-1);
until found or (numread < BUFSIZE);
fstm.Free;
Result := found;
end;
1) The reason for backing up fstm.Position by nearly the length
of strFind is in case strFind crosses buffer boundaries.
2) The "BMFind" function used above is a Boyer-Moore search
as shown below. This is the fastest string search known.
function BMFind(szSubStr, buf: PChar; iBufSize: integer): integer;
{ Returns -1 if substring not found,
or zero-based index into buffer if substring found }
var
iSubStrLen: integer;
skip: array [char] of integer;
found: boolean;
iMaxSubStrIdx: integer;
iSubStrIdx: integer;
iBufIdx: integer;
iScanSubStr: integer;
mismatch: boolean;
iBufScanStart: integer;
ch: char;
begin
{ Initialisations }
found := False;
Result := -1;
{ Check if trivial scan for empty string }
iSubStrLen := StrLen(szSubStr);
if iSubStrLen = 0 then
begin
Result := 0;
Exit
end;
iMaxSubStrIdx := iSubStrLen - 1;
{ Initialise the skip table }
for ch := Low(skip) to High(skip) do skip[ch] := iSubStrLen;
for iSubStrIdx := 0 to (iMaxSubStrIdx - 1) do
skip[szSubStr[iSubStrIdx]] := iMaxSubStrIdx - iSubStrIdx;
{ Scan the buffer, starting comparisons at the end of the substring }
iBufScanStart := iMaxSubStrIdx;
while (not found) and (iBufScanStart < iBufSize) do
begin
iBufIdx := iBufScanStart;
iScanSubStr := iMaxSubStrIdx;
repeat
mismatch := (szSubStr[iScanSubStr] <> buf[iBufIdx]);
if not mismatch then
if iScanSubStr > 0 then
begin // more characters to scan
Dec(iBufIdx); Dec(iScanSubStr)
end
else
found := True;
until mismatch or found;
if found then
Result := iBufIdx
else
iBufScanStart := iBufScanStart + skip[buf[iBufScanStart]];
end;
end;
I have included a "wholeword_only" flag n the BMFind below.
This confirms or rejects the "found" result, and will
cause the loop to keep searching if match is rejected.
function BMFind(szSubStr, buf: PChar; iBufSize: integer;
wholeword_only: boolean): integer;
{ Returns -1 if substring not found,
or zero-based index into buffer if substring found }
var
iSubStrLen: integer;
skip: array [char] of integer;
found: boolean;
iMaxSubStrIdx: integer;
iSubStrIdx: integer;
iBufIdx: integer;
iScanSubStr: integer;
mismatch: boolean;
iBufScanStart: integer;
ch: char;
begin
found := False;
Result := -1;
iSubStrLen := StrLen(szSubStr);
if iSubStrLen = 0 then
begin
Result := 0;
Exit
end;
iMaxSubStrIdx := iSubStrLen - 1;
{ Initialise the skip table }
for ch := Low(skip) to High(skip) do skip[ch] := iSubStrLen;
for iSubStrIdx := 0 to (iMaxSubStrIdx - 1) do
skip[szSubStr[iSubStrIdx]] := iMaxSubStrIdx - iSubStrIdx;
{ Scan the buffer, starting comparisons at the end of the substring }
iBufScanStart := iMaxSubStrIdx;
while (not found) and (iBufScanStart < iBufSize) do
begin
iBufIdx := iBufScanStart;
iScanSubStr := iMaxSubStrIdx;
repeat
mismatch := (szSubStr[iScanSubStr] <> buf[iBufIdx]);
if not mismatch then
if iScanSubStr > 0 then
begin // more characters to scan
Dec(iBufIdx); Dec(iScanSubStr)
end
else
found := True;
until mismatch or found;
if found and wholeword_only then
begin
if (iBufIdx > 0) then
found := not IsCharAlpha(buf[iBufIdx - 1]);
if found then
if iBufScanStart < (iBufSize - 1) then
found := not IsCharAlpha(buf[iBufScanStart + 1]);
end;
if found then
Result := iBufIdx
else
iBufScanStart := iBufScanStart + skip[buf[iBufScanStart]];
end;
end;
Obviously you'll be tempted to increase BUFSIZE on the assumption
that it will improve performance. My experience is that it does
not, and that 8K is pretty optimum.