Sivakatirswami wrote:
>
> I would like to make a small spider that would crawl over a section of our
> web site, check for the existence of certain files, if true then load and
> parse certain other html files and do other "stuff."
>
> so, is it possible to load a directory and not just a file?
>
> i.e. get a line-delimited list as the result of some URL command instead of
> a file?
>
> Hinduism Today
>
> Sivakatirswami
> Editor's Assistant/Production Manager
> www.HinduismToday.com
> [EMAIL PROTECTED]
Hi Sivakatirswami,
I'm working on something like that, not realy ended at this time. See if
the script below can help.
on mrl1
global Lurl,Rref,Nref
set the cursor to watch
if fld "Scan" is not empty
then put fld "Scan" into Lurl
else put "http://sam.w-max.com" into Lurl
put empty into Nref
put empty into Rref
put Lurl into Nref
set itemdelimiter to "/"
if "." is not in last item of Nref
then if last char of Nref is not "/"
then put "/" after Nref
put Nref into Lurl
put url Nref into fref
put empty into indexs
put "1" into d
put "META NAME=" & quote & "KEYWORDS" into i
get offset(i,fref,d)
if it is not "0" then
put it+length(i)-1 into d
get offset (">",fref,d)
if it is not "0" then
put char d to it+d-1 of fref into indexs
put it+d into d
end if
else
put "META NAME=KEYWORDS" into i
get offset (i,fref,d)
if it is not "0" then
put it+length(i)-1 into d
get offset (">",fref,d)
if it is not "0" then
put char d to it+d-1 of fref into indexs
put it+d into d
end if
end if
end if
repeat
get offset("=",indexs)
if it is not "0"
then put empty into char 1 to it of indexs
else exit repeat
end repeat
repeat
get offset(quote,indexs)
if it is not "0"
then put empty into char it of indexs
else exit repeat
end repeat
put fref into sindex
repeat
get offset("<td",sindex)
if it is not "0" then
put it into d
put offset ("</td",sindex) into f
if f > d then
put offset (">",sindex,d) into v
if v is not "0" then
add v to d
if length(char d to f-1 of sindex) > 100
then if "<" is not in char d to f-1 of sindex
then put " " & char d to f-1 of sindex after indexs
delete char 1 to f of sindex
else exit repeat
else exit repeat
else exit repeat
end repeat
replace numtochar(10) with " " in indexs
replace " > " with " -- " in indexs
replace " >" with " -- " in indexs
replace "> " with " -- " in indexs
open file "RefMerl" for update
read from file "RefMerl" until eof
put it & cr & "#MTDA#" & Lurl & "#MTDB#" & cr & indexs & cr & "#MTFE#"
& cr into majml
write majml to file "RefMerl" at 0
close file "RefMerl"
if "<frameset" is in fref then
put offset("<frameset",fref) into a
put offset("</frameset",fref) into b
if b is "0"
then put length(fref) into b
put char a to b of fref into fref
repeat for each line l in fref
if l contains "SRC" then
repeat
get offset("SRC",l)
if it is not "0" then
put offset(">",l) into fin
if fin is not "0" then
put char it to fin-1 of l & cr after Rref
delete char 1 to fin of l
else exit repeat
else exit repeat
end repeat
end if
end repeat
replace "SRC" with "HREF" in Rref
else if fref contains "HREF" then
get offset ("<BODY",fref)
if it is not "0"
then delete char 1 to it of fref
repeat for each line l in fref
if l contains "HREF" then
repeat
get offset("HREF",l)
if it is not "0" then
put offset(">",l) into fin
if fin is not "0" then
put char it to fin-1 of l & cr after Rref
delete char 1 to fin of l
if last line of Rref contains "HREF=#"
then delete last line of Rref
else if last line of Rref contains "HREF=" & quote & "#" &
quote then
delete last line of Rref
end if
else exit repeat
else exit repeat
end repeat
end if
end repeat
end if
repeat with c = the number of lines of Rref down to 1
if line c of Rref is empty
then delete line c of Rref
end repeat
if Rref is not empty
then if "404" is not in Rref
then if "Not Found" is not in Rref
then mrl2
if length(Rref) is "0" and "*" is not in fld "HBuffy" then
put quote & Nref & quote & cr & cr & "Erreur : machine.domaine
introuvable" & \
cr & "ou données dynamiques inaccessibles" & cr & "en" && quote
& "GET" & \
quote & "..." into fld "Buffy"
else if "404" is in Rref or "Not Found" is in Rref then
if "*" is not in fld "HBuffy" then
put "url absente de l'arborescence du" & cr & "site (" & Nref & ")
..." into fld "Buffy"
end if
else
sort lines of Rref
repeat with l = the num of lines in Rref down to 1
if line l of Rref is empty
then delete line l of Rref
else if line l of Rref contains "mailto"
then delete line l of Rref
else if line l of Rref = line l-1 of Rref
then delete line l of Rref
else if Lurl is not in line l of Rref
then delete line l of Rref
end repeat
set itemdelimiter to " "
repeat with c = the number of lines of Rref down to 1
put item 1 of line c of Rref into line c of Rref
end repeat
if fld "HBuffy" is empty then
put Lurl & cr & urldecode(Rref) into fld "Buffy"
put urldecode(Rref) into fld "HBuffy"
show fld "Buffy"
put the number of lines of fld "Buffy" && "=>"
else put cr & urldecode(Rref) after fld "Buffy"
mrl3
end if
end mrl1
on mrl2
global Lurl,Nref,Rref
set the cursor to watch
set itemdelimiter to "/"
if last item of Nref is empty
then delete last item of Nref
if "." is in last item of Nref
then delete last item of Nref
if "/" is not in Nref
then put Nref & "//" into dom
else if "//" is not in Nref
then put Nref & "/" into dom
else put Nref into dom
if item 3 of Lurl is not in dom
then put item 3 of Lurl after dom
replace quote with "" in Rref
repeat with l = the num of lines in Rref down to 1
if "?" is in line l of Rref
then delete line l of Rref
else if "/" is last char of line l of Rref
then delete line l of Rref
else if "http://" is not in line l of Rref then
if "HREF=/" is in line l of Rref
then replace "HREF=" with dom in line l of Rref
else replace "HREF=" with dom & "/" in line l of Rref
else replace "HREF=" with "" in line l of Rref
get offset("//",line l of Rref,"8")
if it is not "0"
then put "" into char (it+8) of line l of Rref
end repeat
end mrl2
on mrl3
set the cursor to watch
if last char of fld "HBuffy" is cr
then delete last char of fld "HBuffy"
if last char of last line of fld "HBuffy" is not "*" then
repeat with l = 1 to the num of lines in fld "HBuffy"
if "*" is not last char of line l of fld "HBuffy" then
put line l of fld "HBuffy" into fld "Scan"
put " *" after line l of fld "HBuffy"
mrl1
exit repeat
end if
end repeat
else
repeat with l = 1 to the num of lines in fld "HBuffy"
delete last char of line l of fld "HBuffy"
delete last char of line l of fld "HBuffy"
end repeat
repeat with l = the num of lines in fld "HBuffy" down to 1
if line l of fld "HBuffy" & cr is not in fld "Buffy" and cr & line
l of fld "HBuffy" is not in fld "Buffy"
then put cr & line l of fld "HBuffy" after fld "Buffy"
end repeat
sort lines of fld "Buffy"
repeat with l = the num of lines in fld "Buffy" down to 1
if line l of fld "Buffy" is empty
then delete line l of fld "Buffy"
end repeat
repeat with l = the num of lines in fld "Buffy" down to 2
if line l of fld "Buffy" = line l-1 of fld "Buffy"
then delete line l of fld "Buffy"
end repeat
put empty into fld "HBuffy"
put word 1 of msg && "<=>" && the number of lines of fld "Buffy"
end if
end mrl3
Regards, Pierre Sahores
CRDP de l'academie de Creteil
WEB, DB, ASP, VPN, B2B design
Archives: http://www.mail-archive.com/metacard%40lists.best.com/
Info: http://www.xworlds.com/metacard/mailinglist.htm
Please send bug reports to <[EMAIL PROTECTED]>, not this list.