Sivakatirswami wrote:
> 
> I would like to make a small spider that would crawl over a section of our
> web site, check for the existence of certain files, if true then load and
> parse certain other html files and do other "stuff."
> 
> so, is it possible to load a directory and not just a file?
> 
> i.e. get a line-delimited list as the result of some URL command instead of
> a file?
> 
> Hinduism Today
> 
> Sivakatirswami
> Editor's Assistant/Production Manager
> www.HinduismToday.com
> [EMAIL PROTECTED]


Hi Sivakatirswami,

I'm working on something like that, not realy ended at this time. See if
the script below can help.


on mrl1
  global Lurl,Rref,Nref
  set the cursor to watch
  if fld "Scan" is not empty
  then put fld "Scan" into Lurl
  else put "http://sam.w-max.com" into Lurl
  put empty into Nref
  put empty into Rref
  put Lurl into Nref
  set itemdelimiter to "/"
  if "." is not in last item of Nref
  then if last char of Nref is not "/"
  then put "/" after Nref
  put Nref into Lurl
  put url Nref into fref
  
  put empty into indexs
  put "1" into d
  put "META NAME=" & quote & "KEYWORDS" into i
  get offset(i,fref,d)
  if it is not "0" then
    put it+length(i)-1 into d
    get offset (">",fref,d)
    if it is not "0" then
      put char d to it+d-1 of fref into indexs
      put it+d into d
    end if
  else
    put "META NAME=KEYWORDS" into i
    get offset (i,fref,d)
    if it is not "0" then
      put it+length(i)-1 into d
      get offset (">",fref,d)
      if it is not "0" then
        put char d to it+d-1 of fref into indexs
        put it+d into d
      end if
    end if
  end if
  repeat
    get offset("=",indexs)
    if it is not "0"
    then put empty into char 1 to it of indexs
    else exit repeat
  end repeat
  repeat
    get offset(quote,indexs)
    if it is not "0"
    then put empty into char it of indexs
    else exit repeat
  end repeat
  
  put fref into sindex
  repeat
    get offset("<td",sindex)
    if it is not "0" then
      put it into d
      put offset ("</td",sindex) into f
      if f > d then
        put offset (">",sindex,d) into v
        if v is not "0" then
          add v to d
          if length(char d to f-1 of sindex) > 100
          then if "<" is not in char d to f-1 of sindex
          then put " " & char d to f-1 of sindex after indexs
          delete char 1 to f of sindex
        else exit repeat
      else exit repeat
    else exit repeat
  end repeat
  
  replace numtochar(10) with " " in indexs
  replace " > " with " -- " in indexs
  replace " >" with " -- " in indexs
  replace "> " with " -- " in indexs
  
  open file "RefMerl" for update
  read from file "RefMerl" until eof
  put it & cr & "#MTDA#" & Lurl & "#MTDB#" & cr & indexs & cr & "#MTFE#"
& cr into majml
  write majml to file "RefMerl" at 0
  close file "RefMerl"
  
  if "<frameset" is in fref then
    put offset("<frameset",fref) into a
    put offset("</frameset",fref) into b
    if b is "0"
    then put length(fref) into b
    put char a to b of fref into fref
    repeat for each line l in fref
      if l contains "SRC" then
        repeat
          get offset("SRC",l)
          if it is not "0" then
            put offset(">",l) into fin
            if fin is not "0" then
              put char it to fin-1 of l & cr after Rref
              delete char 1 to fin of l
            else exit repeat
          else exit repeat
        end repeat
      end if
    end repeat
    replace "SRC" with "HREF" in Rref
  else if fref contains "HREF" then
    get offset ("<BODY",fref)
    if it is not "0"
    then delete char 1 to it of fref
    repeat for each line l in fref
      if l contains "HREF" then
        repeat
          get offset("HREF",l)
          if it is not "0" then
            put offset(">",l) into fin
            if fin is not "0" then
              put char it to fin-1 of l & cr after Rref
              delete char 1 to fin of l
              if last line of Rref contains "HREF=#"
              then delete last line of Rref
              else if last line of Rref contains "HREF=" & quote & "#" &
quote then
                delete last line of Rref
              end if
            else exit repeat
          else exit repeat
        end repeat
      end if
    end repeat
  end if
  repeat with c = the number of lines of Rref down to 1
    if line c of Rref is empty
    then delete line c of Rref
  end repeat
  if Rref is not empty
  then if "404" is not in Rref
  then if "Not Found" is not in Rref
  then mrl2
  if length(Rref) is "0" and "*" is not in fld "HBuffy" then
    put quote & Nref & quote & cr & cr & "Erreur : machine.domaine
introuvable" & \
        cr & "ou données dynamiques inaccessibles" & cr & "en" && quote
& "GET" & \
        quote & "..." into fld "Buffy"
  else if "404" is in Rref or "Not Found" is in Rref then
    if "*" is not in fld "HBuffy" then
      put "url absente de l'arborescence du" & cr & "site (" & Nref & ")
..."  into fld "Buffy"
    end if
  else
    sort lines of Rref
    repeat with l = the num of lines in Rref down to 1
      if line l of Rref is empty
      then delete line l of Rref
      else if line l of Rref contains "mailto"
      then delete line l of Rref
      else if line l of Rref = line l-1 of Rref
      then delete line l of Rref
      else if Lurl is not in line l of Rref
      then delete line l of Rref
    end repeat
    set itemdelimiter to " "
    repeat with c = the number of lines of Rref down to 1
      put item 1 of line c of Rref into line c of Rref
    end repeat
    if fld "HBuffy" is empty then
      put Lurl & cr & urldecode(Rref) into fld "Buffy"
      put urldecode(Rref) into fld "HBuffy"
      show fld "Buffy"
      put the number of lines of fld "Buffy" && "=>"
    else put cr & urldecode(Rref) after fld "Buffy"
    mrl3
  end if
end mrl1


on mrl2
  global Lurl,Nref,Rref
  set the cursor to watch
  set itemdelimiter to "/"
  if last item of Nref is empty
  then delete last item of Nref
  if "." is in last item of Nref
  then delete last item of Nref
  if "/" is not in Nref
  then put Nref & "//" into dom
  else if "//" is not in Nref
  then put Nref & "/" into dom
  else put Nref into dom
  if item 3 of Lurl is not in dom
  then put item 3 of Lurl after dom
  replace quote with "" in Rref
  repeat with l = the num of lines in Rref down to 1
    if "?" is in line l of Rref
    then delete line l of Rref
    else if "/" is last char of line l of Rref
    then delete line l of Rref
    else if "http://" is not in line l of Rref then
      if "HREF=/" is in line l of Rref
      then replace "HREF=" with dom in line l of Rref
      else replace "HREF=" with dom & "/" in line l of Rref
    else replace "HREF=" with "" in line l of Rref
    get offset("//",line l of Rref,"8")
    if it is not "0"
    then put "" into char (it+8) of line l of Rref
  end repeat
end mrl2


on mrl3
  set the cursor to watch
  if last char of fld "HBuffy" is cr
  then delete last char of fld "HBuffy"
  if last char of last line of fld "HBuffy" is not "*" then
    repeat with l = 1 to the num of lines in fld "HBuffy"
      if "*" is not last char of line l of fld "HBuffy" then
        put line l of fld "HBuffy" into fld "Scan"
        put " *" after line l of fld "HBuffy"
        mrl1
        exit repeat
      end if
    end repeat
  else
    repeat with l = 1 to the num of lines in fld "HBuffy"
      delete last char of line l of fld "HBuffy"
      delete last char of line l of fld "HBuffy"
    end repeat
    repeat with l = the num of lines in fld "HBuffy" down to 1
      if line l of fld "HBuffy" & cr is not in fld "Buffy" and cr & line
l of fld "HBuffy" is not in fld "Buffy"
      then put cr & line l of fld "HBuffy" after fld "Buffy"
    end repeat
    sort lines of fld "Buffy"
    repeat with l = the num of lines in fld "Buffy" down to 1
      if line l of fld "Buffy" is empty
      then delete line l of fld "Buffy"
    end repeat
    repeat with l = the num of lines in fld "Buffy" down to 2
      if line l of fld "Buffy" = line l-1 of fld "Buffy"
      then delete line l of fld "Buffy"
    end repeat
    put empty into fld "HBuffy"
    put word 1 of msg && "<=>" && the number of lines of fld "Buffy"
  end if
end mrl3


Regards, Pierre Sahores

CRDP de l'academie de Creteil
WEB, DB, ASP, VPN, B2B design

Archives: http://www.mail-archive.com/metacard%40lists.best.com/
Info: http://www.xworlds.com/metacard/mailinglist.htm
Please send bug reports to <[EMAIL PROTECTED]>, not this list.

Reply via email to