Hi Alex,

> So please take your time, and send it to me when ready.

here is the XML parser:

# expects well formed XML
# encoding by picolisp (utf8 "only", no utf16 etc.)
# trim whitespace except in cdata
# ignore <? <!-- <!DOCTYPE
# non-builtin entities as normal text: &ent; => ent
(de _xml (In Char)
   (unless Char
      (skip)
      (unless (= "<" (char))
         (quit "Bad XML") ) )
   (case (peek)
      ("?"
       (from "?>")
       (unless In (_xml2 In)))
      ("!"
       (char)
       (case (peek)
          ("-"
           (ifn (= '`(chop "--") (list (char) (char)))
              (quit "XML comment expected")
              (from "-->")
              (unless In (_xml2 In))))
          ("D"
           (ifn (= '`(chop "DOCTYPE")
                     (list (char) (char) (char) (char) (char) (char) (char)))
              (quit "XML DOCTYPE expected")
              (when (= "[" (from "[" ">"))
                 (use X
                    (loop
                       (T (= "]" (setq X (from "]" "\"" "'" "<!--"))))
                       (case X
                          ("\"" (from "\""))
                          ("'" (from "'"))
                          ("<!--" (from "-->")))))
                 (from ">"))
              (unless In (_xml2 In))))
          ("["
           (ifn (= '`(chop "[CDATA[")
                     (list (char) (char) (char) (char) (char) (char) (char)))
              (quit "XML CDATA expected")
              (prog1 # ??? echo to string?
                 (pipe (echo "]]>") (till NIL T))
                 (from "]]>"))))
          (T (quit "Unhandled XML tag"))))
      (T
       (let Tok (till " ^I^M^J/>" T)
          (use X
             (make
                (link (intern (pack Tok)))
                (let L
                   (make
                      (loop
                         (NIL (skip) (quit "Unexpected end of XML" Tok))
                         (T (member @ '("/" ">")))
                         (NIL (setq X (intern (pack (trim (till "="))))))
                         (char)
                         (skip)
                         (let C (char)
                            (unless (member C '("\"" "'"))
                               (quit "XML attribute quote expected" X) )
                            (link (cons X (pack (xmlEsc (till C))))))
                         (char) ) )
                   (if (= "/" (char))
                      (prog (char) (and L (link L)))
                      (link L)
                      (loop
                         (NIL (skip) (quit "Unexpected end of XML" Tok))
                         (T (and (= "<" (setq X (char))) (= "/" (peek)))
                            (char)
                            (unless (= Tok (till " ^I^M^J/>" T))
                               (quit "Unbalanced XML" Tok) )
                            (skip)
                            (char) )
                         (if (= "<" X)
                            (when (_xml2 T "<")
                               (link @))
                            (link
                               (pack (xmlEsc (trim (cons X (till 
"^M^J<"))))))))))))))))

_xml should be called from xml function as (_xml), like the previous
_xml2 code.

I found a few XML files where the above code fails but this is due to:

1) utf-8 byte order mark is not understood.  The UTF-8 representation
   of the BOM is the byte sequence EF BB BF.  I am not sure how others
   handle this but it should not be part of the parser probably.

2) some "unusual" utf-8 characters.  Not sure why I get these
   failures.  However, this is quite minor problem and might be a bug
   in picolisp utf-8 fandling code?

   Example file:

00000000: 3c64 6f63 3ef0 9080 80f4 8fbf bd3c 2f64  <doc>........</d
00000010: 6f63 3e                                  oc>

   Any ideas?

Also, I think that a better way of checking whether a file is an XML
file is:

(de xml! (F)
   (call "sh" "-c" (pack "xmlstarlet val -w -q " F " 1>/dev/null 2>/dev/null")))

or something like that, using a specialized validation program.

Another check might be encoding check like:

(in (list "enca" "-L" "none" F)
   (not (from "UCS-2" "Unrecognized" "non-text")))

I hope people will find this useful.

Thanks,

Tomas
-- 
UNSUBSCRIBE: mailto:[EMAIL PROTECTED]

Reply via email to