I'm using a custom 'xmlCharEncodingOutputFunc' named  
'xmlEscapeMinimalEntities', which is intended to only escape the  
minimum required characters, thus leaving the majority of UTF  
characters unescaped and human-readable.

I pass it to xmlSaveSetEscape(), and thus able to save files with node  
contents that are human-readable.  I also call xmlSaveSetAttrEscape()  
at the same time, however I've recently realized that attribute values  
are still being escaped.  Apparently the encoding output function  
isn't being used.  Sample code is provided below.

1. Is there an easier/better way to do this? :(
It's a real disappointment that libxml output of math, scientific  
symbols, foreign language, etc. becomes unreadable because it's a mess  
of escaped numeric values.  XML has a character set definition for a  
reason, I want to use it!
Also, I only know how to do this via xmlSaveToBuffer(), which provides  
the xmlSaveCtxtPtr to then make the *SetEscape() calls on.  And worse,  
versions earlier than 2.6.23 don't even have xmlSaveToBuffer  
implemented, that means this solution isn't even portable (Mac OS X  
10.5 *still* ships with 2.6.16 :()

2. Why isn't xmlSaveSetAttrEscape() working?
BTW, I realize I'll need make a different xmlCharEncodingOutputFunc to  
use with attributes, in order to escape quote characters... I wonder,  
do I no longer need to escape < and > within attribute values?  (you  
can't nest tags there, so are they still special?)

Thanks,
   -Ethan

----- Sample output: -----
<?xml version="1.0"?>
<testing>
   <contentTest>degree ° theta θ less &lt; quote "</contentTest>
   <attrTest prop="degree &#xB0; theta &#x3B8; less &lt; quote &quot;"/>
</testing>

----- Sample code: -----
#include <iostream>
#include <string>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlsave.h>
#include <libxml/xmlversion.h>
#include <libxml/tree.h>
#include <errno.h>

//! if true, the saved document will use automatic indenting and  
formatting
bool autoFormat=true;

int xmlEscapeMinimalEntities(unsigned char* out, int *outlen, const  
xmlChar* in, int *inlen);
unsigned int saveFile(const char* filename);
void saveXML(xmlNode* node);

int main() {
    xmlInitParser();
    saveFile("testout.xml");
    xmlCleanupParser();
}

void saveXML(xmlNode* node) {
    xmlNodeSetName(node,(const xmlChar*)"testing");
    xmlNewChild(node,NULL,(const xmlChar*)"contentTest",(const  
xmlChar*)"degree ° theta θ less < quote \"");
    xmlNode * attrTest = xmlNewChild(node,NULL,(const  
xmlChar*)"attrTest",NULL);
    xmlSetProp(attrTest,(const xmlChar*)"prop",(const xmlChar*)"degree  
° theta θ less < quote \"");
}

unsigned int saveFile(const char* filename) {
    xmlDoc* xmldocument=NULL;
    xmldocument=xmlNewDoc((const xmlChar*)"1.0");
    xmlNode* cur=xmlNewNode(NULL,(const xmlChar*)"");
    saveXML(cur);
    xmlDocSetRootElement(xmldocument,cur);
#if LIBXML_VERSION < 20623
    // versions prior to 2.6.23 don't have saveToBuffer implemented!
    // could use xmlSaveToFilename and fake the return size, but I'd  
rather be correct and
    // give up on un-escaping fancy unicode characters
    int size=xmlSaveFormatFile (filename, xmldocument, autoFormat);
    if(size==-1)
       cerr << "Error: XMLLoadSave::saveFile: xmlSaveFormatFile(\"" <<  
filename << "\",...) returned -1" << endl;
    return size==-1?0:size;
#else
    FILE* f = fopen(filename,"w");
    if(f==NULL) {
       std::cerr << "*** WARNING XMLLoadSave::saveFile: could not open  
file for saving \"" << filename << "\"" << std::endl;
       return 0;
    }
    // xmlSaveDoc doesn't properly return written size, so use buffers  
instead of xmlSaveToFilename:
    xmlBufferPtr xmlbuf = xmlBufferCreate();
    xmlSaveCtxtPtr ctxt = xmlSaveToBuffer(xmlbuf, NULL, (autoFormat ?  
XML_SAVE_FORMAT : 0));
    //xmlSaveCtxtPtr ctxt = xmlSaveToFilename(filename, NULL,  
(autoFormat ? XML_SAVE_FORMAT : 0));
    xmlSaveSetEscape(ctxt,xmlEscapeMinimalEntities);
    xmlSaveSetAttrEscape(ctxt,xmlEscapeMinimalEntities);
    size_t size = xmlSaveDoc(ctxt,xmldocument);
    xmlSaveClose(ctxt);
    ctxt=NULL;
    if(size==(size_t)-1) {
       std::cerr << "Error: XMLLoadSave::saveFile: xmlSaveDoc(\"" <<  
filename << "\",...) returned -1" << std::endl;
       fclose(f);
       return 0;
    }
    size=xmlBufferLength(xmlbuf);
    size_t wrote=fwrite(xmlBufferContent(xmlbuf), 
1,xmlBufferLength(xmlbuf),f);
    if(wrote!=size)
       std::cerr << "*** WARNING XMLLoadSave::saveFile: short write  
(wrote " << wrote << ", expected " << size << ")" << std::endl;
    int err=fclose(f);
    if(err!=0) {
       std::cerr << "*** WARNING XMLLoadSave::saveFile: error '" <<  
strerror(errno) << "' while closing " << filename << std::endl;
       return 0;
    }
    xmlBufferFree(xmlbuf);
    xmlbuf=NULL;
    return size;
#endif
}

int xmlEscapeMinimalEntities(unsigned char* out, int *outlen, const  
xmlChar* in, int *inlen) {
    unsigned char* outstart = out;
    const unsigned char* base = in;
    unsigned char* outend = out + *outlen;
    const unsigned char* inend;
    int val;

    inend = in + (*inlen);

    while ((in < inend) && (out < outend)) {
       if (*in == '<') {
          if (outend - out < 4) break;
          *out++ = '&';
          *out++ = 'l';
          *out++ = 't';
          *out++ = ';';
          in++;
          continue;
       } else if (*in == '>') {
          if (outend - out < 4) break;
          *out++ = '&';
          *out++ = 'g';
          *out++ = 't';
          *out++ = ';';
          in++;
          continue;
       } else if (*in == '&') {
          if (outend - out < 5) break;
          *out++ = '&';
          *out++ = 'a';
          *out++ = 'm';
          *out++ = 'p';
          *out++ = ';';
          in++;
          continue;
       } else if (((*in >= 0x20) && (*in < 0x80)) ||
                (*in == '\n') || (*in == '\t')) {
          /*
           * default case, just copy !
           */
          *out++ = *in++;
          continue;
       } else if (*in >= 0x80) {
          /*
           * We assume we have UTF-8 input.
           */
          if (outend - out < 10) break;

          if (*in < 0xC0) {
             std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities  
encountered non-UTF8 data: " << *in << std::endl;
             in++;
             goto error;
          } else if (*in < 0xE0) {
             if (inend - in < 2) break;
             val = (in[0]) & 0x1F;
             val <<= 6;
             val |= (in[1]) & 0x3F;
             *out++ = *in++;
             *out++ = *in++;
          } else if (*in < 0xF0) {
             if (inend - in < 3) break;
             val = (in[0]) & 0x0F;
             val <<= 6;
             val |= (in[1]) & 0x3F;
             val <<= 6;
             val |= (in[2]) & 0x3F;
             *out++ = *in++;
             *out++ = *in++;
             *out++ = *in++;
          } else if (*in < 0xF8) {
             if (inend - in < 4) break;
             val = (in[0]) & 0x07;
             val <<= 6;
             val |= (in[1]) & 0x3F;
             val <<= 6;
             val |= (in[2]) & 0x3F;
             val <<= 6;
             val |= (in[3]) & 0x3F;
             *out++ = *in++;
             *out++ = *in++;
             *out++ = *in++;
             *out++ = *in++;
          } else {
             std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities  
encountered invalid UTF8 data " << *in << std::endl;
             in++;
             goto error;
          }
          if (!IS_CHAR(val)) {
             std::cerr << "XMLLoadSave::xmlEscapeMinimalEntities  
encountered unknown UTF8 data " << *in << std::endl;
             goto error;
          }

       } else if (IS_BYTE_CHAR(*in)) {
          if (outend - out < 6) break;
          *out++ = *in++;
       } else {
          xmlGenericError(xmlGenericErrorContext,"xmlEscapeEntities :  
char out of range\n");
          in++;
          goto error;
       }
    }
    *outlen = out - outstart;
    *inlen = in - base;
    return(0);
error:
    *outlen = out - outstart;
    *inlen = in - base;
    return(-1);
}

_______________________________________________
xml mailing list, project page  http://xmlsoft.org/
[email protected]
http://mail.gnome.org/mailman/listinfo/xml

Reply via email to