[avro] branch master updated: AVRO-1256: C++ API compileJsonSchema ignores "doc" and custom attributes on a field/record (#345)

thiru Mon, 12 Nov 2018 04:57:51 -0800

This is an automated email from the ASF dual-hosted git repository.

thiru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git



The following commit(s) were added to refs/heads/master by this push:
     new 4a8e814   AVRO-1256: C++ API compileJsonSchema ignores "doc" and 
custom attributes on a field/record (#345)
4a8e814 is described below

commit 4a8e814327be1f18b203784778f8b17d7f9da8b0
Author: Aniket Mokashi <[email protected]>
AuthorDate: Mon Nov 12 04:57:39 2018 -0800

     AVRO-1256: C++ API compileJsonSchema ignores "doc" and custom attributes 
on a field/record (#345)
    
    * AVRO-1256. C++ API compileJsonSchema ignores doc and custom attributes on 
a field/record
    
    * minor code fixes
    
    * minor code fixes
    
    * more std::string fixes
    
    * Add escape_json function to print doc string
    
    * solve merge conflict
    
    * compactSchema changes
    
    * minor refactor
    
    * Fix tests
    
    * fix formatting
    
    * fix formatting
    
    * Fix nullptr and tests
---
 lang/c++/.gitignore            |   2 +
 lang/c++/api/Node.hh           |  10 +++-
 lang/c++/api/NodeImpl.hh       |  49 +++++++++++++++++-
 lang/c++/api/Schema.hh         |   8 ++-
 lang/c++/api/ValidSchema.hh    |   4 ++
 lang/c++/impl/Compiler.cc      | 114 +++++++++++++++++++++++++++++++----------
 lang/c++/impl/DataFile.cc      |   4 +-
 lang/c++/impl/NodeImpl.cc      |  31 +++++++++++
 lang/c++/impl/Schema.cc        |   9 ++++
 lang/c++/impl/ValidSchema.cc   |  78 +++++++++++++++++++++++++---
 lang/c++/jsonschemas/bigrecord |   2 +
 lang/c++/test/DataFileTests.cc |  65 ++++++++++++++++++++---
 lang/c++/test/SchemaTests.cc   | 103 ++++++++++++++++++++++++++-----------
 13 files changed, 400 insertions(+), 79 deletions(-)

diff --git a/lang/c++/.gitignore b/lang/c++/.gitignore
index 76f0125..4ac073b 100644
--- a/lang/c++/.gitignore
+++ b/lang/c++/.gitignore
@@ -3,3 +3,5 @@ build.mac/
 doc/
 test.avro
 test6.df
+test8.df
+test9.df
diff --git a/lang/c++/api/Node.hh b/lang/c++/api/Node.hh
index ebba375..4d54a5e 100644
--- a/lang/c++/api/Node.hh
+++ b/lang/c++/api/Node.hh
@@ -78,7 +78,7 @@ std::ostream& operator << (std::ostream& os, const Name& n) {
 /// objects.
 ///
 /// The Node object uses reference-counted pointers.  This is so that schemas
-/// may be reused in other other schemas, without needing to worry about memory
+/// may be reused in other schemas, without needing to worry about memory
 /// deallocation for nodes that are added to multiple schema parse trees.
 ///
 /// Node has minimal implementation, serving as an abstract base class for
@@ -117,6 +117,12 @@ class AVRO_DECL Node : private boost::noncopyable
     }
     virtual const Name &name() const = 0;
 
+    virtual const std::string &getDoc() const = 0;
+    void setDoc(const std::string &doc) {
+        checkLock();
+        doSetDoc(doc);
+    }
+
     void addLeaf(const NodePtr &newLeaf) {
         checkLock();
         doAddLeaf(newLeaf);
@@ -170,6 +176,8 @@ class AVRO_DECL Node : private boost::noncopyable
     }
 
     virtual void doSetName(const Name &name) = 0;
+    virtual void doSetDoc(const std::string &name) = 0;
+
     virtual void doAddLeaf(const NodePtr &newLeaf) = 0;
     virtual void doAddName(const std::string &name) = 0;
     virtual void doSetFixedSize(int size) = 0;
diff --git a/lang/c++/api/NodeImpl.hh b/lang/c++/api/NodeImpl.hh
index 0f32023..d4c7639 100644
--- a/lang/c++/api/NodeImpl.hh
+++ b/lang/c++/api/NodeImpl.hh
@@ -35,7 +35,7 @@
 namespace avro {
 
 /// Implementation details for Node.  NodeImpl represents all the avro types,
-/// whose properties are enabled are disabled by selecting concept classes.
+/// whose properties are enabled and disabled by selecting concept classes.
 
 template
 <
@@ -52,6 +52,7 @@ class NodeImpl : public Node
     NodeImpl(Type type) :
         Node(type),
         nameAttribute_(),
+        docAttribute_(),
         leafAttributes_(),
         leafNameAttributes_(),
         sizeAttribute_()
@@ -64,13 +65,30 @@ class NodeImpl : public Node
              const SizeConcept &size) :
         Node(type),
         nameAttribute_(name),
+        docAttribute_(),
         leafAttributes_(leaves),
         leafNameAttributes_(leafNames),
         sizeAttribute_(size)
     { }
 
+    // Ctor with "doc"
+    NodeImpl(Type type,
+             const NameConcept &name,
+             const concepts::SingleAttribute<std::string> &doc,
+             const LeavesConcept &leaves,
+             const LeafNamesConcept &leafNames,
+             const SizeConcept &size) :
+        Node(type),
+        nameAttribute_(name),
+        docAttribute_(doc),
+        leafAttributes_(leaves),
+        leafNameAttributes_(leafNames),
+        sizeAttribute_(size)
+    {}
+
     void swap(NodeImpl& impl) {
         std::swap(nameAttribute_, impl.nameAttribute_);
+        std::swap(docAttribute_, impl.docAttribute_);
         std::swap(leafAttributes_, impl.leafAttributes_);
         std::swap(leafNameAttributes_, impl.leafNameAttributes_);
         std::swap(sizeAttribute_, impl.sizeAttribute_);
@@ -78,6 +96,7 @@ class NodeImpl : public Node
     }
 
     bool hasName() const {
+        // e.g.: true for single and multiattributes, false for noattributes.
         return NameConcept::hasAttribute;
     }
 
@@ -89,6 +108,14 @@ class NodeImpl : public Node
         return nameAttribute_.get();
     }
 
+    void doSetDoc(const std::string &doc) {
+      docAttribute_.add(doc);
+    }
+
+    const std::string &getDoc() const {
+      return docAttribute_.get();
+    }
+
     void doAddLeaf(const NodePtr &newLeaf) {
         leafAttributes_.add(newLeaf);
     }
@@ -172,6 +199,10 @@ class NodeImpl : public Node
     }
 
     NameConcept nameAttribute_;
+
+    // Rem: NameConcept type is HasName (= SingleAttribute<Name>), we use 
std::string instead
+    concepts::SingleAttribute<std::string> docAttribute_; /** Doc used to 
compare schemas */
+
     LeavesConcept leafAttributes_;
     LeafNamesConcept leafNameAttributes_;
     SizeConcept sizeAttribute_;
@@ -181,6 +212,8 @@ class NodeImpl : public Node
 typedef concepts::NoAttribute<Name>     NoName;
 typedef concepts::SingleAttribute<Name> HasName;
 
+typedef concepts::SingleAttribute<std::string> HasDoc;
+
 typedef concepts::NoAttribute<NodePtr>      NoLeaves;
 typedef concepts::SingleAttribute<NodePtr>  SingleLeaf;
 typedef concepts::MultiAttribute<NodePtr>   MultiLeaves;
@@ -287,6 +320,20 @@ public:
         }
     }
 
+    NodeRecord(const HasName &name, const HasDoc &doc, const MultiLeaves 
&fields,
+        const LeafNames &fieldsNames,
+        const std::vector<GenericDatum> &dv) :
+        NodeImplRecord(AVRO_RECORD, name, doc, fields, fieldsNames, NoSize()),
+        defaultValues(dv) {
+        for (size_t i = 0; i < leafNameAttributes_.size(); ++i) {
+            if (!nameIndex_.add(leafNameAttributes_.get(i), i)) {
+                throw Exception(boost::format(
+                    "Cannot add duplicate name: %1%") %
+                    leafNameAttributes_.get(i));
+            }
+        }
+    }
+
     void swap(NodeRecord& r) {
         NodeImplRecord::swap(r);
         defaultValues.swap(r.defaultValues);
diff --git a/lang/c++/api/Schema.hh b/lang/c++/api/Schema.hh
index 8ce5f8d..646b95e 100644
--- a/lang/c++/api/Schema.hh
+++ b/lang/c++/api/Schema.hh
@@ -16,11 +16,12 @@
  * limitations under the License.
  */
 
-#ifndef avro_Schema_hh__ 
-#define avro_Schema_hh__ 
+#ifndef avro_Schema_hh__
+#define avro_Schema_hh__
 
 #include "Config.hh"
 #include "NodeImpl.hh"
+#include <string>
 
 /// \file
 ///
@@ -102,6 +103,9 @@ class AVRO_DECL RecordSchema : public Schema {
 public:
     RecordSchema(const std::string &name);
     void addField(const std::string &name, const Schema &fieldSchema);
+
+    std::string getDoc() const;
+    void setDoc(const std::string &);
 };
 
 class AVRO_DECL EnumSchema : public Schema {
diff --git a/lang/c++/api/ValidSchema.hh b/lang/c++/api/ValidSchema.hh
index 30eb33e..d0bbd4e 100644
--- a/lang/c++/api/ValidSchema.hh
+++ b/lang/c++/api/ValidSchema.hh
@@ -50,11 +50,15 @@ public:
     }
 
     void toJson(std::ostream &os) const;
+    std::string toJson(bool prettyPrint = true) const;
 
     void toFlatList(std::ostream &os) const;
 
   protected:
     NodePtr root_;
+
+  private:
+    static std::string compactSchema(const std::string &schema);
 };
 
 } // namespace avro
diff --git a/lang/c++/impl/Compiler.cc b/lang/c++/impl/Compiler.cc
index 725136d..bc0f3cd 100644
--- a/lang/c++/impl/Compiler.cc
+++ b/lang/c++/impl/Compiler.cc
@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <boost/algorithm/string/replace.hpp>
 #include <sstream>
 
 #include "Compiler.hh"
@@ -42,7 +43,7 @@ typedef map<Name, NodePtr> SymbolTable;
 
 // #define DEBUG_VERBOSE
 
-static NodePtr makePrimitive(const std::string& t)
+static NodePtr makePrimitive(const string& t)
 {
     if (t == "null") {
         return NodePtr(new NodePrimitive(AVRO_NULL));
@@ -65,7 +66,7 @@ static NodePtr makePrimitive(const std::string& t)
     }
 }
 
-static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string& 
ns);
+static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string 
&ns);
 
 template <typename T>
 concepts::SingleAttribute<T> asSingleAttribute(const T& t)
@@ -75,17 +76,17 @@ concepts::SingleAttribute<T> asSingleAttribute(const T& t)
     return n;
 }
 
-static bool isFullName(const string& s)
+static bool isFullName(const string &s)
 {
     return s.find('.') != string::npos;
 }
 
-static Name getName(const string& name, const string& ns)
+static Name getName(const string &name, const string &ns)
 {
     return (isFullName(name)) ? Name(name) : Name(name, ns);
 }
 
-static NodePtr makeNode(const std::string& t, SymbolTable& st, const string& 
ns)
+static NodePtr makeNode(const string &t, SymbolTable &st, const string &ns)
 {
     NodePtr result = makePrimitive(t);
     if (result) {
@@ -100,8 +101,15 @@ static NodePtr makeNode(const std::string& t, SymbolTable& 
st, const string& ns)
     throw Exception(boost::format("Unknown type: %1%") % n.fullname());
 }
 
-const json::Object::const_iterator findField(const Entity& e,
-    const Object& m, const string& fieldName)
+/** Returns "true" if the field is in the container */
+// e.g.: can be false for non-mandatory fields
+bool containsField(const Object &m, const string &fieldName) {
+    Object::const_iterator it = m.find(fieldName);
+    return it != m.end();
+}
+
+const json::Object::const_iterator findField(const Entity &e,
+    const Object &m, const string &fieldName)
 {
     Object::const_iterator it = m.find(fieldName);
     if (it == m.end()) {
@@ -112,7 +120,7 @@ const json::Object::const_iterator findField(const Entity& 
e,
     }
 }
 
-template <typename T> void ensureType(const Entity& e, const string& name)
+template <typename T> void ensureType(const Entity &e, const string &name)
 {
     if (e.type() != json::type_traits<T>::type()) {
         throw Exception(boost::format("Json field \"%1%\" is not a %2%: %3%") %
@@ -120,8 +128,8 @@ template <typename T> void ensureType(const Entity& e, 
const string& name)
     }
 }
 
-const string& getStringField(const Entity& e, const Object& m,
-                             const string& fieldName)
+const string& getStringField(const Entity &e, const Object &m,
+                             const string &fieldName)
 {
     Object::const_iterator it = findField(e, m, fieldName);
     ensureType<string>(it->second, fieldName);
@@ -144,6 +152,19 @@ const int64_t getLongField(const Entity& e, const Object& 
m,
     return it->second.longValue();
 }
 
+// Unescape double quotes (") for de-serialization.  This method complements 
the
+// method NodeImpl::escape() which is used for serialization.
+static void unescape(string& s) {
+    boost::replace_all(s, "\\\"", "\"");
+}
+
+const string getDocField(const Entity& e, const Object& m)
+{
+    string doc = getStringField(e, m, "doc");
+    unescape(doc);
+    return doc;
+}
+
 struct Field {
     const string& name;
     const NodePtr schema;
@@ -162,7 +183,7 @@ static void assertType(const Entity& e, EntityType et)
     }
 }
 
-static vector<uint8_t> toBin(const std::string& s)
+static vector<uint8_t> toBin(const string& s)
 {
     vector<uint8_t> result(s.size());
     if (s.size() > 0) {
@@ -278,14 +299,18 @@ static Field makeField(const Entity& e, SymbolTable& st, 
const string& ns)
     Object::const_iterator it = findField(e, m, "type");
     map<string, Entity>::const_iterator it2 = m.find("default");
     NodePtr node = makeNode(it->second, st, ns);
+    if (containsField(m, "doc")) {
+        node->setDoc(getDocField(e, m));
+    }
     GenericDatum d = (it2 == m.end()) ? GenericDatum() :
         makeGenericDatum(node, it2->second, st);
     return Field(n, node, d);
 }
 
-static NodePtr makeRecordNode(const Entity& e,
-    const Name& name, const Object& m, SymbolTable& st, const string& ns)
-{
+// Extended makeRecordNode (with doc).
+static NodePtr makeRecordNode(const Entity& e, const Name& name,
+                              const string* doc, const Object& m,
+                              SymbolTable& st, const string& ns) {
     const Array& v = getArrayField(e, m, "fields");
     concepts::MultiAttribute<string> fieldNames;
     concepts::MultiAttribute<NodePtr> fieldValues;
@@ -297,8 +322,15 @@ static NodePtr makeRecordNode(const Entity& e,
         fieldValues.add(f.schema);
         defaultValues.push_back(f.defaultValue);
     }
-    return NodePtr(new NodeRecord(asSingleAttribute(name),
-        fieldValues, fieldNames, defaultValues));
+    NodeRecord* node;
+    if (doc == NULL) {
+        node = new NodeRecord(asSingleAttribute(name), fieldValues, fieldNames,
+                              defaultValues);
+    } else {
+        node = new NodeRecord(asSingleAttribute(name), asSingleAttribute(*doc),
+                              fieldValues, fieldNames, defaultValues);
+    }
+    return NodePtr(node);
 }
 
 static NodePtr makeEnumNode(const Entity& e,
@@ -313,7 +345,11 @@ static NodePtr makeEnumNode(const Entity& e,
         }
         symbols.add(it->stringValue());
     }
-    return NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
+    NodePtr node = NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
+    if (containsField(m, "doc")) {
+        node->setDoc(getDocField(e, m));
+    }
+    return node;
 }
 
 static NodePtr makeFixedNode(const Entity& e,
@@ -324,16 +360,24 @@ static NodePtr makeFixedNode(const Entity& e,
         throw Exception(boost::format("Size for fixed is not positive: %1%") %
             e.toString());
     }
-    return NodePtr(new NodeFixed(asSingleAttribute(name),
-        asSingleAttribute(v)));
+    NodePtr node =
+        NodePtr(new NodeFixed(asSingleAttribute(name), asSingleAttribute(v)));
+    if (containsField(m, "doc")) {
+        node->setDoc(getDocField(e, m));
+    }
+    return node;
 }
 
 static NodePtr makeArrayNode(const Entity& e, const Object& m,
     SymbolTable& st, const string& ns)
 {
     Object::const_iterator it = findField(e, m, "items");
-    return NodePtr(new NodeArray(asSingleAttribute(
-        makeNode(it->second, st, ns))));
+    NodePtr node = NodePtr(new NodeArray(
+        asSingleAttribute(makeNode(it->second, st, ns))));
+    if (containsField(m, "doc")) {
+        node->setDoc(getDocField(e, m));
+    }
+    return node;
 }
 
 static NodePtr makeMapNode(const Entity& e, const Object& m,
@@ -341,8 +385,12 @@ static NodePtr makeMapNode(const Entity& e, const Object& 
m,
 {
     Object::const_iterator it = findField(e, m, "values");
 
-    return NodePtr(new NodeMap(asSingleAttribute(
-        makeNode(it->second, st, ns))));
+    NodePtr node = NodePtr(new NodeMap(
+        asSingleAttribute(makeNode(it->second, st, ns))));
+    if (containsField(m, "doc")) {
+        node->setDoc(getDocField(e, m));
+    }
+    return node;
 }
 
 static Name getName(const Entity& e, const Object& m, const string& ns)
@@ -380,9 +428,19 @@ static NodePtr makeNode(const Entity& e, const Object& m,
         if (type == "record" || type == "error") {
             result = NodePtr(new NodeRecord());
             st[nm] = result;
-            NodePtr r = makeRecordNode(e, nm, m, st, nm.ns());
-            (boost::dynamic_pointer_cast<NodeRecord>(r))->swap(
-                *boost::dynamic_pointer_cast<NodeRecord>(result));
+            // Get field doc
+            if (containsField(m, "doc")) {
+                string doc = getDocField(e, m);
+
+                NodePtr r = makeRecordNode(e, nm, &doc, m, st, nm.ns());
+                (boost::dynamic_pointer_cast<NodeRecord>(r))->swap(
+                    *boost::dynamic_pointer_cast<NodeRecord>(result));
+            } else {  // No doc
+                NodePtr r =
+                    makeRecordNode(e, nm, NULL, m, st, nm.ns());
+                (boost::dynamic_pointer_cast<NodeRecord>(r))
+                    ->swap(*boost::dynamic_pointer_cast<NodeRecord>(result));
+            }
         } else {
             result = (type == "enum") ? makeEnumNode(e, nm, m) :
                 makeFixedNode(e, nm, m);
@@ -447,7 +505,7 @@ AVRO_DECL ValidSchema compileJsonSchemaFromString(const 
char* input)
         ::strlen(input));
 }
 
-AVRO_DECL ValidSchema compileJsonSchemaFromString(const std::string& input)
+AVRO_DECL ValidSchema compileJsonSchemaFromString(const string& input)
 {
     return compileJsonSchemaFromMemory(
         reinterpret_cast<const uint8_t*>(&input[0]), input.size());
@@ -468,7 +526,7 @@ AVRO_DECL void compileJsonSchema(std::istream &is, 
ValidSchema &schema)
     schema = compile(is);
 }
 
-AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, 
std::string &error)
+AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, string 
&error)
 {
     try {
         compileJsonSchema(is, schema);
diff --git a/lang/c++/impl/DataFile.cc b/lang/c++/impl/DataFile.cc
index a71860d..1777d53 100644
--- a/lang/c++/impl/DataFile.cc
+++ b/lang/c++/impl/DataFile.cc
@@ -121,7 +121,7 @@ void DataFileWriterBase::init(const ValidSchema &schema, 
size_t syncInterval, co
     } else {
       throw Exception(boost::format("Unknown codec: %1%") % codec);
     }
-    setMetadata(AVRO_SCHEMA_KEY, toString(schema));
+    setMetadata(AVRO_SCHEMA_KEY, schema.toJson(false));
 
     writeHeader();
     encoderPtr_->init(*buffer_);
@@ -296,7 +296,7 @@ void DataFileReaderBase::init()
 void DataFileReaderBase::init(const ValidSchema& readerSchema)
 {
     readerSchema_ = readerSchema;
-    dataDecoder_  = (toString(readerSchema_) != toString(dataSchema_)) ?
+    dataDecoder_  = (readerSchema_.toJson(true) != dataSchema_.toJson(true)) ?
         resolvingDecoder(dataSchema_, readerSchema_, binaryDecoder()) :
         binaryDecoder();
     readDataBlock();
diff --git a/lang/c++/impl/NodeImpl.cc b/lang/c++/impl/NodeImpl.cc
index 50b7fba..bdb05a0 100644
--- a/lang/c++/impl/NodeImpl.cc
+++ b/lang/c++/impl/NodeImpl.cc
@@ -17,6 +17,8 @@
  */
 
 
+#include <sstream>
+#include <iomanip>
 #include <boost/algorithm/string/replace.hpp>
 #include "NodeImpl.hh"
 
@@ -25,6 +27,7 @@ using std::string;
 namespace avro {
 
 namespace {
+
 // Escape string for serialization.
 string escape(const string &unescaped) {
   string s;
@@ -219,12 +222,20 @@ void
 NodePrimitive::printJson(std::ostream &os, int depth) const
 {
     os << '\"' << type() << '\"';
+    if (getDoc().size()) {
+        os << ",\n" << indent(depth) << "\"doc\": \""
+           << escape(getDoc()) << "\"";
+    }
 }
 
 void
 NodeSymbolic::printJson(std::ostream &os, int depth) const
 {
     os << '\"' << nameAttribute_.get() << '\"';
+    if (getDoc().size()) {
+        os << ",\n" << indent(depth) << "\"doc\": \""
+           << escape(getDoc()) << "\"";
+    }
 }
 
 static void printName(std::ostream& os, const Name& n, int depth)
@@ -241,6 +252,10 @@ NodeRecord::printJson(std::ostream &os, int depth) const
     os << "{\n";
     os << indent(++depth) << "\"type\": \"record\",\n";
     printName(os, nameAttribute_.get(), depth);
+    if (getDoc().size()) {
+        os << indent(depth) << "\"doc\": \""
+           << escape(getDoc()) << "\",\n";
+    }
     os << indent(depth) << "\"fields\": [";
 
     size_t fields = leafAttributes_.size();
@@ -430,6 +445,10 @@ NodeEnum::printJson(std::ostream &os, int depth) const
 {
     os << "{\n";
     os << indent(++depth) << "\"type\": \"enum\",\n";
+    if (getDoc().size()) {
+        os << indent(depth) << "\"doc\": \""
+           << escape(getDoc()) << "\",\n";
+    }
     printName(os, nameAttribute_.get(), depth);
     os << indent(depth) << "\"symbols\": [\n";
 
@@ -451,6 +470,10 @@ NodeArray::printJson(std::ostream &os, int depth) const
 {
     os << "{\n";
     os << indent(depth+1) << "\"type\": \"array\",\n";
+    if (getDoc().size()) {
+        os << indent(depth+1) << "\"doc\": \""
+           << escape(getDoc()) << "\",\n";
+    }
     os << indent(depth+1) <<  "\"items\": ";
     leafAttributes_.get()->printJson(os, depth+1);
     os << '\n';
@@ -462,6 +485,10 @@ NodeMap::printJson(std::ostream &os, int depth) const
 {
     os << "{\n";
     os << indent(depth+1) <<"\"type\": \"map\",\n";
+    if (getDoc().size()) {
+        os << indent(depth+1) << "\"doc\": \""
+           << escape(getDoc()) << "\",\n";
+    }
     os << indent(depth+1) << "\"values\": ";
     leafAttributes_.get(1)->printJson(os, depth+1);
     os << '\n';
@@ -490,6 +517,10 @@ NodeFixed::printJson(std::ostream &os, int depth) const
 {
     os << "{\n";
     os << indent(++depth) << "\"type\": \"fixed\",\n";
+    if (getDoc().size()) {
+        os << indent(depth) << "\"doc\": \""
+           << escape(getDoc()) << "\",\n";
+    }
     printName(os, nameAttribute_.get(), depth);
     os << indent(depth) << "\"size\": " << sizeAttribute_.get() << "\n";
     os << indent(--depth) << '}';
diff --git a/lang/c++/impl/Schema.cc b/lang/c++/impl/Schema.cc
index b5457ae..6676574 100644
--- a/lang/c++/impl/Schema.cc
+++ b/lang/c++/impl/Schema.cc
@@ -51,6 +51,15 @@ RecordSchema::addField(const std::string &name, const Schema 
&fieldSchema)
     node_->addLeaf(fieldSchema.root());
 }
 
+std::string RecordSchema::getDoc() const
+{
+    return node_->getDoc();
+}
+void RecordSchema::setDoc(const std::string& doc)
+{
+    node_->setDoc(doc);
+}
+
 EnumSchema::EnumSchema(const std::string &name) :
     Schema(new NodeEnum)
 {
diff --git a/lang/c++/impl/ValidSchema.cc b/lang/c++/impl/ValidSchema.cc
index bd28079..17ab994 100644
--- a/lang/c++/impl/ValidSchema.cc
+++ b/lang/c++/impl/ValidSchema.cc
@@ -24,16 +24,16 @@
 #include "Node.hh"
 
 using std::string;
+using std::ostringstream;
 using std::make_pair;
 using boost::format;
 using boost::shared_ptr;
 using boost::static_pointer_cast;
 
 namespace avro {
-
 typedef std::map<Name, NodePtr> SymbolMap;
 
-static bool validate(const NodePtr &node, SymbolMap &symbolMap) 
+static bool validate(const NodePtr &node, SymbolMap &symbolMap)
 {
     if (! node->isValid()) {
         throw Exception(format("Schema is invalid, due to bad node of type 
%1%")
@@ -77,7 +77,7 @@ static bool validate(const NodePtr &node, SymbolMap 
&symbolMap)
             // map (which could potentially create circular shared pointer
             // links that could not be easily freed), replace this node with a
             // symbolic link to the original one.
-            
+
             node->setLeafToSymbolic(i, symbolMap.find(leaf->name())->second);
         }
     }
@@ -101,7 +101,7 @@ ValidSchema::ValidSchema(const Schema &schema) : 
root_(schema.root())
     validate(root_);
 }
 
-ValidSchema::ValidSchema() : root_(NullSchema().root()) 
+ValidSchema::ValidSchema() : root_(NullSchema().root())
 {
     validate(root_);
 }
@@ -113,18 +113,80 @@ ValidSchema::setSchema(const Schema &schema)
     validate(root_);
 }
 
-void 
+void
 ValidSchema::toJson(std::ostream &os) const
-{ 
+{
     root_->printJson(os, 0);
     os << '\n';
 }
 
-void 
+string
+ValidSchema::toJson(bool prettyPrint) const
+{
+    ostringstream oss;
+    toJson(oss);
+    if (!prettyPrint) {
+        return compactSchema(oss.str());
+    }
+    return oss.str();
+}
+
+void
 ValidSchema::toFlatList(std::ostream &os) const
-{ 
+{
     root_->printBasicInfo(os);
 }
 
+/*
+ * compactSchema compacts and returns a formatted string representation
+ * of a ValidSchema object by removing the whitespaces outside of the quoted
+ * field names and values. It can handle the cases where the quoted value is
+ * in UTF-8 format. Note that this method is not responsible for validating
+ * the schema.
+ */
+string ValidSchema::compactSchema(const string& schema) {
+    bool insideQuote = false;
+    size_t newPos = 0;
+    string data(schema.data());
+
+    for (size_t currentPos = 0; currentPos < schema.size(); currentPos++) {
+        if (!insideQuote && std::isspace(data[currentPos])) {
+            // Skip the white spaces outside quotes.
+            continue;
+        }
+
+        if (data[currentPos] == '\"') {
+            // It is valid for a quote to be part of the value for some fields,
+            // e.g., the "doc" field.  In that case, the quote is expected to 
be
+            // escaped inside the schema.  Since the escape character '\\' 
could
+            // be escaped itself, we need to check whether there are an even
+            // number of consecutive slashes prior to the quote.
+            int leadingSlashes = 0;
+            for (int i = newPos - 1; i >= 0; i--) {
+                if (data[i] == '\\') {
+                    leadingSlashes++;
+                } else {
+                    break;
+                }
+            }
+            if (leadingSlashes % 2 == 0) {
+                // Found a real quote which identifies either the start or the
+                // end of a field name or value.
+                insideQuote = !insideQuote;
+            }
+        }
+        data[newPos++] = data[currentPos];
+    }
+
+    if (insideQuote) {
+        throw Exception("Schema is not well formed with mismatched quotes");
+    }
+
+    if (newPos < schema.size()) {
+        data.resize(newPos);
+    }
+    return data;
+}
+
 } // namespace avro
 
diff --git a/lang/c++/jsonschemas/bigrecord b/lang/c++/jsonschemas/bigrecord
index ba430a0..af8a5ad 100644
--- a/lang/c++/jsonschemas/bigrecord
+++ b/lang/c++/jsonschemas/bigrecord
@@ -1,9 +1,11 @@
 {
     "type": "record",
+    "doc": "Top level Doc.",
     "name": "RootRecord",
     "fields": [
         {
             "name": "mylong",
+            "doc": "mylong field doc.",
             "type": "long"
         },
         {
diff --git a/lang/c++/test/DataFileTests.cc b/lang/c++/test/DataFileTests.cc
index 8d8c7b0..bb2efc6 100644
--- a/lang/c++/test/DataFileTests.cc
+++ b/lang/c++/test/DataFileTests.cc
@@ -45,6 +45,7 @@ using boost::unit_test::test_suite;
 using avro::ValidSchema;
 using avro::GenericDatum;
 using avro::GenericRecord;
+using avro::NodePtr;
 
 const int count = 1000;
 
@@ -133,13 +134,29 @@ static const char dsch[] = "{\"type\": \"record\","
         "{\"name\":\"re\", \"type\":\"double\"},"
         "{\"name\":\"im\", \"type\":\"double\"}"
     "]}";
-static const char dblsch[] = "{\"type\": \"record\","
-    "\"name\":\"ComplexDouble\", \"fields\": ["
+static const char dblsch[] =
+    "{\"type\": \"record\","
+    "\"name\":\"ComplexDouble\", "
+    "\"doc\": \"\\\"Quoted_doc_string\\\"\", "
+    "\"fields\": ["
         "{\"name\":\"re\", \"type\":\"double\"}"
     "]}";
 static const char fsch[] = "{\"type\": \"fixed\","
     "\"name\":\"Fixed_32\", \"size\":4}";
-
+static const char ischWithDoc[] =
+    "{\"type\": \"record\","
+    "\"name\":\"ComplexInteger\", "
+    "\"doc\": \"record_doc\", "
+    "\"fields\": ["
+        "{\"name\":\"re1\", \"type\":\"long\", \"doc\": \"field_doc\"},"
+        "{\"name\":\"re2\", \"type\":\"long\"},"
+        "{\"name\":\"re3\", \"type\":\"long\", \"doc\": \"\"},"
+        "{\"name\":\"re4\", \"type\":\"long\", "
+        "\"doc\": \"A_\\\"quoted_doc\\\"\"},"
+        "{\"name\":\"re5\", \"type\":\"long\", \"doc\": \"doc with\nspaces\"},"
+        "{\"name\":\"re6\", \"type\":\"long\", "
+        "\"doc\": \"extra slashes\\\\\\\\\"}"
+    "]}";
 
 string toString(const ValidSchema& s)
 {
@@ -561,18 +578,42 @@ public:
 #endif
 
     void testSchemaReadWrite() {
-    uint32_t a=42;
-    {
+        uint32_t a=42;
+        {
             avro::DataFileWriter<uint32_t> df(filename, writerSchema);
-        df.write(a);
+            df.write(a);
         }
 
         {
-        avro::DataFileReader<uint32_t> df(filename);
-        uint32_t b;
+            avro::DataFileReader<uint32_t> df(filename);
+            uint32_t b;
             df.read(b);
             BOOST_CHECK_EQUAL(b, a);
+        }
     }
+
+    void testSchemaReadWriteWithDoc() {
+        uint32_t a=42;
+        {
+          avro::DataFileWriter<uint32_t> df(filename, writerSchema);
+          df.write(a);
+        }
+
+        {
+          avro::DataFileReader<uint32_t> df(filename);
+          uint32_t b;
+          df.read(b);
+          BOOST_CHECK_EQUAL(b, a);
+
+          const NodePtr& root = df.readerSchema().root();
+          BOOST_CHECK_EQUAL(root->getDoc(), "record_doc");
+          BOOST_CHECK_EQUAL(root->leafAt(0)->getDoc(), "field_doc");
+          BOOST_CHECK_EQUAL(root->leafAt(1)->getDoc(), "");
+          BOOST_CHECK_EQUAL(root->leafAt(2)->getDoc(), "");
+          BOOST_CHECK_EQUAL(root->leafAt(3)->getDoc(), "A_\"quoted_doc\"");
+          BOOST_CHECK_EQUAL(root->leafAt(4)->getDoc(), "doc with\nspaces");
+          BOOST_CHECK_EQUAL(root->leafAt(5)->getDoc(), "extra slashes\\\\");
+        }
     }
 };
 
@@ -677,6 +718,14 @@ init_unit_test_suite(int argc, char *argv[])
         ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testCleanup, t));
         boost::unit_test::framework::master_test_suite().add(ts);
     }
+    {
+        test_suite *ts = BOOST_TEST_SUITE("DataFile tests: test12.df");
+        shared_ptr<DataFileTest> t(new DataFileTest("test12.df", ischWithDoc, 
ischWithDoc));
+        ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testWrite, t));
+        
ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testSchemaReadWriteWithDoc, t));
+        ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testCleanup, t));
+        boost::unit_test::framework::master_test_suite().add(ts);
+    }
 
     return 0;
 }
diff --git a/lang/c++/test/SchemaTests.cc b/lang/c++/test/SchemaTests.cc
index 026c4d0..f6d6195 100644
--- a/lang/c++/test/SchemaTests.cc
+++ b/lang/c++/test/SchemaTests.cc
@@ -47,42 +47,42 @@ const char* basicSchemas[] = {
     "{ \"type\": \"string\" }",
 
     // Record
-    "{\"type\": \"record\",\"name\": \"Test\",\"fields\": []}",
-    "{\"type\": \"record\",\"name\": \"Test\",\"fields\": "
-        "[{\"name\": \"f\",\"type\": \"long\"}]}",
-    "{\"type\": \"record\",\"name\": \"Test\",\"fields\": "
-        "[{\"name\": \"f1\",\"type\": \"long\"},"
-        "{\"name\": \"f2\", \"type\": \"int\"}]}",
-    "{\"type\": \"error\",\"name\": \"Test\",\"fields\": "
-        "[{\"name\": \"f1\",\"type\": \"long\"},"
-        "{\"name\": \"f2\", \"type\": \"int\"}]}",
+    
"{\"type\":\"record\",\"name\":\"Test\",\"doc\":\"Doc_string\",\"fields\":[]}",
+    "{\"type\":\"record\",\"name\":\"Test\",\"fields\":"
+        "[{\"name\":\"f\",\"type\":\"long\"}]}",
+    "{\"type\":\"record\",\"name\":\"Test\",\"fields\":"
+        "[{\"name\":\"f1\",\"type\":\"long\",\"doc\":\"field_doc\"},"
+        "{\"name\":\"f2\",\"type\":\"int\"}]}",
+    "{\"type\":\"error\",\"name\":\"Test\",\"fields\":"
+        "[{\"name\":\"f1\",\"type\":\"long\"},"
+        "{\"name\":\"f2\",\"type\":\"int\"}]}",
 
     // Recursive.
     "{\"type\":\"record\",\"name\":\"LongList\","
-        "\"fields\":[{\"name\":\"value\",\"type\":\"long\"},"
+        
"\"fields\":[{\"name\":\"value\",\"type\":\"long\",\"doc\":\"recursive_doc\"},"
         "{\"name\":\"next\",\"type\":[\"LongList\",\"null\"]}]}",
     // Enum
-    "{\"type\": \"enum\", \"name\": \"Test\", \"symbols\": [\"A\", \"B\"]}",
+    
"{\"type\":\"enum\",\"doc\":\"enum_doc\",\"name\":\"Test\",\"symbols\":[\"A\",\"B\"]}",
 
     // Array
-    "{\"type\": \"array\", \"items\": \"long\"}",
-    "{\"type\": \"array\",\"items\": {\"type\": \"enum\", "
-        "\"name\": \"Test\", \"symbols\": [\"A\", \"B\"]}}",
+    "{\"type\":\"array\",\"doc\":\"array_doc\",\"items\":\"long\"}",
+    "{\"type\":\"array\",\"items\":{\"type\":\"enum\","
+        "\"name\":\"Test\",\"symbols\":[\"A\",\"B\"]}}",
 
     // Map
-    "{\"type\": \"map\", \"values\": \"long\"}",
-    "{\"type\": \"map\",\"values\": {\"type\": \"enum\", "
-        "\"name\": \"Test\", \"symbols\": [\"A\", \"B\"]}}",
+    "{\"type\":\"map\",\"doc\":\"map_doc\",\"values\":\"long\"}",
+    "{\"type\":\"map\",\"values\":{\"type\":\"enum\", "
+        "\"name\":\"Test\",\"symbols\":[\"A\",\"B\"]}}",
 
     // Union
-    "[\"string\", \"null\", \"long\"]",
+    "[\"string\",\"null\",\"long\"]",
 
     // Fixed
-    "{ \"type\": \"fixed\", \"name\": \"Test\", \"size\": 1}",
-    "{\"type\": \"fixed\", \"name\": \"MyFixed\", "
-        "\"namespace\": \"org.apache.hadoop.avro\", \"size\": 1}",
-    "{ \"type\": \"fixed\", \"name\": \"Test\", \"size\": 1}",
-    "{ \"type\": \"fixed\", \"name\": \"Test\", \"size\": 1}",
+    "{\"type\":\"fixed\",\"doc\":\"fixed_doc\",\"name\":\"Test\",\"size\":1}",
+    "{\"type\":\"fixed\",\"name\":\"MyFixed\","
+        "\"namespace\":\"org.apache.hadoop.avro\",\"size\":1}",
+    "{\"type\":\"fixed\",\"name\":\"Test\",\"size\":1}",
+    "{\"type\":\"fixed\",\"name\":\"Test\",\"size\":1}",
 
     // Extra attributes (should be ignored)
     "{\"type\": \"null\", \"extra attribute\": \"should be ignored\"}",
@@ -135,7 +135,7 @@ const char* basicSchemaErrors[] = {
     // Duplicate type
     "[{\"type\": \"array\", \"items\": \"long\"}, "
         "{\"type\": \"array\", \"items\": \"string\"}]",
-        
+
     // Fixed
     // No size
     "{\"type\": \"fixed\", \"name\": \"Missing size\"}",
@@ -166,7 +166,7 @@ const char* roundTripSchemas[] = {
     "{\"type\":\"record\",\"name\":\"Test\",\"fields\":"
         "[{\"name\":\"f1\",\"type\":\"long\"},"
         "{\"name\":\"f2\",\"type\":\"int\"}]}",
-/* Avro-C++ cannot do a round-trip on error schemas. 
+/* Avro-C++ cannot do a round-trip on error schemas.
  * "{\"type\":\"error\",\"name\":\"Test\",\"fields\":"
  *       "[{\"name\":\"f1\",\"type\":\"long\"},"
  *       "{\"name\":\"f2\",\"type\":\"int\"}]}"
@@ -199,7 +199,32 @@ const char* roundTripSchemas[] = {
     "{\"type\":\"fixed\",\"name\":\"Test\",\"size\":1}"
 };
 
+const char* schemasToCompact[] = {
+    // Schema without any whitespace
+    "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[]}",
+
+    // Schema with whitespaces outside of field names/values only.
+    "{\"type\":   \"record\",\n   \n\"name\":\"Test\", \t\t\"fields\":[]}\n 
\n",
 
+    // Schema with whitespaces both inside and outside of field names/values.
+    "{\"type\":   \"record\",  \"name\":               \"ComplexInteger\"\n, "
+    "\"doc\": \"record_doc °C \u00f8 \x1f \\n \n \t\", "
+    "\"fields\": ["
+        "{\"name\":   \"re1\", \"type\":               \"long\", "
+        "\"doc\":   \"A \\\"quoted doc\\\"\"      },                 "
+        "{\"name\":  \"re2\", \"type\":   \"long\", \n\t"
+        "\"doc\": \"extra slashes\\\\\\\\\"}"
+    "]}"};
+
+const char* compactSchemas[] = {
+    "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[]}",
+    "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[]}",
+    "{\"type\":\"record\",\"name\":\"ComplexInteger\","
+    "\"doc\":\"record_doc °C \u00f8 \\u001f \\n \\n \\t\","
+    "\"fields\":["
+        "{\"name\":\"re1\",\"type\":\"long\",\"doc\":\"A \\\"quoted 
doc\\\"\"},"
+        "{\"name\":\"re2\",\"type\":\"long\",\"doc\":\"extra 
slashes\\\\\\\\\"}"
+    "]}"};
 
 static void testBasic(const char* schema)
 {
@@ -219,17 +244,36 @@ static void testCompile(const char* schema)
     compileJsonSchemaFromString(std::string(schema));
 }
 
-// Test that the JSON output from a valid schema matches the JSON that was 
+// Test that the JSON output from a valid schema matches the JSON that was
 // used to construct it, apart from whitespace changes.
 static void testRoundTrip(const char* schema)
 {
     BOOST_TEST_CHECKPOINT(schema);
-    avro::ValidSchema compiledSchema = 
compileJsonSchemaFromString(std::string(schema));
+    avro::ValidSchema compiledSchema =
+        compileJsonSchemaFromString(std::string(schema));
     std::ostringstream os;
     compiledSchema.toJson(os);
     std::string result = os.str();
     result.erase(std::remove_if(result.begin(), result.end(), ::isspace), 
result.end()); // Remove whitespace
     BOOST_CHECK(result == std::string(schema));
+    // Verify that the compact schema from toJson has the same content as the
+    // schema.
+    std::string result2 = compiledSchema.toJson(false);
+    BOOST_CHECK(result2 == std::string(schema));
+}
+
+static void testCompactSchemas()
+{
+  for (size_t i = 0; i < sizeof(schemasToCompact)/ 
sizeof(schemasToCompact[0]); i++)
+  {
+    const char* schema = schemasToCompact[i];
+    BOOST_TEST_CHECKPOINT(schema);
+    avro::ValidSchema compiledSchema =
+        compileJsonSchemaFromString(std::string(schema));
+
+    std::string result = compiledSchema.toJson(false);
+    BOOST_CHECK_EQUAL(result, compactSchemas[i]);
+  }
 }
 
 }
@@ -239,10 +283,10 @@ static void testRoundTrip(const char* schema)
 
 #define ADD_PARAM_TEST(ts, func, data) \
     ts->add(BOOST_PARAM_TEST_CASE(&func, data, ENDOF(data)))
-    
+
 
 boost::unit_test::test_suite*
-init_unit_test_suite(int argc, char* argv[]) 
+init_unit_test_suite(int argc, char* argv[])
 {
     using namespace boost::unit_test;
 
@@ -252,5 +296,6 @@ init_unit_test_suite(int argc, char* argv[])
         avro::schema::basicSchemaErrors);
     ADD_PARAM_TEST(ts, avro::schema::testCompile, avro::schema::basicSchemas);
     ADD_PARAM_TEST(ts, avro::schema::testRoundTrip, 
avro::schema::roundTripSchemas);
+    ts->add(BOOST_TEST_CASE(&avro::schema::testCompactSchemas));
     return ts;
 }

[avro] branch master updated: AVRO-1256: C++ API compileJsonSchema ignores "doc" and custom attributes on a field/record (#345)

Reply via email to