This is an automated email from the ASF dual-hosted git repository.
thiru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/master by this push:
new 4a8e814 AVRO-1256: C++ API compileJsonSchema ignores "doc" and
custom attributes on a field/record (#345)
4a8e814 is described below
commit 4a8e814327be1f18b203784778f8b17d7f9da8b0
Author: Aniket Mokashi <[email protected]>
AuthorDate: Mon Nov 12 04:57:39 2018 -0800
AVRO-1256: C++ API compileJsonSchema ignores "doc" and custom attributes
on a field/record (#345)
* AVRO-1256. C++ API compileJsonSchema ignores doc and custom attributes on
a field/record
* minor code fixes
* minor code fixes
* more std::string fixes
* Add escape_json function to print doc string
* solve merge conflict
* compactSchema changes
* minor refactor
* Fix tests
* fix formatting
* fix formatting
* Fix nullptr and tests
---
lang/c++/.gitignore | 2 +
lang/c++/api/Node.hh | 10 +++-
lang/c++/api/NodeImpl.hh | 49 +++++++++++++++++-
lang/c++/api/Schema.hh | 8 ++-
lang/c++/api/ValidSchema.hh | 4 ++
lang/c++/impl/Compiler.cc | 114 +++++++++++++++++++++++++++++++----------
lang/c++/impl/DataFile.cc | 4 +-
lang/c++/impl/NodeImpl.cc | 31 +++++++++++
lang/c++/impl/Schema.cc | 9 ++++
lang/c++/impl/ValidSchema.cc | 78 +++++++++++++++++++++++++---
lang/c++/jsonschemas/bigrecord | 2 +
lang/c++/test/DataFileTests.cc | 65 ++++++++++++++++++++---
lang/c++/test/SchemaTests.cc | 103 ++++++++++++++++++++++++++-----------
13 files changed, 400 insertions(+), 79 deletions(-)
diff --git a/lang/c++/.gitignore b/lang/c++/.gitignore
index 76f0125..4ac073b 100644
--- a/lang/c++/.gitignore
+++ b/lang/c++/.gitignore
@@ -3,3 +3,5 @@ build.mac/
doc/
test.avro
test6.df
+test8.df
+test9.df
diff --git a/lang/c++/api/Node.hh b/lang/c++/api/Node.hh
index ebba375..4d54a5e 100644
--- a/lang/c++/api/Node.hh
+++ b/lang/c++/api/Node.hh
@@ -78,7 +78,7 @@ std::ostream& operator << (std::ostream& os, const Name& n) {
/// objects.
///
/// The Node object uses reference-counted pointers. This is so that schemas
-/// may be reused in other other schemas, without needing to worry about memory
+/// may be reused in other schemas, without needing to worry about memory
/// deallocation for nodes that are added to multiple schema parse trees.
///
/// Node has minimal implementation, serving as an abstract base class for
@@ -117,6 +117,12 @@ class AVRO_DECL Node : private boost::noncopyable
}
virtual const Name &name() const = 0;
+ virtual const std::string &getDoc() const = 0;
+ void setDoc(const std::string &doc) {
+ checkLock();
+ doSetDoc(doc);
+ }
+
void addLeaf(const NodePtr &newLeaf) {
checkLock();
doAddLeaf(newLeaf);
@@ -170,6 +176,8 @@ class AVRO_DECL Node : private boost::noncopyable
}
virtual void doSetName(const Name &name) = 0;
+ virtual void doSetDoc(const std::string &name) = 0;
+
virtual void doAddLeaf(const NodePtr &newLeaf) = 0;
virtual void doAddName(const std::string &name) = 0;
virtual void doSetFixedSize(int size) = 0;
diff --git a/lang/c++/api/NodeImpl.hh b/lang/c++/api/NodeImpl.hh
index 0f32023..d4c7639 100644
--- a/lang/c++/api/NodeImpl.hh
+++ b/lang/c++/api/NodeImpl.hh
@@ -35,7 +35,7 @@
namespace avro {
/// Implementation details for Node. NodeImpl represents all the avro types,
-/// whose properties are enabled are disabled by selecting concept classes.
+/// whose properties are enabled and disabled by selecting concept classes.
template
<
@@ -52,6 +52,7 @@ class NodeImpl : public Node
NodeImpl(Type type) :
Node(type),
nameAttribute_(),
+ docAttribute_(),
leafAttributes_(),
leafNameAttributes_(),
sizeAttribute_()
@@ -64,13 +65,30 @@ class NodeImpl : public Node
const SizeConcept &size) :
Node(type),
nameAttribute_(name),
+ docAttribute_(),
leafAttributes_(leaves),
leafNameAttributes_(leafNames),
sizeAttribute_(size)
{ }
+ // Ctor with "doc"
+ NodeImpl(Type type,
+ const NameConcept &name,
+ const concepts::SingleAttribute<std::string> &doc,
+ const LeavesConcept &leaves,
+ const LeafNamesConcept &leafNames,
+ const SizeConcept &size) :
+ Node(type),
+ nameAttribute_(name),
+ docAttribute_(doc),
+ leafAttributes_(leaves),
+ leafNameAttributes_(leafNames),
+ sizeAttribute_(size)
+ {}
+
void swap(NodeImpl& impl) {
std::swap(nameAttribute_, impl.nameAttribute_);
+ std::swap(docAttribute_, impl.docAttribute_);
std::swap(leafAttributes_, impl.leafAttributes_);
std::swap(leafNameAttributes_, impl.leafNameAttributes_);
std::swap(sizeAttribute_, impl.sizeAttribute_);
@@ -78,6 +96,7 @@ class NodeImpl : public Node
}
bool hasName() const {
+ // e.g.: true for single and multiattributes, false for noattributes.
return NameConcept::hasAttribute;
}
@@ -89,6 +108,14 @@ class NodeImpl : public Node
return nameAttribute_.get();
}
+ void doSetDoc(const std::string &doc) {
+ docAttribute_.add(doc);
+ }
+
+ const std::string &getDoc() const {
+ return docAttribute_.get();
+ }
+
void doAddLeaf(const NodePtr &newLeaf) {
leafAttributes_.add(newLeaf);
}
@@ -172,6 +199,10 @@ class NodeImpl : public Node
}
NameConcept nameAttribute_;
+
+ // Rem: NameConcept type is HasName (= SingleAttribute<Name>), we use
std::string instead
+ concepts::SingleAttribute<std::string> docAttribute_; /** Doc used to
compare schemas */
+
LeavesConcept leafAttributes_;
LeafNamesConcept leafNameAttributes_;
SizeConcept sizeAttribute_;
@@ -181,6 +212,8 @@ class NodeImpl : public Node
typedef concepts::NoAttribute<Name> NoName;
typedef concepts::SingleAttribute<Name> HasName;
+typedef concepts::SingleAttribute<std::string> HasDoc;
+
typedef concepts::NoAttribute<NodePtr> NoLeaves;
typedef concepts::SingleAttribute<NodePtr> SingleLeaf;
typedef concepts::MultiAttribute<NodePtr> MultiLeaves;
@@ -287,6 +320,20 @@ public:
}
}
+ NodeRecord(const HasName &name, const HasDoc &doc, const MultiLeaves
&fields,
+ const LeafNames &fieldsNames,
+ const std::vector<GenericDatum> &dv) :
+ NodeImplRecord(AVRO_RECORD, name, doc, fields, fieldsNames, NoSize()),
+ defaultValues(dv) {
+ for (size_t i = 0; i < leafNameAttributes_.size(); ++i) {
+ if (!nameIndex_.add(leafNameAttributes_.get(i), i)) {
+ throw Exception(boost::format(
+ "Cannot add duplicate name: %1%") %
+ leafNameAttributes_.get(i));
+ }
+ }
+ }
+
void swap(NodeRecord& r) {
NodeImplRecord::swap(r);
defaultValues.swap(r.defaultValues);
diff --git a/lang/c++/api/Schema.hh b/lang/c++/api/Schema.hh
index 8ce5f8d..646b95e 100644
--- a/lang/c++/api/Schema.hh
+++ b/lang/c++/api/Schema.hh
@@ -16,11 +16,12 @@
* limitations under the License.
*/
-#ifndef avro_Schema_hh__
-#define avro_Schema_hh__
+#ifndef avro_Schema_hh__
+#define avro_Schema_hh__
#include "Config.hh"
#include "NodeImpl.hh"
+#include <string>
/// \file
///
@@ -102,6 +103,9 @@ class AVRO_DECL RecordSchema : public Schema {
public:
RecordSchema(const std::string &name);
void addField(const std::string &name, const Schema &fieldSchema);
+
+ std::string getDoc() const;
+ void setDoc(const std::string &);
};
class AVRO_DECL EnumSchema : public Schema {
diff --git a/lang/c++/api/ValidSchema.hh b/lang/c++/api/ValidSchema.hh
index 30eb33e..d0bbd4e 100644
--- a/lang/c++/api/ValidSchema.hh
+++ b/lang/c++/api/ValidSchema.hh
@@ -50,11 +50,15 @@ public:
}
void toJson(std::ostream &os) const;
+ std::string toJson(bool prettyPrint = true) const;
void toFlatList(std::ostream &os) const;
protected:
NodePtr root_;
+
+ private:
+ static std::string compactSchema(const std::string &schema);
};
} // namespace avro
diff --git a/lang/c++/impl/Compiler.cc b/lang/c++/impl/Compiler.cc
index 725136d..bc0f3cd 100644
--- a/lang/c++/impl/Compiler.cc
+++ b/lang/c++/impl/Compiler.cc
@@ -15,6 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+#include <boost/algorithm/string/replace.hpp>
#include <sstream>
#include "Compiler.hh"
@@ -42,7 +43,7 @@ typedef map<Name, NodePtr> SymbolTable;
// #define DEBUG_VERBOSE
-static NodePtr makePrimitive(const std::string& t)
+static NodePtr makePrimitive(const string& t)
{
if (t == "null") {
return NodePtr(new NodePrimitive(AVRO_NULL));
@@ -65,7 +66,7 @@ static NodePtr makePrimitive(const std::string& t)
}
}
-static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string&
ns);
+static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string
&ns);
template <typename T>
concepts::SingleAttribute<T> asSingleAttribute(const T& t)
@@ -75,17 +76,17 @@ concepts::SingleAttribute<T> asSingleAttribute(const T& t)
return n;
}
-static bool isFullName(const string& s)
+static bool isFullName(const string &s)
{
return s.find('.') != string::npos;
}
-static Name getName(const string& name, const string& ns)
+static Name getName(const string &name, const string &ns)
{
return (isFullName(name)) ? Name(name) : Name(name, ns);
}
-static NodePtr makeNode(const std::string& t, SymbolTable& st, const string&
ns)
+static NodePtr makeNode(const string &t, SymbolTable &st, const string &ns)
{
NodePtr result = makePrimitive(t);
if (result) {
@@ -100,8 +101,15 @@ static NodePtr makeNode(const std::string& t, SymbolTable&
st, const string& ns)
throw Exception(boost::format("Unknown type: %1%") % n.fullname());
}
-const json::Object::const_iterator findField(const Entity& e,
- const Object& m, const string& fieldName)
+/** Returns "true" if the field is in the container */
+// e.g.: can be false for non-mandatory fields
+bool containsField(const Object &m, const string &fieldName) {
+ Object::const_iterator it = m.find(fieldName);
+ return it != m.end();
+}
+
+const json::Object::const_iterator findField(const Entity &e,
+ const Object &m, const string &fieldName)
{
Object::const_iterator it = m.find(fieldName);
if (it == m.end()) {
@@ -112,7 +120,7 @@ const json::Object::const_iterator findField(const Entity&
e,
}
}
-template <typename T> void ensureType(const Entity& e, const string& name)
+template <typename T> void ensureType(const Entity &e, const string &name)
{
if (e.type() != json::type_traits<T>::type()) {
throw Exception(boost::format("Json field \"%1%\" is not a %2%: %3%") %
@@ -120,8 +128,8 @@ template <typename T> void ensureType(const Entity& e,
const string& name)
}
}
-const string& getStringField(const Entity& e, const Object& m,
- const string& fieldName)
+const string& getStringField(const Entity &e, const Object &m,
+ const string &fieldName)
{
Object::const_iterator it = findField(e, m, fieldName);
ensureType<string>(it->second, fieldName);
@@ -144,6 +152,19 @@ const int64_t getLongField(const Entity& e, const Object&
m,
return it->second.longValue();
}
+// Unescape double quotes (") for de-serialization. This method complements
the
+// method NodeImpl::escape() which is used for serialization.
+static void unescape(string& s) {
+ boost::replace_all(s, "\\\"", "\"");
+}
+
+const string getDocField(const Entity& e, const Object& m)
+{
+ string doc = getStringField(e, m, "doc");
+ unescape(doc);
+ return doc;
+}
+
struct Field {
const string& name;
const NodePtr schema;
@@ -162,7 +183,7 @@ static void assertType(const Entity& e, EntityType et)
}
}
-static vector<uint8_t> toBin(const std::string& s)
+static vector<uint8_t> toBin(const string& s)
{
vector<uint8_t> result(s.size());
if (s.size() > 0) {
@@ -278,14 +299,18 @@ static Field makeField(const Entity& e, SymbolTable& st,
const string& ns)
Object::const_iterator it = findField(e, m, "type");
map<string, Entity>::const_iterator it2 = m.find("default");
NodePtr node = makeNode(it->second, st, ns);
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
GenericDatum d = (it2 == m.end()) ? GenericDatum() :
makeGenericDatum(node, it2->second, st);
return Field(n, node, d);
}
-static NodePtr makeRecordNode(const Entity& e,
- const Name& name, const Object& m, SymbolTable& st, const string& ns)
-{
+// Extended makeRecordNode (with doc).
+static NodePtr makeRecordNode(const Entity& e, const Name& name,
+ const string* doc, const Object& m,
+ SymbolTable& st, const string& ns) {
const Array& v = getArrayField(e, m, "fields");
concepts::MultiAttribute<string> fieldNames;
concepts::MultiAttribute<NodePtr> fieldValues;
@@ -297,8 +322,15 @@ static NodePtr makeRecordNode(const Entity& e,
fieldValues.add(f.schema);
defaultValues.push_back(f.defaultValue);
}
- return NodePtr(new NodeRecord(asSingleAttribute(name),
- fieldValues, fieldNames, defaultValues));
+ NodeRecord* node;
+ if (doc == NULL) {
+ node = new NodeRecord(asSingleAttribute(name), fieldValues, fieldNames,
+ defaultValues);
+ } else {
+ node = new NodeRecord(asSingleAttribute(name), asSingleAttribute(*doc),
+ fieldValues, fieldNames, defaultValues);
+ }
+ return NodePtr(node);
}
static NodePtr makeEnumNode(const Entity& e,
@@ -313,7 +345,11 @@ static NodePtr makeEnumNode(const Entity& e,
}
symbols.add(it->stringValue());
}
- return NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
+ NodePtr node = NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
}
static NodePtr makeFixedNode(const Entity& e,
@@ -324,16 +360,24 @@ static NodePtr makeFixedNode(const Entity& e,
throw Exception(boost::format("Size for fixed is not positive: %1%") %
e.toString());
}
- return NodePtr(new NodeFixed(asSingleAttribute(name),
- asSingleAttribute(v)));
+ NodePtr node =
+ NodePtr(new NodeFixed(asSingleAttribute(name), asSingleAttribute(v)));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
}
static NodePtr makeArrayNode(const Entity& e, const Object& m,
SymbolTable& st, const string& ns)
{
Object::const_iterator it = findField(e, m, "items");
- return NodePtr(new NodeArray(asSingleAttribute(
- makeNode(it->second, st, ns))));
+ NodePtr node = NodePtr(new NodeArray(
+ asSingleAttribute(makeNode(it->second, st, ns))));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
}
static NodePtr makeMapNode(const Entity& e, const Object& m,
@@ -341,8 +385,12 @@ static NodePtr makeMapNode(const Entity& e, const Object&
m,
{
Object::const_iterator it = findField(e, m, "values");
- return NodePtr(new NodeMap(asSingleAttribute(
- makeNode(it->second, st, ns))));
+ NodePtr node = NodePtr(new NodeMap(
+ asSingleAttribute(makeNode(it->second, st, ns))));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
}
static Name getName(const Entity& e, const Object& m, const string& ns)
@@ -380,9 +428,19 @@ static NodePtr makeNode(const Entity& e, const Object& m,
if (type == "record" || type == "error") {
result = NodePtr(new NodeRecord());
st[nm] = result;
- NodePtr r = makeRecordNode(e, nm, m, st, nm.ns());
- (boost::dynamic_pointer_cast<NodeRecord>(r))->swap(
- *boost::dynamic_pointer_cast<NodeRecord>(result));
+ // Get field doc
+ if (containsField(m, "doc")) {
+ string doc = getDocField(e, m);
+
+ NodePtr r = makeRecordNode(e, nm, &doc, m, st, nm.ns());
+ (boost::dynamic_pointer_cast<NodeRecord>(r))->swap(
+ *boost::dynamic_pointer_cast<NodeRecord>(result));
+ } else { // No doc
+ NodePtr r =
+ makeRecordNode(e, nm, NULL, m, st, nm.ns());
+ (boost::dynamic_pointer_cast<NodeRecord>(r))
+ ->swap(*boost::dynamic_pointer_cast<NodeRecord>(result));
+ }
} else {
result = (type == "enum") ? makeEnumNode(e, nm, m) :
makeFixedNode(e, nm, m);
@@ -447,7 +505,7 @@ AVRO_DECL ValidSchema compileJsonSchemaFromString(const
char* input)
::strlen(input));
}
-AVRO_DECL ValidSchema compileJsonSchemaFromString(const std::string& input)
+AVRO_DECL ValidSchema compileJsonSchemaFromString(const string& input)
{
return compileJsonSchemaFromMemory(
reinterpret_cast<const uint8_t*>(&input[0]), input.size());
@@ -468,7 +526,7 @@ AVRO_DECL void compileJsonSchema(std::istream &is,
ValidSchema &schema)
schema = compile(is);
}
-AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema,
std::string &error)
+AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, string
&error)
{
try {
compileJsonSchema(is, schema);
diff --git a/lang/c++/impl/DataFile.cc b/lang/c++/impl/DataFile.cc
index a71860d..1777d53 100644
--- a/lang/c++/impl/DataFile.cc
+++ b/lang/c++/impl/DataFile.cc
@@ -121,7 +121,7 @@ void DataFileWriterBase::init(const ValidSchema &schema,
size_t syncInterval, co
} else {
throw Exception(boost::format("Unknown codec: %1%") % codec);
}
- setMetadata(AVRO_SCHEMA_KEY, toString(schema));
+ setMetadata(AVRO_SCHEMA_KEY, schema.toJson(false));
writeHeader();
encoderPtr_->init(*buffer_);
@@ -296,7 +296,7 @@ void DataFileReaderBase::init()
void DataFileReaderBase::init(const ValidSchema& readerSchema)
{
readerSchema_ = readerSchema;
- dataDecoder_ = (toString(readerSchema_) != toString(dataSchema_)) ?
+ dataDecoder_ = (readerSchema_.toJson(true) != dataSchema_.toJson(true)) ?
resolvingDecoder(dataSchema_, readerSchema_, binaryDecoder()) :
binaryDecoder();
readDataBlock();
diff --git a/lang/c++/impl/NodeImpl.cc b/lang/c++/impl/NodeImpl.cc
index 50b7fba..bdb05a0 100644
--- a/lang/c++/impl/NodeImpl.cc
+++ b/lang/c++/impl/NodeImpl.cc
@@ -17,6 +17,8 @@
*/
+#include <sstream>
+#include <iomanip>
#include <boost/algorithm/string/replace.hpp>
#include "NodeImpl.hh"
@@ -25,6 +27,7 @@ using std::string;
namespace avro {
namespace {
+
// Escape string for serialization.
string escape(const string &unescaped) {
string s;
@@ -219,12 +222,20 @@ void
NodePrimitive::printJson(std::ostream &os, int depth) const
{
os << '\"' << type() << '\"';
+ if (getDoc().size()) {
+ os << ",\n" << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\"";
+ }
}
void
NodeSymbolic::printJson(std::ostream &os, int depth) const
{
os << '\"' << nameAttribute_.get() << '\"';
+ if (getDoc().size()) {
+ os << ",\n" << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\"";
+ }
}
static void printName(std::ostream& os, const Name& n, int depth)
@@ -241,6 +252,10 @@ NodeRecord::printJson(std::ostream &os, int depth) const
os << "{\n";
os << indent(++depth) << "\"type\": \"record\",\n";
printName(os, nameAttribute_.get(), depth);
+ if (getDoc().size()) {
+ os << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
os << indent(depth) << "\"fields\": [";
size_t fields = leafAttributes_.size();
@@ -430,6 +445,10 @@ NodeEnum::printJson(std::ostream &os, int depth) const
{
os << "{\n";
os << indent(++depth) << "\"type\": \"enum\",\n";
+ if (getDoc().size()) {
+ os << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
printName(os, nameAttribute_.get(), depth);
os << indent(depth) << "\"symbols\": [\n";
@@ -451,6 +470,10 @@ NodeArray::printJson(std::ostream &os, int depth) const
{
os << "{\n";
os << indent(depth+1) << "\"type\": \"array\",\n";
+ if (getDoc().size()) {
+ os << indent(depth+1) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
os << indent(depth+1) << "\"items\": ";
leafAttributes_.get()->printJson(os, depth+1);
os << '\n';
@@ -462,6 +485,10 @@ NodeMap::printJson(std::ostream &os, int depth) const
{
os << "{\n";
os << indent(depth+1) <<"\"type\": \"map\",\n";
+ if (getDoc().size()) {
+ os << indent(depth+1) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
os << indent(depth+1) << "\"values\": ";
leafAttributes_.get(1)->printJson(os, depth+1);
os << '\n';
@@ -490,6 +517,10 @@ NodeFixed::printJson(std::ostream &os, int depth) const
{
os << "{\n";
os << indent(++depth) << "\"type\": \"fixed\",\n";
+ if (getDoc().size()) {
+ os << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
printName(os, nameAttribute_.get(), depth);
os << indent(depth) << "\"size\": " << sizeAttribute_.get() << "\n";
os << indent(--depth) << '}';
diff --git a/lang/c++/impl/Schema.cc b/lang/c++/impl/Schema.cc
index b5457ae..6676574 100644
--- a/lang/c++/impl/Schema.cc
+++ b/lang/c++/impl/Schema.cc
@@ -51,6 +51,15 @@ RecordSchema::addField(const std::string &name, const Schema
&fieldSchema)
node_->addLeaf(fieldSchema.root());
}
+std::string RecordSchema::getDoc() const
+{
+ return node_->getDoc();
+}
+void RecordSchema::setDoc(const std::string& doc)
+{
+ node_->setDoc(doc);
+}
+
EnumSchema::EnumSchema(const std::string &name) :
Schema(new NodeEnum)
{
diff --git a/lang/c++/impl/ValidSchema.cc b/lang/c++/impl/ValidSchema.cc
index bd28079..17ab994 100644
--- a/lang/c++/impl/ValidSchema.cc
+++ b/lang/c++/impl/ValidSchema.cc
@@ -24,16 +24,16 @@
#include "Node.hh"
using std::string;
+using std::ostringstream;
using std::make_pair;
using boost::format;
using boost::shared_ptr;
using boost::static_pointer_cast;
namespace avro {
-
typedef std::map<Name, NodePtr> SymbolMap;
-static bool validate(const NodePtr &node, SymbolMap &symbolMap)
+static bool validate(const NodePtr &node, SymbolMap &symbolMap)
{
if (! node->isValid()) {
throw Exception(format("Schema is invalid, due to bad node of type
%1%")
@@ -77,7 +77,7 @@ static bool validate(const NodePtr &node, SymbolMap
&symbolMap)
// map (which could potentially create circular shared pointer
// links that could not be easily freed), replace this node with a
// symbolic link to the original one.
-
+
node->setLeafToSymbolic(i, symbolMap.find(leaf->name())->second);
}
}
@@ -101,7 +101,7 @@ ValidSchema::ValidSchema(const Schema &schema) :
root_(schema.root())
validate(root_);
}
-ValidSchema::ValidSchema() : root_(NullSchema().root())
+ValidSchema::ValidSchema() : root_(NullSchema().root())
{
validate(root_);
}
@@ -113,18 +113,80 @@ ValidSchema::setSchema(const Schema &schema)
validate(root_);
}
-void
+void
ValidSchema::toJson(std::ostream &os) const
-{
+{
root_->printJson(os, 0);
os << '\n';
}
-void
+string
+ValidSchema::toJson(bool prettyPrint) const
+{
+ ostringstream oss;
+ toJson(oss);
+ if (!prettyPrint) {
+ return compactSchema(oss.str());
+ }
+ return oss.str();
+}
+
+void
ValidSchema::toFlatList(std::ostream &os) const
-{
+{
root_->printBasicInfo(os);
}
+/*
+ * compactSchema compacts and returns a formatted string representation
+ * of a ValidSchema object by removing the whitespaces outside of the quoted
+ * field names and values. It can handle the cases where the quoted value is
+ * in UTF-8 format. Note that this method is not responsible for validating
+ * the schema.
+ */
+string ValidSchema::compactSchema(const string& schema) {
+ bool insideQuote = false;
+ size_t newPos = 0;
+ string data(schema.data());
+
+ for (size_t currentPos = 0; currentPos < schema.size(); currentPos++) {
+ if (!insideQuote && std::isspace(data[currentPos])) {
+ // Skip the white spaces outside quotes.
+ continue;
+ }
+
+ if (data[currentPos] == '\"') {
+ // It is valid for a quote to be part of the value for some fields,
+ // e.g., the "doc" field. In that case, the quote is expected to
be
+ // escaped inside the schema. Since the escape character '\\'
could
+ // be escaped itself, we need to check whether there are an even
+ // number of consecutive slashes prior to the quote.
+ int leadingSlashes = 0;
+ for (int i = newPos - 1; i >= 0; i--) {
+ if (data[i] == '\\') {
+ leadingSlashes++;
+ } else {
+ break;
+ }
+ }
+ if (leadingSlashes % 2 == 0) {
+ // Found a real quote which identifies either the start or the
+ // end of a field name or value.
+ insideQuote = !insideQuote;
+ }
+ }
+ data[newPos++] = data[currentPos];
+ }
+
+ if (insideQuote) {
+ throw Exception("Schema is not well formed with mismatched quotes");
+ }
+
+ if (newPos < schema.size()) {
+ data.resize(newPos);
+ }
+ return data;
+}
+
} // namespace avro
diff --git a/lang/c++/jsonschemas/bigrecord b/lang/c++/jsonschemas/bigrecord
index ba430a0..af8a5ad 100644
--- a/lang/c++/jsonschemas/bigrecord
+++ b/lang/c++/jsonschemas/bigrecord
@@ -1,9 +1,11 @@
{
"type": "record",
+ "doc": "Top level Doc.",
"name": "RootRecord",
"fields": [
{
"name": "mylong",
+ "doc": "mylong field doc.",
"type": "long"
},
{
diff --git a/lang/c++/test/DataFileTests.cc b/lang/c++/test/DataFileTests.cc
index 8d8c7b0..bb2efc6 100644
--- a/lang/c++/test/DataFileTests.cc
+++ b/lang/c++/test/DataFileTests.cc
@@ -45,6 +45,7 @@ using boost::unit_test::test_suite;
using avro::ValidSchema;
using avro::GenericDatum;
using avro::GenericRecord;
+using avro::NodePtr;
const int count = 1000;
@@ -133,13 +134,29 @@ static const char dsch[] = "{\"type\": \"record\","
"{\"name\":\"re\", \"type\":\"double\"},"
"{\"name\":\"im\", \"type\":\"double\"}"
"]}";
-static const char dblsch[] = "{\"type\": \"record\","
- "\"name\":\"ComplexDouble\", \"fields\": ["
+static const char dblsch[] =
+ "{\"type\": \"record\","
+ "\"name\":\"ComplexDouble\", "
+ "\"doc\": \"\\\"Quoted_doc_string\\\"\", "
+ "\"fields\": ["
"{\"name\":\"re\", \"type\":\"double\"}"
"]}";
static const char fsch[] = "{\"type\": \"fixed\","
"\"name\":\"Fixed_32\", \"size\":4}";
-
+static const char ischWithDoc[] =
+ "{\"type\": \"record\","
+ "\"name\":\"ComplexInteger\", "
+ "\"doc\": \"record_doc\", "
+ "\"fields\": ["
+ "{\"name\":\"re1\", \"type\":\"long\", \"doc\": \"field_doc\"},"
+ "{\"name\":\"re2\", \"type\":\"long\"},"
+ "{\"name\":\"re3\", \"type\":\"long\", \"doc\": \"\"},"
+ "{\"name\":\"re4\", \"type\":\"long\", "
+ "\"doc\": \"A_\\\"quoted_doc\\\"\"},"
+ "{\"name\":\"re5\", \"type\":\"long\", \"doc\": \"doc with\nspaces\"},"
+ "{\"name\":\"re6\", \"type\":\"long\", "
+ "\"doc\": \"extra slashes\\\\\\\\\"}"
+ "]}";
string toString(const ValidSchema& s)
{
@@ -561,18 +578,42 @@ public:
#endif
void testSchemaReadWrite() {
- uint32_t a=42;
- {
+ uint32_t a=42;
+ {
avro::DataFileWriter<uint32_t> df(filename, writerSchema);
- df.write(a);
+ df.write(a);
}
{
- avro::DataFileReader<uint32_t> df(filename);
- uint32_t b;
+ avro::DataFileReader<uint32_t> df(filename);
+ uint32_t b;
df.read(b);
BOOST_CHECK_EQUAL(b, a);
+ }
}
+
+ void testSchemaReadWriteWithDoc() {
+ uint32_t a=42;
+ {
+ avro::DataFileWriter<uint32_t> df(filename, writerSchema);
+ df.write(a);
+ }
+
+ {
+ avro::DataFileReader<uint32_t> df(filename);
+ uint32_t b;
+ df.read(b);
+ BOOST_CHECK_EQUAL(b, a);
+
+ const NodePtr& root = df.readerSchema().root();
+ BOOST_CHECK_EQUAL(root->getDoc(), "record_doc");
+ BOOST_CHECK_EQUAL(root->leafAt(0)->getDoc(), "field_doc");
+ BOOST_CHECK_EQUAL(root->leafAt(1)->getDoc(), "");
+ BOOST_CHECK_EQUAL(root->leafAt(2)->getDoc(), "");
+ BOOST_CHECK_EQUAL(root->leafAt(3)->getDoc(), "A_\"quoted_doc\"");
+ BOOST_CHECK_EQUAL(root->leafAt(4)->getDoc(), "doc with\nspaces");
+ BOOST_CHECK_EQUAL(root->leafAt(5)->getDoc(), "extra slashes\\\\");
+ }
}
};
@@ -677,6 +718,14 @@ init_unit_test_suite(int argc, char *argv[])
ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testCleanup, t));
boost::unit_test::framework::master_test_suite().add(ts);
}
+ {
+ test_suite *ts = BOOST_TEST_SUITE("DataFile tests: test12.df");
+ shared_ptr<DataFileTest> t(new DataFileTest("test12.df", ischWithDoc,
ischWithDoc));
+ ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testWrite, t));
+
ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testSchemaReadWriteWithDoc, t));
+ ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testCleanup, t));
+ boost::unit_test::framework::master_test_suite().add(ts);
+ }
return 0;
}
diff --git a/lang/c++/test/SchemaTests.cc b/lang/c++/test/SchemaTests.cc
index 026c4d0..f6d6195 100644
--- a/lang/c++/test/SchemaTests.cc
+++ b/lang/c++/test/SchemaTests.cc
@@ -47,42 +47,42 @@ const char* basicSchemas[] = {
"{ \"type\": \"string\" }",
// Record
- "{\"type\": \"record\",\"name\": \"Test\",\"fields\": []}",
- "{\"type\": \"record\",\"name\": \"Test\",\"fields\": "
- "[{\"name\": \"f\",\"type\": \"long\"}]}",
- "{\"type\": \"record\",\"name\": \"Test\",\"fields\": "
- "[{\"name\": \"f1\",\"type\": \"long\"},"
- "{\"name\": \"f2\", \"type\": \"int\"}]}",
- "{\"type\": \"error\",\"name\": \"Test\",\"fields\": "
- "[{\"name\": \"f1\",\"type\": \"long\"},"
- "{\"name\": \"f2\", \"type\": \"int\"}]}",
+
"{\"type\":\"record\",\"name\":\"Test\",\"doc\":\"Doc_string\",\"fields\":[]}",
+ "{\"type\":\"record\",\"name\":\"Test\",\"fields\":"
+ "[{\"name\":\"f\",\"type\":\"long\"}]}",
+ "{\"type\":\"record\",\"name\":\"Test\",\"fields\":"
+ "[{\"name\":\"f1\",\"type\":\"long\",\"doc\":\"field_doc\"},"
+ "{\"name\":\"f2\",\"type\":\"int\"}]}",
+ "{\"type\":\"error\",\"name\":\"Test\",\"fields\":"
+ "[{\"name\":\"f1\",\"type\":\"long\"},"
+ "{\"name\":\"f2\",\"type\":\"int\"}]}",
// Recursive.
"{\"type\":\"record\",\"name\":\"LongList\","
- "\"fields\":[{\"name\":\"value\",\"type\":\"long\"},"
+
"\"fields\":[{\"name\":\"value\",\"type\":\"long\",\"doc\":\"recursive_doc\"},"
"{\"name\":\"next\",\"type\":[\"LongList\",\"null\"]}]}",
// Enum
- "{\"type\": \"enum\", \"name\": \"Test\", \"symbols\": [\"A\", \"B\"]}",
+
"{\"type\":\"enum\",\"doc\":\"enum_doc\",\"name\":\"Test\",\"symbols\":[\"A\",\"B\"]}",
// Array
- "{\"type\": \"array\", \"items\": \"long\"}",
- "{\"type\": \"array\",\"items\": {\"type\": \"enum\", "
- "\"name\": \"Test\", \"symbols\": [\"A\", \"B\"]}}",
+ "{\"type\":\"array\",\"doc\":\"array_doc\",\"items\":\"long\"}",
+ "{\"type\":\"array\",\"items\":{\"type\":\"enum\","
+ "\"name\":\"Test\",\"symbols\":[\"A\",\"B\"]}}",
// Map
- "{\"type\": \"map\", \"values\": \"long\"}",
- "{\"type\": \"map\",\"values\": {\"type\": \"enum\", "
- "\"name\": \"Test\", \"symbols\": [\"A\", \"B\"]}}",
+ "{\"type\":\"map\",\"doc\":\"map_doc\",\"values\":\"long\"}",
+ "{\"type\":\"map\",\"values\":{\"type\":\"enum\", "
+ "\"name\":\"Test\",\"symbols\":[\"A\",\"B\"]}}",
// Union
- "[\"string\", \"null\", \"long\"]",
+ "[\"string\",\"null\",\"long\"]",
// Fixed
- "{ \"type\": \"fixed\", \"name\": \"Test\", \"size\": 1}",
- "{\"type\": \"fixed\", \"name\": \"MyFixed\", "
- "\"namespace\": \"org.apache.hadoop.avro\", \"size\": 1}",
- "{ \"type\": \"fixed\", \"name\": \"Test\", \"size\": 1}",
- "{ \"type\": \"fixed\", \"name\": \"Test\", \"size\": 1}",
+ "{\"type\":\"fixed\",\"doc\":\"fixed_doc\",\"name\":\"Test\",\"size\":1}",
+ "{\"type\":\"fixed\",\"name\":\"MyFixed\","
+ "\"namespace\":\"org.apache.hadoop.avro\",\"size\":1}",
+ "{\"type\":\"fixed\",\"name\":\"Test\",\"size\":1}",
+ "{\"type\":\"fixed\",\"name\":\"Test\",\"size\":1}",
// Extra attributes (should be ignored)
"{\"type\": \"null\", \"extra attribute\": \"should be ignored\"}",
@@ -135,7 +135,7 @@ const char* basicSchemaErrors[] = {
// Duplicate type
"[{\"type\": \"array\", \"items\": \"long\"}, "
"{\"type\": \"array\", \"items\": \"string\"}]",
-
+
// Fixed
// No size
"{\"type\": \"fixed\", \"name\": \"Missing size\"}",
@@ -166,7 +166,7 @@ const char* roundTripSchemas[] = {
"{\"type\":\"record\",\"name\":\"Test\",\"fields\":"
"[{\"name\":\"f1\",\"type\":\"long\"},"
"{\"name\":\"f2\",\"type\":\"int\"}]}",
-/* Avro-C++ cannot do a round-trip on error schemas.
+/* Avro-C++ cannot do a round-trip on error schemas.
* "{\"type\":\"error\",\"name\":\"Test\",\"fields\":"
* "[{\"name\":\"f1\",\"type\":\"long\"},"
* "{\"name\":\"f2\",\"type\":\"int\"}]}"
@@ -199,7 +199,32 @@ const char* roundTripSchemas[] = {
"{\"type\":\"fixed\",\"name\":\"Test\",\"size\":1}"
};
+const char* schemasToCompact[] = {
+ // Schema without any whitespace
+ "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[]}",
+
+ // Schema with whitespaces outside of field names/values only.
+ "{\"type\": \"record\",\n \n\"name\":\"Test\", \t\t\"fields\":[]}\n
\n",
+ // Schema with whitespaces both inside and outside of field names/values.
+ "{\"type\": \"record\", \"name\": \"ComplexInteger\"\n, "
+ "\"doc\": \"record_doc °C \u00f8 \x1f \\n \n \t\", "
+ "\"fields\": ["
+ "{\"name\": \"re1\", \"type\": \"long\", "
+ "\"doc\": \"A \\\"quoted doc\\\"\" }, "
+ "{\"name\": \"re2\", \"type\": \"long\", \n\t"
+ "\"doc\": \"extra slashes\\\\\\\\\"}"
+ "]}"};
+
+const char* compactSchemas[] = {
+ "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[]}",
+ "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[]}",
+ "{\"type\":\"record\",\"name\":\"ComplexInteger\","
+ "\"doc\":\"record_doc °C \u00f8 \\u001f \\n \\n \\t\","
+ "\"fields\":["
+ "{\"name\":\"re1\",\"type\":\"long\",\"doc\":\"A \\\"quoted
doc\\\"\"},"
+ "{\"name\":\"re2\",\"type\":\"long\",\"doc\":\"extra
slashes\\\\\\\\\"}"
+ "]}"};
static void testBasic(const char* schema)
{
@@ -219,17 +244,36 @@ static void testCompile(const char* schema)
compileJsonSchemaFromString(std::string(schema));
}
-// Test that the JSON output from a valid schema matches the JSON that was
+// Test that the JSON output from a valid schema matches the JSON that was
// used to construct it, apart from whitespace changes.
static void testRoundTrip(const char* schema)
{
BOOST_TEST_CHECKPOINT(schema);
- avro::ValidSchema compiledSchema =
compileJsonSchemaFromString(std::string(schema));
+ avro::ValidSchema compiledSchema =
+ compileJsonSchemaFromString(std::string(schema));
std::ostringstream os;
compiledSchema.toJson(os);
std::string result = os.str();
result.erase(std::remove_if(result.begin(), result.end(), ::isspace),
result.end()); // Remove whitespace
BOOST_CHECK(result == std::string(schema));
+ // Verify that the compact schema from toJson has the same content as the
+ // schema.
+ std::string result2 = compiledSchema.toJson(false);
+ BOOST_CHECK(result2 == std::string(schema));
+}
+
+static void testCompactSchemas()
+{
+ for (size_t i = 0; i < sizeof(schemasToCompact)/
sizeof(schemasToCompact[0]); i++)
+ {
+ const char* schema = schemasToCompact[i];
+ BOOST_TEST_CHECKPOINT(schema);
+ avro::ValidSchema compiledSchema =
+ compileJsonSchemaFromString(std::string(schema));
+
+ std::string result = compiledSchema.toJson(false);
+ BOOST_CHECK_EQUAL(result, compactSchemas[i]);
+ }
}
}
@@ -239,10 +283,10 @@ static void testRoundTrip(const char* schema)
#define ADD_PARAM_TEST(ts, func, data) \
ts->add(BOOST_PARAM_TEST_CASE(&func, data, ENDOF(data)))
-
+
boost::unit_test::test_suite*
-init_unit_test_suite(int argc, char* argv[])
+init_unit_test_suite(int argc, char* argv[])
{
using namespace boost::unit_test;
@@ -252,5 +296,6 @@ init_unit_test_suite(int argc, char* argv[])
avro::schema::basicSchemaErrors);
ADD_PARAM_TEST(ts, avro::schema::testCompile, avro::schema::basicSchemas);
ADD_PARAM_TEST(ts, avro::schema::testRoundTrip,
avro::schema::roundTripSchemas);
+ ts->add(BOOST_TEST_CASE(&avro::schema::testCompactSchemas));
return ts;
}