Repository: parquet-cpp Updated Branches: refs/heads/master c6e069297 -> ebb45b1e7
PARQUET-545: Improve API to support decimal type This PR also 1) Improves scanner coverage. 2) Implements PARQUET-546 3) Adds tests for Types to string Author: Deepak Majeti <[email protected]> Closes #65 from majetideepak/PARQUET-545 and squashes the following commits: df28ccc [Deepak Majeti] removed little redundant code c5f6d85 [Deepak Majeti] improved schema type coverage 6a18c98 [Deepak Majeti] removed ColumnDescriptor Repetition API 6b450ab [Deepak Majeti] modified scanner test to remove redundant code 1eed0de [Deepak Majeti] Added tests for GroupNodes 0f0e559 [Deepak Majeti] added type to string tests cab06d8 [Deepak Majeti] Improve coverage for types test 6ce10ce [Deepak Majeti] Tests for PARQUET-545 a643bf4 [Deepak Majeti] Added PrimitiveNode Tests 656942f [Deepak Majeti] restored some necessary checks addf7f6 [Deepak Majeti] remove unnecessary checks 8a043c0 [Deepak Majeti] modified scanner and column descriptor a346ca5 [Deepak Majeti] PARQUET-546 bdbcdd7 [Deepak Majeti] Extend Column Descriptor to include Decimal scale and precision 938ded5 [Deepak Majeti] Improve Scanner Coverage Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/ebb45b1e Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/ebb45b1e Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/ebb45b1e Branch: refs/heads/master Commit: ebb45b1e7da49e4084602b10aee90dcdf7317768 Parents: c6e0692 Author: Deepak Majeti <[email protected]> Authored: Mon Feb 29 10:04:00 2016 -0800 Committer: Julien Le Dem <[email protected]> Committed: Mon Feb 29 10:04:00 2016 -0800 ---------------------------------------------------------------------- src/parquet/CMakeLists.txt | 1 + src/parquet/column/scanner-test.cc | 91 +++++++++++----- src/parquet/column/scanner.h | 18 ++-- src/parquet/encodings/encoding-test.cc | 4 +- src/parquet/schema/descriptor.cc | 11 +- src/parquet/schema/descriptor.h | 19 ++-- src/parquet/schema/schema-descriptor-test.cc | 6 +- src/parquet/schema/schema-types-test.cc | 69 ++++++++++-- src/parquet/schema/types.cc | 126 +++++++++++++++++++--- src/parquet/schema/types.h | 23 ++-- src/parquet/types-test.cc | 82 ++++++++++++++ src/parquet/types.h | 67 ++++++++++++ 12 files changed, 419 insertions(+), 98 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt index 97547ce..8a9b860 100644 --- a/src/parquet/CMakeLists.txt +++ b/src/parquet/CMakeLists.txt @@ -23,4 +23,5 @@ install(FILES DESTINATION include/parquet) ADD_PARQUET_TEST(public-api-test) +ADD_PARQUET_TEST(types-test) ADD_PARQUET_TEST(reader-test) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/column/scanner-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc index 785db08..fcaf65e 100644 --- a/src/parquet/column/scanner-test.cc +++ b/src/parquet/column/scanner-test.cc @@ -103,7 +103,7 @@ class TestFlatScanner : public ::testing::Test { TypedScanner<Type::type_num>* scanner = reinterpret_cast<TypedScanner<Type::type_num>* >(scanner_.get()); T val; - bool is_null; + bool is_null = false; int16_t def_level; int16_t rep_level; size_t j = 0; @@ -113,15 +113,15 @@ class TestFlatScanner : public ::testing::Test { if (!is_null) { ASSERT_EQ(values_[j++], val) << i <<"V"<< j; } - if (!d->is_required()) { + if (d->max_definition_level() > 0) { ASSERT_EQ(def_levels_[i], def_level) << i <<"D"<< j; } - if (d->is_repeated()) { + if (d->max_repetition_level() > 0) { ASSERT_EQ(rep_levels_[i], rep_level) << i <<"R"<< j; } } ASSERT_EQ(num_values_, j); - ASSERT_FALSE(scanner->HasNext()); + ASSERT_FALSE(scanner->Next(&val, &def_level, &rep_level, &is_null)); } void Clear() { @@ -140,21 +140,25 @@ class TestFlatScanner : public ::testing::Test { } void InitDescriptors(std::shared_ptr<ColumnDescriptor>& d1, - std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>& d3) { + std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>& d3, + int length) { NodePtr type; - type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num); + type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num, + LogicalType::NONE, length); d1.reset(new ColumnDescriptor(type, 0, 0)); - type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num); + type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num, + LogicalType::NONE, length); d2.reset(new ColumnDescriptor(type, 4, 0)); - type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num); + type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num, + LogicalType::NONE, length); d3.reset(new ColumnDescriptor(type, 4, 2)); } - void ExecuteAll(int num_pages, int num_levels, int batch_size) { + void ExecuteAll(int num_pages, int num_levels, int batch_size, int type_length) { std::shared_ptr<ColumnDescriptor> d1; std::shared_ptr<ColumnDescriptor> d2; std::shared_ptr<ColumnDescriptor> d3; - InitDescriptors(d1, d2, d3); + InitDescriptors(d1, d2, d3, type_length); // evaluate REQUIRED pages Execute(num_pages, num_levels, batch_size, d1.get()); // evaluate OPTIONAL pages @@ -203,42 +207,71 @@ void TestFlatScanner<FLBAType>::InitValues() { values_.data()); } -template<> -void TestFlatScanner<FLBAType>::InitDescriptors( - std::shared_ptr<ColumnDescriptor>& d1, std::shared_ptr<ColumnDescriptor>& d2, - std::shared_ptr<ColumnDescriptor>& d3) { - NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED, - FLBA_LENGTH, LogicalType::UTF8); - d1.reset(new ColumnDescriptor(type, 0, 0)); - type = schema::PrimitiveNode::MakeFLBA("c2", Repetition::OPTIONAL, - FLBA_LENGTH, LogicalType::UTF8); - d2.reset(new ColumnDescriptor(type, 4, 0)); - type = schema::PrimitiveNode::MakeFLBA("c3", Repetition::REPEATED, - FLBA_LENGTH, LogicalType::UTF8); - d3.reset(new ColumnDescriptor(type, 4, 2)); -} - typedef TestFlatScanner<FLBAType> TestFlatFLBAScanner; static int num_levels_per_page = 100; static int num_pages = 20; static int batch_size = 32; -TYPED_TEST_CASE(TestFlatScanner, ParquetTypes); +typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, + FloatType, DoubleType, ByteArrayType> TestTypes; + +typedef TestFlatScanner<FLBAType> TestFLBAFlatScanner; + +TYPED_TEST_CASE(TestFlatScanner, TestTypes); TYPED_TEST(TestFlatScanner, TestScanner) { - this->ExecuteAll(num_pages, num_levels_per_page, batch_size); + this->ExecuteAll(num_pages, num_levels_per_page, batch_size, 0); +} + +TEST_F(TestFLBAFlatScanner, TestScanner) { + this->ExecuteAll(num_pages, num_levels_per_page, batch_size, FLBA_LENGTH); } //PARQUET 502 TEST_F(TestFlatFLBAScanner, TestSmallBatch) { - NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED, - FLBA_LENGTH, LogicalType::UTF8); + NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); const ColumnDescriptor d(type, 0, 0); MakePages(&d, 1, 100); InitScanner(&d); CheckResults(1, &d); } +TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) { + NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); + const ColumnDescriptor d(type, 4, 0); + MakePages(&d, 1, 100); + InitScanner(&d); + TypedScanner<FLBAType::type_num>* scanner = + reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get()); + ASSERT_EQ(10, scanner->descr()->type_precision()); + ASSERT_EQ(2, scanner->descr()->type_scale()); + ASSERT_EQ(FLBA_LENGTH, scanner->descr()->type_length()); +} + +TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) { + NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); + const ColumnDescriptor d(type, 4, 0); + MakePages(&d, 1, 100); + InitScanner(&d); + TypedScanner<FLBAType::type_num>* scanner = + reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get()); + size_t j = 0; + scanner->SetBatchSize(batch_size); + std::stringstream ss_fail; + for (size_t i = 0; i < num_levels_; i++) { + std::stringstream ss; + scanner->PrintNext(ss, 17); + std::string result = ss.str(); + ASSERT_LE(17, result.size()) << i; + } + ASSERT_THROW(scanner->PrintNext(ss_fail, 17), ParquetException); +} + +//Test for GroupNode + } // namespace test } // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/column/scanner.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h index f3f5719..8569a94 100644 --- a/src/parquet/column/scanner.h +++ b/src/parquet/column/scanner.h @@ -45,8 +45,8 @@ class Scanner { values_buffered_(0), reader_(reader) { // TODO: don't allocate for required fields - def_levels_.resize(reader->descr()->is_optional() ? batch_size_ : 0); - rep_levels_.resize(reader->descr()->is_repeated() ? batch_size_ : 0); + def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0); + rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0); } virtual ~Scanner() {} @@ -114,10 +114,8 @@ class TypedScanner : public Scanner { return false; } } - *def_level = descr()->is_optional() ? - def_levels_[level_offset_] : descr()->max_definition_level(); - *rep_level = descr()->is_repeated() ? - rep_levels_[level_offset_] : descr()->max_repetition_level(); + *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0; + *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0; level_offset_++; return true; } @@ -172,10 +170,12 @@ class TypedScanner : public Scanner { virtual void PrintNext(std::ostream& out, int width) { T val; - bool is_null; - + bool is_null = false; char buffer[25]; - NextValue(&val, &is_null); + + if (!NextValue(&val, &is_null)) { + throw ParquetException("No more values buffered"); + } if (is_null) { std::string null_fmt = format_fwf<Type::BYTE_ARRAY>(width); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/encodings/encoding-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/encoding-test.cc b/src/parquet/encodings/encoding-test.cc index 10310ed..f060f96 100644 --- a/src/parquet/encodings/encoding-test.cc +++ b/src/parquet/encodings/encoding-test.cc @@ -134,8 +134,8 @@ std::shared_ptr<ColumnDescriptor> ExampleDescr() { template <> std::shared_ptr<ColumnDescriptor> ExampleDescr<FLBA>() { - auto node = schema::PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL, - flba_length, LogicalType::UTF8); + auto node = schema::PrimitiveNode::Make("name", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, flba_length, 10, 2); return std::make_shared<ColumnDescriptor>(node, 0, 0); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/descriptor.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc index 02b5060..b3fefee 100644 --- a/src/parquet/schema/descriptor.cc +++ b/src/parquet/schema/descriptor.cc @@ -84,10 +84,15 @@ const ColumnDescriptor* SchemaDescriptor::Column(size_t i) const { return &leaves_[i]; } +int ColumnDescriptor::type_scale() const { + return primitive_node_->decimal_metadata().scale; +} + +int ColumnDescriptor::type_precision() const { + return primitive_node_->decimal_metadata().precision; +} + int ColumnDescriptor::type_length() const { - if (primitive_node_->physical_type() != Type::FIXED_LEN_BYTE_ARRAY) { - throw ParquetException("Not a FIXED_LEN_BYTE_ARRAY"); - } return primitive_node_->type_length(); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/descriptor.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h index 62066ef..4c6f50d 100644 --- a/src/parquet/schema/descriptor.h +++ b/src/parquet/schema/descriptor.h @@ -24,7 +24,6 @@ #include <string> #include <unordered_map> #include <vector> - #include "parquet/schema/types.h" #include "parquet/types.h" @@ -54,23 +53,19 @@ class ColumnDescriptor { return primitive_node_->physical_type(); } - const std::string& name() const { - return primitive_node_->name(); + LogicalType::type logical_type() const { + return primitive_node_->logical_type(); } - bool is_required() const { - return max_definition_level_ == 0; + const std::string& name() const { + return primitive_node_->name(); } - bool is_optional() const { - return max_definition_level_ > 0; - } + int type_length() const; - bool is_repeated() const { - return max_repetition_level_ > 0; - } + int type_precision() const; - int type_length() const; + int type_scale() const; private: schema::NodePtr node_; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/schema-descriptor-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc index 3b1734b..519d968 100644 --- a/src/parquet/schema/schema-descriptor-test.cc +++ b/src/parquet/schema/schema-descriptor-test.cc @@ -46,11 +46,11 @@ TEST(TestColumnDescriptor, TestAttrs) { ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type()); - ASSERT_THROW(descr.type_length(), ParquetException); + ASSERT_EQ(-1, descr.type_length()); // Test FIXED_LEN_BYTE_ARRAY - node = PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL, - 12, LogicalType::UTF8); + node = PrimitiveNode::Make("name", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 12, 10, 4); descr = ColumnDescriptor(node, 4, 1); ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type()); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/schema-types-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc index cac7dc5..f512f0f 100644 --- a/src/parquet/schema/schema-types-test.cc +++ b/src/parquet/schema/schema-types-test.cc @@ -21,6 +21,7 @@ #include <string> #include <vector> +#include "parquet/exception.h" #include "parquet/schema/test-util.h" #include "parquet/schema/types.h" #include "parquet/thrift/parquet_types.h" @@ -130,15 +131,15 @@ TEST_F(TestPrimitiveNode, FromParquet) { parquet::Type::FIXED_LEN_BYTE_ARRAY); elt.__set_converted_type(ConvertedType::DECIMAL); elt.__set_type_length(6); - elt.__set_scale(12); - elt.__set_precision(2); + elt.__set_scale(2); + elt.__set_precision(12); Convert(&elt); ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type()); ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type()); ASSERT_EQ(6, prim_node_->type_length()); - ASSERT_EQ(12, prim_node_->decimal_metadata().scale); - ASSERT_EQ(2, prim_node_->decimal_metadata().precision); + ASSERT_EQ(2, prim_node_->decimal_metadata().scale); + ASSERT_EQ(12, prim_node_->decimal_metadata().precision); } TEST_F(TestPrimitiveNode, Equals) { @@ -154,17 +155,66 @@ TEST_F(TestPrimitiveNode, Equals) { ASSERT_FALSE(node1.Equals(&node4)); ASSERT_TRUE(node1.Equals(&node5)); - PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY); - flba1.SetTypeLength(12); + PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, 12, 4, 2); - PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY); + PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, 1, 4, 2); flba2.SetTypeLength(12); - PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY); + PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, 1, 4, 2); flba3.SetTypeLength(16); + PrimitiveNode flba4("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, 12, 4, 0); + + PrimitiveNode flba5("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, 12, 4, 0); + ASSERT_TRUE(flba1.Equals(&flba2)); ASSERT_FALSE(flba1.Equals(&flba3)); + ASSERT_FALSE(flba1.Equals(&flba4)); + ASSERT_FALSE(flba1.Equals(&flba5)); +} + +TEST_F(TestPrimitiveNode, PhysicalLogicalMapping) { + ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::INT32, LogicalType::INT_32)); + ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::BYTE_ARRAY, LogicalType::JSON)); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::INT32, LogicalType::JSON), ParquetException); + ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::INT64, LogicalType::TIMESTAMP_MILLIS)); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::INT32, LogicalType::INT_64), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::BYTE_ARRAY, LogicalType::INT_8), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::BYTE_ARRAY, LogicalType::INTERVAL), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM), ParquetException); + ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::BYTE_ARRAY, LogicalType::ENUM)); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 2, 4), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FLOAT, LogicalType::DECIMAL, 0, 2, 4), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 4, 0), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 0, 4), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 4, -1), ParquetException); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 2, 4), ParquetException); + ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 6, 4)); + ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 12)); + ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10), ParquetException); } // ---------------------------------------------------------------------- @@ -220,11 +270,14 @@ TEST_F(TestGroupNode, Equals) { // This is copied in the GroupNode ctor, so this is okay f2.push_back(Float("four", Repetition::OPTIONAL)); GroupNode group4("group", Repetition::REPEATED, f2); + GroupNode group5("group", Repetition::REPEATED, Fields1()); + ASSERT_TRUE(group1.Equals(&group1)); ASSERT_TRUE(group1.Equals(&group2)); ASSERT_FALSE(group1.Equals(&group3)); ASSERT_FALSE(group1.Equals(&group4)); + ASSERT_FALSE(group5.Equals(&group4)); } } // namespace schema http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/types.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc index fae7c84..b57fd37 100644 --- a/src/parquet/schema/types.cc +++ b/src/parquet/schema/types.cc @@ -40,17 +40,117 @@ bool Node::EqualsInternal(const Node* other) const { // ---------------------------------------------------------------------- // Primitive node +PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition, + Type::type type, LogicalType::type logical_type, + int length, int precision, int scale, int id) : + Node(Node::PRIMITIVE, name, repetition, logical_type, id), + physical_type_(type), type_length_(length) { + std::stringstream ss; + // Check if the physical and logical types match + // Mapping referred from Apache parquet-mr as on 2016-02-22 + switch (logical_type) { + case LogicalType::NONE: + // Logical type not set + // Clients should be able to read these values + decimal_metadata_.precision = precision; + decimal_metadata_.scale = scale; + break; + case LogicalType::UTF8: + case LogicalType::JSON: + case LogicalType::BSON: + if (type != Type::BYTE_ARRAY) { + ss << logical_type_to_string(logical_type); + ss << " can only annotate BYTE_ARRAY fields"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::DECIMAL: + if ((type != Type::INT32) && + (type != Type::INT64) && + (type != Type::BYTE_ARRAY) && + (type != Type::FIXED_LEN_BYTE_ARRAY)) { + ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED"; + throw ParquetException(ss.str()); + } + if (precision <= 0) { + ss << "Invalid DECIMAL precision: " << precision; + throw ParquetException(ss.str()); + } + if (scale < 0) { + ss << "Invalid DECIMAL scale: " << scale; + throw ParquetException(ss.str()); + } + if (scale > precision) { + ss << "Invalid DECIMAL scale " << scale; + ss << " cannot be greater than precision " << precision; + throw ParquetException(ss.str()); + } + decimal_metadata_.precision = precision; + decimal_metadata_.scale = scale; + break; + case LogicalType::DATE: + case LogicalType::TIME_MILLIS: + case LogicalType::UINT_8: + case LogicalType::UINT_16: + case LogicalType::UINT_32: + case LogicalType::INT_8: + case LogicalType::INT_16: + case LogicalType::INT_32: + if (type != Type::INT32) { + ss << logical_type_to_string(logical_type); + ss << " can only annotate INT32"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::TIMESTAMP_MILLIS: + case LogicalType::UINT_64: + case LogicalType::INT_64: + if (type != Type::INT64) { + ss << logical_type_to_string(logical_type); + ss << " can only annotate INT64"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::INTERVAL: + if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) { + ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::ENUM: + if (type != Type::BYTE_ARRAY) { + ss << "ENUM can only annotate BYTE_ARRAY fields"; + throw ParquetException(ss.str()); + } + break; + default: + ss << logical_type_to_string(logical_type); + ss << " can not be applied to a primitive type"; + throw ParquetException(ss.str()); + } + if (type == Type::FIXED_LEN_BYTE_ARRAY) { + if (length <= 0) { + ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length; + throw ParquetException(ss.str()); + } + type_length_ = length; + } +} + bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const { - if (physical_type_ != other->physical_type_) { - return false; - } else if (logical_type_ == LogicalType::DECIMAL) { - // TODO(wesm): metadata - ParquetException::NYI("comparing decimals"); + bool is_equal = true; + if ((physical_type_ != other->physical_type_) || + (logical_type_ != other->logical_type_)) { return false; - } else if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { - return type_length_ == other->type_length_; } - return true; + if (logical_type_ == LogicalType::DECIMAL) { + is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) && + (decimal_metadata_.scale == other->decimal_metadata_.scale); + } + if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { + is_equal &= (type_length_ == other->type_length_); + } + return is_equal; } bool PrimitiveNode::Equals(const Node* other) const { @@ -134,14 +234,8 @@ std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element, std::unique_ptr<PrimitiveNode> result = std::unique_ptr<PrimitiveNode>( new PrimitiveNode(params.name, params.repetition, - FromThrift(element->type), params.logical_type, node_id)); - - if (element->type == parquet::Type::FIXED_LEN_BYTE_ARRAY) { - result->SetTypeLength(element->type_length); - if (params.logical_type == LogicalType::DECIMAL) { - result->SetDecimalMetadata(element->scale, element->precision); - } - } + FromThrift(element->type), params.logical_type, + element->type_length, element->precision, element->scale, node_id)); // Return as unique_ptr to the base type return std::unique_ptr<Node>(result.release()); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/types.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h index e76323f..2eaee8b 100644 --- a/src/parquet/schema/types.h +++ b/src/parquet/schema/types.h @@ -177,17 +177,10 @@ class PrimitiveNode : public Node { static inline NodePtr Make(const std::string& name, Repetition::type repetition, Type::type type, - LogicalType::type logical_type = LogicalType::NONE) { - return NodePtr(new PrimitiveNode(name, repetition, type, logical_type)); - } - - // Alternate constructor for FIXED_LEN_BYTE_ARRAY (FLBA) - static inline NodePtr MakeFLBA(const std::string& name, - Repetition::type repetition, int32_t type_length, - LogicalType::type logical_type = LogicalType::NONE) { - NodePtr result = Make(name, repetition, Type::FIXED_LEN_BYTE_ARRAY, logical_type); - static_cast<PrimitiveNode*>(result.get())->SetTypeLength(type_length); - return result; + LogicalType::type logical_type = LogicalType::NONE, + int length = -1, int precision = -1, int scale = -1) { + return NodePtr(new PrimitiveNode(name, repetition, type, logical_type, + length, precision, scale)); } virtual bool Equals(const Node* other) const; @@ -208,11 +201,8 @@ class PrimitiveNode : public Node { private: PrimitiveNode(const std::string& name, Repetition::type repetition, - Type::type type, - LogicalType::type logical_type = LogicalType::NONE, - int id = -1) : - Node(Node::PRIMITIVE, name, repetition, logical_type, id), - physical_type_(type) {} + Type::type type, LogicalType::type logical_type = LogicalType::NONE, + int length = -1, int precision = -1, int scale = -1, int id = -1); Type::type physical_type_; int32_t type_length_; @@ -234,6 +224,7 @@ class PrimitiveNode : public Node { FRIEND_TEST(TestPrimitiveNode, Attrs); FRIEND_TEST(TestPrimitiveNode, Equals); + FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping); FRIEND_TEST(TestPrimitiveNode, FromParquet); }; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/types-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc new file mode 100644 index 0000000..b1bfacd --- /dev/null +++ b/src/parquet/types-test.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <string> + +#include "parquet/types.h" + +namespace parquet_cpp { + +TEST(TestTypeToString, PhysicalTypes) { + ASSERT_STREQ("BOOLEAN", type_to_string(Type::BOOLEAN).c_str()); + ASSERT_STREQ("INT32", type_to_string(Type::INT32).c_str()); + ASSERT_STREQ("INT64", type_to_string(Type::INT64).c_str()); + ASSERT_STREQ("INT96", type_to_string(Type::INT96).c_str()); + ASSERT_STREQ("FLOAT", type_to_string(Type::FLOAT).c_str()); + ASSERT_STREQ("DOUBLE", type_to_string(Type::DOUBLE).c_str()); + ASSERT_STREQ("BYTE_ARRAY", type_to_string(Type::BYTE_ARRAY).c_str()); + ASSERT_STREQ("FIXED_LEN_BYTE_ARRAY", + type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str()); +} + +TEST(TestLogicalTypeToString, LogicalTypes) { + ASSERT_STREQ("NONE", + logical_type_to_string(LogicalType::NONE).c_str()); + ASSERT_STREQ("UTF8", + logical_type_to_string(LogicalType::UTF8).c_str()); + ASSERT_STREQ("MAP_KEY_VALUE", + logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str()); + ASSERT_STREQ("LIST", + logical_type_to_string(LogicalType::LIST).c_str()); + ASSERT_STREQ("ENUM", + logical_type_to_string(LogicalType::ENUM).c_str()); + ASSERT_STREQ("DECIMAL", + logical_type_to_string(LogicalType::DECIMAL).c_str()); + ASSERT_STREQ("DATE", + logical_type_to_string(LogicalType::DATE).c_str()); + ASSERT_STREQ("TIME_MILLIS", + logical_type_to_string(LogicalType::TIME_MILLIS).c_str()); + ASSERT_STREQ("TIMESTAMP_MILLIS", + logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str()); + ASSERT_STREQ("UINT_8", + logical_type_to_string(LogicalType::UINT_8).c_str()); + ASSERT_STREQ("UINT_16", + logical_type_to_string(LogicalType::UINT_16).c_str()); + ASSERT_STREQ("UINT_32", + logical_type_to_string(LogicalType::UINT_32).c_str()); + ASSERT_STREQ("UINT_64", + logical_type_to_string(LogicalType::UINT_64).c_str()); + ASSERT_STREQ("INT_8", + logical_type_to_string(LogicalType::INT_8).c_str()); + ASSERT_STREQ("INT_16", + logical_type_to_string(LogicalType::INT_16).c_str()); + ASSERT_STREQ("INT_32", + logical_type_to_string(LogicalType::INT_32).c_str()); + ASSERT_STREQ("INT_64", + logical_type_to_string(LogicalType::INT_64).c_str()); + ASSERT_STREQ("JSON", + logical_type_to_string(LogicalType::JSON).c_str()); + ASSERT_STREQ("BSON", + logical_type_to_string(LogicalType::BSON).c_str()); + ASSERT_STREQ("INTERVAL", + logical_type_to_string(LogicalType::INTERVAL).c_str()); +} + + +} // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/types.h ---------------------------------------------------------------------- diff --git a/src/parquet/types.h b/src/parquet/types.h index f59f6a9..e29f11c 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -316,6 +316,73 @@ static inline std::string type_to_string(Type::type t) { } } +static inline std::string logical_type_to_string(LogicalType::type t) { + switch (t) { + case LogicalType::NONE: + return "NONE"; + break; + case LogicalType::UTF8: + return "UTF8"; + break; + case LogicalType::MAP_KEY_VALUE: + return "MAP_KEY_VALUE"; + break; + case LogicalType::LIST: + return "LIST"; + break; + case LogicalType::ENUM: + return "ENUM"; + break; + case LogicalType::DECIMAL: + return "DECIMAL"; + break; + case LogicalType::DATE: + return "DATE"; + break; + case LogicalType::TIME_MILLIS: + return "TIME_MILLIS"; + break; + case LogicalType::TIMESTAMP_MILLIS: + return "TIMESTAMP_MILLIS"; + break; + case LogicalType::UINT_8: + return "UINT_8"; + break; + case LogicalType::UINT_16: + return "UINT_16"; + break; + case LogicalType::UINT_32: + return "UINT_32"; + break; + case LogicalType::UINT_64: + return "UINT_64"; + break; + case LogicalType::INT_8: + return "INT_8"; + break; + case LogicalType::INT_16: + return "INT_16"; + break; + case LogicalType::INT_32: + return "INT_32"; + break; + case LogicalType::INT_64: + return "INT_64"; + break; + case LogicalType::JSON: + return "JSON"; + break; + case LogicalType::BSON: + return "BSON"; + break; + case LogicalType::INTERVAL: + return "INTERVAL"; + break; + default: + return "UNKNOWN"; + break; + } +} } // namespace parquet_cpp #endif // PARQUET_TYPES_H
