Repository: arrow Updated Branches: refs/heads/master 26140dca8 -> fdbc57941
ARROW-417: Add Equals implementation to compare ChunkedArrays Author: Uwe L. Korn <uw...@xhochy.com> Closes #259 from xhochy/ARROW-417 and squashes the following commits: ffc076a [Uwe L. Korn] Add interface for non-shared_ptr-based Equals 3686d6c [Uwe L. Korn] ARROW-415: C++: Add Equals implementation to compare Tables 54cbf54 [Uwe L. Korn] ARROW-416: C++: Add Equals implementation to compare Columns 21e73a0 [Uwe L. Korn] Make signed comparison explicit 8563cb2 [Uwe L. Korn] ARROW-417: Add Equals implementation to compare ChunkedArrays Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fdbc5794 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fdbc5794 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fdbc5794 Branch: refs/heads/master Commit: fdbc57941fd3615c71b3a61b409b63eb6a48a817 Parents: 26140dc Author: Uwe L. Korn <uw...@xhochy.com> Authored: Tue Jan 3 07:23:17 2017 -0500 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Tue Jan 3 07:23:17 2017 -0500 ---------------------------------------------------------------------- cpp/src/arrow/column-test.cc | 121 ++++++++++++++++++++++++++++++++++++-- cpp/src/arrow/column.cc | 51 ++++++++++++++++ cpp/src/arrow/column.h | 7 +++ cpp/src/arrow/table-test.cc | 44 ++++++++++---- cpp/src/arrow/table.cc | 17 ++++++ cpp/src/arrow/table.h | 3 + cpp/src/arrow/test-util.h | 2 +- 7 files changed, 228 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/column-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc index 9005245..1e722ed 100644 --- a/cpp/src/arrow/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -33,12 +33,92 @@ using std::vector; namespace arrow { -const auto INT32 = std::make_shared<Int32Type>(); +class TestChunkedArray : public TestBase { + protected: + virtual void Construct() { + one_ = std::make_shared<ChunkedArray>(arrays_one_); + another_ = std::make_shared<ChunkedArray>(arrays_another_); + } + + ArrayVector arrays_one_; + ArrayVector arrays_another_; + + std::shared_ptr<ChunkedArray> one_; + std::shared_ptr<ChunkedArray> another_; +}; + +TEST_F(TestChunkedArray, BasicEquals) { + std::vector<bool> null_bitmap(100, true); + std::vector<int32_t> data(100, 1); + std::shared_ptr<Array> array; + ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array); + arrays_one_.push_back(array); + arrays_another_.push_back(array); + + Construct(); + ASSERT_TRUE(one_->Equals(one_)); + ASSERT_FALSE(one_->Equals(nullptr)); + ASSERT_TRUE(one_->Equals(another_)); + ASSERT_TRUE(one_->Equals(*another_.get())); +} + +TEST_F(TestChunkedArray, EqualsDifferingTypes) { + std::vector<bool> null_bitmap(100, true); + std::vector<int32_t> data32(100, 1); + std::vector<int64_t> data64(100, 1); + std::shared_ptr<Array> array; + ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data32, &array); + arrays_one_.push_back(array); + ArrayFromVector<Int64Type, int64_t>(int64(), null_bitmap, data64, &array); + arrays_another_.push_back(array); + + Construct(); + ASSERT_FALSE(one_->Equals(another_)); + ASSERT_FALSE(one_->Equals(*another_.get())); +} + +TEST_F(TestChunkedArray, EqualsDifferingLengths) { + std::vector<bool> null_bitmap100(100, true); + std::vector<bool> null_bitmap101(101, true); + std::vector<int32_t> data100(100, 1); + std::vector<int32_t> data101(101, 1); + std::shared_ptr<Array> array; + ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap100, data100, &array); + arrays_one_.push_back(array); + ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap101, data101, &array); + arrays_another_.push_back(array); + + Construct(); + ASSERT_FALSE(one_->Equals(another_)); + ASSERT_FALSE(one_->Equals(*another_.get())); + + std::vector<bool> null_bitmap1(1, true); + std::vector<int32_t> data1(1, 1); + ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap1, data1, &array); + arrays_one_.push_back(array); -class TestColumn : public TestBase { + Construct(); + ASSERT_TRUE(one_->Equals(another_)); + ASSERT_TRUE(one_->Equals(*another_.get())); +} + +class TestColumn : public TestChunkedArray { protected: + void Construct() override { + TestChunkedArray::Construct(); + + one_col_ = std::make_shared<Column>(one_field_, one_); + another_col_ = std::make_shared<Column>(another_field_, another_); + } + std::shared_ptr<ChunkedArray> data_; std::unique_ptr<Column> column_; + + std::shared_ptr<Field> one_field_; + std::shared_ptr<Field> another_field_; + + std::shared_ptr<Column> one_col_; + std::shared_ptr<Column> another_col_; }; TEST_F(TestColumn, BasicAPI) { @@ -47,11 +127,11 @@ TEST_F(TestColumn, BasicAPI) { arrays.push_back(MakePrimitive<Int32Array>(100, 10)); arrays.push_back(MakePrimitive<Int32Array>(100, 20)); - auto field = std::make_shared<Field>("c0", INT32); + auto field = std::make_shared<Field>("c0", int32()); column_.reset(new Column(field, arrays)); ASSERT_EQ("c0", column_->name()); - ASSERT_TRUE(column_->type()->Equals(INT32)); + ASSERT_TRUE(column_->type()->Equals(int32())); ASSERT_EQ(300, column_->length()); ASSERT_EQ(30, column_->null_count()); ASSERT_EQ(3, column_->data()->num_chunks()); @@ -62,7 +142,7 @@ TEST_F(TestColumn, ChunksInhomogeneous) { arrays.push_back(MakePrimitive<Int32Array>(100)); arrays.push_back(MakePrimitive<Int32Array>(100, 10)); - auto field = std::make_shared<Field>("c0", INT32); + auto field = std::make_shared<Field>("c0", int32()); column_.reset(new Column(field, arrays)); ASSERT_OK(column_->ValidateData()); @@ -72,4 +152,35 @@ TEST_F(TestColumn, ChunksInhomogeneous) { ASSERT_RAISES(Invalid, column_->ValidateData()); } +TEST_F(TestColumn, Equals) { + std::vector<bool> null_bitmap(100, true); + std::vector<int32_t> data(100, 1); + std::shared_ptr<Array> array; + ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array); + arrays_one_.push_back(array); + arrays_another_.push_back(array); + + one_field_ = std::make_shared<Field>("column", int32()); + another_field_ = std::make_shared<Field>("column", int32()); + + Construct(); + ASSERT_TRUE(one_col_->Equals(one_col_)); + ASSERT_FALSE(one_col_->Equals(nullptr)); + ASSERT_TRUE(one_col_->Equals(another_col_)); + ASSERT_TRUE(one_col_->Equals(*another_col_.get())); + + // Field is different + another_field_ = std::make_shared<Field>("two", int32()); + Construct(); + ASSERT_FALSE(one_col_->Equals(another_col_)); + ASSERT_FALSE(one_col_->Equals(*another_col_.get())); + + // ChunkedArray is different + another_field_ = std::make_shared<Field>("column", int32()); + arrays_another_.push_back(array); + Construct(); + ASSERT_FALSE(one_col_->Equals(another_col_)); + ASSERT_FALSE(one_col_->Equals(*another_col_.get())); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/column.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/column.cc b/cpp/src/arrow/column.cc index 1d136e7..3e89956 100644 --- a/cpp/src/arrow/column.cc +++ b/cpp/src/arrow/column.cc @@ -35,6 +35,45 @@ ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { } } +bool ChunkedArray::Equals(const ChunkedArray& other) const { + if (length_ != other.length()) { return false; } + if (null_count_ != other.null_count()) { return false; } + + // Check contents of the underlying arrays. This checks for equality of + // the underlying data independently of the chunk size. + int this_chunk_idx = 0; + int32_t this_start_idx = 0; + int other_chunk_idx = 0; + int32_t other_start_idx = 0; + while (this_chunk_idx < static_cast<int32_t>(chunks_.size())) { + const std::shared_ptr<Array> this_array = chunks_[this_chunk_idx]; + const std::shared_ptr<Array> other_array = other.chunk(other_chunk_idx); + int32_t common_length = std::min( + this_array->length() - this_start_idx, other_array->length() - other_start_idx); + if (!this_array->RangeEquals(this_start_idx, this_start_idx + common_length, + other_start_idx, other_array)) { + return false; + } + + // If we have exhausted the current chunk, proceed to the next one individually. + if (this_start_idx + common_length == this_array->length()) { + this_chunk_idx++; + this_start_idx = 0; + } + if (other_start_idx + common_length == other_array->length()) { + other_chunk_idx++; + other_start_idx = 0; + } + } + return true; +} + +bool ChunkedArray::Equals(const std::shared_ptr<ChunkedArray>& other) const { + if (this == other.get()) { return true; } + if (!other) { return false; } + return Equals(*other.get()); +} + Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks) : field_(field) { data_ = std::make_shared<ChunkedArray>(chunks); @@ -49,6 +88,18 @@ Column::Column( const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data) : field_(field), data_(data) {} +bool Column::Equals(const Column& other) const { + if (!field_->Equals(other.field())) { return false; } + return data_->Equals(other.data()); +} + +bool Column::Equals(const std::shared_ptr<Column>& other) const { + if (this == other.get()) { return true; } + if (!other) { return false; } + + return Equals(*other.get()); +} + Status Column::ValidateData() { for (int i = 0; i < data_->num_chunks(); ++i) { std::shared_ptr<DataType> type = data_->chunk(i)->type(); http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/column.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h index 1caafec..f716473 100644 --- a/cpp/src/arrow/column.h +++ b/cpp/src/arrow/column.h @@ -48,6 +48,9 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; } + bool Equals(const ChunkedArray& other) const; + bool Equals(const std::shared_ptr<ChunkedArray>& other) const; + protected: ArrayVector chunks_; int64_t length_; @@ -78,6 +81,10 @@ class ARROW_EXPORT Column { // @returns: the column's data as a chunked logical array std::shared_ptr<ChunkedArray> data() const { return data_; } + + bool Equals(const Column& other) const; + bool Equals(const std::shared_ptr<Column>& other) const; + // Verify that the column's array data is consistent with the passed field's // metadata Status ValidateData(); http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/table-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index f62336d..734b941 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -34,16 +34,12 @@ using std::vector; namespace arrow { -const auto INT16 = std::make_shared<Int16Type>(); -const auto UINT8 = std::make_shared<UInt8Type>(); -const auto INT32 = std::make_shared<Int32Type>(); - class TestTable : public TestBase { public: void MakeExample1(int length) { - auto f0 = std::make_shared<Field>("f0", INT32); - auto f1 = std::make_shared<Field>("f1", UINT8); - auto f2 = std::make_shared<Field>("f2", INT16); + auto f0 = std::make_shared<Field>("f0", int32()); + auto f1 = std::make_shared<Field>("f1", uint8()); + auto f2 = std::make_shared<Field>("f2", int16()); vector<shared_ptr<Field>> fields = {f0, f1, f2}; schema_ = std::make_shared<Schema>(fields); @@ -55,7 +51,7 @@ class TestTable : public TestBase { } protected: - std::unique_ptr<Table> table_; + std::shared_ptr<Table> table_; shared_ptr<Schema> schema_; vector<std::shared_ptr<Column>> columns_; }; @@ -123,14 +119,40 @@ TEST_F(TestTable, InvalidColumns) { ASSERT_RAISES(Invalid, table_->ValidateColumns()); } +TEST_F(TestTable, Equals) { + int length = 100; + MakeExample1(length); + + std::string name = "data"; + table_.reset(new Table(name, schema_, columns_)); + + ASSERT_TRUE(table_->Equals(table_)); + ASSERT_FALSE(table_->Equals(nullptr)); + // Differing name + ASSERT_FALSE(table_->Equals(std::make_shared<Table>("other_name", schema_, columns_))); + // Differing schema + auto f0 = std::make_shared<Field>("f3", int32()); + auto f1 = std::make_shared<Field>("f4", uint8()); + auto f2 = std::make_shared<Field>("f5", int16()); + vector<shared_ptr<Field>> fields = {f0, f1, f2}; + auto other_schema = std::make_shared<Schema>(fields); + ASSERT_FALSE(table_->Equals(std::make_shared<Table>(name, other_schema, columns_))); + // Differing columns + std::vector<std::shared_ptr<Column>> other_columns = { + std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length, 10)), + std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length, 10)), + std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length, 10))}; + ASSERT_FALSE(table_->Equals(std::make_shared<Table>(name, schema_, other_columns))); +} + class TestRecordBatch : public TestBase {}; TEST_F(TestRecordBatch, Equals) { const int length = 10; - auto f0 = std::make_shared<Field>("f0", INT32); - auto f1 = std::make_shared<Field>("f1", UINT8); - auto f2 = std::make_shared<Field>("f2", INT16); + auto f0 = std::make_shared<Field>("f0", int32()); + auto f1 = std::make_shared<Field>("f1", uint8()); + auto f2 = std::make_shared<Field>("f2", int16()); vector<shared_ptr<Field>> fields = {f0, f1, f2}; auto schema = std::make_shared<Schema>(fields); http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/table.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 855d4ec..45f672e 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -77,6 +77,23 @@ Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema, const std::vector<std::shared_ptr<Column>>& columns, int64_t num_rows) : name_(name), schema_(schema), columns_(columns), num_rows_(num_rows) {} +bool Table::Equals(const Table& other) const { + if (name_ != other.name()) { return false; } + if (!schema_->Equals(other.schema())) { return false; } + if (static_cast<int64_t>(columns_.size()) != other.num_columns()) { return false; } + + for (size_t i = 0; i < columns_.size(); i++) { + if (!columns_[i]->Equals(other.column(i))) { return false; } + } + return true; +} + +bool Table::Equals(const std::shared_ptr<Table>& other) const { + if (this == other.get()) { return true; } + if (!other) { return false; } + return Equals(*other.get()); +} + Status Table::ValidateColumns() const { if (num_columns() != schema_->num_fields()) { return Status::Invalid("Number of columns did not match schema"); http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/table.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index bf5c39f..0f2418d 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -100,6 +100,9 @@ class ARROW_EXPORT Table { // @returns: the number of rows (the corresponding length of each column) int64_t num_rows() const { return num_rows_; } + bool Equals(const Table& other) const; + bool Equals(const std::shared_ptr<Table>& other) const; + // After construction, perform any checks to validate the input arguments Status ValidateColumns() const; http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/test-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index ce9327d..70e9333 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -81,7 +81,7 @@ class TestBase : public ::testing::Test { auto null_bitmap = std::make_shared<PoolBuffer>(pool_); EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); EXPECT_OK(null_bitmap->Resize(BitUtil::BytesForBits(length))); - return std::make_shared<ArrayType>(length, data, 10, null_bitmap); + return std::make_shared<ArrayType>(length, data, null_count, null_bitmap); } protected: