This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 35cd7ae ARROW-3474: [GLib] Extend gparquet API with get_schema and
read_column
35cd7ae is described below
commit 35cd7ae7ff15beea8138417c8c7f54708fd2f923
Author: Kouhei Sutou <[email protected]>
AuthorDate: Thu Oct 11 11:40:08 2018 +0900
ARROW-3474: [GLib] Extend gparquet API with get_schema and read_column
So we can read individual columns without loading the whole parquet file in
memory, we need to surface the getSchema and ReadColumn functions of
parquet-cpp to the parquet glib API.
Originally submitted
[here](https://github.com/red-data-tools/parquet-glib/pull/2) before the
inclusion of parquet-glib to Arrow.
cc @kou
Author: Kouhei Sutou <[email protected]>
Author: Benoit Rostykus <[email protected]>
Closes #2736 from rostyboost/gparquet_augment_api and squashes the
following commits:
2c8adeb2 <Kouhei Sutou> Split test file
7b3e9726 <Kouhei Sutou> Add gparquet_arrow_file_reader_select_schema()
3006dce3 <Kouhei Sutou> Read schema internally
fe1fcb6b <Kouhei Sutou> Stop to create intermediate GArrow objects
05615dae <Kouhei Sutou> Use auto
d2a344e5 <Kouhei Sutou> Follow name change
55d866a4 <Kouhei Sutou> GArrowColumn: add missing reference increment
d9d329a3 <Benoit Rostykus> Space
53572a24 <Benoit Rostykus> Space
381db521 <Benoit Rostykus> Space
e7ef71f2 <Benoit Rostykus> Nits
b216d836 <Benoit Rostykus> Address PR comments, fix build and add test
b60dd9ba <Benoit Rostykus> Expose getSchema and ReadColumn functions
---
c_glib/arrow-glib/column.cpp | 129 +++++++++++++++++----
c_glib/parquet-glib/arrow-file-reader.cpp | 110 ++++++++++++++++++
c_glib/parquet-glib/arrow-file-reader.h | 14 +++
c_glib/test/parquet/test-arrow-file-reader.rb | 76 ++++++++++++
.../{test-arrow.rb => test-arrow-file-writer.rb} | 32 ++---
5 files changed, 318 insertions(+), 43 deletions(-)
diff --git a/c_glib/arrow-glib/column.cpp b/c_glib/arrow-glib/column.cpp
index 55d06ea..22c7ab8 100644
--- a/c_glib/arrow-glib/column.cpp
+++ b/c_glib/arrow-glib/column.cpp
@@ -39,49 +39,84 @@ G_BEGIN_DECLS
typedef struct GArrowColumnPrivate_ {
std::shared_ptr<arrow::Column> column;
+ GArrowField *field;
+ GArrowArray *array;
+ GArrowChunkedArray *chunked_array;
} GArrowColumnPrivate;
enum {
PROP_0,
- PROP_COLUMN
+ PROP_COLUMN,
+ PROP_FIELD,
+ PROP_ARRAY,
+ PROP_CHUNKED_ARRAY
};
G_DEFINE_TYPE_WITH_PRIVATE(GArrowColumn,
garrow_column,
G_TYPE_OBJECT)
-#define GARROW_COLUMN_GET_PRIVATE(obj) \
- (G_TYPE_INSTANCE_GET_PRIVATE((obj), \
- GARROW_TYPE_COLUMN, \
- GArrowColumnPrivate))
+#define GARROW_COLUMN_GET_PRIVATE(object) \
+ static_cast<GArrowColumnPrivate *>( \
+ garrow_column_get_instance_private( \
+ GARROW_COLUMN(object)))
static void
garrow_column_dispose(GObject *object)
{
- GArrowColumnPrivate *priv;
+ auto priv = GARROW_COLUMN_GET_PRIVATE(object);
- priv = GARROW_COLUMN_GET_PRIVATE(object);
+ if (priv->field) {
+ g_object_unref(priv->field);
+ priv->field = nullptr;
+ }
- priv->column = nullptr;
+ if (priv->array) {
+ g_object_unref(priv->array);
+ priv->array = nullptr;
+ }
+
+ if (priv->chunked_array) {
+ g_object_unref(priv->chunked_array);
+ priv->chunked_array = nullptr;
+ }
G_OBJECT_CLASS(garrow_column_parent_class)->dispose(object);
}
static void
+garrow_column_finalize(GObject *object)
+{
+ auto priv = GARROW_COLUMN_GET_PRIVATE(object);
+
+ priv->column = nullptr;
+
+ G_OBJECT_CLASS(garrow_column_parent_class)->finalize(object);
+}
+
+static void
garrow_column_set_property(GObject *object,
guint prop_id,
const GValue *value,
GParamSpec *pspec)
{
- GArrowColumnPrivate *priv;
-
- priv = GARROW_COLUMN_GET_PRIVATE(object);
+ auto priv = GARROW_COLUMN_GET_PRIVATE(object);
switch (prop_id) {
case PROP_COLUMN:
priv->column =
*static_cast<std::shared_ptr<arrow::Column>
*>(g_value_get_pointer(value));
break;
+ case PROP_FIELD:
+ priv->field = static_cast<GArrowField *>(g_value_dup_object(value));
+ break;
+ case PROP_ARRAY:
+ priv->array = static_cast<GArrowArray *>(g_value_dup_object(value));
+ break;
+ case PROP_CHUNKED_ARRAY:
+ priv->chunked_array =
+ static_cast<GArrowChunkedArray *>(g_value_dup_object(value));
+ break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
@@ -94,7 +129,18 @@ garrow_column_get_property(GObject *object,
GValue *value,
GParamSpec *pspec)
{
+ auto priv = GARROW_COLUMN_GET_PRIVATE(object);
+
switch (prop_id) {
+ case PROP_FIELD:
+ g_value_set_object(value, priv->field);
+ break;
+ case PROP_ARRAY:
+ g_value_set_object(value, priv->array);
+ break;
+ case PROP_CHUNKED_ARRAY:
+ g_value_set_object(value, priv->chunked_array);
+ break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
@@ -109,21 +155,44 @@ garrow_column_init(GArrowColumn *object)
static void
garrow_column_class_init(GArrowColumnClass *klass)
{
- GObjectClass *gobject_class;
- GParamSpec *spec;
-
- gobject_class = G_OBJECT_CLASS(klass);
+ auto gobject_class = G_OBJECT_CLASS(klass);
gobject_class->dispose = garrow_column_dispose;
+ gobject_class->finalize = garrow_column_finalize;
gobject_class->set_property = garrow_column_set_property;
gobject_class->get_property = garrow_column_get_property;
+ GParamSpec *spec;
spec = g_param_spec_pointer("column",
"Column",
"The raw std::shared<arrow::Column> *",
static_cast<GParamFlags>(G_PARAM_WRITABLE |
G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_COLUMN, spec);
+
+ spec = g_param_spec_object("field",
+ "Field",
+ "The field of the column",
+ GARROW_TYPE_FIELD,
+ static_cast<GParamFlags>(G_PARAM_READWRITE |
+ G_PARAM_CONSTRUCT_ONLY));
+ g_object_class_install_property(gobject_class, PROP_FIELD, spec);
+
+ spec = g_param_spec_object("array",
+ "Array",
+ "The array of the column",
+ GARROW_TYPE_ARRAY,
+ static_cast<GParamFlags>(G_PARAM_READWRITE |
+ G_PARAM_CONSTRUCT_ONLY));
+ g_object_class_install_property(gobject_class, PROP_ARRAY, spec);
+
+ spec = g_param_spec_object("chunked-array",
+ "Chunked array",
+ "The chunked array of the column",
+ GARROW_TYPE_CHUNKED_ARRAY,
+ static_cast<GParamFlags>(G_PARAM_READWRITE |
+ G_PARAM_CONSTRUCT_ONLY));
+ g_object_class_install_property(gobject_class, PROP_CHUNKED_ARRAY, spec);
}
/**
@@ -140,7 +209,12 @@ garrow_column_new_array(GArrowField *field,
auto arrow_column =
std::make_shared<arrow::Column>(garrow_field_get_raw(field),
garrow_array_get_raw(array));
- return garrow_column_new_raw(&arrow_column);
+ auto column = GARROW_COLUMN(g_object_new(GARROW_TYPE_COLUMN,
+ "column", &arrow_column,
+ "field", field,
+ "array", array,
+ NULL));
+ return column;
}
/**
@@ -157,7 +231,12 @@ garrow_column_new_chunked_array(GArrowField *field,
auto arrow_column =
std::make_shared<arrow::Column>(garrow_field_get_raw(field),
garrow_chunked_array_get_raw(chunked_array));
- return garrow_column_new_raw(&arrow_column);
+ auto column = GARROW_COLUMN(g_object_new(GARROW_TYPE_COLUMN,
+ "column", &arrow_column,
+ "field", field,
+ "chunked-array", chunked_array,
+ NULL));
+ return column;
}
/**
@@ -233,9 +312,15 @@ garrow_column_get_n_nulls(GArrowColumn *column)
GArrowField *
garrow_column_get_field(GArrowColumn *column)
{
- const auto arrow_column = garrow_column_get_raw(column);
- auto arrow_field = arrow_column->field();
- return garrow_field_new_raw(&arrow_field);
+ auto priv = GARROW_COLUMN_GET_PRIVATE(column);
+ if (priv->field) {
+ g_object_ref(priv->field);
+ return priv->field;
+ } else {
+ const auto arrow_column = garrow_column_get_raw(column);
+ auto arrow_field = arrow_column->field();
+ return garrow_field_new_raw(&arrow_field);
+ }
}
/**
@@ -293,8 +378,6 @@ garrow_column_new_raw(std::shared_ptr<arrow::Column>
*arrow_column)
std::shared_ptr<arrow::Column>
garrow_column_get_raw(GArrowColumn *column)
{
- GArrowColumnPrivate *priv;
-
- priv = GARROW_COLUMN_GET_PRIVATE(column);
+ auto priv = GARROW_COLUMN_GET_PRIVATE(column);
return priv->column;
}
diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp
b/c_glib/parquet-glib/arrow-file-reader.cpp
index 07aa829..398e85b 100644
--- a/c_glib/parquet-glib/arrow-file-reader.cpp
+++ b/c_glib/parquet-glib/arrow-file-reader.cpp
@@ -25,6 +25,8 @@
#include <parquet-glib/arrow-file-reader.hpp>
+#include <parquet/file_reader.h>
+
G_BEGIN_DECLS
/**
@@ -214,6 +216,114 @@
gparquet_arrow_file_reader_read_table(GParquetArrowFileReader *reader,
}
/**
+ * gparquet_arrow_file_reader_get_schema:
+ * @reader: A #GParquetArrowFileReader.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable): A got #GArrowSchema.
+ *
+ * Since: 0.12.0
+ */
+GArrowSchema *
+gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader,
+ GError **error)
+{
+ auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader);
+
+ const auto n_columns =
+ parquet_arrow_file_reader->parquet_reader()->metadata()->num_columns();
+ std::vector<int> indices(n_columns);
+ for (int i = 0; i < n_columns; ++i) {
+ indices[i] = i;
+ }
+
+ std::shared_ptr<arrow::Schema> arrow_schema;
+ auto status = parquet_arrow_file_reader->GetSchema(indices, &arrow_schema);
+ if (garrow_error_check(error,
+ status,
+ "[parquet][arrow][file-reader][get-schema]")) {
+ return garrow_schema_new_raw(&arrow_schema);
+ } else {
+ return NULL;
+ }
+}
+
+/**
+ * gparquet_arrow_file_reader_select_schema:
+ * @reader: A #GParquetArrowFileReader.
+ * @column_indexes: (array length=n_column_indexes):
+ * The array of column indexes to be selected
+ * @n_column_indexes: The length of `column_indexes`.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable): A selected #GArrowSchema.
+ *
+ * Since: 0.12.0
+ */
+GArrowSchema *
+gparquet_arrow_file_reader_select_schema(GParquetArrowFileReader *reader,
+ gint *column_indexes,
+ gsize n_column_indexes,
+ GError **error)
+{
+ auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader);
+
+ std::vector<int> indices(n_column_indexes);
+ for (gsize i = 0; i < n_column_indexes; ++i) {
+ indices[i] = column_indexes[i];
+ }
+
+ std::shared_ptr<arrow::Schema> arrow_schema;
+ auto status = parquet_arrow_file_reader->GetSchema(indices, &arrow_schema);
+ if (garrow_error_check(error,
+ status,
+ "[parquet][arrow][file-reader][select-schema]")) {
+ return garrow_schema_new_raw(&arrow_schema);
+ } else {
+ return NULL;
+ }
+}
+
+/**
+ * gparquet_arrow_file_reader_read_column:
+ * @reader: A #GParquetArrowFileReader.
+ * @column_index: Index integer of the column to be read.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable): A read #GArrowColumn.
+ *
+ * Since: 0.12.0
+ */
+GArrowColumn *
+gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader,
+ gint column_index,
+ GError **error)
+{
+ auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader);
+
+ std::vector<int> indices = {column_index};
+ std::shared_ptr<arrow::Schema> arrow_schema;
+ auto status = parquet_arrow_file_reader->GetSchema(indices, &arrow_schema);
+ if (!garrow_error_check(error,
+ status,
+
"[parquet][arrow][file-reader][read-column][get-schema]")) {
+ return NULL;
+ }
+
+ std::shared_ptr<arrow::Array> arrow_array;
+ status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_array);
+ if (!garrow_error_check(error,
+ status,
+ "[parquet][arrow][file-reader][read-column]")) {
+ return NULL;
+ }
+
+ auto arrow_field = arrow_schema->field(0);
+ auto arrow_column = std::make_shared<arrow::Column>(arrow_field,
arrow_array);
+ return garrow_column_new_raw(&arrow_column);
+}
+
+/**
* gparquet_arrow_file_reader_get_n_row_groups:
* @reader: A #GParquetArrowFileReader.
*
diff --git a/c_glib/parquet-glib/arrow-file-reader.h
b/c_glib/parquet-glib/arrow-file-reader.h
index 5aa2aa6..c251dcd 100644
--- a/c_glib/parquet-glib/arrow-file-reader.h
+++ b/c_glib/parquet-glib/arrow-file-reader.h
@@ -45,6 +45,20 @@ GArrowTable *
gparquet_arrow_file_reader_read_table(GParquetArrowFileReader *reader,
GError **error);
+GArrowSchema *
+gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader,
+ GError **error);
+GArrowSchema *
+gparquet_arrow_file_reader_select_schema(GParquetArrowFileReader *reader,
+ gint *column_indexes,
+ gsize n_column_indexes,
+ GError **error);
+
+GArrowColumn *
+gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader,
+ gint column_index,
+ GError **error);
+
gint
gparquet_arrow_file_reader_get_n_row_groups(GParquetArrowFileReader *reader);
diff --git a/c_glib/test/parquet/test-arrow-file-reader.rb
b/c_glib/test/parquet/test-arrow-file-reader.rb
new file mode 100644
index 0000000..9657454
--- /dev/null
+++ b/c_glib/test/parquet/test-arrow-file-reader.rb
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestParquetArrowFileReader < Test::Unit::TestCase
+ include Helper::Buildable
+
+ def setup
+ omit("Parquet is required") unless defined?(::Parquet)
+ @file = Tempfile.open(["data", ".parquet"])
+ @a_array = build_string_array(["foo", "bar"])
+ @b_array = build_int32_array([123, 456])
+ @table = build_table("a" => @a_array,
+ "b" => @b_array)
+ writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path)
+ chunk_size = 2
+ writer.write_table(@table, chunk_size)
+ writer.close
+ @reader = Parquet::ArrowFileReader.new(@file.path)
+ end
+
+ def test_schema
+ assert_equal(<<-SCHEMA.chomp, @reader.schema.to_s)
+a: string
+b: int32
+ SCHEMA
+ end
+
+ def test_select_schema
+ assert_equal(<<-SCHEMA.chomp, @reader.select_schema([0]).to_s)
+a: string
+ SCHEMA
+ assert_equal(<<-SCHEMA.chomp, @reader.select_schema([1]).to_s)
+b: int32
+ SCHEMA
+ assert_equal(<<-SCHEMA.chomp, @reader.select_schema([0, 1]).to_s)
+a: string
+b: int32
+ SCHEMA
+ end
+
+ def test_read_column
+ a = @reader.read_column(0)
+ assert_equal([
+ "a: string",
+ Arrow::ChunkedArray.new([@a_array]).to_s,
+ ],
+ [
+ a.field.to_s,
+ a.data.to_s,
+ ])
+
+ b = @reader.read_column(1)
+ assert_equal([
+ "b: int32",
+ Arrow::ChunkedArray.new([@b_array]).to_s,
+ ],
+ [
+ b.field.to_s,
+ b.data.to_s,
+ ])
+ end
+end
diff --git a/c_glib/test/parquet/test-arrow.rb
b/c_glib/test/parquet/test-arrow-file-writer.rb
similarity index 65%
rename from c_glib/test/parquet/test-arrow.rb
rename to c_glib/test/parquet/test-arrow-file-writer.rb
index de021df..d6c775e 100644
--- a/c_glib/test/parquet/test-arrow.rb
+++ b/c_glib/test/parquet/test-arrow-file-writer.rb
@@ -15,35 +15,27 @@
# specific language governing permissions and limitations
# under the License.
-class TestParquetArrow < Test::Unit::TestCase
+class TestParquetArrowFileWriter < Test::Unit::TestCase
include Helper::Buildable
def setup
omit("Parquet is required") unless defined?(::Parquet)
+ @file = Tempfile.open(["data", ".parquet"])
end
- def test_read_write
- tempfile = Tempfile.open(["data", ".parquet"])
-
- values = [true, nil, false, true]
+ def test_write
+ enabled_values = [true, nil, false, true]
+ table = build_table("enabled" => build_boolean_array(enabled_values))
chunk_size = 2
- field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new)
- schema = Arrow::Schema.new([field])
- writer = Parquet::ArrowFileWriter.new(schema, tempfile.path)
- begin
- columns = [
- Arrow::Column.new(field, build_boolean_array(values)),
- ]
- table = Arrow::Table.new(schema, columns)
- writer.write_table(table, chunk_size)
- ensure
- writer.close
- end
+ writer = Parquet::ArrowFileWriter.new(table.schema, @file.path)
+ writer.write_table(table, chunk_size)
+ writer.close
- reader = Parquet::ArrowFileReader.new(tempfile.path)
+ reader = Parquet::ArrowFileReader.new(@file.path)
reader.use_threads = true
- assert_equal(chunk_size, reader.n_row_groups)
+ assert_equal(enabled_values.length / chunk_size, reader.n_row_groups)
+ table = reader.read_table
table = reader.read_table
table_data = table.n_columns.times.collect do |i|
column = table.get_column(i)
@@ -59,6 +51,6 @@ class TestParquetArrow < Test::Unit::TestCase
end
[column.name, data]
end
- assert_equal([["enabled", values]], table_data)
+ assert_equal([["enabled", enabled_values]], table_data)
end
end