This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 35cd7ae  ARROW-3474: [GLib] Extend gparquet API with get_schema and 
read_column
35cd7ae is described below

commit 35cd7ae7ff15beea8138417c8c7f54708fd2f923
Author: Kouhei Sutou <[email protected]>
AuthorDate: Thu Oct 11 11:40:08 2018 +0900

    ARROW-3474: [GLib] Extend gparquet API with get_schema and read_column
    
    So we can read individual columns without loading the whole parquet file in 
memory, we need to surface the getSchema and ReadColumn functions of 
parquet-cpp to the parquet glib API.
    
    Originally submitted 
[here](https://github.com/red-data-tools/parquet-glib/pull/2) before the 
inclusion of parquet-glib to Arrow.
    
    cc @kou
    
    Author: Kouhei Sutou <[email protected]>
    Author: Benoit Rostykus <[email protected]>
    
    Closes #2736 from rostyboost/gparquet_augment_api and squashes the 
following commits:
    
    2c8adeb2 <Kouhei Sutou> Split test file
    7b3e9726 <Kouhei Sutou> Add gparquet_arrow_file_reader_select_schema()
    3006dce3 <Kouhei Sutou> Read schema internally
    fe1fcb6b <Kouhei Sutou> Stop to create intermediate GArrow objects
    05615dae <Kouhei Sutou> Use auto
    d2a344e5 <Kouhei Sutou> Follow name change
    55d866a4 <Kouhei Sutou> GArrowColumn: add missing reference increment
    d9d329a3 <Benoit Rostykus> Space
    53572a24 <Benoit Rostykus> Space
    381db521 <Benoit Rostykus> Space
    e7ef71f2 <Benoit Rostykus> Nits
    b216d836 <Benoit Rostykus> Address PR comments, fix build and add test
    b60dd9ba <Benoit Rostykus> Expose getSchema and ReadColumn functions
---
 c_glib/arrow-glib/column.cpp                       | 129 +++++++++++++++++----
 c_glib/parquet-glib/arrow-file-reader.cpp          | 110 ++++++++++++++++++
 c_glib/parquet-glib/arrow-file-reader.h            |  14 +++
 c_glib/test/parquet/test-arrow-file-reader.rb      |  76 ++++++++++++
 .../{test-arrow.rb => test-arrow-file-writer.rb}   |  32 ++---
 5 files changed, 318 insertions(+), 43 deletions(-)

diff --git a/c_glib/arrow-glib/column.cpp b/c_glib/arrow-glib/column.cpp
index 55d06ea..22c7ab8 100644
--- a/c_glib/arrow-glib/column.cpp
+++ b/c_glib/arrow-glib/column.cpp
@@ -39,49 +39,84 @@ G_BEGIN_DECLS
 
 typedef struct GArrowColumnPrivate_ {
   std::shared_ptr<arrow::Column> column;
+  GArrowField *field;
+  GArrowArray *array;
+  GArrowChunkedArray *chunked_array;
 } GArrowColumnPrivate;
 
 enum {
   PROP_0,
-  PROP_COLUMN
+  PROP_COLUMN,
+  PROP_FIELD,
+  PROP_ARRAY,
+  PROP_CHUNKED_ARRAY
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowColumn,
                            garrow_column,
                            G_TYPE_OBJECT)
 
-#define GARROW_COLUMN_GET_PRIVATE(obj)                  \
-  (G_TYPE_INSTANCE_GET_PRIVATE((obj),                   \
-                               GARROW_TYPE_COLUMN,      \
-                               GArrowColumnPrivate))
+#define GARROW_COLUMN_GET_PRIVATE(object)          \
+  static_cast<GArrowColumnPrivate *>(              \
+    garrow_column_get_instance_private(            \
+      GARROW_COLUMN(object)))
 
 static void
 garrow_column_dispose(GObject *object)
 {
-  GArrowColumnPrivate *priv;
+  auto priv = GARROW_COLUMN_GET_PRIVATE(object);
 
-  priv = GARROW_COLUMN_GET_PRIVATE(object);
+  if (priv->field) {
+    g_object_unref(priv->field);
+    priv->field = nullptr;
+  }
 
-  priv->column = nullptr;
+  if (priv->array) {
+    g_object_unref(priv->array);
+    priv->array = nullptr;
+  }
+
+  if (priv->chunked_array) {
+    g_object_unref(priv->chunked_array);
+    priv->chunked_array = nullptr;
+  }
 
   G_OBJECT_CLASS(garrow_column_parent_class)->dispose(object);
 }
 
 static void
+garrow_column_finalize(GObject *object)
+{
+  auto priv = GARROW_COLUMN_GET_PRIVATE(object);
+
+  priv->column = nullptr;
+
+  G_OBJECT_CLASS(garrow_column_parent_class)->finalize(object);
+}
+
+static void
 garrow_column_set_property(GObject *object,
                            guint prop_id,
                            const GValue *value,
                            GParamSpec *pspec)
 {
-  GArrowColumnPrivate *priv;
-
-  priv = GARROW_COLUMN_GET_PRIVATE(object);
+  auto priv = GARROW_COLUMN_GET_PRIVATE(object);
 
   switch (prop_id) {
   case PROP_COLUMN:
     priv->column =
       *static_cast<std::shared_ptr<arrow::Column> 
*>(g_value_get_pointer(value));
     break;
+  case PROP_FIELD:
+    priv->field = static_cast<GArrowField *>(g_value_dup_object(value));
+    break;
+  case PROP_ARRAY:
+    priv->array = static_cast<GArrowArray *>(g_value_dup_object(value));
+    break;
+  case PROP_CHUNKED_ARRAY:
+    priv->chunked_array =
+      static_cast<GArrowChunkedArray *>(g_value_dup_object(value));
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -94,7 +129,18 @@ garrow_column_get_property(GObject *object,
                            GValue *value,
                            GParamSpec *pspec)
 {
+  auto priv = GARROW_COLUMN_GET_PRIVATE(object);
+
   switch (prop_id) {
+  case PROP_FIELD:
+    g_value_set_object(value, priv->field);
+    break;
+  case PROP_ARRAY:
+    g_value_set_object(value, priv->array);
+    break;
+  case PROP_CHUNKED_ARRAY:
+    g_value_set_object(value, priv->chunked_array);
+    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -109,21 +155,44 @@ garrow_column_init(GArrowColumn *object)
 static void
 garrow_column_class_init(GArrowColumnClass *klass)
 {
-  GObjectClass *gobject_class;
-  GParamSpec *spec;
-
-  gobject_class = G_OBJECT_CLASS(klass);
+  auto gobject_class = G_OBJECT_CLASS(klass);
 
   gobject_class->dispose      = garrow_column_dispose;
+  gobject_class->finalize     = garrow_column_finalize;
   gobject_class->set_property = garrow_column_set_property;
   gobject_class->get_property = garrow_column_get_property;
 
+  GParamSpec *spec;
   spec = g_param_spec_pointer("column",
                               "Column",
                               "The raw std::shared<arrow::Column> *",
                               static_cast<GParamFlags>(G_PARAM_WRITABLE |
                                                        
G_PARAM_CONSTRUCT_ONLY));
   g_object_class_install_property(gobject_class, PROP_COLUMN, spec);
+
+  spec = g_param_spec_object("field",
+                             "Field",
+                             "The field of the column",
+                             GARROW_TYPE_FIELD,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_FIELD, spec);
+
+  spec = g_param_spec_object("array",
+                             "Array",
+                             "The array of the column",
+                             GARROW_TYPE_ARRAY,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_ARRAY, spec);
+
+  spec = g_param_spec_object("chunked-array",
+                             "Chunked array",
+                             "The chunked array of the column",
+                             GARROW_TYPE_CHUNKED_ARRAY,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_CHUNKED_ARRAY, spec);
 }
 
 /**
@@ -140,7 +209,12 @@ garrow_column_new_array(GArrowField *field,
   auto arrow_column =
     std::make_shared<arrow::Column>(garrow_field_get_raw(field),
                                     garrow_array_get_raw(array));
-  return garrow_column_new_raw(&arrow_column);
+  auto column = GARROW_COLUMN(g_object_new(GARROW_TYPE_COLUMN,
+                                           "column", &arrow_column,
+                                           "field", field,
+                                           "array", array,
+                                           NULL));
+  return column;
 }
 
 /**
@@ -157,7 +231,12 @@ garrow_column_new_chunked_array(GArrowField *field,
   auto arrow_column =
     std::make_shared<arrow::Column>(garrow_field_get_raw(field),
                                     
garrow_chunked_array_get_raw(chunked_array));
-  return garrow_column_new_raw(&arrow_column);
+  auto column = GARROW_COLUMN(g_object_new(GARROW_TYPE_COLUMN,
+                                           "column", &arrow_column,
+                                           "field", field,
+                                           "chunked-array", chunked_array,
+                                           NULL));
+  return column;
 }
 
 /**
@@ -233,9 +312,15 @@ garrow_column_get_n_nulls(GArrowColumn *column)
 GArrowField *
 garrow_column_get_field(GArrowColumn *column)
 {
-  const auto arrow_column = garrow_column_get_raw(column);
-  auto arrow_field = arrow_column->field();
-  return garrow_field_new_raw(&arrow_field);
+  auto priv = GARROW_COLUMN_GET_PRIVATE(column);
+  if (priv->field) {
+    g_object_ref(priv->field);
+    return priv->field;
+  } else {
+    const auto arrow_column = garrow_column_get_raw(column);
+    auto arrow_field = arrow_column->field();
+    return garrow_field_new_raw(&arrow_field);
+  }
 }
 
 /**
@@ -293,8 +378,6 @@ garrow_column_new_raw(std::shared_ptr<arrow::Column> 
*arrow_column)
 std::shared_ptr<arrow::Column>
 garrow_column_get_raw(GArrowColumn *column)
 {
-  GArrowColumnPrivate *priv;
-
-  priv = GARROW_COLUMN_GET_PRIVATE(column);
+  auto priv = GARROW_COLUMN_GET_PRIVATE(column);
   return priv->column;
 }
diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp 
b/c_glib/parquet-glib/arrow-file-reader.cpp
index 07aa829..398e85b 100644
--- a/c_glib/parquet-glib/arrow-file-reader.cpp
+++ b/c_glib/parquet-glib/arrow-file-reader.cpp
@@ -25,6 +25,8 @@
 
 #include <parquet-glib/arrow-file-reader.hpp>
 
+#include <parquet/file_reader.h>
+
 G_BEGIN_DECLS
 
 /**
@@ -214,6 +216,114 @@ 
gparquet_arrow_file_reader_read_table(GParquetArrowFileReader *reader,
 }
 
 /**
+ * gparquet_arrow_file_reader_get_schema:
+ * @reader: A #GParquetArrowFileReader.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable): A got #GArrowSchema.
+ *
+ * Since: 0.12.0
+ */
+GArrowSchema *
+gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader,
+                                      GError **error)
+{
+  auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader);
+
+  const auto n_columns =
+    parquet_arrow_file_reader->parquet_reader()->metadata()->num_columns();
+  std::vector<int> indices(n_columns);
+  for (int i = 0; i < n_columns; ++i) {
+    indices[i] = i;
+  }
+
+  std::shared_ptr<arrow::Schema> arrow_schema;
+  auto status = parquet_arrow_file_reader->GetSchema(indices, &arrow_schema);
+  if (garrow_error_check(error,
+                         status,
+                         "[parquet][arrow][file-reader][get-schema]")) {
+    return garrow_schema_new_raw(&arrow_schema);
+  } else {
+    return NULL;
+  }
+}
+
+/**
+ * gparquet_arrow_file_reader_select_schema:
+ * @reader: A #GParquetArrowFileReader.
+ * @column_indexes: (array length=n_column_indexes):
+ *   The array of column indexes to be selected
+ * @n_column_indexes: The length of `column_indexes`.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable): A selected #GArrowSchema.
+ *
+ * Since: 0.12.0
+ */
+GArrowSchema *
+gparquet_arrow_file_reader_select_schema(GParquetArrowFileReader *reader,
+                                         gint *column_indexes,
+                                         gsize n_column_indexes,
+                                         GError **error)
+{
+  auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader);
+
+  std::vector<int> indices(n_column_indexes);
+  for (gsize i = 0; i < n_column_indexes; ++i) {
+    indices[i] = column_indexes[i];
+  }
+
+  std::shared_ptr<arrow::Schema> arrow_schema;
+  auto status = parquet_arrow_file_reader->GetSchema(indices, &arrow_schema);
+  if (garrow_error_check(error,
+                         status,
+                         "[parquet][arrow][file-reader][select-schema]")) {
+    return garrow_schema_new_raw(&arrow_schema);
+  } else {
+    return NULL;
+  }
+}
+
+/**
+ * gparquet_arrow_file_reader_read_column:
+ * @reader: A #GParquetArrowFileReader.
+ * @column_index: Index integer of the column to be read.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable): A read #GArrowColumn.
+ *
+ * Since: 0.12.0
+ */
+GArrowColumn *
+gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader,
+                                       gint column_index,
+                                       GError **error)
+{
+  auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader);
+
+  std::vector<int> indices = {column_index};
+  std::shared_ptr<arrow::Schema> arrow_schema;
+  auto status = parquet_arrow_file_reader->GetSchema(indices, &arrow_schema);
+  if (!garrow_error_check(error,
+                          status,
+                          
"[parquet][arrow][file-reader][read-column][get-schema]")) {
+    return NULL;
+  }
+
+  std::shared_ptr<arrow::Array> arrow_array;
+  status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_array);
+  if (!garrow_error_check(error,
+                          status,
+                          "[parquet][arrow][file-reader][read-column]")) {
+    return NULL;
+  }
+
+  auto arrow_field = arrow_schema->field(0);
+  auto arrow_column = std::make_shared<arrow::Column>(arrow_field, 
arrow_array);
+  return garrow_column_new_raw(&arrow_column);
+}
+
+/**
  * gparquet_arrow_file_reader_get_n_row_groups:
  * @reader: A #GParquetArrowFileReader.
  *
diff --git a/c_glib/parquet-glib/arrow-file-reader.h 
b/c_glib/parquet-glib/arrow-file-reader.h
index 5aa2aa6..c251dcd 100644
--- a/c_glib/parquet-glib/arrow-file-reader.h
+++ b/c_glib/parquet-glib/arrow-file-reader.h
@@ -45,6 +45,20 @@ GArrowTable *
 gparquet_arrow_file_reader_read_table(GParquetArrowFileReader *reader,
                                       GError **error);
 
+GArrowSchema *
+gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader,
+                                      GError **error);
+GArrowSchema *
+gparquet_arrow_file_reader_select_schema(GParquetArrowFileReader *reader,
+                                         gint *column_indexes,
+                                         gsize n_column_indexes,
+                                         GError **error);
+
+GArrowColumn *
+gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader,
+                                       gint column_index,
+                                       GError **error);
+
 gint
 gparquet_arrow_file_reader_get_n_row_groups(GParquetArrowFileReader *reader);
 
diff --git a/c_glib/test/parquet/test-arrow-file-reader.rb 
b/c_glib/test/parquet/test-arrow-file-reader.rb
new file mode 100644
index 0000000..9657454
--- /dev/null
+++ b/c_glib/test/parquet/test-arrow-file-reader.rb
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestParquetArrowFileReader < Test::Unit::TestCase
+  include Helper::Buildable
+
+  def setup
+    omit("Parquet is required") unless defined?(::Parquet)
+    @file = Tempfile.open(["data", ".parquet"])
+    @a_array = build_string_array(["foo", "bar"])
+    @b_array = build_int32_array([123, 456])
+    @table = build_table("a" => @a_array,
+                         "b" => @b_array)
+    writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path)
+    chunk_size = 2
+    writer.write_table(@table, chunk_size)
+    writer.close
+    @reader = Parquet::ArrowFileReader.new(@file.path)
+  end
+
+  def test_schema
+    assert_equal(<<-SCHEMA.chomp, @reader.schema.to_s)
+a: string
+b: int32
+    SCHEMA
+  end
+
+  def test_select_schema
+    assert_equal(<<-SCHEMA.chomp, @reader.select_schema([0]).to_s)
+a: string
+    SCHEMA
+    assert_equal(<<-SCHEMA.chomp, @reader.select_schema([1]).to_s)
+b: int32
+    SCHEMA
+    assert_equal(<<-SCHEMA.chomp, @reader.select_schema([0, 1]).to_s)
+a: string
+b: int32
+    SCHEMA
+  end
+
+  def test_read_column
+    a = @reader.read_column(0)
+    assert_equal([
+                   "a: string",
+                   Arrow::ChunkedArray.new([@a_array]).to_s,
+                 ],
+                 [
+                   a.field.to_s,
+                   a.data.to_s,
+                 ])
+
+    b = @reader.read_column(1)
+    assert_equal([
+                   "b: int32",
+                   Arrow::ChunkedArray.new([@b_array]).to_s,
+                 ],
+                 [
+                   b.field.to_s,
+                   b.data.to_s,
+                 ])
+  end
+end
diff --git a/c_glib/test/parquet/test-arrow.rb 
b/c_glib/test/parquet/test-arrow-file-writer.rb
similarity index 65%
rename from c_glib/test/parquet/test-arrow.rb
rename to c_glib/test/parquet/test-arrow-file-writer.rb
index de021df..d6c775e 100644
--- a/c_glib/test/parquet/test-arrow.rb
+++ b/c_glib/test/parquet/test-arrow-file-writer.rb
@@ -15,35 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 
-class TestParquetArrow < Test::Unit::TestCase
+class TestParquetArrowFileWriter < Test::Unit::TestCase
   include Helper::Buildable
 
   def setup
     omit("Parquet is required") unless defined?(::Parquet)
+    @file = Tempfile.open(["data", ".parquet"])
   end
 
-  def test_read_write
-    tempfile = Tempfile.open(["data", ".parquet"])
-
-    values = [true, nil, false, true]
+  def test_write
+    enabled_values = [true, nil, false, true]
+    table = build_table("enabled" => build_boolean_array(enabled_values))
     chunk_size = 2
 
-    field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new)
-    schema = Arrow::Schema.new([field])
-    writer = Parquet::ArrowFileWriter.new(schema, tempfile.path)
-    begin
-      columns = [
-        Arrow::Column.new(field, build_boolean_array(values)),
-      ]
-      table = Arrow::Table.new(schema, columns)
-      writer.write_table(table, chunk_size)
-    ensure
-      writer.close
-    end
+    writer = Parquet::ArrowFileWriter.new(table.schema, @file.path)
+    writer.write_table(table, chunk_size)
+    writer.close
 
-    reader = Parquet::ArrowFileReader.new(tempfile.path)
+    reader = Parquet::ArrowFileReader.new(@file.path)
     reader.use_threads = true
-    assert_equal(chunk_size, reader.n_row_groups)
+    assert_equal(enabled_values.length / chunk_size, reader.n_row_groups)
+    table = reader.read_table
     table = reader.read_table
     table_data = table.n_columns.times.collect do |i|
       column = table.get_column(i)
@@ -59,6 +51,6 @@ class TestParquetArrow < Test::Unit::TestCase
       end
       [column.name, data]
     end
-    assert_equal([["enabled", values]], table_data)
+    assert_equal([["enabled", enabled_values]], table_data)
   end
 end

Reply via email to