This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 59ec2b1  ARROW-2849: [Ruby] Arrow::Table#load supports ORC
59ec2b1 is described below

commit 59ec2b18b495a2b7cce2eaffd642872a8cc99ac3
Author: Kouhei Sutou <k...@clear-code.com>
AuthorDate: Sat Jul 14 19:52:48 2018 +0200

    ARROW-2849: [Ruby] Arrow::Table#load supports ORC
    
    ruby/red-arrow/test/fixture/TestOrcFile.test1.orc is copied from
    https://github.com/apache/orc/blob/master/examples/TestOrcFile.test1.orc .
    Its license is Apache License 2.0.
    
    Author: Kouhei Sutou <k...@clear-code.com>
    
    Closes #2265 from kou/ruby-add-orc-loader and squashes the following 
commits:
    
    89c6d94f <Kouhei Sutou>  Arrow::Table#load supports ORC
---
 ruby/red-arrow/lib/arrow/table-loader.rb          |  10 ++
 ruby/red-arrow/test/fixture/TestOrcFile.test1.orc | Bin 0 -> 1711 bytes
 ruby/red-arrow/test/test-orc.rb                   | 118 ++++++++++++++++++++++
 3 files changed, 128 insertions(+)

diff --git a/ruby/red-arrow/lib/arrow/table-loader.rb 
b/ruby/red-arrow/lib/arrow/table-loader.rb
index 985999e..a68b33f 100644
--- a/ruby/red-arrow/lib/arrow/table-loader.rb
+++ b/ruby/red-arrow/lib/arrow/table-loader.rb
@@ -111,6 +111,16 @@ module Arrow
       load_raw(input, reader)
     end
 
+    if Arrow.const_defined?(:ORCFileReader)
+      def load_as_orc(path)
+        input = MemoryMappedInputStream.new(path)
+        reader = ORCFileReader.new(input)
+        field_indexes = @options[:field_indexes]
+        reader.set_field_indexes(field_indexes) if field_indexes
+        reader.read_stripes
+      end
+    end
+
     def load_as_csv(path)
       options = @options.dup
       options.delete(:format)
diff --git a/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc 
b/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc
new file mode 100644
index 0000000..4fb0bef
Binary files /dev/null and b/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc 
differ
diff --git a/ruby/red-arrow/test/test-orc.rb b/ruby/red-arrow/test/test-orc.rb
new file mode 100644
index 0000000..d5237ad
--- /dev/null
+++ b/ruby/red-arrow/test/test-orc.rb
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ORCTest < Test::Unit::TestCase
+  include Helper::Fixture
+
+  def setup
+    omit("Require Apache Arrow ORC") unless 
Arrow.const_defined?(:ORCFileReader)
+    @orc_path = fixture_path("TestOrcFile.test1.orc")
+  end
+
+  sub_test_case("load") do
+    test("default") do
+      table = Arrow::Table.load(@orc_path)
+      dump = table.columns.collect do |column|
+        [
+          column.field.to_s,
+          column.data.chunks.collect(&:to_s),
+        ]
+      end
+      assert_equal([
+                     ["boolean1: bool", ["[false, true]"]],
+                     ["byte1: int8", ["[1, 100]"]],
+                     ["short1: int16", ["[1024, 2048]"]],
+                     ["int1: int32", ["[65536, 65536]"]],
+                     [
+                       "long1: int64",
+                       ["[9223372036854775807, 9223372036854775807]"],
+                     ],
+                     ["float1: float", ["[1, 2]"]],
+                     ["double1: double", ["[-15, -5]"]],
+                     ["bytes1: binary", ["[0001020304, ]"]],
+                     ["string1: string", ["[\"hi\", \"bye\"]"]],
+                     [
+                       "middle: " +
+                       "struct<list: " +
+                       "list<item: struct<int1: int32, string1: string>>>",
+                       [
+                         <<-STRUCT.chomp
+
+-- is_valid: all not null
+-- child 0 type: list<item: struct<int1: int32, string1: string>> values: 
+  -- is_valid: all not null
+  -- value_offsets: [0, 2, 4]
+  -- values: 
+    -- is_valid: all not null
+    -- child 0 type: int32 values: [1, 2, 1, 2]
+    -- child 1 type: string values: ["bye", "sigh", "bye", "sigh"]
+                          STRUCT
+                       ]
+                     ],
+                     [
+                       "list: list<item: struct<int1: int32, string1: 
string>>",
+                       [
+                         <<-LIST.chomp
+
+-- is_valid: all not null
+-- value_offsets: [0, 2, 5]
+-- values: 
+  -- is_valid: all not null
+  -- child 0 type: int32 values: [3, 4, 100000000, -100000, 1234]
+  -- child 1 type: string values: ["good", "bad", "cat", "in", "hat"]
+                         LIST
+                       ]
+                     ],
+                     [
+                       "map: list<item: " +
+                       "struct<key: string, value: " +
+                       "struct<int1: int32, string1: string>>>",
+                       [
+                         <<-MAP.chomp
+
+-- is_valid: all not null
+-- value_offsets: [0, 0, 2]
+-- values: 
+  -- is_valid: all not null
+  -- child 0 type: string values: ["chani", "mauddib"]
+  -- child 1 type: struct<int1: int32, string1: string> values: 
+    -- is_valid: all not null
+    -- child 0 type: int32 values: [5, 1]
+    -- child 1 type: string values: ["chani", "mauddib"]
+                         MAP
+                       ],
+                     ],
+                   ],
+                   dump)
+    end
+
+    test(":field_indexes") do
+      table = Arrow::Table.load(@orc_path, field_indexes: [1, 3])
+      dump = table.columns.collect do |column|
+        [
+          column.field.to_s,
+          column.data.chunks.collect(&:to_s),
+        ]
+      end
+      assert_equal([
+                     ["boolean1: bool", ["[false, true]"]],
+                     ["short1: int16", ["[1024, 2048]"]],
+                   ],
+                   dump)
+    end
+  end
+end

Reply via email to