This is an automated email from the ASF dual-hosted git repository. uwe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 59ec2b1 ARROW-2849: [Ruby] Arrow::Table#load supports ORC 59ec2b1 is described below commit 59ec2b18b495a2b7cce2eaffd642872a8cc99ac3 Author: Kouhei Sutou <k...@clear-code.com> AuthorDate: Sat Jul 14 19:52:48 2018 +0200 ARROW-2849: [Ruby] Arrow::Table#load supports ORC ruby/red-arrow/test/fixture/TestOrcFile.test1.orc is copied from https://github.com/apache/orc/blob/master/examples/TestOrcFile.test1.orc . Its license is Apache License 2.0. Author: Kouhei Sutou <k...@clear-code.com> Closes #2265 from kou/ruby-add-orc-loader and squashes the following commits: 89c6d94f <Kouhei Sutou> Arrow::Table#load supports ORC --- ruby/red-arrow/lib/arrow/table-loader.rb | 10 ++ ruby/red-arrow/test/fixture/TestOrcFile.test1.orc | Bin 0 -> 1711 bytes ruby/red-arrow/test/test-orc.rb | 118 ++++++++++++++++++++++ 3 files changed, 128 insertions(+) diff --git a/ruby/red-arrow/lib/arrow/table-loader.rb b/ruby/red-arrow/lib/arrow/table-loader.rb index 985999e..a68b33f 100644 --- a/ruby/red-arrow/lib/arrow/table-loader.rb +++ b/ruby/red-arrow/lib/arrow/table-loader.rb @@ -111,6 +111,16 @@ module Arrow load_raw(input, reader) end + if Arrow.const_defined?(:ORCFileReader) + def load_as_orc(path) + input = MemoryMappedInputStream.new(path) + reader = ORCFileReader.new(input) + field_indexes = @options[:field_indexes] + reader.set_field_indexes(field_indexes) if field_indexes + reader.read_stripes + end + end + def load_as_csv(path) options = @options.dup options.delete(:format) diff --git a/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc b/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc new file mode 100644 index 0000000..4fb0bef Binary files /dev/null and b/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc differ diff --git a/ruby/red-arrow/test/test-orc.rb b/ruby/red-arrow/test/test-orc.rb new file mode 100644 index 0000000..d5237ad --- /dev/null +++ b/ruby/red-arrow/test/test-orc.rb @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class ORCTest < Test::Unit::TestCase + include Helper::Fixture + + def setup + omit("Require Apache Arrow ORC") unless Arrow.const_defined?(:ORCFileReader) + @orc_path = fixture_path("TestOrcFile.test1.orc") + end + + sub_test_case("load") do + test("default") do + table = Arrow::Table.load(@orc_path) + dump = table.columns.collect do |column| + [ + column.field.to_s, + column.data.chunks.collect(&:to_s), + ] + end + assert_equal([ + ["boolean1: bool", ["[false, true]"]], + ["byte1: int8", ["[1, 100]"]], + ["short1: int16", ["[1024, 2048]"]], + ["int1: int32", ["[65536, 65536]"]], + [ + "long1: int64", + ["[9223372036854775807, 9223372036854775807]"], + ], + ["float1: float", ["[1, 2]"]], + ["double1: double", ["[-15, -5]"]], + ["bytes1: binary", ["[0001020304, ]"]], + ["string1: string", ["[\"hi\", \"bye\"]"]], + [ + "middle: " + + "struct<list: " + + "list<item: struct<int1: int32, string1: string>>>", + [ + <<-STRUCT.chomp + +-- is_valid: all not null +-- child 0 type: list<item: struct<int1: int32, string1: string>> values: + -- is_valid: all not null + -- value_offsets: [0, 2, 4] + -- values: + -- is_valid: all not null + -- child 0 type: int32 values: [1, 2, 1, 2] + -- child 1 type: string values: ["bye", "sigh", "bye", "sigh"] + STRUCT + ] + ], + [ + "list: list<item: struct<int1: int32, string1: string>>", + [ + <<-LIST.chomp + +-- is_valid: all not null +-- value_offsets: [0, 2, 5] +-- values: + -- is_valid: all not null + -- child 0 type: int32 values: [3, 4, 100000000, -100000, 1234] + -- child 1 type: string values: ["good", "bad", "cat", "in", "hat"] + LIST + ] + ], + [ + "map: list<item: " + + "struct<key: string, value: " + + "struct<int1: int32, string1: string>>>", + [ + <<-MAP.chomp + +-- is_valid: all not null +-- value_offsets: [0, 0, 2] +-- values: + -- is_valid: all not null + -- child 0 type: string values: ["chani", "mauddib"] + -- child 1 type: struct<int1: int32, string1: string> values: + -- is_valid: all not null + -- child 0 type: int32 values: [5, 1] + -- child 1 type: string values: ["chani", "mauddib"] + MAP + ], + ], + ], + dump) + end + + test(":field_indexes") do + table = Arrow::Table.load(@orc_path, field_indexes: [1, 3]) + dump = table.columns.collect do |column| + [ + column.field.to_s, + column.data.chunks.collect(&:to_s), + ] + end + assert_equal([ + ["boolean1: bool", ["[false, true]"]], + ["short1: int16", ["[1024, 2048]"]], + ], + dump) + end + end +end