Repository: arrow
Updated Branches:
  refs/heads/master 78619686f -> 8960a2ed4


ARROW-255: Finalize Dictionary representation

Author: Julien Le Dem <jul...@dremio.com>

Closes #119 from julienledem/arrow_255_dictionary and squashes the following 
commits:

316745d [Julien Le Dem] ARROW-255: fix typo and linter errors
e28a3c8 [Julien Le Dem] ARROW-255: review feedback
8c27943 [Julien Le Dem] ARROW-255: Finalize Dictionary representation


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8960a2ed
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8960a2ed
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8960a2ed

Branch: refs/heads/master
Commit: 8960a2ed4c0d400be32003beb183f150e019c4ec
Parents: 7861968
Author: Julien Le Dem <jul...@dremio.com>
Authored: Sat Aug 20 13:02:45 2016 -0700
Committer: Julien Le Dem <jul...@dremio.com>
Committed: Sat Aug 20 13:02:45 2016 -0700

----------------------------------------------------------------------
 cpp/src/arrow/ipc/metadata-internal.cc |  3 ++-
 cpp/src/arrow/type.h                   | 11 ++++++---
 format/Layout.md                       | 37 +++++++++++++++++++++++++++++
 format/Message.fbs                     |  6 ++++-
 4 files changed, 52 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/cpp/src/arrow/ipc/metadata-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc 
b/cpp/src/arrow/ipc/metadata-internal.cc
index 16ba20f..50db730 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -220,7 +220,8 @@ static Status FieldToFlatbuffer(
   auto fb_children = fbb.CreateVector(children);
 
   *offset = flatbuf::CreateField(
-      fbb, fb_name, field->nullable, type_enum, type_data, fb_children);
+      fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary,
+      fb_children);
 
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 4cb37fd..02677d5 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -144,8 +144,13 @@ struct ARROW_EXPORT Field {
   // Fields can be nullable
   bool nullable;
 
-  Field(const std::string& name, const TypePtr& type, bool nullable = true)
-      : name(name), type(type), nullable(nullable) {}
+  // optional dictionary id if the field is dictionary encoded
+  // 0 means it's not dictionary encoded
+  int64_t dictionary;
+
+  Field(const std::string& name, const TypePtr& type, bool nullable = true,
+        int64_t dictionary = 0)
+      : name(name), type(type), nullable(nullable), dictionary(dictionary) {}
 
   bool operator==(const Field& other) const { return this->Equals(other); }
 
@@ -154,7 +159,7 @@ struct ARROW_EXPORT Field {
   bool Equals(const Field& other) const {
     return (this == &other) ||
            (this->name == other.name && this->nullable == other.nullable &&
-               this->type->Equals(other.type.get()));
+            this->dictionary == dictionary && 
this->type->Equals(other.type.get()));
   }
 
   bool Equals(const std::shared_ptr<Field>& other) const { return 
Equals(*other.get()); }

http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/format/Layout.md
----------------------------------------------------------------------
diff --git a/format/Layout.md b/format/Layout.md
index 5eaefee..a953930 100644
--- a/format/Layout.md
+++ b/format/Layout.md
@@ -583,6 +583,43 @@ even if the null bitmap of the parent union array 
indicates the slot is
 null.  Additionally, a child array may have a non-null slot even if
 the the types array indicates that a slot contains a different type at the 
index.
 
+## Dictionary encoding
+
+When a field is dictionary encoded, the values are represented by an array of 
Int32 representing the index of the value in the dictionary.
+The Dictionary is received as a DictionaryBacth whose id is referenced by a 
dictionary attribute defined in the metadata (Message.fbs) in the Field table.
+The dictionary has the same layout as the type of the field would dictate. 
Each entry in the dictionary can be accessed by its index in the 
DictionaryBatch.
+When a Schema references a Dictionary id, it must send a DictionaryBatch for 
this id before any RecordBatch.
+
+As an example, you could have the following data:
+```
+type: List<String>
+
+[
+ ['a', 'b'],
+ ['a', 'b'],
+ ['a', 'b'],
+ ['c', 'd', 'e'],
+ ['c', 'd', 'e'],
+ ['c', 'd', 'e'],
+ ['c', 'd', 'e'],
+ ['a', 'b']
+]
+```
+In dictionary-encoded form, this could appear as:
+```
+data List<String> (dictionary-encoded, dictionary id i)
+indices: [0, 0, 0, 1, 1, 1, 0]
+
+dictionary i
+
+type: List<String>
+
+[
+ ['a', 'b'],
+ ['c', 'd', 'e'],
+]
+```
+
 ## References
 
 Apache Drill Documentation - [Value Vectors][6] 

http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/format/Message.fbs
----------------------------------------------------------------------
diff --git a/format/Message.fbs b/format/Message.fbs
index 2928207..a78009b 100644
--- a/format/Message.fbs
+++ b/format/Message.fbs
@@ -104,6 +104,10 @@ table Field {
   name: string;
   nullable: bool;
   type: Type;
+  // present only if the field is dictionary encoded
+  // will point to a dictionary provided by a DictionaryBatch message
+  dictionary: long;
+  // children apply only to Nested data types like Struct, List and Union
   children: [Field];
 }
 
@@ -185,8 +189,8 @@ table RecordBatch {
 /// For sending dictionary encoding information. Any Field can be
 /// dictionary-encoded, but in this case none of its children may be
 /// dictionary-encoded.
+/// There is one dictionary batch per dictionary
 ///
-/// TODO(wesm): To be documented in more detail
 
 table DictionaryBatch {
   id: long;

Reply via email to