Repository: arrow Updated Branches: refs/heads/master 78619686f -> 8960a2ed4
ARROW-255: Finalize Dictionary representation Author: Julien Le Dem <jul...@dremio.com> Closes #119 from julienledem/arrow_255_dictionary and squashes the following commits: 316745d [Julien Le Dem] ARROW-255: fix typo and linter errors e28a3c8 [Julien Le Dem] ARROW-255: review feedback 8c27943 [Julien Le Dem] ARROW-255: Finalize Dictionary representation Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8960a2ed Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8960a2ed Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8960a2ed Branch: refs/heads/master Commit: 8960a2ed4c0d400be32003beb183f150e019c4ec Parents: 7861968 Author: Julien Le Dem <jul...@dremio.com> Authored: Sat Aug 20 13:02:45 2016 -0700 Committer: Julien Le Dem <jul...@dremio.com> Committed: Sat Aug 20 13:02:45 2016 -0700 ---------------------------------------------------------------------- cpp/src/arrow/ipc/metadata-internal.cc | 3 ++- cpp/src/arrow/type.h | 11 ++++++--- format/Layout.md | 37 +++++++++++++++++++++++++++++ format/Message.fbs | 6 ++++- 4 files changed, 52 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/cpp/src/arrow/ipc/metadata-internal.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 16ba20f..50db730 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -220,7 +220,8 @@ static Status FieldToFlatbuffer( auto fb_children = fbb.CreateVector(children); *offset = flatbuf::CreateField( - fbb, fb_name, field->nullable, type_enum, type_data, fb_children); + fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, + fb_children); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4cb37fd..02677d5 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -144,8 +144,13 @@ struct ARROW_EXPORT Field { // Fields can be nullable bool nullable; - Field(const std::string& name, const TypePtr& type, bool nullable = true) - : name(name), type(type), nullable(nullable) {} + // optional dictionary id if the field is dictionary encoded + // 0 means it's not dictionary encoded + int64_t dictionary; + + Field(const std::string& name, const TypePtr& type, bool nullable = true, + int64_t dictionary = 0) + : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } @@ -154,7 +159,7 @@ struct ARROW_EXPORT Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && this->nullable == other.nullable && - this->type->Equals(other.type.get())); + this->dictionary == dictionary && this->type->Equals(other.type.get())); } bool Equals(const std::shared_ptr<Field>& other) const { return Equals(*other.get()); } http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/format/Layout.md ---------------------------------------------------------------------- diff --git a/format/Layout.md b/format/Layout.md index 5eaefee..a953930 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -583,6 +583,43 @@ even if the null bitmap of the parent union array indicates the slot is null. Additionally, a child array may have a non-null slot even if the the types array indicates that a slot contains a different type at the index. +## Dictionary encoding + +When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. +The Dictionary is received as a DictionaryBacth whose id is referenced by a dictionary attribute defined in the metadata (Message.fbs) in the Field table. +The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatch. +When a Schema references a Dictionary id, it must send a DictionaryBatch for this id before any RecordBatch. + +As an example, you could have the following data: +``` +type: List<String> + +[ + ['a', 'b'], + ['a', 'b'], + ['a', 'b'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['a', 'b'] +] +``` +In dictionary-encoded form, this could appear as: +``` +data List<String> (dictionary-encoded, dictionary id i) +indices: [0, 0, 0, 1, 1, 1, 0] + +dictionary i + +type: List<String> + +[ + ['a', 'b'], + ['c', 'd', 'e'], +] +``` + ## References Apache Drill Documentation - [Value Vectors][6] http://git-wip-us.apache.org/repos/asf/arrow/blob/8960a2ed/format/Message.fbs ---------------------------------------------------------------------- diff --git a/format/Message.fbs b/format/Message.fbs index 2928207..a78009b 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -104,6 +104,10 @@ table Field { name: string; nullable: bool; type: Type; + // present only if the field is dictionary encoded + // will point to a dictionary provided by a DictionaryBatch message + dictionary: long; + // children apply only to Nested data types like Struct, List and Union children: [Field]; } @@ -185,8 +189,8 @@ table RecordBatch { /// For sending dictionary encoding information. Any Field can be /// dictionary-encoded, but in this case none of its children may be /// dictionary-encoded. +/// There is one dictionary batch per dictionary /// -/// TODO(wesm): To be documented in more detail table DictionaryBatch { id: long;