This is an automated email from the ASF dual-hosted git repository.
planka pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 5d315227b Updated Site to include documentation for LazyFilters
5d315227b is described below
commit 5d315227bf4489458152f6353af8af59eba381fc
Author: Pavan Lanka <[email protected]>
AuthorDate: Tue Jun 14 09:11:07 2022 -0700
Updated Site to include documentation for LazyFilters
---
develop/design/index.html | 145 ++++++
develop/design/lazy_filter/index.html | 800 ++++++++++++++++++++++++++++++++++
develop/index.html | 3 +
3 files changed, 948 insertions(+)
diff --git a/develop/design/index.html b/develop/design/index.html
new file mode 100644
index 000000000..b2f5ff612
--- /dev/null
+++ b/develop/design/index.html
@@ -0,0 +1,145 @@
+<!DOCTYPE HTML>
+<html lang="en-US">
+<head>
+ <meta charset="UTF-8">
+ <title>Design</title>
+ <meta name="viewport" content="width=device-width,initial-scale=1">
+ <meta name="generator" content="Jekyll v3.8.6">
+ <link rel="stylesheet"
href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900">
+ <link rel="stylesheet" href="/css/screen.css">
+ <link rel="icon" type="image/x-icon" href="/favicon.ico">
+ <!--[if lt IE 9]>
+ <script src="/js/html5shiv.min.js"></script>
+ <script src="/js/respond.min.js"></script>
+ <![endif]-->
+</head>
+
+
+<body class="wrap">
+ <header role="banner">
+ <nav class="mobile-nav show-on-mobiles">
+ <ul>
+ <li class="">
+ <a href="/">Home</a>
+ </li>
+ <li class="">
+ <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+ <span class="hide-on-mobiles">Documentation</span></a>
+ </li>
+ <li class="">
+ <a href="/talks/">Talks</a>
+ </li>
+ <li class="">
+ <a href="/news/">News</a>
+ </li>
+ <li class="">
+ <a href="/help/">Help</a>
+ </li>
+ <li class="current">
+ <a href="/develop/">Develop</a>
+ </li>
+</ul>
+
+ </nav>
+ <div class="grid">
+ <div class="unit one-third center-on-mobiles">
+ <h1>
+ <a href="/">
+ <span class="sr-only">Apache ORC</span>
+ <img src="/img/logo.png" width="249" height="101" alt="ORC Logo">
+ </a>
+ </h1>
+ </div>
+ <nav class="main-nav unit two-thirds hide-on-mobiles">
+ <ul>
+ <li class="">
+ <a href="/">Home</a>
+ </li>
+ <li class="">
+ <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+ <span class="hide-on-mobiles">Documentation</span></a>
+ </li>
+ <li class="">
+ <a href="/talks/">Talks</a>
+ </li>
+ <li class="">
+ <a href="/news/">News</a>
+ </li>
+ <li class="">
+ <a href="/help/">Help</a>
+ </li>
+ <li class="current">
+ <a href="/develop/">Develop</a>
+ </li>
+</ul>
+
+ </nav>
+ </div>
+</header>
+
+
+ <section class="standalone">
+ <div class="grid">
+
+ <div class="unit whole">
+ <article>
+ <h1>Design</h1>
+ <ul>
+ <li><a href="lazy_filter">Lazy Filters</a></li>
+</ul>
+
+ </article>
+ </div>
+
+ <div class="clear"></div>
+
+ </div>
+</section>
+
+
+ <footer role="contentinfo">
+ <p>The contents of this website are © 2022
+ <a href="https://www.apache.org/">Apache Software Foundation</a>
+ under the terms of the <a
+ href="https://www.apache.org/licenses/LICENSE-2.0.html">
+ Apache License v2</a>. Apache ORC and its logo are trademarks
+ of the Apache Software Foundation.</p>
+</footer>
+
+ <script>
+ var anchorForId = function (id) {
+ var anchor = document.createElement("a");
+ anchor.className = "header-link";
+ anchor.href = "#" + id;
+ anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa
fa-link\"></i>";
+ anchor.title = "Permalink";
+ return anchor;
+ };
+
+ var linkifyAnchors = function (level, containingElement) {
+ var headers = containingElement.getElementsByTagName("h" + level);
+ for (var h = 0; h < headers.length; h++) {
+ var header = headers[h];
+
+ if (typeof header.id !== "undefined" && header.id !== "") {
+ header.appendChild(anchorForId(header.id));
+ }
+ }
+ };
+
+ document.onreadystatechange = function () {
+ if (this.readyState === "complete") {
+ var contentBlock = document.getElementsByClassName("docs")[0] ||
document.getElementsByClassName("news")[0];
+ if (!contentBlock) {
+ return;
+ }
+ for (var level = 1; level <= 6; level++) {
+ linkifyAnchors(level, contentBlock);
+ }
+ }
+ };
+</script>
+
+
+</body>
+</html>
diff --git a/develop/design/lazy_filter/index.html
b/develop/design/lazy_filter/index.html
new file mode 100644
index 000000000..a606d6842
--- /dev/null
+++ b/develop/design/lazy_filter/index.html
@@ -0,0 +1,800 @@
+<!DOCTYPE HTML>
+<html lang="en-US">
+<head>
+ <meta charset="UTF-8">
+ <title>Lazy Filter</title>
+ <meta name="viewport" content="width=device-width,initial-scale=1">
+ <meta name="generator" content="Jekyll v3.8.6">
+ <link rel="stylesheet"
href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900">
+ <link rel="stylesheet" href="/css/screen.css">
+ <link rel="icon" type="image/x-icon" href="/favicon.ico">
+ <!--[if lt IE 9]>
+ <script src="/js/html5shiv.min.js"></script>
+ <script src="/js/respond.min.js"></script>
+ <![endif]-->
+</head>
+
+
+<body class="wrap">
+ <header role="banner">
+ <nav class="mobile-nav show-on-mobiles">
+ <ul>
+ <li class="">
+ <a href="/">Home</a>
+ </li>
+ <li class="">
+ <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+ <span class="hide-on-mobiles">Documentation</span></a>
+ </li>
+ <li class="">
+ <a href="/talks/">Talks</a>
+ </li>
+ <li class="">
+ <a href="/news/">News</a>
+ </li>
+ <li class="">
+ <a href="/help/">Help</a>
+ </li>
+ <li class="current">
+ <a href="/develop/">Develop</a>
+ </li>
+</ul>
+
+ </nav>
+ <div class="grid">
+ <div class="unit one-third center-on-mobiles">
+ <h1>
+ <a href="/">
+ <span class="sr-only">Apache ORC</span>
+ <img src="/img/logo.png" width="249" height="101" alt="ORC Logo">
+ </a>
+ </h1>
+ </div>
+ <nav class="main-nav unit two-thirds hide-on-mobiles">
+ <ul>
+ <li class="">
+ <a href="/">Home</a>
+ </li>
+ <li class="">
+ <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+ <span class="hide-on-mobiles">Documentation</span></a>
+ </li>
+ <li class="">
+ <a href="/talks/">Talks</a>
+ </li>
+ <li class="">
+ <a href="/news/">News</a>
+ </li>
+ <li class="">
+ <a href="/help/">Help</a>
+ </li>
+ <li class="current">
+ <a href="/develop/">Develop</a>
+ </li>
+</ul>
+
+ </nav>
+ </div>
+</header>
+
+
+ <section class="standalone">
+ <div class="grid">
+
+ <div class="unit whole">
+ <article>
+ <h1>Lazy Filter</h1>
+ <ul>
+ <li><a href="#Background">Background</a></li>
+ <li><a href="#Design">Design</a>
+ <ul>
+ <li><a href="#SArgtoFilter">SArg to Filter</a></li>
+ <li><a href="#Read">Read</a></li>
+ </ul>
+ </li>
+ <li><a href="#Configuration">Configuration</a></li>
+ <li><a href="#Tests">Tests</a></li>
+ <li><a href="#Appendix">Appendix</a>
+ <ul>
+ <li><a href="#Benchmarks">Benchmarks</a>
+ <ul>
+ <li><a href="#RowvsVector">Row vs Vector</a></li>
+ <li><a href="#NormalizationvsCompact">Normalization vs
Compact</a></li>
+ <li><a href="#Summary">Summary</a></li>
+ </ul>
+ </li>
+ </ul>
+ </li>
+</ul>
+
+<h2 id="background-">Background <a id="Background"></a></h2>
+
+<p>This feature request started as a result of a needle in the haystack search
that is performed with the following
+characteristics:</p>
+
+<ul>
+ <li>The search fields are not part of partition, bucket or sort
specification.</li>
+ <li>The table is a very large table.</li>
+ <li>The result is very few rows compared to the scan size.</li>
+ <li>The search columns are a significant subset of selection columns in the
query.</li>
+</ul>
+
+<p>Initial analysis showed that we could have a significant benefit by lazily
reading the non-search columns only when we
+have a match. We explore the design and some benchmarks in subsequent
sections.</p>
+
+<h2 id="design-">Design <a id="Design"></a></h2>
+
+<p>This builds further on <a
href="https://issues.apache.org/jira/browse/ORC-577">ORC-577</a> which
currently only restricts deserialization for some selected data types
+but does not improve on IO.</p>
+
+<p>On a high level the design includes the following components:</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>┌──────────────┐ ┌────────────────────────┐
+│ │ │ Read │
+│ │ │ │
+│ │ │ ┌────────────┐ │
+│SArg to Filter│─────────▶│ │Read Filter │ │
+│ │ │ │ Columns │ │
+│ │ │ └────────────┘ │
+│ │ │ │ │
+└──────────────┘ │ ▼ │
+ │ ┌────────────┐ │
+ │ │Apply Filter│ │
+ │ └────────────┘ │
+ │ │ │
+ │ ▼ │
+ │ ┌────────────┐ │
+ │ │Read Select │ │
+ │ │ Columns │ │
+ │ └────────────┘ │
+ │ │
+ │ │
+ └────────────────────────┘
+</code></pre></div></div>
+
+<ul>
+ <li><strong>SArg to Filter</strong>: Converts Search Arguments passed down
into filters for efficient application during scans.</li>
+ <li><strong>Read</strong>: Performs the lazy read using the filters.
+ <ul>
+ <li><strong>Read Filter Columns</strong>: Read the filter columns from
the file.</li>
+ <li><strong>Apply Filter</strong>: Apply the filter on the read filter
columns.</li>
+ <li><strong>Read Select Columns</strong>: If filter selects at least a
row then read the remaining columns.</li>
+ </ul>
+ </li>
+</ul>
+
+<h3 id="sarg-to-filter-">SArg to Filter <a id="SArgtoFilter"></a></h3>
+
+<p>SArg to Filter converts the passed SArg into a filter. This enables
automatic compatibility with both Spark and Hive as
+they already push down Search Arguments down to ORC.</p>
+
+<p>The SArg is automatically converted into a <a
href="https://github.com/apache/orc/tree/main/java/core/src/java/org/apache/orc/impl/filter/VectorFilter.java">Vector
Filter</a>. Which is applied during the read process. Two
+filter types were evaluated:</p>
+
+<ul>
+ <li><a
href="https://github.com/apache/orc/tree/main/java/bench/core/src/java/org/apache/orc/impl/filter/RowFilter.java">Row
Filter</a> that evaluates each row across all the predicates once.</li>
+ <li><a
href="https://github.com/apache/orc/tree/main/java/core/src/java/org/apache/orc/impl/filter/VectorFilter.java">Vector
Filter</a> that evaluates each filter across the entire vector and adjusts the
subsequent evaluation.</li>
+</ul>
+
+<p>While a row based filter is easier to code, it is much <a
href="#RowvsVector">slower</a> to process. We also see a significant
+<a href="#RowvsVector">performance gain</a> in the absence of
normalization.</p>
+
+<p>The builder for search argument should allow skipping normalization during
the <a
href="https://github.com/apache/hive/blob/storage-branch-2.7/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java#L491">build</a>.
This has been added with
+<a href="https://issues.apache.org/jira/browse/HIVE-24458">HIVE-24458</a>.</p>
+
+<h3 id="read-">Read <a id="Read"></a></h3>
+
+<p>The read process has the following changes:</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre
class="highlight"><code> │
+ │
+ │
+┌────────────────────────▼────────────────────────┐
+│ ┏━━━━━━━━━━━━━━━━┓ │
+│ ┃Plan ++Search++ ┃ │
+│ ┃ Columns ┃ │
+│ ┗━━━━━━━━━━━━━━━━┛ │
+│ Read │Stripe │
+└────────────────────────┼────────────────────────┘
+ │
+ ▼
+
+
+ │
+ │
+┌────────────────────────▼────────────────────────┐
+│ ┏━━━━━━━━━━━━━━━━┓ │
+│ ┃Read ++Search++ ┃ │
+│ ┃ Columns ┃◀─────────┐ │
+│ ┗━━━━━━━━━━━━━━━━┛ │ │
+│ │ Size = 0 │
+│ ▼ │ │
+│ ┏━━━━━━━━━━━━━━━━┓ │ │
+│ ┃ Apply Filter ┃──────────┘ │
+│ ┗━━━━━━━━━━━━━━━━┛ │
+│ Size > 0 │
+│ │ │
+│ ▼ │
+│ ┏━━━━━━━━━━━━━━━━┓ │
+│ ┃ Plan Select ┃ │
+│ ┃ Columns ┃ │
+│ ┗━━━━━━━━━━━━━━━━┛ │
+│ │ │
+│ ▼ │
+│ ┏━━━━━━━━━━━━━━━━┓ │
+│ ┃ Read Select ┃ │
+│ ┃ Columns ┃ │
+│ ┗━━━━━━━━━━━━━━━━┛ │
+│ Next │Batch │
+└────────────────────────┼────────────────────────┘
+ │
+ ▼
+</code></pre></div></div>
+
+<p>The read process changes:</p>
+
+<ul>
+ <li><strong>Read Stripe</strong> used to plan the read of all (search +
select) columns. This is enhanced to plan and fetch only the
+search columns. The rest of the stripe planning process optimizations remain
unchanged e.g. partial read planning of
+the stripe based on RowGroup statistics.</li>
+ <li><strong>Next Batch</strong> identifies the processing that takes place
when <code class="highlighter-rouge">RecordReader.nextBatch</code> is invoked.
+ <ul>
+ <li><strong>Read Search Columns</strong> takes place instead of reading
all the selected columns. This is in sync with the planning
+that has taken place during <strong>Read Stripe</strong> where only the search
columns have been planned.</li>
+ <li><strong>Apply Filter</strong> on the batch that at this point only
includes search columns. Evaluate the result of the filter:
+ <ul>
+ <li><strong>Size = 0</strong> indicates all records have been
filtered out. Given this we proceed to the next batch of search
+columns.</li>
+ <li><strong>Size > 0</strong> indicates that at least one record
accepted by the filter. This record needs to be substantiated with
+other columns.</li>
+ </ul>
+ </li>
+ <li><strong>Plan Select Columns</strong> is invoked to perform read of
the select columns. The planning happens as follows:
+ <ul>
+ <li>Determine the current position of the read within the stripe and
plan the read for the select columns from this
+point forward to the end of the stripe.</li>
+ <li>The Read planning of select columns respects the row groups
filtered out as a result of the stripe planning.</li>
+ <li>Fetch the select columns using the above plan.</li>
+ </ul>
+ </li>
+ <li><strong>Read Select Columns</strong> into the vectorized row
batch</li>
+ <li>Return this batch.</li>
+ </ul>
+ </li>
+</ul>
+
+<p>The current implementation performs a single read for the select columns in
a stripe.</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>┌──────────────────────────────────────────────────┐
+│ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ │
+│ │RG0 │ │RG1 │ │RG2■│ │RG3 │ │RG4 │ │RG5■│ │RG6 │ │
+│ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ │
+│ Stripe │
+└──────────────────────────────────────────────────┘
+</code></pre></div></div>
+
+<p>The above diagram depicts a stripe with 7 Row Groups out of which
<strong>RG2</strong> and <strong>RG5</strong> are selected by the filter. The
+current implementation does the following:</p>
+
+<ul>
+ <li>Start the read planning process from the first match RG2</li>
+ <li>Read to the end of the stripe that includes RG6</li>
+ <li>Based on the above fetch skips RG0 and RG1 subject to compression block
boundaries</li>
+</ul>
+
+<p>The above logic could be enhanced to perform say <strong>2 or n</strong>
reads before reading to the end of stripe. The current
+implementation allows 0 reads before reading to the end of the stripe. The
value of <strong>n</strong> could be configurable but
+should avoid too many short reads.</p>
+
+<p>The read behavior changes as follows with multiple reads being allowed
within a stripe for select columns:</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>┌──────────────────────────────────────────────────┐
+│ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ │
+│ │ │ │ │ │■■■■│ │■■■■│ │■■■■│ │■■■■│ │■■■■│ │
+│ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ │
+│ Current implementation │
+└──────────────────────────────────────────────────┘
+┌──────────────────────────────────────────────────┐
+│ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ │
+│ │ │ │ │ │■■■■│ │ │ │ │ │■■■■│ │■■■■│ │
+│ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ │
+│ Allow 1 partial read │
+└──────────────────────────────────────────────────┘
+</code></pre></div></div>
+
+<p>The figure shows that we could read significantly fewer bytes by performing
an additional read before reading to the end
+of stripe. This shall be included as a subsequent enhancement to this
patch.</p>
+
+<h2 id="configuration-">Configuration <a id="Configuration"></a></h2>
+
+<p>The following configuration options are exposed that control the filter
behavior:</p>
+
+<table>
+ <thead>
+ <tr>
+ <th style="text-align: left">Property</th>
+ <th style="text-align: left">Type</th>
+ <th style="text-align: left">Default</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td style="text-align: left">orc.sarg.to.filter</td>
+ <td style="text-align: left">boolean</td>
+ <td style="text-align: left">false</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">orc.filter.use.selected</td>
+ <td style="text-align: left">boolean</td>
+ <td style="text-align: left">false</td>
+ </tr>
+ </tbody>
+</table>
+
+<ul>
+ <li><code class="highlighter-rouge">orc.sarg.to.filter</code> can be used to
turn off the SArg to filter conversion. This might be particularly relevant in
+cases where the filter is expensive and does not eliminate a lot of records.
This will not be relevant once we have
+the option to turn off the filters on the caller as they have been completely
implemented by the ORC layer.</li>
+ <li><code class="highlighter-rouge">orc.filter.use.selected</code> is an
important setting that if incorrectly enabled results in wrong output. A
boolean flag
+to determine if the selected vector is supported by the reading application.
If false, the output of the ORC reader
+must have the filter reapplied to avoid using unset values in the unselected
rows. If unsure please leave this as
+false.</li>
+</ul>
+
+<h2 id="tests-">Tests <a id="Tests"></a></h2>
+
+<p>We evaluated this patch against a search job with the following stats:</p>
+
+<ul>
+ <li>Table
+ <ul>
+ <li>Size: ~<strong>420 TB</strong></li>
+ <li>Data fields: ~<strong>120</strong></li>
+ <li>Partition fields: <strong>3</strong></li>
+ </ul>
+ </li>
+ <li>Scan
+ <ul>
+ <li>Search fields: 3 data fields with large (~ 1000 value) IN clauses
compounded by <strong>OR</strong>.</li>
+ <li>Select fields: 16 data fields (includes the 3 search fields), 1
partition field</li>
+ <li>Search:
+ <ul>
+ <li>Size: ~<strong>180 TB</strong></li>
+ <li>Records: <strong>3.99 T</strong></li>
+ </ul>
+ </li>
+ <li>Selected:
+ <ul>
+ <li>Size: ~<strong>100 MB</strong></li>
+ <li>Records: <strong>1 M</strong></li>
+ </ul>
+ </li>
+ </ul>
+ </li>
+</ul>
+
+<p>We have observed the following reductions:</p>
+
+<table>
+ <thead>
+ <tr>
+ <th style="text-align: left">Test</th>
+ <th style="text-align: right">IO Reduction %</th>
+ <th style="text-align: right">CPU Reduction %</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td style="text-align: left">SELECT 16 cols</td>
+ <td style="text-align: right">45</td>
+ <td style="text-align: right">47</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SELECT *</td>
+ <td style="text-align: right">70</td>
+ <td style="text-align: right">87</td>
+ </tr>
+ </tbody>
+</table>
+
+<ul>
+ <li>The savings are more significant as you increase the number of select
columns with respect to the search columns</li>
+ <li>When the filter selects most data, no significant penalty observed as a
result of 2 IO compared with a single IO
+ <ul>
+ <li>We do have a penalty as a result of the double filter application
both in ORC and in the calling engine.</li>
+ </ul>
+ </li>
+</ul>
+
+<h2 id="appendix-">Appendix <a id="Appendix"></a></h2>
+
+<h3 id="benchmarks-">Benchmarks <a id="Benchmarks"></a></h3>
+
+<h4 id="row-vs-vector-">Row vs Vector <a id="RowvsVector"></a></h4>
+
+<p>We start with a decision of using a Row filter vs a Vector filter. The Row
filter has the advantage of simpler code when
+compared with the Vector filter.</p>
+
+<div class="language-bash highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>java <span class="nt">-jar</span>
java/bench/core/target/orc-benchmarks-core-<span class="k">*</span><span
class="nt">-uber</span>.jar filter simple
+</code></pre></div></div>
+
+<table>
+ <thead>
+ <tr>
+ <th style="text-align: left">Benchmark</th>
+ <th style="text-align: right">(fInSize)</th>
+ <th style="text-align: left">(fType)</th>
+ <th style="text-align: left">Mode</th>
+ <th style="text-align: right">Cnt</th>
+ <th style="text-align: right">Score</th>
+ <th style="text-align: left">Error</th>
+ <th style="text-align: left">Units</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">4</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">38.207</td>
+ <td style="text-align: left">± 0.178</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">4</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">18.663</td>
+ <td style="text-align: left">± 0.117</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">8</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">50.694</td>
+ <td style="text-align: left">± 0.313</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">8</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">35.532</td>
+ <td style="text-align: left">± 0.190</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">16</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">52.443</td>
+ <td style="text-align: left">± 0.268</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">16</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">33.966</td>
+ <td style="text-align: left">± 0.204</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">32</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">68.504</td>
+ <td style="text-align: left">± 0.318</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">32</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">51.707</td>
+ <td style="text-align: left">± 0.302</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">256</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">88.348</td>
+ <td style="text-align: left">± 0.793</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">SimpleFilter</td>
+ <td style="text-align: right">256</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">72.602</td>
+ <td style="text-align: left">± 0.282</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ </tbody>
+</table>
+
+<p>Explanation:</p>
+
+<ul>
+ <li><strong>fInSize</strong> calls out the number of values in the IN
clause.</li>
+ <li><strong>fType</strong> calls out the whether the filter is a row based
filter, or a vector based filter.</li>
+</ul>
+
+<p>Observations:</p>
+
+<ul>
+ <li>The vector based filter is significantly faster than the row based
filter.
+ <ul>
+ <li>At best, vector was faster by <strong>51.15%</strong></li>
+ <li>At worst, vector was faster by <strong>17.82%</strong></li>
+ </ul>
+ </li>
+ <li>The performance of the filters is deteriorates with the increase of the
IN values, however even in this case the
+vector filter is much better than the row filter. The current <code
class="highlighter-rouge">IN</code> filter employs a binary search on an array
instead
+of a hash lookup.</li>
+</ul>
+
+<h4 id="normalization-vs-compact-">Normalization vs Compact <a
id="NormalizationvsCompact"></a></h4>
+
+<p>In this test we use a complex filter with both AND, and OR to understand
the impact of Conjunctive Normal Form on the
+filter performance. The Search Argument builder by default performs a CNF. The
advantage of the CNF would again be a
+simpler code base.</p>
+
+<div class="language-bash highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>java <span class="nt">-jar</span>
java/bench/core/target/orc-benchmarks-core-<span class="k">*</span><span
class="nt">-uber</span>.jar filter complex
+</code></pre></div></div>
+
+<table>
+ <thead>
+ <tr>
+ <th style="text-align: left">Benchmark</th>
+ <th style="text-align: right">(fSize)</th>
+ <th style="text-align: left">(fType)</th>
+ <th style="text-align: left">(normalize)</th>
+ <th style="text-align: left">Mode</th>
+ <th style="text-align: right">Cnt</th>
+ <th style="text-align: right">Score</th>
+ <th style="text-align: left">Error</th>
+ <th style="text-align: left">Units</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">2</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">true</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">91.922</td>
+ <td style="text-align: left">± 0.301</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">2</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">false</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">90.741</td>
+ <td style="text-align: left">± 0.556</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">2</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">true</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">61.137</td>
+ <td style="text-align: left">± 0.398</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">2</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">false</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">54.829</td>
+ <td style="text-align: left">± 0.431</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">4</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">true</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">284.956</td>
+ <td style="text-align: left">± 1.237</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">4</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">false</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">130.526</td>
+ <td style="text-align: left">± 0.767</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">4</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">true</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">242.387</td>
+ <td style="text-align: left">± 1.053</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">4</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">false</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">98.530</td>
+ <td style="text-align: left">± 0.423</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">8</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">true</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">8007.101</td>
+ <td style="text-align: left">± 54.912</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">8</td>
+ <td style="text-align: left">row</td>
+ <td style="text-align: left">false</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">234.943</td>
+ <td style="text-align: left">± 4.713</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">8</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">true</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">7013.758</td>
+ <td style="text-align: left">± 33.701</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ <tr>
+ <td style="text-align: left">ComplexFilter</td>
+ <td style="text-align: right">8</td>
+ <td style="text-align: left">vector</td>
+ <td style="text-align: left">false</td>
+ <td style="text-align: left">avgt</td>
+ <td style="text-align: right">20</td>
+ <td style="text-align: right">190.442</td>
+ <td style="text-align: left">± 0.881</td>
+ <td style="text-align: left">us/op</td>
+ </tr>
+ </tbody>
+</table>
+
+<p>Explanation:</p>
+
+<ul>
+ <li><strong>fSize</strong> identifies the size of the children in the OR
clause that will be normalized.</li>
+ <li><strong>normalize</strong> identifies whether normalize was carried out
on the Search Argument.</li>
+</ul>
+
+<p>Observations:</p>
+
+<ul>
+ <li>Vector filter is better than the row filter as demonstrated by the <a
href="#RowvsVector">Row vs Vector Test</a>.</li>
+ <li>Normalizing the search argument results in a significant performance
penalty given the explosion of the operator tree
+ <ul>
+ <li>In case where an AND includes 8 ORs, the compact version is faster
by <strong>97.29%</strong></li>
+ </ul>
+ </li>
+</ul>
+
+<h4 id="summary-">Summary <a id="Summary"></a></h4>
+
+<p>Based on the benchmarks we have the following conclusions:</p>
+
+<ul>
+ <li>Vector based filter is significantly better than a row based filter and
justifies the more complex code.</li>
+ <li>Compact filter is significantly faster than a normalized filter.</li>
+</ul>
+
+
+ </article>
+ </div>
+
+ <div class="clear"></div>
+
+ </div>
+</section>
+
+
+ <footer role="contentinfo">
+ <p>The contents of this website are © 2022
+ <a href="https://www.apache.org/">Apache Software Foundation</a>
+ under the terms of the <a
+ href="https://www.apache.org/licenses/LICENSE-2.0.html">
+ Apache License v2</a>. Apache ORC and its logo are trademarks
+ of the Apache Software Foundation.</p>
+</footer>
+
+ <script>
+ var anchorForId = function (id) {
+ var anchor = document.createElement("a");
+ anchor.className = "header-link";
+ anchor.href = "#" + id;
+ anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa
fa-link\"></i>";
+ anchor.title = "Permalink";
+ return anchor;
+ };
+
+ var linkifyAnchors = function (level, containingElement) {
+ var headers = containingElement.getElementsByTagName("h" + level);
+ for (var h = 0; h < headers.length; h++) {
+ var header = headers[h];
+
+ if (typeof header.id !== "undefined" && header.id !== "") {
+ header.appendChild(anchorForId(header.id));
+ }
+ }
+ };
+
+ document.onreadystatechange = function () {
+ if (this.readyState === "complete") {
+ var contentBlock = document.getElementsByClassName("docs")[0] ||
document.getElementsByClassName("news")[0];
+ if (!contentBlock) {
+ return;
+ }
+ for (var level = 1; level <= 6; level++) {
+ linkifyAnchors(level, contentBlock);
+ }
+ }
+ };
+</script>
+
+
+</body>
+</html>
diff --git a/develop/index.html b/develop/index.html
index 8edf7f005..6db99a372 100644
--- a/develop/index.html
+++ b/develop/index.html
@@ -122,6 +122,9 @@ with archive <a
href="https://mail-archives.apache.org/mod_mbox/orc-commits/">he
<p>Each code change requires a <a
href="https://issues.apache.org/jira/browse/ORC">jira</a> to track the
discussion of the change.</p>
+<h2 id="design">Design</h2>
+<p>Some code changes provide <a href="design">design/additional
documentation</a>.</p>
+
<h2 id="source-code">Source code</h2>
<p>ORC uses git for version control. Get the source code and configure it