http://git-wip-us.apache.org/repos/asf/arrow-site/blob/62ef7145/docs/latest/_modules/pyarrow/parquet.html ---------------------------------------------------------------------- diff --git a/docs/latest/_modules/pyarrow/parquet.html b/docs/latest/_modules/pyarrow/parquet.html new file mode 100644 index 0000000..70ad977 --- /dev/null +++ b/docs/latest/_modules/pyarrow/parquet.html @@ -0,0 +1,1543 @@ + + + +<!DOCTYPE html> +<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]--> +<head> + <meta charset="utf-8"> + + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + + <title>pyarrow.parquet — Apache Arrow v0.11.1.dev473+g6ed02454</title> + + + + + + + + + + + + + + + + <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" /> + <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" /> + <link rel="index" title="Index" href="../../genindex.html" /> + <link rel="search" title="Search" href="../../search.html" /> + + + <script src="../../_static/js/modernizr.min.js"></script> + +</head> + +<body class="wy-body-for-nav"> + + + <div class="wy-grid-for-nav"> + + + <nav data-toggle="wy-nav-shift" class="wy-nav-side"> + <div class="wy-side-scroll"> + <div class="wy-side-nav-search"> + + + + <a href="../../index.html" class="icon icon-home"> Apache Arrow + + + + </a> + + + + + <div class="version"> + 0.11.1.dev473+g6ed02454 + </div> + + + + +<div role="search"> + <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get"> + <input type="text" name="q" placeholder="Search docs" /> + <input type="hidden" name="check_keywords" value="yes" /> + <input type="hidden" name="area" value="default" /> + </form> +</div> + + + </div> + + <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> + + + + + + + <p class="caption"><span class="caption-text">Memory Format</span></p> +<ul> +<li class="toctree-l1"><a class="reference internal" href="../../format/README.html">Arrow specification documents</a></li> +<li class="toctree-l1"><a class="reference internal" href="../../format/Guidelines.html">Implementation guidelines</a></li> +<li class="toctree-l1"><a class="reference internal" href="../../format/Layout.html">Physical memory layout</a></li> +<li class="toctree-l1"><a class="reference internal" href="../../format/Metadata.html">Metadata: Logical types, schemas, data headers</a></li> +<li class="toctree-l1"><a class="reference internal" href="../../format/IPC.html">Interprocess messaging / communication (IPC)</a></li> +</ul> +<p class="caption"><span class="caption-text">Languages</span></p> +<ul> +<li class="toctree-l1"><a class="reference internal" href="../../cpp/index.html">C++ Implementation</a></li> +<li class="toctree-l1"><a class="reference internal" href="../../python/index.html">Python bindings</a></li> +</ul> + + + + </div> + </div> + </nav> + + <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> + + + <nav class="wy-nav-top" aria-label="top navigation"> + + <i data-toggle="wy-nav-top" class="fa fa-bars"></i> + <a href="../../index.html">Apache Arrow</a> + + </nav> + + + <div class="wy-nav-content"> + + <div class="rst-content"> + + + + + + + + + + + + + + + + + +<div role="navigation" aria-label="breadcrumbs navigation"> + + <ul class="wy-breadcrumbs"> + + <li><a href="../../index.html">Docs</a> »</li> + + <li><a href="../index.html">Module code</a> »</li> + + <li><a href="../pyarrow.html">pyarrow</a> »</li> + + <li>pyarrow.parquet</li> + + + <li class="wy-breadcrumbs-aside"> + + </li> + + </ul> + + + <hr/> +</div> + <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> + <div itemprop="articleBody"> + + <h1>Source code for pyarrow.parquet</h1><div class="highlight"><pre> +<span></span><span class="c1"># Licensed to the Apache Software Foundation (ASF) under one</span> +<span class="c1"># or more contributor license agreements. See the NOTICE file</span> +<span class="c1"># distributed with this work for additional information</span> +<span class="c1"># regarding copyright ownership. The ASF licenses this file</span> +<span class="c1"># to you under the Apache License, Version 2.0 (the</span> +<span class="c1"># "License"); you may not use this file except in compliance</span> +<span class="c1"># with the License. You may obtain a copy of the License at</span> +<span class="c1">#</span> +<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> +<span class="c1">#</span> +<span class="c1"># Unless required by applicable law or agreed to in writing,</span> +<span class="c1"># software distributed under the License is distributed on an</span> +<span class="c1"># "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY</span> +<span class="c1"># KIND, either express or implied. See the License for the</span> +<span class="c1"># specific language governing permissions and limitations</span> +<span class="c1"># under the License.</span> + +<span class="kn">from</span> <span class="nn">collections</span> <span class="k">import</span> <span class="n">defaultdict</span> +<span class="kn">from</span> <span class="nn">concurrent</span> <span class="k">import</span> <span class="n">futures</span> + +<span class="kn">from</span> <span class="nn">six.moves.urllib.parse</span> <span class="k">import</span> <span class="n">urlparse</span> +<span class="kn">import</span> <span class="nn">json</span> +<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> +<span class="kn">import</span> <span class="nn">os</span> +<span class="kn">import</span> <span class="nn">re</span> +<span class="kn">import</span> <span class="nn">six</span> + +<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span> +<span class="kn">import</span> <span class="nn">pyarrow.lib</span> <span class="k">as</span> <span class="nn">lib</span> +<span class="kn">import</span> <span class="nn">pyarrow._parquet</span> <span class="k">as</span> <span class="nn">_parquet</span> + +<span class="kn">from</span> <span class="nn">pyarrow._parquet</span> <span class="k">import</span> <span class="p">(</span><span class="n">ParquetReader</span><span class="p">,</span> <span class="n">RowGroupStatistics</span><span class="p">,</span> <span class="c1"># noqa</span> + <span class="n">FileMetaData</span><span class="p">,</span> <span class="n">RowGroupMetaData</span><span class="p">,</span> + <span class="n">ColumnChunkMetaData</span><span class="p">,</span> + <span class="n">ParquetSchema</span><span class="p">,</span> <span class="n">ColumnSchema</span><span class="p">)</span> +<span class="kn">from</span> <span class="nn">pyarrow.compat</span> <span class="k">import</span> <span class="n">guid</span> +<span class="kn">from</span> <span class="nn">pyarrow.filesystem</span> <span class="k">import</span> <span class="p">(</span><span class="n">LocalFileSystem</span><span class="p">,</span> <span class="n">_ensure_filesystem</span><span class="p">,</span> + <span class="n">get_filesystem_from_uri</span><span class="p">)</span> +<span class="kn">from</span> <span class="nn">pyarrow.util</span> <span class="k">import</span> <span class="n">_is_path_like</span><span class="p">,</span> <span class="n">_stringify_path</span> + +<span class="n">_URI_STRIP_SCHEMES</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'hdfs'</span><span class="p">,)</span> + + +<span class="k">def</span> <span class="nf">_parse_uri</span><span class="p">(</span><span class="n">path</span><span class="p">):</span> + <span class="n">path</span> <span class="o">=</span> <span class="n">_stringify_path</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> + <span class="n">parsed_uri</span> <span class="o">=</span> <span class="n">urlparse</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> + <span class="k">if</span> <span class="n">parsed_uri</span><span class="o">.</span><span class="n">scheme</span> <span class="ow">in</span> <span class="n">_URI_STRIP_SCHEMES</span><span class="p">:</span> + <span class="k">return</span> <span class="n">parsed_uri</span><span class="o">.</span><span class="n">path</span> + <span class="k">else</span><span class="p">:</span> + <span class="c1"># ARROW-4073: On Windows returning the path with the scheme</span> + <span class="c1"># stripped removes the drive letter, if any</span> + <span class="k">return</span> <span class="n">path</span> + + +<span class="k">def</span> <span class="nf">_get_filesystem_and_path</span><span class="p">(</span><span class="n">passed_filesystem</span><span class="p">,</span> <span class="n">path</span><span class="p">):</span> + <span class="k">if</span> <span class="n">passed_filesystem</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">return</span> <span class="n">get_filesystem_from_uri</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">passed_filesystem</span> <span class="o">=</span> <span class="n">_ensure_filesystem</span><span class="p">(</span><span class="n">passed_filesystem</span><span class="p">)</span> + <span class="n">parsed_path</span> <span class="o">=</span> <span class="n">_parse_uri</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> + <span class="k">return</span> <span class="n">passed_filesystem</span><span class="p">,</span> <span class="n">parsed_path</span> + + +<span class="k">def</span> <span class="nf">_check_contains_null</span><span class="p">(</span><span class="n">val</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">six</span><span class="o">.</span><span class="n">binary_type</span><span class="p">):</span> + <span class="k">for</span> <span class="n">byte</span> <span class="ow">in</span> <span class="n">val</span><span class="p">:</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">byte</span><span class="p">,</span> <span class="n">six</span><span class="o">.</span><span class="n">binary_type</span><span class="p">):</span> + <span class="n">compare_to</span> <span class="o">=</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">compare_to</span> <span class="o">=</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">byte</span> <span class="o">==</span> <span class="n">compare_to</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">True</span> + <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">six</span><span class="o">.</span><span class="n">text_type</span><span class="p">):</span> + <span class="k">return</span> <span class="sa">u</span><span class="s1">'</span><span class="se">\x00</span><span class="s1">'</span> <span class="ow">in</span> <span class="n">val</span> + <span class="k">return</span> <span class="kc">False</span> + + +<span class="k">def</span> <span class="nf">_check_filters</span><span class="p">(</span><span class="n">filters</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Check if filters are well-formed.</span> +<span class="sd"> """</span> + <span class="k">if</span> <span class="n">filters</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">filters</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="nb">any</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">filters</span><span class="p">):</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Malformed filters"</span><span class="p">)</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">filters</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">],</span> <span class="n">six</span><span class="o">.</span><span class="n">string_types</span><span class="p">):</span> + <span class="c1"># We have encountered the situation where we have one nesting level</span> + <span class="c1"># too few:</span> + <span class="c1"># We have [(,,), ..] instead of [[(,,), ..]]</span> + <span class="n">filters</span> <span class="o">=</span> <span class="p">[</span><span class="n">filters</span><span class="p">]</span> + <span class="k">for</span> <span class="n">conjunction</span> <span class="ow">in</span> <span class="n">filters</span><span class="p">:</span> + <span class="k">for</span> <span class="n">col</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="n">conjunction</span><span class="p">:</span> + <span class="k">if</span> <span class="p">(</span> + <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> + <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="n">_check_contains_null</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">val</span><span class="p">)</span> + <span class="ow">or</span> <span class="n">_check_contains_null</span><span class="p">(</span><span class="n">val</span><span class="p">)</span> + <span class="p">):</span> + <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> + <span class="s2">"Null-terminated binary strings are not supported as"</span> + <span class="s2">" filter values."</span> + <span class="p">)</span> + <span class="k">return</span> <span class="n">filters</span> + +<span class="c1"># ----------------------------------------------------------------------</span> +<span class="c1"># Reading a single Parquet file</span> + + +<div class="viewcode-block" id="ParquetFile"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile">[docs]</a><span class="k">class</span> <span class="nc">ParquetFile</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Reader interface for a single Parquet file</span> + +<span class="sd"> Parameters</span> +<span class="sd"> ----------</span> +<span class="sd"> source : str, pathlib.Path, pyarrow.NativeFile, or file-like object</span> +<span class="sd"> Readable source. For passing bytes or buffer-like file containing a</span> +<span class="sd"> Parquet file, use pyarorw.BufferReader</span> +<span class="sd"> metadata : ParquetFileMetadata, default None</span> +<span class="sd"> Use existing metadata object, rather than reading from file.</span> +<span class="sd"> common_metadata : ParquetFileMetadata, default None</span> +<span class="sd"> Will be used in reads for pandas schema metadata if not found in the</span> +<span class="sd"> main file's metadata, no other uses at the moment</span> +<span class="sd"> memory_map : boolean, default True</span> +<span class="sd"> If the source is a file path, use a memory map to read file, which can</span> +<span class="sd"> improve performance in some environments</span> +<span class="sd"> """</span> +<div class="viewcode-block" id="ParquetFile.__init__"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.__init__">[docs]</a> <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">source</span><span class="p">,</span> <span class="n">metadata</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">common_metadata</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> + <span class="n">memory_map</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">reader</span> <span class="o">=</span> <span class="n">ParquetReader</span><span class="p">()</span> + <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">use_memory_map</span><span class="o">=</span><span class="n">memory_map</span><span class="p">,</span> <span class="n">metadata</span><span class="o">=</span><span class="n">metadata</span><span class="p">)</span> + <span class="bp">self</span><span class="o">.</span><span class="n">common_metadata</span> <span class="o">=</span> <span class="n">common_metadata</span> + <span class="bp">self</span><span class="o">.</span><span class="n">_nested_paths_by_prefix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_build_nested_paths</span><span class="p">()</span></div> + + <span class="k">def</span> <span class="nf">_build_nested_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">paths</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">column_paths</span> + + <span class="n">result</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">_visit_piece</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">rest</span><span class="p">):</span> + <span class="n">result</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> + + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">rest</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> + <span class="n">nested_key</span> <span class="o">=</span> <span class="s1">'.'</span><span class="o">.</span><span class="n">join</span><span class="p">((</span><span class="n">key</span><span class="p">,</span> <span class="n">rest</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span> + <span class="n">_visit_piece</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">nested_key</span><span class="p">,</span> <span class="n">rest</span><span class="p">[</span><span class="mi">1</span><span class="p">:])</span> + + <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">path</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">paths</span><span class="p">):</span> + <span class="n">_visit_piece</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">path</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">path</span><span class="p">[</span><span class="mi">1</span><span class="p">:])</span> + + <span class="k">return</span> <span class="n">result</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">metadata</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">schema</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">num_row_groups</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">num_row_groups</span> + +<div class="viewcode-block" id="ParquetFile.read_row_group"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.read_row_group">[docs]</a> <span class="k">def</span> <span class="nf">read_row_group</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> + <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Read a single row group from a Parquet file</span> + +<span class="sd"> Parameters</span> +<span class="sd"> ----------</span> +<span class="sd"> columns: list</span> +<span class="sd"> If not None, only these columns will be read from the row group. A</span> +<span class="sd"> column name may be a prefix of a nested field, e.g. 'a' will select</span> +<span class="sd"> 'a.b', 'a.c', and 'a.d.e'</span> +<span class="sd"> use_threads : boolean, default True</span> +<span class="sd"> Perform multi-threaded column reads</span> +<span class="sd"> use_pandas_metadata : boolean, default False</span> +<span class="sd"> If True and file has custom pandas schema metadata, ensure that</span> +<span class="sd"> index columns are also loaded</span> + +<span class="sd"> Returns</span> +<span class="sd"> -------</span> +<span class="sd"> pyarrow.table.Table</span> +<span class="sd"> Content of the row group as a table (of columns)</span> +<span class="sd"> """</span> + <span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span> + <span class="n">columns</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">read_row_group</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">column_indices</span><span class="o">=</span><span class="n">column_indices</span><span class="p">,</span> + <span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">)</span></div> + +<div class="viewcode-block" id="ParquetFile.read"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.read">[docs]</a> <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Read a Table from Parquet format</span> + +<span class="sd"> Parameters</span> +<span class="sd"> ----------</span> +<span class="sd"> columns: list</span> +<span class="sd"> If not None, only these columns will be read from the file. A</span> +<span class="sd"> column name may be a prefix of a nested field, e.g. 'a' will select</span> +<span class="sd"> 'a.b', 'a.c', and 'a.d.e'</span> +<span class="sd"> use_threads : boolean, default True</span> +<span class="sd"> Perform multi-threaded column reads</span> +<span class="sd"> use_pandas_metadata : boolean, default False</span> +<span class="sd"> If True and file has custom pandas schema metadata, ensure that</span> +<span class="sd"> index columns are also loaded</span> + +<span class="sd"> Returns</span> +<span class="sd"> -------</span> +<span class="sd"> pyarrow.table.Table</span> +<span class="sd"> Content of the file as a table (of columns)</span> +<span class="sd"> """</span> + <span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span> + <span class="n">columns</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">read_all</span><span class="p">(</span><span class="n">column_indices</span><span class="o">=</span><span class="n">column_indices</span><span class="p">,</span> + <span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">)</span></div> + +<div class="viewcode-block" id="ParquetFile.scan_contents"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.scan_contents">[docs]</a> <span class="k">def</span> <span class="nf">scan_contents</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">batch_size</span><span class="o">=</span><span class="mi">65536</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Read contents of file with a single thread for indicated columns and</span> +<span class="sd"> batch size. Number of rows in file is returned. This function is used</span> +<span class="sd"> for benchmarking</span> + +<span class="sd"> Parameters</span> +<span class="sd"> ----------</span> +<span class="sd"> columns : list of integers, default None</span> +<span class="sd"> If None, scan all columns</span> +<span class="sd"> batch_size : int, default 64K</span> +<span class="sd"> Number of rows to read at a time internally</span> + +<span class="sd"> Returns</span> +<span class="sd"> -------</span> +<span class="sd"> num_rows : number of rows in file</span> +<span class="sd"> """</span> + <span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">scan_contents</span><span class="p">(</span><span class="n">column_indices</span><span class="p">,</span> + <span class="n">batch_size</span><span class="o">=</span><span class="n">batch_size</span><span class="p">)</span></div> + + <span class="k">def</span> <span class="nf">_get_column_indices</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">column_names</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> + <span class="k">if</span> <span class="n">column_names</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">None</span> + + <span class="n">indices</span> <span class="o">=</span> <span class="p">[]</span> + + <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">column_names</span><span class="p">:</span> + <span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_nested_paths_by_prefix</span><span class="p">:</span> + <span class="n">indices</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_nested_paths_by_prefix</span><span class="p">[</span><span class="n">name</span><span class="p">])</span> + + <span class="k">if</span> <span class="n">use_pandas_metadata</span><span class="p">:</span> + <span class="n">file_keyvalues</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">metadata</span> + <span class="n">common_keyvalues</span> <span class="o">=</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">common_metadata</span><span class="o">.</span><span class="n">metadata</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">common_metadata</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> + <span class="k">else</span> <span class="kc">None</span><span class="p">)</span> + + <span class="k">if</span> <span class="n">file_keyvalues</span> <span class="ow">and</span> <span class="sa">b</span><span class="s1">'pandas'</span> <span class="ow">in</span> <span class="n">file_keyvalues</span><span class="p">:</span> + <span class="n">index_columns</span> <span class="o">=</span> <span class="n">_get_pandas_index_columns</span><span class="p">(</span><span class="n">file_keyvalues</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">common_keyvalues</span> <span class="ow">and</span> <span class="sa">b</span><span class="s1">'pandas'</span> <span class="ow">in</span> <span class="n">common_keyvalues</span><span class="p">:</span> + <span class="n">index_columns</span> <span class="o">=</span> <span class="n">_get_pandas_index_columns</span><span class="p">(</span><span class="n">common_keyvalues</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">index_columns</span> <span class="o">=</span> <span class="p">[]</span> + + <span class="k">if</span> <span class="n">indices</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">index_columns</span><span class="p">:</span> + <span class="n">indices</span> <span class="o">+=</span> <span class="nb">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">column_name_idx</span><span class="p">,</span> <span class="n">index_columns</span><span class="p">)</span> + + <span class="k">return</span> <span class="n">indices</span></div> + + +<span class="n">_SPARK_DISALLOWED_CHARS</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s1">'[ ,;</span><span class="si">{}</span><span class="s1">()</span><span class="se">\n\t</span><span class="s1">=]'</span><span class="p">)</span> + + +<span class="k">def</span> <span class="nf">_sanitized_spark_field_name</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">_SPARK_DISALLOWED_CHARS</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s1">'_'</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span> + + +<span class="k">def</span> <span class="nf">_sanitize_schema</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">flavor</span><span class="p">):</span> + <span class="k">if</span> <span class="s1">'spark'</span> <span class="ow">in</span> <span class="n">flavor</span><span class="p">:</span> + <span class="n">sanitized_fields</span> <span class="o">=</span> <span class="p">[]</span> + + <span class="n">schema_changed</span> <span class="o">=</span> <span class="kc">False</span> + + <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">schema</span><span class="p">:</span> + <span class="n">name</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">name</span> + <span class="n">sanitized_name</span> <span class="o">=</span> <span class="n">_sanitized_spark_field_name</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> + + <span class="k">if</span> <span class="n">sanitized_name</span> <span class="o">!=</span> <span class="n">name</span><span class="p">:</span> + <span class="n">schema_changed</span> <span class="o">=</span> <span class="kc">True</span> + <span class="n">sanitized_field</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="n">sanitized_name</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> + <span class="n">field</span><span class="o">.</span><span class="n">nullable</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">metadata</span><span class="p">)</span> + <span class="n">sanitized_fields</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">sanitized_field</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">sanitized_fields</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">field</span><span class="p">)</span> + + <span class="n">new_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">sanitized_fields</span><span class="p">,</span> <span class="n">metadata</span><span class="o">=</span><span class="n">schema</span><span class="o">.</span><span class="n">metadata</span><span class="p">)</span> + <span class="k">return</span> <span class="n">new_schema</span><span class="p">,</span> <span class="n">schema_changed</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">schema</span><span class="p">,</span> <span class="kc">False</span> + + +<span class="k">def</span> <span class="nf">_sanitize_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">new_schema</span><span class="p">,</span> <span class="n">flavor</span><span class="p">):</span> + <span class="c1"># TODO: This will not handle prohibited characters in nested field names</span> + <span class="k">if</span> <span class="s1">'spark'</span> <span class="ow">in</span> <span class="n">flavor</span><span class="p">:</span> + <span class="n">column_data</span> <span class="o">=</span> <span class="p">[</span><span class="n">table</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">data</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">num_columns</span><span class="p">)]</span> + <span class="k">return</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">(</span><span class="n">column_data</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">new_schema</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">table</span> + + +<span class="n">_parquet_writer_arg_docs</span> <span class="o">=</span> <span class="s2">"""version : {"1.0", "2.0"}, default "1.0"</span> +<span class="s2"> The Parquet format version, defaults to 1.0</span> +<span class="s2">use_dictionary : bool or list</span> +<span class="s2"> Specify if we should use dictionary encoding in general or only for</span> +<span class="s2"> some columns.</span> +<span class="s2">use_deprecated_int96_timestamps : boolean, default None</span> +<span class="s2"> Write timestamps to INT96 Parquet format. Defaults to False unless enabled</span> +<span class="s2"> by flavor argument. This take priority over the coerce_timestamps option.</span> +<span class="s2">coerce_timestamps : string, default None</span> +<span class="s2"> Cast timestamps a particular resolution.</span> +<span class="s2"> Valid values: {None, 'ms', 'us'}</span> +<span class="s2">allow_truncated_timestamps : boolean, default False</span> +<span class="s2"> Allow loss of data when coercing timestamps to a particular</span> +<span class="s2"> resolution. E.g. if microsecond or nanosecond data is lost when coercing to</span> +<span class="s2"> 'ms', do not raise an exception</span> +<span class="s2">compression : str or dict</span> +<span class="s2"> Specify the compression codec, either on a general basis or per-column.</span> +<span class="s2"> Valid values: {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD'}</span> +<span class="s2">flavor : {'spark'}, default None</span> +<span class="s2"> Sanitize schema or set other compatibility options for compatibility"""</span> + + +<div class="viewcode-block" id="ParquetWriter"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter">[docs]</a><span class="k">class</span> <span class="nc">ParquetWriter</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> + + <span class="vm">__doc__</span> <span class="o">=</span> <span class="s2">"""</span> +<span class="s2">Class for incrementally building a Parquet file for Arrow tables</span> + +<span class="s2">Parameters</span> +<span class="s2">----------</span> +<span class="s2">where : path or file-like object</span> +<span class="s2">schema : arrow Schema</span> +<span class="si">{0}</span><span class="s2"></span> +<span class="s2">"""</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">_parquet_writer_arg_docs</span><span class="p">)</span> + +<div class="viewcode-block" id="ParquetWriter.__init__"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.__init__">[docs]</a> <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">where</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">flavor</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> + <span class="n">version</span><span class="o">=</span><span class="s1">'1.0'</span><span class="p">,</span> + <span class="n">use_dictionary</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> + <span class="n">compression</span><span class="o">=</span><span class="s1">'snappy'</span><span class="p">,</span> + <span class="n">use_deprecated_int96_timestamps</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> + <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">):</span> + <span class="k">if</span> <span class="n">use_deprecated_int96_timestamps</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="c1"># Use int96 timestamps for Spark</span> + <span class="k">if</span> <span class="n">flavor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="s1">'spark'</span> <span class="ow">in</span> <span class="n">flavor</span><span class="p">:</span> + <span class="n">use_deprecated_int96_timestamps</span> <span class="o">=</span> <span class="kc">True</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">use_deprecated_int96_timestamps</span> <span class="o">=</span> <span class="kc">False</span> + + <span class="bp">self</span><span class="o">.</span><span class="n">flavor</span> <span class="o">=</span> <span class="n">flavor</span> + <span class="k">if</span> <span class="n">flavor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">schema</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema_changed</span> <span class="o">=</span> <span class="n">_sanitize_schema</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">flavor</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="bp">self</span><span class="o">.</span><span class="n">schema_changed</span> <span class="o">=</span> <span class="kc">False</span> + + <span class="bp">self</span><span class="o">.</span><span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span> + <span class="bp">self</span><span class="o">.</span><span class="n">where</span> <span class="o">=</span> <span class="n">where</span> + + <span class="c1"># If we open a file using an implied filesystem, so it can be assured</span> + <span class="c1"># to be closed</span> + <span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span> <span class="o">=</span> <span class="kc">None</span> + + <span class="k">if</span> <span class="n">_is_path_like</span><span class="p">(</span><span class="n">where</span><span class="p">):</span> + <span class="n">fs</span><span class="p">,</span> <span class="n">path</span> <span class="o">=</span> <span class="n">_get_filesystem_and_path</span><span class="p">(</span><span class="n">filesystem</span><span class="p">,</span> <span class="n">where</span><span class="p">)</span> + <span class="n">sink</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="s1">'wb'</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">sink</span> <span class="o">=</span> <span class="n">where</span> + + <span class="bp">self</span><span class="o">.</span><span class="n">writer</span> <span class="o">=</span> <span class="n">_parquet</span><span class="o">.</span><span class="n">ParquetWriter</span><span class="p">(</span> + <span class="n">sink</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> + <span class="n">version</span><span class="o">=</span><span class="n">version</span><span class="p">,</span> + <span class="n">compression</span><span class="o">=</span><span class="n">compression</span><span class="p">,</span> + <span class="n">use_dictionary</span><span class="o">=</span><span class="n">use_dictionary</span><span class="p">,</span> + <span class="n">use_deprecated_int96_timestamps</span><span class="o">=</span><span class="n">use_deprecated_int96_timestamps</span><span class="p">,</span> + <span class="o">**</span><span class="n">options</span><span class="p">)</span> + <span class="bp">self</span><span class="o">.</span><span class="n">is_open</span> <span class="o">=</span> <span class="kc">True</span></div> + + <span class="k">def</span> <span class="nf">__del__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s1">'is_open'</span><span class="p">,</span> <span class="kc">False</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">close</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span> + + <span class="k">def</span> <span class="nf">__exit__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">close</span><span class="p">()</span> + <span class="c1"># return false since we want to propagate exceptions</span> + <span class="k">return</span> <span class="kc">False</span> + +<div class="viewcode-block" id="ParquetWriter.write_table"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_table">[docs]</a> <span class="k">def</span> <span class="nf">write_table</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">table</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema_changed</span><span class="p">:</span> + <span class="n">table</span> <span class="o">=</span> <span class="n">_sanitize_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">flavor</span><span class="p">)</span> + <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_open</span> + + <span class="k">if</span> <span class="ow">not</span> <span class="n">table</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span> <span class="n">check_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> + <span class="n">msg</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'Table schema does not match schema used to create file: '</span> + <span class="s1">'</span><span class="se">\n</span><span class="s1">table:</span><span class="se">\n</span><span class="si">{0!s}</span><span class="s1"> vs. </span><span class="se">\n</span><span class="s1">file:</span><span class="se">\n</span><span class="si">{1!s}</span><span class="s1">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span> + <span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="p">))</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span> + + <span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="n">row_group_size</span><span class="p">)</span></div> + +<div class="viewcode-block" id="ParquetWriter.close"><a class="viewcode-back" href="../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.close">[docs]</a> <span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_open</span><span class="p">:</span> + <span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">close</span><span class="p">()</span> + <span class="bp">self</span><span class="o">.</span><span class="n">is_open</span> <span class="o">=</span> <span class="kc">False</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span><span class="o">.</span><span class="n">close</span><span class="p">()</span></div></div> + + +<span class="k">def</span> <span class="nf">_get_pandas_index_columns</span><span class="p">(</span><span class="n">keyvalues</span><span class="p">):</span> + <span class="k">return</span> <span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">keyvalues</span><span class="p">[</span><span class="sa">b</span><span class="s1">'pandas'</span><span class="p">]</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s1">'utf8'</span><span class="p">))</span> + <span class="p">[</span><span class="s1">'index_columns'</span><span class="p">])</span> + + +<span class="c1"># ----------------------------------------------------------------------</span> +<span class="c1"># Metadata container providing instructions about reading a single Parquet</span> +<span class="c1"># file, possibly part of a partitioned dataset</span> + + +<span class="k">class</span> <span class="nc">ParquetDatasetPiece</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> A single chunk of a potentially larger Parquet dataset to read. The</span> +<span class="sd"> arguments will indicate to read either a single row group or all row</span> +<span class="sd"> groups, and whether to add partition keys to the resulting pyarrow.Table</span> + +<span class="sd"> Parameters</span> +<span class="sd"> ----------</span> +<span class="sd"> path : str or pathlib.Path</span> +<span class="sd"> Path to file in the file system where this piece is located</span> +<span class="sd"> partition_keys : list of tuples</span> +<span class="sd"> [(column name, ordinal index)]</span> +<span class="sd"> row_group : int, default None</span> +<span class="sd"> Row group to load. By default, reads all row groups</span> +<span class="sd"> """</span> + + <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path</span><span class="p">,</span> <span class="n">row_group</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">partition_keys</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">_stringify_path</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> + <span class="bp">self</span><span class="o">.</span><span class="n">row_group</span> <span class="o">=</span> <span class="n">row_group</span> + <span class="bp">self</span><span class="o">.</span><span class="n">partition_keys</span> <span class="o">=</span> <span class="n">partition_keys</span> <span class="ow">or</span> <span class="p">[]</span> + + <span class="k">def</span> <span class="nf">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">ParquetDatasetPiece</span><span class="p">):</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">return</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">path</span> <span class="ow">and</span> + <span class="bp">self</span><span class="o">.</span><span class="n">row_group</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">row_group</span> <span class="ow">and</span> + <span class="bp">self</span><span class="o">.</span><span class="n">partition_keys</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">partition_keys</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">__ne__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="ow">not</span> <span class="p">(</span><span class="bp">self</span> <span class="o">==</span> <span class="n">other</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="p">(</span><span class="s1">'</span><span class="si">{0}</span><span class="s1">(</span><span class="si">{1!r}</span><span class="s1">, row_group=</span><span class="si">{2!r}</span><span class="s1">, partition_keys=</span><span class="si">{3!r}</span><span class="s1">)'</span> + <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">path</span><span class="p">,</span> + <span class="bp">self</span><span class="o">.</span><span class="n">row_group</span><span class="p">,</span> + <span class="bp">self</span><span class="o">.</span><span class="n">partition_keys</span><span class="p">))</span> + + <span class="k">def</span> <span class="nf">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">result</span> <span class="o">=</span> <span class="s1">''</span> + + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">partition_keys</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> + <span class="n">partition_str</span> <span class="o">=</span> <span class="s1">', '</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s1">'</span><span class="si">{0}</span><span class="s1">=</span><span class="si">{1}</span><span class="s1">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">index</span><span class="p">)</span> + <span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">index</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">partition_keys</span><span class="p">)</span> + <span class="n">result</span> <span class="o">+=</span> <span class="s1">'partition[</span><span class="si">{0}</span><span class="s1">] '</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">partition_str</span><span class="p">)</span> + + <span class="n">result</span> <span class="o">+=</span> <span class="bp">self</span><span class="o">.</span><span class="n">path</span> + + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">row_group</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">result</span> <span class="o">+=</span> <span class="s1">' | row_group=</span><span class="si">{0}</span><span class="s1">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">row_group</span><span class="p">)</span> + + <span class="k">return</span> <span class="n">result</span> + + <span class="k">def</span> <span class="nf">get_metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">open_file_func</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Given a function that can create an open ParquetFile object, return the</span> +<span class="sd"> file's metadata</span> +<span class="sd"> """</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_open</span><span class="p">(</span><span class="n">open_file_func</span><span class="p">)</span><span class="o">.</span><span class="n">metadata</span> + + <span class="k">def</span> <span class="nf">_open</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">open_file_func</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Returns instance of ParquetFile</span> +<span class="sd"> """</span> + <span class="n">reader</span> <span class="o">=</span> <span class="n">open_file_func</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">path</span><span class="p">)</span> + <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">reader</span><span class="p">,</span> <span class="n">ParquetFile</span><span class="p">):</span> + <span class="n">reader</span> <span class="o">=</span> <span class="n">ParquetFile</span><span class="p">(</span><span class="n">reader</span><span class="p">)</span> + <span class="k">return</span> <span class="n">reader</span> + + <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">partitions</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> + <span class="n">open_file_func</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">file</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Read this piece as a pyarrow.Table</span> + +<span class="sd"> Parameters</span> +<span class="sd"> ----------</span> +<span class="sd"> columns : list of column names, default None</span> +<span class="sd"> use_threads : boolean, default True</span> +<span class="sd"> Perform multi-threaded column reads</span> +<span class="sd"> partitions : ParquetPartitions, default None</span> +<span class="sd"> open_file_func : function, default None</span> +<span class="sd"> A function that knows how to construct a ParquetFile object given</span> +<span class="sd"> the file path in this piece</span> +<span class="sd"> file : file-like object</span> +<span class="sd"> passed to ParquetFile</span> + +<span class="sd"> Returns</span> +<span class="sd"> -------</span> +<span class="sd"> table : pyarrow.Table</span> +<span class="sd"> """</span> + <span class="k">if</span> <span class="n">open_file_func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">reader</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_open</span><span class="p">(</span><span class="n">open_file_func</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">file</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">reader</span> <span class="o">=</span> <span class="n">ParquetFile</span><span class="p">(</span><span class="n">file</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="c1"># try to read the local path</span> + <span class="n">reader</span> <span class="o">=</span> <span class="n">ParquetFile</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">path</span><span class="p">)</span> + + <span class="n">options</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> + <span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">,</span> + <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span> + + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">row_group</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">table</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">read_row_group</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">row_group</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">table</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span> + + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">partition_keys</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> + <span class="k">if</span> <span class="n">partitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">'Must pass partition sets'</span><span class="p">)</span> + + <span class="c1"># Here, the index is the categorical code of the partition where</span> + <span class="c1"># this piece is located. Suppose we had</span> + <span class="c1">#</span> + <span class="c1"># /foo=a/0.parq</span> + <span class="c1"># /foo=b/0.parq</span> + <span class="c1"># /foo=c/0.parq</span> + <span class="c1">#</span> + <span class="c1"># Then we assign a=0, b=1, c=2. And the resulting Table pieces will</span> + <span class="c1"># have a DictionaryArray column named foo having the constant index</span> + <span class="c1"># value as indicated. The distinct categories of the partition have</span> + <span class="c1"># been computed in the ParquetManifest</span> + <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">index</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">partition_keys</span><span class="p">):</span> + <span class="c1"># The partition code is the same for all values in this piece</span> + <span class="n">indices</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="n">index</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">'i4'</span><span class="p">)</span><span class="o">.</span><span class="n">repeat</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">table</span><span class="p">))</span> + + <span class="c1"># This is set of all partition values, computed as part of the</span> + <span class="c1"># manifest, so ['a', 'b', 'c'] as in our example above.</span> + <span class="n">dictionary</span> <span class="o">=</span> <span class="n">partitions</span><span class="o">.</span><span class="n">levels</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">dictionary</span> + + <span class="n">arr</span> <span class="o">=</span> <span class="n">lib</span><span class="o">.</span><span class="n">DictionaryArray</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">(</span><span class="n">indices</span><span class="p">,</span> <span class="n">dictionary</span><span class="p">)</span> + <span class="n">col</span> <span class="o">=</span> <span class="n">lib</span><span class="o">.</span><span class="n">Column</span><span class="o">.</span><span class="n">from_array</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">arr</span><span class="p">)</span> + <span class="n">table</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">append_column</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> + + <span class="k">return</span> <span class="n">table</span> + + +<span class="k">class</span> <span class="nc">PartitionSet</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> + <span class="sd">"""A data structure for cataloguing the observed Parquet partitions at a</span> +<span class="sd"> particular level. So if we have</span> + +<span class="sd"> /foo=a/bar=0</span> +<span class="sd"> /foo=a/bar=1</span> +<span class="sd"> /foo=a/bar=2</span> +<span class="sd"> /foo=b/bar=0</span> +<span class="sd"> /foo=b/bar=1</span> +<span class="sd"> /foo=b/bar=2</span> + +<span class="sd"> Then we have two partition sets, one for foo, another for bar. As we visit</span> +<span class="sd"> levels of the partition hierarchy, a PartitionSet tracks the distinct</span> +<span class="sd"> values and assigns categorical codes to use when reading the pieces</span> +<span class="sd"> """</span> + + <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">keys</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> + <span class="bp">self</span><span class="o">.</span><span class="n">keys</span> <span class="o">=</span> <span class="n">keys</span> <span class="ow">or</span> <span class="p">[]</span> + <span class="bp">self</span><span class="o">.</span><span class="n">key_indices</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">k</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">keys</span><span class="p">)}</span> + <span class="bp">self</span><span class="o">.</span><span class="n">_dictionary</span> <span class="o">=</span> <span class="kc">None</span> + + <span class="k">def</span> <span class="nf">get_index</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Get the index of the partition value if it is known, otherwise assign</span> +<span class="sd"> one</span> +<span class="sd"> """</span> + <span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">key_indices</span><span class="p">:</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">key_indices</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">index</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">key_indices</span><span class="p">)</span> + <span class="bp">self</span><span class="o">.</span><span class="n">keys</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> + <span class="bp">self</span><span class="o">.</span><span class="n">key_indices</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">index</span> + <span class="k">return</span> <span class="n">index</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">dictionary</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dictionary</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dictionary</span> + + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">keys</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">'No known partition keys'</span><span class="p">)</span> + + <span class="c1"># Only integer and string partition types are supported right now</span> + <span class="k">try</span><span class="p">:</span> + <span class="n">integer_keys</span> <span class="o">=</span> <span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">keys</span><span class="p">]</span> + <span class="n">dictionary</span> <span class="o">=</span> <span class="n">lib</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">integer_keys</span><span class="p">)</span> + <span class="k">except</span> <span class="ne">ValueError</span><span class="p">:</span> + <span class="n">dictionary</span> <span class="o">=</span> <span class="n">lib</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">keys</span><span class="p">)</span> + + <span class="bp">self</span><span class="o">.</span><span class="n">_dictionary</span> <span class="o">=</span> <span class="n">dictionary</span> + <span class="k">return</span> <span class="n">dictionary</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">is_sorted</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">keys</span><span class="p">)</span> <span class="o">==</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">keys</span><span class="p">)</span> + + +<span class="k">class</span> <span class="nc">ParquetPartitions</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> + + <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">levels</span> <span class="o">=</span> <span class="p">[]</span> + <span class="bp">self</span><span class="o">.</span><span class="n">partition_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">levels</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">levels</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> + + <span class="k">def</span> <span class="nf">get_index</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">level</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">key</span><span class="p">):</span> + <span class="sd">"""</span> +<span class="sd"> Record a partition value at a particular level, returning the distinct</span> +<span class="sd"> code for that value at that level. Example:</span> + +<span class="sd"> partitions.get_index(1, 'foo', 'a') returns 0</span> +<span class="sd"> partitions.get_index(1, 'foo', 'b') returns 1</span> +<span class="sd"> partitions.get_index(1, 'foo', 'c') returns 2</span> +<span class="sd"> partitions.get_index(1, 'foo', 'a') returns 0</span> + +<span class="sd"> Parameters</span> +<span class="sd"> ----------</span> +<span class="sd"> level : int</span> +<span class="sd"> The nesting level of the partition we are observing</span> +<span class="sd"> name : string</span> +<span class="sd"> The partition name</span> +<span class="sd"> key : string or int</span> +<span class="sd"> The partition value</span> +<span class="sd"> """</span> + <span class="k">if</span> <span class="n">level</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">levels</span><span class="p">):</span> + <span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">partition_names</span><span class="p">:</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">'</span><span class="si">{0}</span><span class="s1"> was the name of the partition in '</span> + <span class="s1">'another level'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> + + <span class="n">part_set</span> <span class="o">=</span> <span class="n">PartitionSet</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> + <span class="bp">self</span><span class="o">.</span><span class="n">levels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">part_set</span><span class="p">)</span> + <span class="bp">self</span><span class="o">.</span><span class="n">partition_names</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> + + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">levels</span><span class="p">[</span><span class="n">level</span><span class="p">]</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">filter_accepts_partition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">part_key</span><span class="p">,</span> <span class="nb">filter</span><span class="p">,</span> <span class="n">level</span><span class="p">):</span> + <span class="n">p_column</span><span class="p">,</span> <span class="n">p_value_index</span> <span class="o">=</span> <span class="n">part_key</span> + <span class="n">f_column</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">f_value</span> <span class="o">=</span> <span class="nb">filter</span> + <span class="k">if</span> <span class="n">p_column</span> <span class="o">!=</span> <span class="n">f_column</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">True</span> + + <span class="n">f_type</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="n">f_value</span><span class="p">)</span> + + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">f_value</span><span class="p">,</span> <span class="nb">set</span><span class="p">):</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">f_value</span><span class="p">:</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Cannot use empty set as filter value"</span><span class="p">)</span> + <span class="k">if</span> <span class="n">op</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">{</span><span class="s1">'in'</span><span class="p">,</span> <span class="s1">'not in'</span><span class="p">}:</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Op '</span><span class="si">%s</span><span class="s2">' not supported with set value"</span><span class="p">,</span> + <span class="n">op</span><span class="p">)</span> + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">([</span><span class="nb">type</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">f_value</span><span class="p">]))</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"All elements of set '</span><span class="si">%s</span><span class="s2">' must be of"</span> + <span class="s2">" same type"</span><span class="p">,</span> <span class="n">f_value</span><span class="p">)</span> + <span class="n">f_type</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="nb">next</span><span class="p">(</span><span class="nb">iter</span><span class="p">(</span><span class="n">f_value</span><span class="p">)))</span> + + <span class="n">p_value</span> <span class="o">=</span> <span class="n">f_type</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">levels</span><span class="p">[</span><span class="n">level</span><span class="p">]</span> + <span class="o">.</span><span class="n">dictionary</span><span class="p">[</span><span class="n">p_value_index</span><span class="p">]</span> + <span class="o">.</span><span class="n">as_py</span><span class="p">()))</span> + + <span class="k">if</span> <span class="n">op</span> <span class="o">==</span> <span class="s2">"="</span> <span class="ow">or</span> <span class="n">op</span> <span class="o">==</span> <span class="s2">"=="</span><span class="p">:</span> + <span class="k">return</span> <span class="n">p_value</span> <span class="o">==</span> <span class="n">f_value</span> + <span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s2">"!="</span><span class="p">:</span> + <span class="k">return</span> <span class="n">p_value</span> <span class="o">!=</span> <span class="n">f_value</span> + <span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s1">'<'</span><span class="p">:</span> + <span class="k">return</span> <span class="n">p_value</span> <span class="o"><</span> <span class="n">f_value</span> + <span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <spa
<TRUNCATED>