Repository: arrow Updated Branches: refs/heads/master 5739e04b3 -> d8d3d8435
ARROW-1022: [Python] Add multithreaded read option to read_feather Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #682 from wesm/ARROW-1022 and squashes the following commits: 8fd241e [Wes McKinney] Add multithreaded read option to read_feather Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d8d3d843 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d8d3d843 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d8d3d843 Branch: refs/heads/master Commit: d8d3d84354d827e45c8267cd05aecd2aa36cf60b Parents: 5739e04 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Sun May 14 17:23:26 2017 +0200 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Sun May 14 17:23:26 2017 +0200 ---------------------------------------------------------------------- python/pyarrow/feather.py | 10 ++++++---- python/pyarrow/tests/test_feather.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/d8d3d843/python/pyarrow/feather.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 3754aec..34783a7 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -37,7 +37,7 @@ class FeatherReader(ext.FeatherReader): self.source = source self.open(source) - def read(self, columns=None): + def read(self, columns=None, nthreads=1): if columns is not None: column_set = set(columns) else: @@ -53,7 +53,7 @@ class FeatherReader(ext.FeatherReader): names.append(name) table = Table.from_arrays(columns, names=names) - return table.to_pandas() + return table.to_pandas(nthreads=nthreads) class FeatherWriter(object): @@ -118,7 +118,7 @@ def write_feather(df, dest): raise -def read_feather(source, columns=None): +def read_feather(source, columns=None, nthreads=1): """ Read a pandas.DataFrame from Feather format @@ -128,10 +128,12 @@ def read_feather(source, columns=None): columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read + nthreads : int, default 1 + Number of CPU threads to use when reading to pandas.DataFrame Returns ------- df : pandas.DataFrame """ reader = FeatherReader(source) - return reader.read(columns=columns) + return reader.read(columns=columns, nthreads=nthreads) http://git-wip-us.apache.org/repos/asf/arrow/blob/d8d3d843/python/pyarrow/tests/test_feather.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 69c32be..287e0da 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -61,7 +61,8 @@ class TestFeatherReader(unittest.TestCase): return counts def _check_pandas_roundtrip(self, df, expected=None, path=None, - columns=None, null_counts=None): + columns=None, null_counts=None, + nthreads=1): if path is None: path = random_path() @@ -70,7 +71,7 @@ class TestFeatherReader(unittest.TestCase): if not os.path.exists(path): raise Exception('file not written') - result = read_feather(path, columns) + result = read_feather(path, columns, nthreads=nthreads) if expected is None: expected = df @@ -293,6 +294,12 @@ class TestFeatherReader(unittest.TestCase): df = pd.DataFrame({'strings': [''] * 10}) self._check_pandas_roundtrip(df) + def test_multithreaded_read(self): + data = {'c{0}'.format(i): [''] * 10 + for i in range(100)} + df = pd.DataFrame(data) + self._check_pandas_roundtrip(df, nthreads=4) + def test_nan_as_null(self): # Create a nan that is not numpy.nan values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)