Repository: spark Updated Branches: refs/heads/branch-2.1 2b85e059b -> fb81a412e
[SPARK-20214][ML] Make sure converted csc matrix has sorted indices ## What changes were proposed in this pull request? `_convert_to_vector` converts a scipy sparse matrix to csc matrix for initializing `SparseVector`. However, it doesn't guarantee the converted csc matrix has sorted indices and so a failure happens when you do something like that: from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 lil[3, 0] = 2 _convert_to_vector(lil.todok()) File "/home/jenkins/workspace/python/pyspark/mllib/linalg/__init__.py", line 78, in _convert_to_vector return SparseVector(l.shape[0], csc.indices, csc.data) File "/home/jenkins/workspace/python/pyspark/mllib/linalg/__init__.py", line 556, in __init__ % (self.indices[i], self.indices[i + 1])) TypeError: Indices 3 and 1 are not strictly increasing A simple test can confirm that `dok_matrix.tocsc()` won't guarantee sorted indices: >>> from scipy.sparse import lil_matrix >>> lil = lil_matrix((4, 1)) >>> lil[1, 0] = 1 >>> lil[3, 0] = 2 >>> dok = lil.todok() >>> csc = dok.tocsc() >>> csc.has_sorted_indices 0 >>> csc.indices array([3, 1], dtype=int32) I checked the source codes of scipy. The only way to guarantee it is `csc_matrix.tocsr()` and `csr_matrix.tocsc()`. ## How was this patch tested? Existing tests. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Liang-Chi Hsieh <vii...@gmail.com> Closes #17532 from viirya/make-sure-sorted-indices. (cherry picked from commit 12206058e8780e202c208b92774df3773eff36ae) Signed-off-by: Joseph K. Bradley <jos...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb81a412 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb81a412 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb81a412 Branch: refs/heads/branch-2.1 Commit: fb81a412eea1e60bd503cb5bb879ae468be24e56 Parents: 2b85e05 Author: Liang-Chi Hsieh <vii...@gmail.com> Authored: Wed Apr 5 17:46:44 2017 -0700 Committer: Joseph K. Bradley <jos...@databricks.com> Committed: Wed Apr 5 17:46:55 2017 -0700 ---------------------------------------------------------------------- python/pyspark/ml/linalg/__init__.py | 3 +++ python/pyspark/mllib/linalg/__init__.py | 3 +++ python/pyspark/mllib/tests.py | 11 +++++++++++ 3 files changed, 17 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/fb81a412/python/pyspark/ml/linalg/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index 1705c15..eed9946 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -72,7 +72,10 @@ def _convert_to_vector(l): return DenseVector(l) elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" + # Make sure the converted csc_matrix has sorted indices. csc = l.tocsc() + if not csc.has_sorted_indices: + csc.sort_indices() return SparseVector(l.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(l)) http://git-wip-us.apache.org/repos/asf/spark/blob/fb81a412/python/pyspark/mllib/linalg/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 031f22c..7b24b3c 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -74,7 +74,10 @@ def _convert_to_vector(l): return DenseVector(l) elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" + # Make sure the converted csc_matrix has sorted indices. csc = l.tocsc() + if not csc.has_sorted_indices: + csc.sort_indices() return SparseVector(l.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(l)) http://git-wip-us.apache.org/repos/asf/spark/blob/fb81a412/python/pyspark/mllib/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index c519883..523b3f1 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -853,6 +853,17 @@ class SciPyTests(MLlibTestCase): self.assertEqual(sv, serialize(lil.tocsr())) self.assertEqual(sv, serialize(lil.todok())) + def test_convert_to_vector(self): + from scipy.sparse import csc_matrix + # Create a CSC matrix with non-sorted indices + indptr = array([0, 2]) + indices = array([3, 1]) + data = array([2.0, 1.0]) + csc = csc_matrix((data, indices, indptr)) + self.assertFalse(csc.has_sorted_indices) + sv = SparseVector(4, {1: 1, 3: 2}) + self.assertEqual(sv, _convert_to_vector(csc)) + def test_dot(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org