Github user BryanCutler commented on a diff in the pull request: https://github.com/apache/spark/pull/20777#discussion_r173335895 --- Diff: python/pyspark/ml/feature.py --- @@ -408,35 +408,86 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, """ Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`. - >>> df = spark.createDataFrame( + >>> df1 = spark.createDataFrame( ... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])], ... ["label", "raw"]) - >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors") - >>> model = cv.fit(df) - >>> model.transform(df).show(truncate=False) + >>> cv1 = CountVectorizer(inputCol="raw", outputCol="vectors") + >>> model1 = cv1.fit(df1) + >>> model1.transform(df1).show(truncate=False) +-----+---------------+-------------------------+ |label|raw |vectors | +-----+---------------+-------------------------+ |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])| |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])| +-----+---------------+-------------------------+ ... - >>> sorted(model.vocabulary) == ['a', 'b', 'c'] + >>> sorted(model1.vocabulary) == ['a', 'b', 'c'] True >>> countVectorizerPath = temp_path + "/count-vectorizer" - >>> cv.save(countVectorizerPath) + >>> cv1.save(countVectorizerPath) >>> loadedCv = CountVectorizer.load(countVectorizerPath) - >>> loadedCv.getMinDF() == cv.getMinDF() + >>> loadedCv.getMinDF() == cv1.getMinDF() True - >>> loadedCv.getMinTF() == cv.getMinTF() + >>> loadedCv.getMinTF() == cv1.getMinTF() True - >>> loadedCv.getVocabSize() == cv.getVocabSize() + >>> loadedCv.getVocabSize() == cv1.getVocabSize() True >>> modelPath = temp_path + "/count-vectorizer-model" - >>> model.save(modelPath) + >>> model1.save(modelPath) >>> loadedModel = CountVectorizerModel.load(modelPath) - >>> loadedModel.vocabulary == model.vocabulary + >>> loadedModel.vocabulary == model1.vocabulary True + >>> df2 = spark.createDataFrame( + ... [(0, ["a", "b", "c", "d"]), (1, ["a", "b", "c",]),(2, ["a", "b"]),(3, ["a"]),], + ... ["label", "raw"]) + >>> cv2 = CountVectorizer(inputCol="raw", outputCol="vectors", maxDF=3) + >>> model2 = cv2.fit(df2) + >>> model2.transform(df2).show(truncate=False) + +-----+------------+-------------------------+ + |label|raw |vectors | + +-----+------------+-------------------------+ + |0 |[a, b, c, d]|(3,[0,1,2],[1.0,1.0,1.0])| + |1 |[a, b, c] |(3,[0,1],[1.0,1.0]) | + |2 |[a, b] |(3,[0],[1.0]) | + |3 |[a] |(3,[],[]) | + +-----+------------+-------------------------+ + ... + >>> cv3 = CountVectorizer(inputCol="raw", outputCol="vectors", maxDF=0.75) + >>> model3 = cv3.fit(df2) + >>> model3.transform(df2).show(truncate=False) + +-----+------------+-------------------------+ + |label|raw |vectors | + +-----+------------+-------------------------+ + |0 |[a, b, c, d]|(3,[0,1,2],[1.0,1.0,1.0])| + |1 |[a, b, c] |(3,[0,1],[1.0,1.0]) | + |2 |[a, b] |(3,[0],[1.0]) | + |3 |[a] |(3,[],[]) | + +-----+------------+-------------------------+ + ... + >>> cv4 = CountVectorizer(inputCol="raw", outputCol="vectors", minDF=2, maxDF=3) + >>> model4 = cv4.fit(df2) + >>> model4.transform(df2).show(truncate=False) + +-----+------------+-------------------+ + |label|raw |vectors | + +-----+------------+-------------------+ + |0 |[a, b, c, d]|(2,[0,1],[1.0,1.0])| + |1 |[a, b, c] |(2,[0,1],[1.0,1.0])| + |2 |[a, b] |(2,[0],[1.0]) | + |3 |[a] |(2,[],[]) | + +-----+------------+-------------------+ + ... + >>> cv5 = CountVectorizer(inputCol="raw", outputCol="vectors", minDF=0.5, maxDF=0.75) + >>> model5 = cv5.fit(df2) + >>> model5.transform(df2).show(truncate=False) + +-----+------------+-------------------+ + |label|raw |vectors | + +-----+------------+-------------------+ + |0 |[a, b, c, d]|(2,[0,1],[1.0,1.0])| + |1 |[a, b, c] |(2,[0,1],[1.0,1.0])| + |2 |[a, b] |(2,[0],[1.0]) | + |3 |[a] |(2,[],[]) | + +-----+------------+-------------------+ + ... --- End diff -- I think this is too much to put as a doctest. Instead, can you just add a unit test in ml/tests.py? I think you just need 2 transforms, one with an integer value of `maxDF` > 1 and one as a fractional value. Also, I don't think your test data actually uses the `maxDF` filtering.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org