Github user mpetruska commented on a diff in the pull request: https://github.com/apache/spark/pull/19659#discussion_r149696337 --- Diff: mllib/src/main/scala/org/apache/spark/ml/extensions/seq/package.scala --- @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.extensions + +import scala.annotation.tailrec +import scala.collection.immutable.Queue + +import org.apache.spark.annotation.Since + +package object seq { + + /** + * Calculates sliding windows over multiple length parameters simultaneously. + * @param x the input sequence over which sliding windows will be collected + * @param min the inclusive minimal length of the window to be collected + * @param max the inclusive maximal length of the window to be collected + * @return the collected windows + * + * @example {{{ + * multiSliding(1 to 5, min = 2, max = 4) == Seq( + * Seq(1, 2), Seq(1, 2, 3), Seq(1, 2, 3, 4), + * Seq(2, 3), Seq(2, 3, 4), Seq(2, 3, 4, 5), + * Seq(3, 4), Seq(3, 4, 5), + * Seq(4, 5) + * ) + * + * multiSliding(1 to 10, min = 2, max = 5) == Seq( + * Seq(1, 2), Seq(1, 2, 3), Seq(1, 2, 3, 4), Seq(1, 2, 3, 4, 5), + * Seq(2, 3), Seq(2, 3, 4), Seq(2, 3, 4, 5), Seq(2, 3, 4, 5, 6), + * Seq(3, 4), Seq(3, 4, 5), Seq(3, 4, 5, 6), Seq(3, 4, 5, 6, 7), + * Seq(4, 5), Seq(4, 5, 6), Seq(4, 5, 6, 7), Seq(4, 5, 6, 7, 8), + * Seq(5, 6), Seq(5, 6, 7), Seq(5, 6, 7, 8), Seq(5, 6, 7, 8, 9), + * Seq(6, 7), Seq(6, 7, 8), Seq(6, 7, 8, 9), Seq(6, 7, 8, 9, 10), + * Seq(7, 8), Seq(7, 8, 9), Seq(7, 8, 9, 10), + * Seq(8, 9), Seq(8, 9, 10), + * Seq(9, 10) + * ) + * }}} + */ + @Since("From which version?") + def multiSliding[A](x: Seq[A], min: Int, max: Int): Seq[Seq[A]] = { + type B = Seq[A] + + def addWindowsFromBuffer(acc: List[B], buffer: Queue[A]): List[B] = { + buffer.drop(min - 1).foldLeft((acc, buffer.take(min - 1))) { + case ((a, b), current) => + val newB = b.enqueue(current) + (newB :: a, newB) + }._1 + } + + @tailrec + def addWindowsFromFinalBuffer(acc: List[B], buffer: Queue[A]): List[B] = { + buffer.dequeueOption match { + case Some((_, tail)) => addWindowsFromFinalBuffer(addWindowsFromBuffer(acc, tail), tail) + case None => acc + } + } + + def calculateMultiSliding(): List[B] = { --- End diff -- Nope, unfortunately it's not the case: ```Scala scala> def slidingOriginal(x: Seq[String], n: Int): Seq[String] = { | x.iterator.sliding(n).withPartial(false).map(_.mkString(" ")).toSeq | } slidingOriginal: (x: Seq[String], n: Int)Seq[String] scala> def slidingNew(x: Seq[String], n: Int): Seq[String] = { | multiSliding(x, n, n).map(_.mkString(" ")) | } slidingNew: (x: Seq[String], n: Int)Seq[String] scala> time(slidingOriginal((1 to 10000).map(_.toString), 5)) res9: scala.concurrent.duration.FiniteDuration = 2427473 nanoseconds scala> time(slidingOriginal((1 to 10000).map(_.toString), 5)).toMillis res10: Long = 3 scala> time(slidingNew((1 to 10000).map(_.toString), 5)).toMillis res11: Long = 15 scala> time(slidingOriginal((1 to 100000).map(_.toString), 5)).toMillis res12: Long = 11 scala> time(slidingNew((1 to 100000).map(_.toString), 5)).toMillis res13: Long = 479 scala> time(slidingOriginal((1 to 10000).map(_.toString), 50)).toMillis res14: Long = 1 scala> time(slidingNew((1 to 10000).map(_.toString), 50)).toMillis res15: Long = 136 ``` Adding code to fall back to the original implementation if `n == maxN`...
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org