Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199286570
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingLimitExec.scala
---
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.concurrent.TimeUnit.NANOSECONDS
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples,
Distribution, Partitioning}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.streaming.state.StateStoreOps
+import org.apache.spark.sql.types.{LongType, NullType, StructField,
StructType}
+import org.apache.spark.util.CompletionIterator
+
+/**
+ * A physical operator for executing a streaming limit, which makes sure
no more than streamLimit
+ * rows are returned.
+ */
+case class StreamingLimitExec(
+streamLimit: Long,
+child: SparkPlan,
+stateInfo: Option[StatefulOperatorStateInfo] = None)
+ extends UnaryExecNode with StateStoreWriter {
+
+ private val keySchema = StructType(Array(StructField("key", NullType)))
+ private val valueSchema = StructType(Array(StructField("value",
LongType)))
+
+ override protected def doExecute(): RDD[InternalRow] = {
+metrics // force lazy init at driver
+
+child.execute().mapPartitionsWithStateStore(
+ getStateInfo,
+ keySchema,
+ valueSchema,
+ indexOrdinal = None,
+ sqlContext.sessionState,
+ Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
+ val key = UnsafeProjection.create(keySchema)(new
GenericInternalRow(Array[Any](null)))
+ val numOutputRows = longMetric("numOutputRows")
+ val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+ val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
+ val commitTimeMs = longMetric("commitTimeMs")
+ val updatesStartTimeNs = System.nanoTime
+
+ val startCount: Long =
Option(store.get(key)).map(_.getLong(0)).getOrElse(0L)
+ var rowCount = startCount
+
+ val result = iter.filter { r =>
+val x = rowCount < streamLimit
--- End diff --
Oh and we should be planning a `LocalLimit` before this and perhaps
`GlobalStreamingLimitExec` would be a better name to make the functionality
obvious.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199286349
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingLimitExec.scala
---
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.concurrent.TimeUnit.NANOSECONDS
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples,
Distribution, Partitioning}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.streaming.state.StateStoreOps
+import org.apache.spark.sql.types.{LongType, NullType, StructField,
StructType}
+import org.apache.spark.util.CompletionIterator
+
+/**
+ * A physical operator for executing a streaming limit, which makes sure
no more than streamLimit
+ * rows are returned.
+ */
+case class StreamingLimitExec(
+streamLimit: Long,
+child: SparkPlan,
+stateInfo: Option[StatefulOperatorStateInfo] = None)
+ extends UnaryExecNode with StateStoreWriter {
+
+ private val keySchema = StructType(Array(StructField("key", NullType)))
+ private val valueSchema = StructType(Array(StructField("value",
LongType)))
+
+ override protected def doExecute(): RDD[InternalRow] = {
+metrics // force lazy init at driver
+
+child.execute().mapPartitionsWithStateStore(
+ getStateInfo,
+ keySchema,
+ valueSchema,
+ indexOrdinal = None,
+ sqlContext.sessionState,
+ Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
+ val key = UnsafeProjection.create(keySchema)(new
GenericInternalRow(Array[Any](null)))
+ val numOutputRows = longMetric("numOutputRows")
+ val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+ val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
+ val commitTimeMs = longMetric("commitTimeMs")
+ val updatesStartTimeNs = System.nanoTime
+
+ val startCount: Long =
Option(store.get(key)).map(_.getLong(0)).getOrElse(0L)
+ var rowCount = startCount
+
+ val result = iter.filter { r =>
+val x = rowCount < streamLimit
--- End diff --
I think its okay due to `override def requiredChildDistribution:
Seq[Distribution] = AllTuples :: Nil`.
+1 to making sure there are tests with more than one partition though.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199279042
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingLimitExec.scala
---
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.concurrent.TimeUnit.NANOSECONDS
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples,
Distribution, Partitioning}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.streaming.state.StateStoreOps
+import org.apache.spark.sql.types.{LongType, NullType, StructField,
StructType}
+import org.apache.spark.util.CompletionIterator
+
+/**
+ * A physical operator for executing a streaming limit, which makes sure
no more than streamLimit
+ * rows are returned.
+ */
+case class StreamingLimitExec(
+streamLimit: Long,
+child: SparkPlan,
+stateInfo: Option[StatefulOperatorStateInfo] = None)
+ extends UnaryExecNode with StateStoreWriter {
+
+ private val keySchema = StructType(Array(StructField("key", NullType)))
+ private val valueSchema = StructType(Array(StructField("value",
LongType)))
+
+ override protected def doExecute(): RDD[InternalRow] = {
+metrics // force lazy init at driver
--- End diff --
Existing: Do we really do this in every operator? Why isn't this the
responsibility of the parent class?
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199280992
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala ---
@@ -615,7 +615,7 @@ class StreamSuite extends StreamTest {
// Get an existing checkpoint generated by Spark v2.1.
// v2.1 does not record # shuffle partitions in the offset metadata.
val resourceUri =
-
this.getClass.getResource("/structured-streaming/checkpoint-version-2.1.0").toURI
+
this.getClass.getResource("/structured-streaming/checkpoint-version-2.1.0").toURI
--- End diff --
I'd undo these spurious changes.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199278369
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala ---
@@ -72,6 +72,8 @@ abstract class SparkStrategies extends
QueryPlanner[SparkPlan] {
case Limit(IntegerLiteral(limit), Project(projectList, Sort(order,
true, child)))
if limit < conf.topKSortFallbackThreshold =>
TakeOrderedAndProjectExec(limit, order, projectList,
planLater(child)) :: Nil
+case Limit(IntegerLiteral(limit), child) if plan.isStreaming =>
+ StreamingLimitExec(limit, planLater(child)) :: Nil
--- End diff --
I would create a different one only to continue the pattern of isolating
streaming specific Strategies. You'll then need to inject your new Strategy in
`IncrementalExecution`.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199278041
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
---
@@ -315,8 +315,10 @@ object UnsupportedOperationChecker {
case GroupingSets(_, _, child, _) if child.isStreaming =>
throwError("GroupingSets is not supported on streaming
DataFrames/Datasets")
-case GlobalLimit(_, _) | LocalLimit(_, _) if
subPlan.children.forall(_.isStreaming) =>
- throwError("Limits are not supported on streaming
DataFrames/Datasets")
+case GlobalLimit(_, _) | LocalLimit(_, _) if
+ subPlan.children.forall(_.isStreaming) && outputMode ==
InternalOutputModes.Update =>
--- End diff --
It is today (though as we discussed I think the query planner would be a
better place if we were to rearchitect).
Style nit: line break at the high syntactic level (i.e. before the if) and
indent 4 space for a continuation like this (to distinguish the guard from the
code executed when matched.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199278862
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingLimitExec.scala
---
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.concurrent.TimeUnit.NANOSECONDS
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples,
Distribution, Partitioning}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.streaming.state.StateStoreOps
+import org.apache.spark.sql.types.{LongType, NullType, StructField,
StructType}
+import org.apache.spark.util.CompletionIterator
+
+/**
+ * A physical operator for executing a streaming limit, which makes sure
no more than streamLimit
+ * rows are returned.
+ */
+case class StreamingLimitExec(
+streamLimit: Long,
+child: SparkPlan,
+stateInfo: Option[StatefulOperatorStateInfo] = None)
+ extends UnaryExecNode with StateStoreWriter {
+
+ private val keySchema = StructType(Array(StructField("key", NullType)))
+ private val valueSchema = StructType(Array(StructField("value",
LongType)))
+
+ override protected def doExecute(): RDD[InternalRow] = {
+metrics // force lazy init at driver
+
+child.execute().mapPartitionsWithStateStore(
+ getStateInfo,
+ keySchema,
+ valueSchema,
+ indexOrdinal = None,
+ sqlContext.sessionState,
+ Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
+ val key = UnsafeProjection.create(keySchema)(new
GenericInternalRow(Array[Any](null)))
+ val numOutputRows = longMetric("numOutputRows")
+ val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+ val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
+ val commitTimeMs = longMetric("commitTimeMs")
+ val updatesStartTimeNs = System.nanoTime
+
+ val startCount: Long =
Option(store.get(key)).map(_.getLong(0)).getOrElse(0L)
+ var rowCount = startCount
+
+ val result = iter.filter { r =>
+val x = rowCount < streamLimit
+if (x) {
+ rowCount += 1
+}
+x
+ }
+
+ CompletionIterator[InternalRow, Iterator[InternalRow]](result, {
--- End diff --
Do you need these type parameters?
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/21662#discussion_r199279216
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingLimitExec.scala
---
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.concurrent.TimeUnit.NANOSECONDS
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples,
Distribution, Partitioning}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.streaming.state.StateStoreOps
+import org.apache.spark.sql.types.{LongType, NullType, StructField,
StructType}
+import org.apache.spark.util.CompletionIterator
+
+/**
+ * A physical operator for executing a streaming limit, which makes sure
no more than streamLimit
+ * rows are returned.
+ */
+case class StreamingLimitExec(
+streamLimit: Long,
+child: SparkPlan,
+stateInfo: Option[StatefulOperatorStateInfo] = None)
+ extends UnaryExecNode with StateStoreWriter {
+
+ private val keySchema = StructType(Array(StructField("key", NullType)))
+ private val valueSchema = StructType(Array(StructField("value",
LongType)))
+
+ override protected def doExecute(): RDD[InternalRow] = {
+metrics // force lazy init at driver
+
+child.execute().mapPartitionsWithStateStore(
+ getStateInfo,
+ keySchema,
+ valueSchema,
+ indexOrdinal = None,
+ sqlContext.sessionState,
+ Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
--- End diff --
Nit: I'd indent 4 above to distinguish these two blocks visually.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/20647#discussion_r170185948
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
---
@@ -77,31 +79,32 @@ class MicroBatchExecution(
sparkSession.sqlContext.conf.disabledV2StreamingMicroBatchReaders.split(",")
val _logicalPlan = analyzedPlan.transform {
- case streamingRelation@StreamingRelation(dataSourceV1, sourceName,
output) =>
-toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
+ case s @ StreamingRelation(dsV1, sourceName, output) =>
--- End diff --
If you are touching that specific code then its fine to fix the style, but
in general I tend to agree that it makes the diff harder to read and commit
harder to back port if you include spurious changes.
I've even seen guidelines that specifically prohibit fixing style just to
fix style since it obfuscates the history.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/20647#discussion_r170115965
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
---
@@ -415,12 +418,14 @@ class MicroBatchExecution(
case v1: SerializedOffset => reader.deserializeOffset(v1.json)
case v2: OffsetV2 => v2
}
- reader.setOffsetRange(
-toJava(current),
-Optional.of(availableV2))
+ reader.setOffsetRange(toJava(current), Optional.of(availableV2))
logDebug(s"Retrieving data from $reader: $current ->
$availableV2")
- Some(reader ->
-new
StreamingDataSourceV2Relation(reader.readSchema().toAttributes, reader))
+ Some(reader -> StreamingDataSourceV2Relation(
--- End diff --
@rdblue There was a doc as part of this SPIP:
https://issues.apache.org/jira/browse/SPARK-20928, but it has definitely
evolved enough past that we should update and send to the dev list again.
Things like the logical plan requirement in execution will likely be
significantly easier to remove once we have a full V2 API and can remove the
legacy internal API for streaming.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20598
Yeah, this seems risky at RC5.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/20647#discussion_r169799465
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
---
@@ -107,17 +106,24 @@ case class DataSourceV2Relation(
}
/**
- * A specialization of DataSourceV2Relation with the streaming bit set to
true. Otherwise identical
- * to the non-streaming relation.
+ * A specialization of [[DataSourceV2Relation]] with the streaming bit set
to true.
+ *
+ * Note that, this plan has a mutable reader, so Spark won't apply
operator push-down for this plan,
+ * to avoid making the plan mutable. We should consolidate this plan and
[[DataSourceV2Relation]]
+ * after we figure out how to apply operator push-down for streaming data
sources.
--- End diff --
@rdblue as well.
I agree that we really need to define the contract here, and personally I
would really benefit from seeing a life cycle diagram for all of the different
pieces of the API. @tdas and @jose-torres made one for just the write side of
streaming and it was super useful for me as someone at a distance that wants to
understand what was going on.
![screen shot 2018-02-21 at 2 18 55
pm](https://user-images.githubusercontent.com/527/36508744-65d25be0-1712-11e8-93c6-52515f8b50e9.png)
Something like this diagram that also covered when things are resolved,
when pushdown happens, and that shows the differences between read/write,
microbatch, batch and continuous would be awesome.
Regarding the actual question, I'm not a huge fan of option 2 as it still
seems like an implicit contract with this mutable object (assuming I understand
the proposal correctly). Option 1 at least means that we could say, "whenever
its time to do pushdown: call `reset()`, do pushdown in some defined order,
then call `createX()`. It is invalid to do more pushdown after createX has
been called".
Even better than a `reset()` might be a `cleanClone()` method that gives
you a fresh copy. As I said above, I don't really understand the lifecycle of
the API, but given how we reuse query plan fragments I'm really nervous about
mutable objects that are embedded in operators.
I also agree with @jose-torres point that this mechanism looks like action
at a distance, but the `reset()` contract at least localizes it to some degree,
and I don't have a better suggestion for a way to support evolvable pushdown.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20511
Unfortunately, dependency changes are not typically allowed in patch
releases.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/20598#discussion_r168011671
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
---
@@ -431,7 +431,11 @@ class MicroBatchExecution(
s"Invalid batch: ${Utils.truncatedString(output, ",")} != " +
s"${Utils.truncatedString(dataPlan.output, ",")}")
replacements ++= output.zip(dataPlan.output)
--- End diff --
I think this is no longer used.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/20598#discussion_r168011940
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
---
@@ -440,8 +444,6 @@ class MicroBatchExecution(
// Rewire the plan to use the new attributes that were returned by the
source.
val replacementMap = AttributeMap(replacements)
--- End diff --
Same
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20511
Sorry if I'm missing some context here, but our typical process this late
in the release (we are over a month since the branch was cut) would be to
disable any new features that still have regressions. I'm generally not okay
with any dependency changes this far into QA unless there is no other option.
Should we be considering turning this off so that we can GA Spark 2.3 soon?
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20387
Regarding, `computeStats`, the logical plan seems like it might not be the
right place. As we move towards more CBO it seems like we are going to need to
pick physical operators before we can really reason about the cost of a
subplan. With the caveat that I haven't though hard about this, I'd be
supportive of moving these kinds of metrics to physical plan. +1 that we need
to be able to consider pushdown when producing stats either way.
On the second point, I don't think I understand DataSourceV2 enough yet to
know the answer, but you ask a lot of questions that I think need to be defined
as part of the API (if we haven't already). What is the contract for ordering
and interactions between different types of pushdown? Is it valid to pushdown
in pieces or will we only call the method once? (sorry if this is written down
and I've just missed it).
My gut feeling is that we don't really want to fuse incrementally. Its
seems hard to reason about correctness and interactions between different
things that have been pushed. As I hinted at before, I think its most natural
to split the concerns of pushdown within a query plan and fusing of operators.
But maybe this is limited in someway I don't realize.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/20387#discussion_r165751660
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
---
@@ -17,15 +17,149 @@
package org.apache.spark.sql.execution.datasources.v2
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.sql.{AnalysisException, SaveMode}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference,
Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
+import org.apache.spark.sql.execution.datasources.DataSourceStrategy
+import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options,
ReadSupport, ReadSupportWithSchema, WriteSupport}
import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
+import org.apache.spark.sql.types.StructType
case class DataSourceV2Relation(
-fullOutput: Seq[AttributeReference],
-reader: DataSourceV2Reader) extends LeafNode with
DataSourceReaderHolder {
+source: DataSourceV2,
+options: Map[String, String],
+path: Option[String] = None,
+table: Option[TableIdentifier] = None,
+projection: Option[Seq[AttributeReference]] = None,
+filters: Option[Seq[Expression]] = None,
--- End diff --
I like this pattern. I think it is important that the arguments to a query
plan node are comprehensive so that it is easy to understand what is going on
in the output of `explain()`.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20085
This blocks better support for encoders on spark-avro, and seems safe, so
I'd really like to include it in possible.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20219
This seems reasonable to me. You can already do `sql("SELECT null")`
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20010
/cc @cloud-fan @sameeragarwal
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/20085
/cc @cloud-fan @sameeragarwal
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/16578
I agree that this PR needs to be allocated more review bandwidth, and it is
unfortunate that it has been blocked on that. However, I am -1 on merging a
change this large after branch cut.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
GitHub user marmbrus opened a pull request:
https://github.com/apache/spark/pull/20048
[SPARK-22862] Docs on lazy elimination of columns missing from an encoder
This behavior has confused some users, so lets clarify it.
You can merge this pull request into a Git repository by running:
$ git pull https://github.com/marmbrus/spark datasetAsDocs
Alternatively you can review and apply these changes as the patch at:
https://github.com/apache/spark/pull/20048.patch
To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:
This closes #20048
commit 2f4e7b04f8d62d73225f00c12a95136c556a15d7
Author: Michael Armbrust
Date: 2017-12-21T19:48:46Z
[SPARK-22862] Docs on lazy elimination of columns missing from an encoder
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/19925
I would probably do ... streaming.reader/writer if we are going to
namespace it.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/19771
LGTM
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/19461
LGTM
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/19314
LGTM!
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/19240
LGTM
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/19240#discussion_r139042816
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
---
@@ -2256,7 +2256,10 @@ object CleanupAliases extends Rule[LogicalPlan] {
def trimNonTopLevelAliases(e: Expression): Expression = e match {
case a: Alias =>
- a.withNewChildren(trimAliases(a.child) :: Nil)
--- End diff --
Is there anywhere else in the code base where we call `withNewChildren` on
an alias? Should we just override that?
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/19080#discussion_r136213945
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
---
@@ -30,18 +30,43 @@ import org.apache.spark.sql.types.{DataType,
IntegerType}
* - Intra-partition ordering of data: In this case the distribution
describes guarantees made
*about how tuples are distributed within a single partition.
*/
-sealed trait Distribution
+sealed trait Distribution {
+ /**
+ * The required number of partitions for this distribution. If it's
None, then any number of
+ * partitions is allowed for this distribution.
+ */
+ def requiredNumPartitions: Option[Int]
+
+ /**
+ * Creates a default partitioning for this distribution, which can
satisfy this distribution while
+ * matching the given number of partitions.
+ */
+ def createPartitioning(numPartitions: Int): Partitioning
+}
/**
* Represents a distribution where no promises are made about co-location
of data.
*/
-case object UnspecifiedDistribution extends Distribution
+case object UnspecifiedDistribution extends Distribution {
+ override def requiredNumPartitions: Option[Int] = None
+
+ override def createPartitioning(numPartitions: Int): Partitioning = {
+throw new IllegalStateException("UnspecifiedDistribution does not have
default partitioning.")
+ }
+}
/**
* Represents a distribution that only has a single partition and all
tuples of the dataset
* are co-located.
*/
-case object AllTuples extends Distribution
+case object AllTuples extends Distribution {
--- End diff --
It seems we should be moving towards CBO, not away from it.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/19080#discussion_r136213879
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
---
@@ -30,18 +30,43 @@ import org.apache.spark.sql.types.{DataType,
IntegerType}
* - Intra-partition ordering of data: In this case the distribution
describes guarantees made
*about how tuples are distributed within a single partition.
*/
-sealed trait Distribution
+sealed trait Distribution {
+ /**
+ * The required number of partitions for this distribution. If it's
None, then any number of
+ * partitions is allowed for this distribution.
+ */
+ def requiredNumPartitions: Option[Int]
+
+ /**
+ * Creates a default partitioning for this distribution, which can
satisfy this distribution while
+ * matching the given number of partitions.
+ */
+ def createPartitioning(numPartitions: Int): Partitioning
+}
/**
* Represents a distribution where no promises are made about co-location
of data.
*/
-case object UnspecifiedDistribution extends Distribution
+case object UnspecifiedDistribution extends Distribution {
+ override def requiredNumPartitions: Option[Int] = None
+
+ override def createPartitioning(numPartitions: Int): Partitioning = {
+throw new IllegalStateException("UnspecifiedDistribution does not have
default partitioning.")
+ }
+}
/**
* Represents a distribution that only has a single partition and all
tuples of the dataset
* are co-located.
*/
-case object AllTuples extends Distribution
+case object AllTuples extends Distribution {
--- End diff --
Sure you can make that argument (I'm not sure I buy it), but then why does
this PR still have two different concepts?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/19080#discussion_r136209980
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
---
@@ -30,18 +30,43 @@ import org.apache.spark.sql.types.{DataType,
IntegerType}
* - Intra-partition ordering of data: In this case the distribution
describes guarantees made
*about how tuples are distributed within a single partition.
*/
-sealed trait Distribution
+sealed trait Distribution {
+ /**
+ * The required number of partitions for this distribution. If it's
None, then any number of
+ * partitions is allowed for this distribution.
+ */
+ def requiredNumPartitions: Option[Int]
+
+ /**
+ * Creates a default partitioning for this distribution, which can
satisfy this distribution while
+ * matching the given number of partitions.
+ */
+ def createPartitioning(numPartitions: Int): Partitioning
+}
/**
* Represents a distribution where no promises are made about co-location
of data.
*/
-case object UnspecifiedDistribution extends Distribution
+case object UnspecifiedDistribution extends Distribution {
+ override def requiredNumPartitions: Option[Int] = None
+
+ override def createPartitioning(numPartitions: Int): Partitioning = {
+throw new IllegalStateException("UnspecifiedDistribution does not have
default partitioning.")
+ }
+}
/**
* Represents a distribution that only has a single partition and all
tuples of the dataset
* are co-located.
*/
-case object AllTuples extends Distribution
+case object AllTuples extends Distribution {
--- End diff --
So I spoke with @sameeragarwal about this a little. The whole point here
was to have a logical / physical separation. `AllTuples` could be `SingleNode`
or it could be `Broadcast`. All the operation wants to know is that its seeing
all of them and it shouldn't care about how that is being accomplished.
Now, since the first version, we have started to deviate from that. I'm
not sure if this is still the right thing to do, but I wanted to give a little
context.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/18923#discussion_r132801760
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
---
@@ -49,7 +49,7 @@ class ConsoleSink(options: Map[String, String]) extends
Sink with Logging {
println("---")
// scalastyle:off println
data.sparkSession.createDataFrame(
- data.sparkSession.sparkContext.parallelize(data.collect()),
data.schema)
--- End diff --
I don't think this means we can't do anything. I just think that we need
to fix the query plan and call take without changing the plan. Its kind of a
hack but it would work until we make the planner smarter.
I think something like `data.queryExecution.executedPlan.executeTake(...)`
would be safe.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/18923#discussion_r132792708
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
---
@@ -49,7 +49,7 @@ class ConsoleSink(options: Map[String, String]) extends
Sink with Logging {
println("---")
// scalastyle:off println
data.sparkSession.createDataFrame(
- data.sparkSession.sparkContext.parallelize(data.collect()),
data.schema)
--- End diff --
This might not be safe unfortunately. Anything that changes the query plan
might cause it to get replanned (and we don't do this correctly).
Can you make sure that there is a test case that uses the console sink and
does aggregation across more than one batch?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/18925#discussion_r132791950
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
---
@@ -779,10 +780,16 @@ case object OneRowRelation extends LeafNode {
}
/** A logical plan for `dropDuplicates`. */
+case object Deduplicate {
+ def apply(keys: Seq[Attribute], child: LogicalPlan): Deduplicate = {
+Deduplicate(keys, child, child.outputMode)
+ }
+}
+
case class Deduplicate(
keys: Seq[Attribute],
child: LogicalPlan,
-streaming: Boolean) extends UnaryNode {
+originalOutputMode: OutputMode) extends UnaryNode {
--- End diff --
Can we drop this? Can it just preserve the output mode of its child?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/18925#discussion_r132791711
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
---
@@ -1185,7 +1186,7 @@ object ReplaceDistinctWithAggregate extends
Rule[LogicalPlan] {
*/
object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-case Deduplicate(keys, child, streaming) if !streaming =>
+case d @ Deduplicate(keys, child, _) if d.originalOutputMode !=
OutputMode.Append() =>
--- End diff --
existing: It's kind of odd that this decision is made in the optimizer and
not the query planner. I think our aggregate operator is actually worse than
the specialized deduplication operator (since the specialized one is
non-blocking).
It doesn't have to be in this PR, but we should probably move this to the
planner eventually.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18925
ok to test
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18925
add to whitelist
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/18828#discussion_r131250896
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
---
@@ -181,17 +181,38 @@ abstract class QueryPlan[PlanType <:
QueryPlan[PlanType]] extends TreeNode[PlanT
override def innerChildren: Seq[QueryPlan[_]] = subqueries
/**
+ * A private mutable variable to indicate whether this plan is the
result of canonicalization.
+ * This is used solely for making sure we wouldn't execute a
canonicalized plan.
+ * See [[canonicalized]] on how this is set.
+ */
+ @transient
--- End diff --
I guess plans are already not valid on executors, by why `@transient`?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18822
LGTM
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18803
LGTM
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18780
I'm in favor of fairly aggressive closing of inactive pull requests. The
cost of reopening them is small, and the benefits of having a clear view of
what is in progress are large. I think as long as the notification is clear and
polite, it won't discourage contribution.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18658
That seems reasonable. I'm kind of pro-truncation for very very large
code. Even though its not great to have something truncated, outputting GBs of
logs is also pretty bad for downstream consumers.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18658
I don't have super strong opinions here, but in my experience its not
always easy to get users to rerun a failed query with a different logging
level. Have we considered truncating or special casing the 64k limitation
instead?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Github user marmbrus commented on the issue:
https://github.com/apache/spark/pull/18638
LGTM
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org
Repository: spark-website
Updated Branches:
refs/heads/asf-site 5ddf243fd -> f57f0702b
Add pyspark instructions
Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/f57f0702
Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/f57f0702
Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/f57f0702
Branch: refs/heads/asf-site
Commit: f57f0702b293bed20111de50f62eaea5026175a4
Parents: 5ddf243
Author: Michael Armbrust
Authored: Thu Jul 13 20:17:23 2017 +
Committer: Michael Armbrust
Committed: Thu Jul 13 20:17:23 2017 +
--
downloads.md | 3 +++
releases/_posts/2017-07-11-spark-release-2-2-0.md | 1 +
site/downloads.html | 3 +++
site/releases/spark-release-2-2-0.html| 2 ++
site/sitemap.xml | 12 ++--
5 files changed, 15 insertions(+), 6 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f57f0702/downloads.md
--
diff --git a/downloads.md b/downloads.md
index 2f1f50c..bf96c5e 100644
--- a/downloads.md
+++ b/downloads.md
@@ -53,6 +53,9 @@ Spark artifacts are [hosted in Maven
Central](https://search.maven.org/#search%7
artifactId: spark-core_2.11
version: 2.2.0
+### Installing with PyPi
+https://pypi.python.org/pypi/pyspark";>PySpark is now available in
pypi. To install just run `pip install pyspark`.
+
### Spark Source Code Management
If you are interested in working with the newest under-development code or
contributing to Apache Spark development, you can also check out the master
branch from Git:
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f57f0702/releases/_posts/2017-07-11-spark-release-2-2-0.md
--
diff --git a/releases/_posts/2017-07-11-spark-release-2-2-0.md
b/releases/_posts/2017-07-11-spark-release-2-2-0.md
index b630c75..2138eb3 100644
--- a/releases/_posts/2017-07-11-spark-release-2-2-0.md
+++ b/releases/_posts/2017-07-11-spark-release-2-2-0.md
@@ -14,6 +14,7 @@ meta:
Apache Spark 2.2.0 is the third release on the 2.x line. This release removes
the experimental tag from Structured Streaming. In addition, this release
focuses more on usability, stability, and polish, resolving over 1100 tickets.
+Additionally, we are excited to announce that https://pypi.python.org/pypi/pyspark";>PySpark is now available in
pypi. To install just run `pip install pyspark`.
To download Apache Spark 2.2.0, visit the downloads page. You can consult JIRA
for the [detailed
changes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12315420&version=12338275).
We have curated a list of high level changes here, grouped by major modules.
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f57f0702/site/downloads.html
--
diff --git a/site/downloads.html b/site/downloads.html
index 35e8920..5d3653d 100644
--- a/site/downloads.html
+++ b/site/downloads.html
@@ -249,6 +249,9 @@ artifactId: spark-core_2.11
version: 2.2.0
+Installing with PyPi
+https://pypi.python.org/pypi/pyspark";>PySpark is now available
in pypi. To install just run pip install pyspark.
+
Spark Source Code Management
If you are interested in working with the newest under-development code or
contributing to Apache Spark development, you can also check out the master
branch from Git:
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f57f0702/site/releases/spark-release-2-2-0.html
--
diff --git a/site/releases/spark-release-2-2-0.html
b/site/releases/spark-release-2-2-0.html
index fc43088..56f8d21 100644
--- a/site/releases/spark-release-2-2-0.html
+++ b/site/releases/spark-release-2-2-0.html
@@ -199,6 +199,8 @@
Apache Spark 2.2.0 is the third release on the 2.x line. This release
removes the experimental tag from Structured Streaming. In addition, this
release focuses more on usability, stability, and polish, resolving over 1100
tickets.
+Additionally, we are excited to announce that https://pypi.python.org/pypi/pyspark";>PySpark is now available in
pypi. To install just run pip install pyspark.
+
To download Apache Spark 2.2.0, visit the downloads page. You can consult JIRA for the https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12315420&version=12338275";>detailed
changes. We have curated a list of high level changes here, grouped by
major modules.
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f57f0702/site/sitemap.xml
--
Repository: spark-website
Updated Branches:
refs/heads/asf-site 2fac17731 -> 40f588bb5
Fix 2.2.0 contributor list
Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/40f588bb
Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/40f588bb
Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/40f588bb
Branch: refs/heads/asf-site
Commit: 40f588bb525e21e457c5d839937350a5c18172c4
Parents: 2fac177
Author: Michael Armbrust
Authored: Wed Jul 12 22:46:30 2017 +
Committer: Michael Armbrust
Committed: Wed Jul 12 15:48:01 2017 -0700
--
releases/_posts/2017-07-11-spark-release-2-2-0.md | 2 +-
site/releases/spark-release-2-2-0.html| 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark-website/blob/40f588bb/releases/_posts/2017-07-11-spark-release-2-2-0.md
--
diff --git a/releases/_posts/2017-07-11-spark-release-2-2-0.md
b/releases/_posts/2017-07-11-spark-release-2-2-0.md
index 37d3638..52ae28f 100644
--- a/releases/_posts/2017-07-11-spark-release-2-2-0.md
+++ b/releases/_posts/2017-07-11-spark-release-2-2-0.md
@@ -148,4 +148,4 @@ The main focus of SparkR in the 2.2.0 release was adding
extensive support for e
### Credits
Last but not least, this release would not have been possible without the
following contributors:
-ALeksander Eskilson, Aaditya Ramesh, Adam Roberts, Adrian Petrescu, Ahmed
Mahran, Alex Bozarth, Alexander Shorin, Alexander Ulanov, Andrew Duffy, Andrew
Mills, Andrew Ray, Angus Gerry, Anthony Truchet, Anton Okolnychyi, Artur
Sukhenko, Bartek Wisniewski, Bijay Pathak, Bill Chambers, Bjarne Fruergaard,
Brian Cho, Bryan Cutler, Burak Yavuz, Cen Yu Hai, Charles Allen, Cheng Lian,
Chie Hayashida, Christian Kadner, Clark Fitzgerald, Cody Koeninger, Daniel
Darabos, Daoyuan Wang, David Navas, Davies Liu, Denny Lee, Devaraj K, Dhruve
Ashar, Dilip Biswal, Ding Ding, Dmitriy Sokolov, Dongjoon Hyun, Drew Robb,
Ekasit Kijsipongse, Eren Avsarogullari, Ergin Seyfe, Eric Liang, Erik
O'Shaughnessy, Eyal Farago, Felix Cheung, Ferdinand Xu, Fred Reiss, Fu Xing,
Gabriel Huang, Gaetan Semet, Gang Wu, Gayathri Murali, Gu Huiqin Alice,
Guoqiang Li, Gurvinder Singh, Hao Ren, Herman Van Hovell, Hiroshi Inoue, Holden
Karau, Hossein Falaki, Huang Zhaowei, Huaxin Gao, Hyukjin Kwon, Imran Rashid,
Jacek Laskows
ki, Jagadeesan A S, Jakob Odersky, Jason White, Jeff Zhang, Jianfei Wang,
Jiang Xingbo, Jie Huang, Jie Xiong, Jisoo Kim, John Muller, Jose Hiram Soltren,
Joseph K. Bradley, Josh Rosen, Jun Kim, Junyang Qian, Justin Pihony, Kapil
Singh, Kay Ousterhout, Kazuaki Ishizaki, Kevin Grealish, Kevin McHale, Kishor
Patil, Koert Kuipers, Kousuke Saruta, Krishna Kalyan, Liang Ke, Liang-Chi
Hsieh, Lianhui Wang, Linbo Jin, Liwei Lin, Luciano Resende, Maciej Brynski,
Maciej Szymkiewicz, Mahmoud Rawas, Manoj Kumar, Marcelo Vanzin, Mariusz
Strzelecki, Mark Grover, Maxime Rihouey, Miao Wang, Michael Allman, Michael
Armbrust, Michael Gummelt, Michal Senkyr, Michal Wesolowski, Mikael Staldal,
Mike Ihbe, Mitesh Patel, Nan Zhu, Nattavut Sutyanyong, Nic Eggert, Nicholas
Chammas, Nick Lavers, Nick Pentreath, Nicolas Fraison, Noritaka Sekiyama, Peng
Meng, Peng, Meng, Pete Robbins, Peter Ableda, Peter Lee, Philipp Hoffmann,
Prashant Sharma, Prince J Wesley, Priyanka Garg, Qian Huang, Qifan Pu, Rajesh
Balamoh
an, Reynold Xin, Robert Kruszewski, Russell Spitzer, Ryan Blue, Saisai Shao,
Sameer Agarwal, Sami Jaktholm, Sandeep Purohit, Sandeep Singh, Satendra Kumar,
Sean Owen, Sean Zhong, Seth Hendrickson, Sharkd Tu, Shen Hong, Shivansh
Srivastava, Shivaram Venkataraman, Shixiong Zhu, Shuai Lin, Shubham Chopra,
Sital Kedia, Song Jun, Srinath Shankar, Stavros Kontopoulos, Stefan Schulze,
Steve Loughran, Suman Somasundar, Sun Dapeng, Sun Rui, Sunitha Kambhampati,
Suresh Thalamati, Susan X. Huynh, Sylvain Zimmer, Takeshi YAMAMURO, Takuya
UESHIN, Tao LI, Tao Lin, Tao Wang, Tarun Kumar, Tathagata Das, Tejas Patil,
Thomas Graves, Timothy Chen, Timothy Hunter, Tom Graves, Tom Magrino, Tommy YU,
Tyson Condie, Uncle Gen, Vinayak Joshi, Vincent Xie, Wang Fei, Wang Lei, Wang
Tao, Wayne Zhang, Weichen Xu, Weiluo (David) Ren, Weiqing Yang, Wenchen Fan,
Wesley Tang, William Benton, Wojciech Szymanski, Xiangrui Meng, Xianyang Liu,
Xiao Li, Xin Ren, Xin Wu, Xing SHI, Xusen Yin, Yadong Qi, Yanbo Liang, Yang
Wang, Yangyang Liu, Yin Huai, Yu Peng, Yucai Yu, Yuhao Yang, Yuming Wang, Yun
Ni, Yves Raimond, Zhan Zhang, Zheng RuiFeng, Zhenhua Wang, pkch, tone-zhang,
yimuxi
\ No newline at end of file
+ALeksander Eskilson, Aaditya Ramesh, Adam Budde, Adam Roberts, Adrian Ionescu,
Ala Luszczak, Alex Bozarth, Andrew Ray, Anirudh Ramanathan, Anthony Truch
Repository: spark-website
Updated Branches:
refs/heads/asf-site ee654d1f3 -> f2d5d2a68
Fix downloads page
Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/f2d5d2a6
Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/f2d5d2a6
Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/f2d5d2a6
Branch: refs/heads/asf-site
Commit: f2d5d2a6800e479396a21db2e349db6008df84ec
Parents: ee654d1
Author: Michael Armbrust
Authored: Tue Jul 11 22:47:12 2017 +
Committer: Michael Armbrust
Committed: Tue Jul 11 22:47:12 2017 +
--
downloads.md| 6 +++---
site/downloads.html | 6 +++---
2 files changed, 6 insertions(+), 6 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f2d5d2a6/downloads.md
--
diff --git a/downloads.md b/downloads.md
index 16d6062..2f1f50c 100644
--- a/downloads.md
+++ b/downloads.md
@@ -51,7 +51,7 @@ Spark artifacts are [hosted in Maven
Central](https://search.maven.org/#search%7
groupId: org.apache.spark
artifactId: spark-core_2.11
-version: 2.1.1
+version: 2.2.0
### Spark Source Code Management
If you are interested in working with the newest under-development code or
contributing to Apache Spark development, you can also check out the master
branch from Git:
@@ -59,8 +59,8 @@ If you are interested in working with the newest
under-development code or contr
# Master development branch
git clone git://github.com/apache/spark.git
-# 2.1 maintenance branch with stability fixes on top of Spark 2.1.0
-git clone git://github.com/apache/spark.git -b branch-2.1
+# 2.1 maintenance branch with stability fixes on top of Spark 2.2.0
+git clone git://github.com/apache/spark.git -b branch-2.2
Once you've downloaded Spark, you can find instructions for installing and
building it on the documentation
page.
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f2d5d2a6/site/downloads.html
--
diff --git a/site/downloads.html b/site/downloads.html
index 93bbcdc..35e8920 100644
--- a/site/downloads.html
+++ b/site/downloads.html
@@ -246,7 +246,7 @@ You can select and download it above.
groupId: org.apache.spark
artifactId: spark-core_2.11
-version: 2.1.1
+version: 2.2.0
Spark Source Code Management
@@ -255,8 +255,8 @@ version: 2.1.1
# Master development branch
git clone git://github.com/apache/spark.git
-# 2.1 maintenance branch with stability fixes on top of Spark 2.1.0
-git clone git://github.com/apache/spark.git -b branch-2.1
+# 2.1 maintenance branch with stability fixes on top of Spark 2.2.0
+git clone git://github.com/apache/spark.git -b branch-2.2
Once you’ve downloaded Spark, you can find instructions for
installing and building it on the documentation
page.
-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/nafunctions.html
--
diff --git a/site/docs/2.2.0/api/R/nafunctions.html
b/site/docs/2.2.0/api/R/nafunctions.html
new file mode 100644
index 000..aaf39cb
--- /dev/null
+++ b/site/docs/2.2.0/api/R/nafunctions.html
@@ -0,0 +1,349 @@
+
+R: A set of SparkDataFrame functions working with NA
values
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+dropna
{SparkR}R Documentation
+
+A set of SparkDataFrame functions working with NA values
+
+Description
+
+dropna, na.omit - Returns a new SparkDataFrame omitting rows with null
values.
+
+fillna - Replace null values.
+
+
+
+Usage
+
+
+## S4 method for signature 'SparkDataFrame'
+dropna(x, how = c("any", "all"),
+ minNonNulls = NULL, cols = NULL)
+
+## S4 method for signature 'SparkDataFrame'
+na.omit(object, how = c("any", "all"),
+ minNonNulls = NULL, cols = NULL)
+
+## S4 method for signature 'SparkDataFrame'
+fillna(x, value, cols = NULL)
+
+dropna(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL)
+
+na.omit(object, ...)
+
+fillna(x, value, cols = NULL)
+
+
+
+Arguments
+
+
+x
+
+a SparkDataFrame.
+
+how
+
+"any" or "all".
+if "any", drop a row if it contains any nulls.
+if "all", drop a row only if all its values are null.
+if minNonNulls is specified, how is ignored.
+
+minNonNulls
+
+if specified, drop rows that have less than
+minNonNulls non-null values.
+This overwrites the how parameter.
+
+cols
+
+optional list of column names to consider. In fillna,
+columns specified in cols that do not have matching data
+type are ignored. For example, if value is a character, and
+subset contains a non-character column, then the non-character
+column is simply ignored.
+
+object
+
+a SparkDataFrame.
+
+value
+
+value to replace null values with.
+Should be an integer, numeric, character or named list.
+If the value is a named list, then cols is ignored and
+value must be a mapping from column name (character) to
+replacement value. The replacement value must be an
+integer, numeric or character.
+
+...
+
+further arguments to be passed to or from other methods.
+
+
+
+
+Value
+
+A SparkDataFrame.
+
+
+
+Note
+
+dropna since 1.4.0
+
+na.omit since 1.5.0
+
+fillna since 1.4.0
+
+
+
+See Also
+
+Other SparkDataFrame functions: $,
+$,SparkDataFrame-method, $<-,
+$<-,SparkDataFrame-method,
+select, select,
+select,SparkDataFrame,Column-method,
+select,SparkDataFrame,character-method,
+select,SparkDataFrame,list-method;
+SparkDataFrame-class; [,
+[,SparkDataFrame-method, [[,
+[[,SparkDataFrame,numericOrcharacter-method,
+[[<-,
+[[<-,SparkDataFrame,numericOrcharacter-method,
+subset, subset,
+subset,SparkDataFrame-method;
+agg, agg, agg,
+agg,GroupedData-method,
+agg,SparkDataFrame-method,
+summarize, summarize,
+summarize,
+summarize,GroupedData-method,
+summarize,SparkDataFrame-method;
+arrange, arrange,
+arrange,
+arrange,SparkDataFrame,Column-method,
+arrange,SparkDataFrame,character-method,
+orderBy,SparkDataFrame,characterOrColumn-method;
+as.data.frame,
+as.data.frame,SparkDataFrame-method;
+attach,
+attach,SparkDataFrame-method;
+cache, cache,
+cache,SparkDataFrame-method;
+checkpoint, checkpoint,
+checkpoint,SparkDataFrame-method;
+coalesce, coalesce,
+coalesce,
+coalesce,Column-method,
+coalesce,SparkDataFrame-method;
+collect, collect,
+collect,SparkDataFrame-method;
+colnames, colnames,
+colnames,SparkDataFrame-method,
+colnames<-, colnames<-,
+colnames<-,SparkDataFrame-method,
+columns, columns,
+columns,SparkDataFrame-method,
+names,
+names,SparkDataFrame-method,
+names<-,
+names<-,SparkDataFrame-method;
+coltypes, coltypes,
+coltypes,SparkDataFrame-method,
+coltypes<-, coltypes<-,
+coltypes<-,SparkDataFrame,character-method;
+count,SparkDataFrame-method,
+nrow, nrow,
+nrow,SparkDataFrame-method;
+createOrReplaceTempView,
+createOrReplaceTempView,
+createOrReplaceTempView,SparkDataFrame,character-method;
+crossJoin,
+crossJoin,SparkDataFrame,SparkDataFrame-method;
+dapplyCollect, dapplyCollect,
+dapplyCollect,SparkDataFrame,function-method;
+dapply, dapply,
+dapply,SparkDataFrame,function,structType-method;
+describe, describe,
+describe,
+describe,SparkDataFrame,ANY-method,
+describe,SparkDataFrame,character-method,
+describe,SparkDataFrame-method,
+summary, summary,
+summary,SparkDataFrame-method;
+dim,
+dim,SparkDataFrame-method;
+distinct, distinct,
+distinct,SparkDataFrame-method,
+unique,
+unique,SparkDataFrame-method;
+dropDuplicates,
+dropDuplicates,
+dropDuplicates,SparkDataFrame-method;
+drop, drop,
+drop, drop,ANY-method,
+drop,SparkDataFrame-method;
+dtypes, dtypes,
+dtypes,SparkDataFrame-method;
+except, except,
+except,SparkDataFrame,SparkD
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/listColumns.html
--
diff --git a/site/docs/2.2.0/api/R/listColumns.html
b/site/docs/2.2.0/api/R/listColumns.html
new file mode 100644
index 000..0c486a7
--- /dev/null
+++ b/site/docs/2.2.0/api/R/listColumns.html
@@ -0,0 +1,67 @@
+
+R: Returns a list of columns for the given table/view in
the...
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+listColumns {SparkR}R
Documentation
+
+Returns a list of columns for the given table/view in the specified
database
+
+Description
+
+Returns a list of columns for the given table/view in the specified
database.
+
+
+
+Usage
+
+
+listColumns(tableName, databaseName = NULL)
+
+
+
+Arguments
+
+
+tableName
+
+the qualified or unqualified name that designates a table/view. If no
database
+identifier is provided, it refers to a table/view in the current database.
+If databaseName parameter is specified, this must be an
unqualified name.
+
+databaseName
+
+(optional) name of the database
+
+
+
+
+Value
+
+a SparkDataFrame of the list of column descriptions.
+
+
+
+Note
+
+since 2.2.0
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D listColumns("mytable")
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/listDatabases.html
--
diff --git a/site/docs/2.2.0/api/R/listDatabases.html
b/site/docs/2.2.0/api/R/listDatabases.html
new file mode 100644
index 000..e903961
--- /dev/null
+++ b/site/docs/2.2.0/api/R/listDatabases.html
@@ -0,0 +1,51 @@
+
+R: Returns a list of databases available
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+listDatabases {SparkR}R
Documentation
+
+Returns a list of databases available
+
+Description
+
+Returns a list of databases available.
+
+
+
+Usage
+
+
+listDatabases()
+
+
+
+Value
+
+a SparkDataFrame of the list of databases.
+
+
+
+Note
+
+since 2.2.0
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D listDatabases()
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/listFunctions.html
--
diff --git a/site/docs/2.2.0/api/R/listFunctions.html
b/site/docs/2.2.0/api/R/listFunctions.html
new file mode 100644
index 000..85ae31b
--- /dev/null
+++ b/site/docs/2.2.0/api/R/listFunctions.html
@@ -0,0 +1,62 @@
+
+R: Returns a list of functions registered in the
specified...
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+listFunctions {SparkR}R
Documentation
+
+Returns a list of functions registered in the specified database
+
+Description
+
+Returns a list of functions registered in the specified database.
+This includes all temporary functions.
+
+
+
+Usage
+
+
+listFunctions(databaseName = NULL)
+
+
+
+Arguments
+
+
+databaseName
+
+(optional) name of the database
+
+
+
+
+Value
+
+a SparkDataFrame of the list of function descriptions.
+
+
+
+Note
+
+since 2.2.0
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D listFunctions()
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/listTables.html
--
diff --git a/site/docs/2.2.0/api/R/listTables.html
b/site/docs/2.2.0/api/R/listTables.html
new file mode 100644
index 000..a418a55
--- /dev/null
+++ b/site/docs/2.2.0/api/R/listTables.html
@@ -0,0 +1,69 @@
+
+R: Returns a list of tables or views in the specified
database
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+listTables
{SparkR}R Documentation
+
+Returns a list of tables or views in the specified database
+
+Description
+
+Returns a list of tables or views in the specified database.
+This includes a
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/crosstab.html
--
diff --git a/site/docs/2.2.0/api/R/crosstab.html
b/site/docs/2.2.0/api/R/crosstab.html
new file mode 100644
index 000..e944279
--- /dev/null
+++ b/site/docs/2.2.0/api/R/crosstab.html
@@ -0,0 +1,94 @@
+
+R: Computes a pair-wise frequency table of the given
columns
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+crosstab
{SparkR}R Documentation
+
+Computes a pair-wise frequency table of the given columns
+
+Description
+
+Computes a pair-wise frequency table of the given columns. Also known as a
contingency
+table. The number of distinct values for each column should be less than 1e4.
At most 1e6
+non-zero pair frequencies will be returned.
+
+
+
+Usage
+
+
+## S4 method for signature 'SparkDataFrame,character,character'
+crosstab(x, col1, col2)
+
+
+
+Arguments
+
+
+x
+
+a SparkDataFrame
+
+col1
+
+name of the first column. Distinct items will make the first item of each
row.
+
+col2
+
+name of the second column. Distinct items will make the column names of the
output.
+
+
+
+
+Value
+
+a local R data.frame representing the contingency table. The first column
of each row
+will be the distinct values of col1 and the column names will be
the distinct values
+of col2. The name of the first column will be
"col1_col2". Pairs
+that have no occurrences will have zero as their counts.
+
+
+
+Note
+
+crosstab since 1.5.0
+
+
+
+See Also
+
+Other stat functions: approxQuantile,
+approxQuantile,SparkDataFrame,character,numeric,numeric-method;
+corr, corr,
+corr, corr,Column-method,
+corr,SparkDataFrame-method;
+cov, cov, cov,
+cov,SparkDataFrame-method,
+cov,characterOrColumn-method,
+covar_samp, covar_samp,
+covar_samp,characterOrColumn,characterOrColumn-method;
+freqItems,
+freqItems,SparkDataFrame,character-method;
+sampleBy, sampleBy,
+sampleBy,SparkDataFrame,character,list,numeric-method
+
+
+
+Examples
+
+## Not run:
+##D df <- read.json("/path/to/file.json")
+##D ct <- crosstab(df, "title", "gender")
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/cume_dist.html
--
diff --git a/site/docs/2.2.0/api/R/cume_dist.html
b/site/docs/2.2.0/api/R/cume_dist.html
new file mode 100644
index 000..328243d
--- /dev/null
+++ b/site/docs/2.2.0/api/R/cume_dist.html
@@ -0,0 +1,90 @@
+
+R: cume_dist
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+cume_dist
{SparkR}R Documentation
+
+cume_dist
+
+Description
+
+Window function: returns the cumulative distribution of values within a
window partition,
+i.e. the fraction of rows that are below the current row.
+
+
+
+Usage
+
+
+## S4 method for signature 'missing'
+cume_dist()
+
+cume_dist(x = "missing")
+
+
+
+Arguments
+
+
+x
+
+empty. Should be used with no argument.
+
+
+
+
+Details
+
+N = total number of rows in the partition
+cume_dist(x) = number of values before (and including) x / N
+
+This is equivalent to the CUME_DIST function in SQL.
+
+
+
+Note
+
+cume_dist since 1.6.0
+
+
+
+See Also
+
+Other window_funcs: dense_rank,
+dense_rank,
+dense_rank,missing-method;
+lag, lag,
+lag,characterOrColumn-method;
+lead, lead,
+lead,characterOrColumn,numeric-method;
+ntile, ntile,
+ntile,numeric-method;
+percent_rank, percent_rank,
+percent_rank,missing-method;
+rank, rank,
+rank, rank,ANY-method,
+rank,missing-method;
+row_number, row_number,
+row_number,missing-method
+
+
+
+Examples
+
+## Not run:
+##D df <- createDataFrame(mtcars)
+##D ws <- orderBy(windowPartitionBy("am"), "hp")
+##D out <- select(df, over(cume_dist(), ws), df$hp, df$am)
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/currentDatabase.html
--
diff --git a/site/docs/2.2.0/api/R/currentDatabase.html
b/site/docs/2.2.0/api/R/currentDatabase.html
new file mode 100644
index 000..0b9d463
--- /dev/null
+++ b/site/docs/2.2.0/api/R/currentDatabase.html
@@ -0,0 +1,51 @@
+
+R: Returns the current default database
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/a
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/tableNames.html
--
diff --git a/site/docs/2.2.0/api/R/tableNames.html
b/site/docs/2.2.0/api/R/tableNames.html
new file mode 100644
index 000..3d0c8ea
--- /dev/null
+++ b/site/docs/2.2.0/api/R/tableNames.html
@@ -0,0 +1,62 @@
+
+R: Table Names
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+tableNames
{SparkR}R Documentation
+
+Table Names
+
+Description
+
+Returns the names of tables in the given database as an array.
+
+
+
+Usage
+
+
+## Default S3 method:
+tableNames(databaseName = NULL)
+
+
+
+Arguments
+
+
+databaseName
+
+(optional) name of the database
+
+
+
+
+Value
+
+a list of table names
+
+
+
+Note
+
+tableNames since 1.4.0
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D tableNames("hive")
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/tableToDF.html
--
diff --git a/site/docs/2.2.0/api/R/tableToDF.html
b/site/docs/2.2.0/api/R/tableToDF.html
new file mode 100644
index 000..086636f
--- /dev/null
+++ b/site/docs/2.2.0/api/R/tableToDF.html
@@ -0,0 +1,68 @@
+
+R: Create a SparkDataFrame from a SparkSQL table or
view
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+tableToDF
{SparkR}R Documentation
+
+Create a SparkDataFrame from a SparkSQL table or view
+
+Description
+
+Returns the specified table or view as a SparkDataFrame. The table or view
must already exist or
+have already been registered in the SparkSession.
+
+
+
+Usage
+
+
+tableToDF(tableName)
+
+
+
+Arguments
+
+
+tableName
+
+the qualified or unqualified name that designates a table or view. If a
database
+is specified, it identifies the table/view from the database.
+Otherwise, it first attempts to find a temporary view with the given name
+and then match the table/view from the current database.
+
+
+
+
+Value
+
+SparkDataFrame
+
+
+
+Note
+
+tableToDF since 2.0.0
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D path <- "path/to/file.json"
+##D df <- read.json(path)
+##D createOrReplaceTempView(df, "table")
+##D new_df <- tableToDF("table")
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/tables.html
--
diff --git a/site/docs/2.2.0/api/R/tables.html
b/site/docs/2.2.0/api/R/tables.html
new file mode 100644
index 000..555027e
--- /dev/null
+++ b/site/docs/2.2.0/api/R/tables.html
@@ -0,0 +1,68 @@
+
+R: Tables
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+tables
{SparkR}R Documentation
+
+Tables
+
+Description
+
+Returns a SparkDataFrame containing names of tables in the given database.
+
+
+
+Usage
+
+
+## Default S3 method:
+tables(databaseName = NULL)
+
+
+
+Arguments
+
+
+databaseName
+
+(optional) name of the database
+
+
+
+
+Value
+
+a SparkDataFrame
+
+
+
+Note
+
+tables since 1.4.0
+
+
+
+See Also
+
+listTables
+
+
+
+Examples
+
+## Not run:
+##D sparkR.session()
+##D tables("hive")
+## End(Not run)
+
+
+
+[Package SparkR version 2.2.0 Index]
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/take.html
--
diff --git a/site/docs/2.2.0/api/R/take.html b/site/docs/2.2.0/api/R/take.html
new file mode 100644
index 000..e9af127
--- /dev/null
+++ b/site/docs/2.2.0/api/R/take.html
@@ -0,0 +1,288 @@
+
+R: Take the first NUM rows of a SparkDataFrame and return
the...
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+take
{SparkR}R Documentation
+
+Take the first NUM rows of a SparkDataFrame and return the results as a R
data.frame
+
+Description
+
+Take the first NUM rows of a SparkDataFrame and return the results as a R
data.frame
+
+
+
+Usage
+
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/spark.lda.html
--
diff --git a/site/docs/2.2.0/api/R/spark.lda.html
b/site/docs/2.2.0/api/R/spark.lda.html
new file mode 100644
index 000..aa1cfa8
--- /dev/null
+++ b/site/docs/2.2.0/api/R/spark.lda.html
@@ -0,0 +1,247 @@
+
+R: Latent Dirichlet Allocation
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+spark.lda
{SparkR}R Documentation
+
+Latent Dirichlet Allocation
+
+Description
+
+spark.lda fits a Latent Dirichlet Allocation model on a
SparkDataFrame. Users can call
+summary to get a summary of the fitted LDA model,
spark.posterior to compute
+posterior probabilities on new data, spark.perplexity to compute
log perplexity on new
+data and write.ml/read.ml to save/load fitted models.
+
+
+
+Usage
+
+
+spark.lda(data, ...)
+
+spark.posterior(object, newData)
+
+spark.perplexity(object, data)
+
+## S4 method for signature 'SparkDataFrame'
+spark.lda(data, features = "features", k = 10,
+ maxIter = 20, optimizer = c("online", "em"), subsamplingRate = 0.05,
+ topicConcentration = -1, docConcentration = -1,
+ customizedStopWords = "", maxVocabSize = bitwShiftL(1, 18))
+
+## S4 method for signature 'LDAModel'
+summary(object, maxTermsPerTopic)
+
+## S4 method for signature 'LDAModel,SparkDataFrame'
+spark.perplexity(object, data)
+
+## S4 method for signature 'LDAModel,SparkDataFrame'
+spark.posterior(object, newData)
+
+## S4 method for signature 'LDAModel,character'
+write.ml(object, path, overwrite = FALSE)
+
+
+
+Arguments
+
+
+data
+
+A SparkDataFrame for training.
+
+...
+
+additional argument(s) passed to the method.
+
+object
+
+A Latent Dirichlet Allocation model fitted by spark.lda.
+
+newData
+
+A SparkDataFrame for testing.
+
+features
+
+Features column name. Either libSVM-format column or character-format
column is
+valid.
+
+k
+
+Number of topics.
+
+maxIter
+
+Maximum iterations.
+
+optimizer
+
+Optimizer to train an LDA model, "online" or "em",
default is "online".
+
+subsamplingRate
+
+(For online optimizer) Fraction of the corpus to be sampled and used in
+each iteration of mini-batch gradient descent, in range (0, 1].
+
+topicConcentration
+
+concentration parameter (commonly named beta or
eta) for
+the prior placed on topic distributions over terms, default -1 to set
automatically on the
+Spark side. Use summary to retrieve the effective
topicConcentration. Only 1-size
+numeric is accepted.
+
+docConcentration
+
+concentration parameter (commonly named alpha) for the
+prior placed on documents distributions over topics (theta),
default -1 to set
+automatically on the Spark side. Use summary to retrieve the
effective
+docConcentration. Only 1-size or k-size numeric is accepted.
+
+customizedStopWords
+
+stopwords that need to be removed from the given corpus. Ignore the
+parameter if libSVM-format column is used as the features column.
+
+maxVocabSize
+
+maximum vocabulary size, default 1 << 18
+
+maxTermsPerTopic
+
+Maximum number of terms to collect for each topic. Default value of 10.
+
+path
+
+The directory where the model is saved.
+
+overwrite
+
+Overwrites or not if the output path already exists. Default is FALSE
+which means throw exception if the output path exists.
+
+
+
+
+Value
+
+spark.lda returns a fitted Latent Dirichlet Allocation model.
+
+summary returns summary information of the fitted model, which
is a list.
+The list includes
+
+
+docConcentration
+
+concentration parameter commonly named alpha for
+the prior placed on documents distributions over topics theta
+
+topicConcentration
+
+concentration parameter commonly named beta or
+eta for the prior placed on topic distributions over terms
+
+logLikelihood
+
+log likelihood of the entire corpus
+
+logPerplexity
+
+log perplexity
+
+isDistributed
+
+TRUE for distributed model while FALSE for local model
+
+vocabSize
+
+number of terms in the corpus
+
+topics
+
+top 10 terms and their weights of all topics
+
+vocabulary
+
+whole terms of the training corpus, NULL if libsvm format file
+used as training set
+
+trainingLogLikelihood
+
+Log likelihood of the observed tokens in the training set,
+given the current parameter estimates:
+log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)
+It is only for distributed LDA model (i.e., optimizer = "em")
+
+logPrior
+
+Log probability of the current parameter estimate:
+log P(topics, topic distributions for docs | Dirichlet hyperparameters)
+It is only for distributed LDA model (i.e., optimizer = "em")
+
+
+spark.perplexity returns the log perplexity of given
SparkDataFrame, or the log
+perplexity of the training data if missing argument "dat
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/00Index.html
--
diff --git a/site/docs/2.2.0/api/R/00Index.html
b/site/docs/2.2.0/api/R/00Index.html
new file mode 100644
index 000..7b199db
--- /dev/null
+++ b/site/docs/2.2.0/api/R/00Index.html
@@ -0,0 +1,1714 @@
+
+R: R Frontend for Apache Spark
+
+
+
+ R Frontend for Apache Spark
+http://stat.ethz.ch/R-manual/R-devel/doc/html/logo.jpg"; alt="[R logo]">
+
+
+
+http://stat.ethz.ch/R-manual/R-devel/doc/html/packages.html";>http://stat.ethz.ch/R-manual/R-devel/doc/html/left.jpg"; alt="[Up]"
width="30" height="30" border="0">
+http://stat.ethz.ch/R-manual/R-devel/doc/html/index.html";>http://stat.ethz.ch/R-manual/R-devel/doc/html/up.jpg"; alt="[Top]"
width="30" height="30" border="0">
+Documentation for package ‘SparkR’ version 2.2.0
+
+DESCRIPTION file.
+
+
+Help Pages
+
+
+
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+Y
+misc
+
+
+
+-- A --
+
+
+abs
+abs
+abs-method
+abs
+acos
+acos
+acos-method
+acos
+add_months
+add_months
+add_months-method
+add_months
+AFTSurvivalRegressionModel-class
+S4 class that represents a AFTSurvivalRegressionModel
+agg
+Summarize data across columns
+agg-method
+Summarize data across columns
+alias
+alias
+alias-method
+alias
+ALSModel-class
+S4 class that represents an ALSModel
+approxCountDistinct
+Returns the approximate number of distinct items in a group
+approxCountDistinct-method
+Returns the approximate number of distinct items in a group
+approxQuantile
+Calculates the approximate quantiles of numerical columns of a
SparkDataFrame
+approxQuantile-method
+Calculates the approximate quantiles of numerical columns of a
SparkDataFrame
+arrange
+Arrange Rows by Variables
+arrange-method
+Arrange Rows by Variables
+array_contains
+array_contains
+array_contains-method
+array_contains
+as.data.frame
+Download data from a SparkDataFrame into a R data.frame
+as.data.frame-method
+Download data from a SparkDataFrame into a R data.frame
+as.DataFrame
+Create a SparkDataFrame
+as.DataFrame.default
+Create a SparkDataFrame
+asc
+A set of operations working with SparkDataFrame columns
+ascii
+ascii
+ascii-method
+ascii
+asin
+asin
+asin-method
+asin
+associationRules-method
+FP-growth
+atan
+atan
+atan-method
+atan
+atan2
+atan2
+atan2-method
+atan2
+attach
+Attach SparkDataFrame to R search path
+attach-method
+Attach SparkDataFrame to R search path
+avg
+avg
+avg-method
+avg
+awaitTermination
+awaitTermination
+awaitTermination-method
+awaitTermination
+
+
+-- B --
+
+
+base64
+base64
+base64-method
+base64
+between
+between
+between-method
+between
+bin
+bin
+bin-method
+bin
+BisectingKMeansModel-class
+S4 class that represents a BisectingKMeansModel
+bitwiseNOT
+bitwiseNOT
+bitwiseNOT-method
+bitwiseNOT
+bround
+bround
+bround-method
+bround
+
+
+-- C --
+
+
+cache
+Cache
+cache-method
+Cache
+cacheTable
+Cache Table
+cacheTable.default
+Cache Table
+cancelJobGroup
+Cancel active jobs for the specified group
+cancelJobGroup.default
+Cancel active jobs for the specified group
+cast
+Casts the column to a different data type.
+cast-method
+Casts the column to a different data type.
+cbrt
+cbrt
+cbrt-method
+cbrt
+ceil
+Computes the ceiling of the given value
+ceil-method
+Computes the ceiling of the given value
+ceiling
+Computes the ceiling of the given value
+ceiling-method
+Computes the ceiling of the given value
+checkpoint
+checkpoint
+checkpoint-method
+checkpoint
+clearCache
+Clear Cache
+clearCache.default
+Clear Cache
+clearJobGroup
+Clear current job group ID and its description
+clearJobGroup.default
+Clear current job group ID and its description
+coalesce
+Coalesce
+coalesce-method
+Coalesce
+collect
+Collects all the elements of a SparkDataFrame and coerces them into an R
data.frame.
+collect-method
+Collects all the elements of a SparkDataFrame and coerces them into an R
data.frame.
+colnames
+Column Names of SparkDataFrame
+colnames-method
+Column Names of SparkDataFrame
+colnames<-
+Column Names of SparkDataFrame
+colnames<--method
+Column Names of SparkDataFrame
+coltypes
+coltypes
+coltypes-method
+coltypes
+coltypes<-
+coltypes
+coltypes<--method
+coltypes
+column
+S4 class that represents a SparkDataFrame column
+Column-class
+S4 class that represents a SparkDataFrame column
+column-method
+S4 class that represents a SparkDataFrame column
+columnfunctions
+A set of operations working with SparkDataFrame columns
+columns
+Column Names of SparkDataFrame
+columns-method
+Column Names of SparkDataFrame
+concat
+concat
+concat-method
+concat
+concat_ws
+concat_ws
+concat_ws-method
+concat_ws
+contains
+A set of operations working with SparkDataFrame columns
+conv
+conv
+conv-method
+conv
+corr
+corr
+corr-method
+corr
+cos
+cos
+cos-method
+cos
+cosh
+cosh
+cosh-method
+cosh
+count
+Returns the number of items in a group
+count-method
+Returns the number of items in a group
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/README.md
--
diff --git a/site/docs/2.2.0/README.md b/site/docs/2.2.0/README.md
new file mode 100644
index 000..90e10a1
--- /dev/null
+++ b/site/docs/2.2.0/README.md
@@ -0,0 +1,73 @@
+Welcome to the Spark documentation!
+
+This readme will walk you through navigating and building the Spark
documentation, which is included
+here with the Spark source code. You can also find documentation specific to
release versions of
+Spark at http://spark.apache.org/documentation.html.
+
+Read on to learn more about viewing documentation in plain text (i.e.,
markdown) or building the
+documentation yourself. Why build it yourself? So that you have the docs that
corresponds to
+whichever version of Spark you currently have checked out of revision control.
+
+## Prerequisites
+The Spark documentation build uses a number of tools to build HTML docs and
API docs in Scala,
+Python and R.
+
+You need to have
[Ruby](https://www.ruby-lang.org/en/documentation/installation/) and
+[Python](https://docs.python.org/2/using/unix.html#getting-and-installing-the-latest-version-of-python)
+installed. Also install the following libraries:
+```sh
+$ sudo gem install jekyll jekyll-redirect-from pygments.rb
+$ sudo pip install Pygments
+# Following is needed only for generating API docs
+$ sudo pip install sphinx pypandoc
+$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "roxygen2",
"testthat", "rmarkdown"), repos="http://cran.stat.ucla.edu/";)'
+```
+(Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to
replace gem with gem2.0)
+
+## Generating the Documentation HTML
+
+We include the Spark documentation as part of the source (as opposed to using
a hosted wiki, such as
+the github wiki, as the definitive documentation) to enable the documentation
to evolve along with
+the source code and be captured by revision control (currently git). This way
the code automatically
+includes the version of the documentation that is relevant regardless of which
version or release
+you have checked out or downloaded.
+
+In this directory you will find textfiles formatted using Markdown, with an
".md" suffix. You can
+read those text files directly if you want. Start with index.md.
+
+Execute `jekyll build` from the `docs/` directory to compile the site.
Compiling the site with
+Jekyll will create a directory called `_site` containing index.html as well as
the rest of the
+compiled files.
+
+$ cd docs
+$ jekyll build
+
+You can modify the default Jekyll build as follows:
+```sh
+# Skip generating API docs (which takes a while)
+$ SKIP_API=1 jekyll build
+
+# Serve content locally on port 4000
+$ jekyll serve --watch
+
+# Build the site with extra features used on the live page
+$ PRODUCTION=1 jekyll build
+```
+
+## API Docs (Scaladoc, Sphinx, roxygen2)
+
+You can build just the Spark scaladoc by running `build/sbt unidoc` from the
SPARK_PROJECT_ROOT directory.
+
+Similarly, you can build just the PySpark docs by running `make html` from the
+SPARK_PROJECT_ROOT/python/docs directory. Documentation is only generated for
classes that are listed as
+public in `__init__.py`. The SparkR docs can be built by running
SPARK_PROJECT_ROOT/R/create-docs.sh.
+
+When you run `jekyll` in the `docs` directory, it will also copy over the
scaladoc for the various
+Spark subprojects into the `docs` directory (and then also into the `_site`
directory). We use a
+jekyll plugin to run `build/sbt unidoc` before building the site so if you
haven't run it (recently) it
+may take some time as it generates all of the scaladoc. The jekyll plugin
also generates the
+PySpark docs using [Sphinx](http://sphinx-doc.org/).
+
+NOTE: To skip the step of building and copying over the Scala, Python, R API
docs, run `SKIP_API=1
+jekyll`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, and `SKIP_RDOC=1`
can be used to skip a single
+step of the corresponding language.
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api.html
--
diff --git a/site/docs/2.2.0/api.html b/site/docs/2.2.0/api.html
new file mode 100644
index 000..835f32d
--- /dev/null
+++ b/site/docs/2.2.0/api.html
@@ -0,0 +1,178 @@
+
+
+
+
+
+
+
+
+
+Spark API Documentation - Spark 2.2.0 Documentation
+
+
+
+
+
+
+body {
+padding-top: 60px;
+padding-bottom: 40px;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+ var _gaq = _gaq || [];
+ _gaq.push(['_setAccount', 'UA-32518208-2']);
+ _gaq.push(['_trackPageview']);
+
+ (function() {
+var ga = doc
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/R/merge.html
--
diff --git a/site/docs/2.2.0/api/R/merge.html b/site/docs/2.2.0/api/R/merge.html
new file mode 100644
index 000..0bc3629
--- /dev/null
+++ b/site/docs/2.2.0/api/R/merge.html
@@ -0,0 +1,354 @@
+
+R: Merges two data frames
+
+
+
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js";>
+https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js";>
+hljs.initHighlightingOnLoad();
+
+
+merge
{SparkR}R Documentation
+
+Merges two data frames
+
+Description
+
+Merges two data frames
+
+
+
+Usage
+
+
+## S4 method for signature 'SparkDataFrame,SparkDataFrame'
+merge(x, y, by = intersect(names(x),
+ names(y)), by.x = by, by.y = by, all = FALSE, all.x = all,
+ all.y = all, sort = TRUE, suffixes = c("_x", "_y"), ...)
+
+merge(x, y, ...)
+
+
+
+Arguments
+
+
+x
+
+the first data frame to be joined.
+
+y
+
+the second data frame to be joined.
+
+by
+
+a character vector specifying the join columns. If by is not
+specified, the common column names in x and y will
be used.
+If by or both by.x and by.y are explicitly set to NULL or of length 0, the
Cartesian
+Product of x and y will be returned.
+
+by.x
+
+a character vector specifying the joining columns for x.
+
+by.y
+
+a character vector specifying the joining columns for y.
+
+all
+
+a boolean value setting all.x and all.y
+if any of them are unset.
+
+all.x
+
+a boolean value indicating whether all the rows in x should
+be including in the join.
+
+all.y
+
+a boolean value indicating whether all the rows in y should
+be including in the join.
+
+sort
+
+a logical argument indicating whether the resulting columns should be
sorted.
+
+suffixes
+
+a string vector of length 2 used to make colnames of
+x and y unique.
+The first element is appended to each colname of x.
+The second element is appended to each colname of y.
+
+...
+
+additional argument(s) passed to the method.
+
+
+
+
+Details
+
+If all.x and all.y are set to FALSE, a natural join will be returned. If
+all.x is set to TRUE and all.y is set to FALSE, a left outer join will
+be returned. If all.x is set to FALSE and all.y is set to TRUE, a right
+outer join will be returned. If all.x and all.y are set to TRUE, a full
+outer join will be returned.
+
+
+
+Note
+
+merge since 1.5.0
+
+
+
+See Also
+
+join crossJoin
+
+Other SparkDataFrame functions: $,
+$,SparkDataFrame-method, $<-,
+$<-,SparkDataFrame-method,
+select, select,
+select,SparkDataFrame,Column-method,
+select,SparkDataFrame,character-method,
+select,SparkDataFrame,list-method;
+SparkDataFrame-class; [,
+[,SparkDataFrame-method, [[,
+[[,SparkDataFrame,numericOrcharacter-method,
+[[<-,
+[[<-,SparkDataFrame,numericOrcharacter-method,
+subset, subset,
+subset,SparkDataFrame-method;
+agg, agg, agg,
+agg,GroupedData-method,
+agg,SparkDataFrame-method,
+summarize, summarize,
+summarize,
+summarize,GroupedData-method,
+summarize,SparkDataFrame-method;
+arrange, arrange,
+arrange,
+arrange,SparkDataFrame,Column-method,
+arrange,SparkDataFrame,character-method,
+orderBy,SparkDataFrame,characterOrColumn-method;
+as.data.frame,
+as.data.frame,SparkDataFrame-method;
+attach,
+attach,SparkDataFrame-method;
+cache, cache,
+cache,SparkDataFrame-method;
+checkpoint, checkpoint,
+checkpoint,SparkDataFrame-method;
+coalesce, coalesce,
+coalesce,
+coalesce,Column-method,
+coalesce,SparkDataFrame-method;
+collect, collect,
+collect,SparkDataFrame-method;
+colnames, colnames,
+colnames,SparkDataFrame-method,
+colnames<-, colnames<-,
+colnames<-,SparkDataFrame-method,
+columns, columns,
+columns,SparkDataFrame-method,
+names,
+names,SparkDataFrame-method,
+names<-,
+names<-,SparkDataFrame-method;
+coltypes, coltypes,
+coltypes,SparkDataFrame-method,
+coltypes<-, coltypes<-,
+coltypes<-,SparkDataFrame,character-method;
+count,SparkDataFrame-method,
+nrow, nrow,
+nrow,SparkDataFrame-method;
+createOrReplaceTempView,
+createOrReplaceTempView,
+createOrReplaceTempView,SparkDataFrame,character-method;
+crossJoin,
+crossJoin,SparkDataFrame,SparkDataFrame-method;
+dapplyCollect, dapplyCollect,
+dapplyCollect,SparkDataFrame,function-method;
+dapply, dapply,
+dapply,SparkDataFrame,function,structType-method;
+describe, describe,
+describe,
+describe,SparkDataFrame,ANY-method,
+describe,SparkDataFrame,character-method,
+describe,SparkDataFrame-method,
+summary, summary,
+summary,SparkDataFrame-method;
+dim,
+dim,SparkDataFrame-method;
+distinct, distinct,
+distinct,SparkDataFrame-method,
+unique,
+unique,SparkDataFrame-method;
+dropDuplicates,
+dropDuplicates,
+dropDuplicates,SparkDataFrame-method;
+dropna, dropna,
+dropna,SparkDataFrame-method,
+fillna, fillna,
+fillna,SparkDataFrame-method,
+na.omit, na.omit,
+na.omit,SparkDataFrame-method;
+drop, drop,
+drop, drop,ANY-method
Update for 2.2.0
Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/76d1abf1
Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/76d1abf1
Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/76d1abf1
Branch: refs/heads/asf-site
Commit: 76d1abf1fc7be6d4455958b0ce3e35b240a21342
Parents: f7ec115
Author: Michael Armbrust
Authored: Mon Jul 10 16:48:58 2017 -0700
Committer: Michael Armbrust
Committed: Tue Jul 11 19:24:06 2017 +
--
_layouts/global.html| 2 +-
js/downloads.js | 1 +
news/_posts/2017-07-11-spark-2-2-0-released.md | 14 ++
.../_posts/2017-07-11-spark-release-2-2-0.md| 151 +++
site/docs/latest| 2 +-
5 files changed, 168 insertions(+), 2 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/spark-website/blob/76d1abf1/_layouts/global.html
--
diff --git a/_layouts/global.html b/_layouts/global.html
index bcd83ad..778a841 100644
--- a/_layouts/global.html
+++ b/_layouts/global.html
@@ -121,7 +121,7 @@
Documentation
- Latest Release (Spark
2.1.1)
+ Latest Release (Spark
2.2.0)
Older Versions and
Other Resources
Frequently Asked
Questions
http://git-wip-us.apache.org/repos/asf/spark-website/blob/76d1abf1/js/downloads.js
--
diff --git a/js/downloads.js b/js/downloads.js
index d308389..d2b43a7 100644
--- a/js/downloads.js
+++ b/js/downloads.js
@@ -28,6 +28,7 @@ var packagesV8 = [hadoop2p7, hadoop2p6, hadoopFree, sources]
//addRelease("2.2.0", new Date("x/x/2017"), packagesV8, true);
+addRelease("2.2.0", new Date("07/11/2017"), packagesV8, true);
addRelease("2.1.1", new Date("05/02/2017"), packagesV7, true);
addRelease("2.1.0", new Date("12/28/2016"), packagesV7, true);
addRelease("2.0.2", new Date("11/14/2016"), packagesV7, true);
http://git-wip-us.apache.org/repos/asf/spark-website/blob/76d1abf1/news/_posts/2017-07-11-spark-2-2-0-released.md
--
diff --git a/news/_posts/2017-07-11-spark-2-2-0-released.md
b/news/_posts/2017-07-11-spark-2-2-0-released.md
new file mode 100644
index 000..6cb11f1
--- /dev/null
+++ b/news/_posts/2017-07-11-spark-2-2-0-released.md
@@ -0,0 +1,14 @@
+---
+layout: post
+title: Spark 2.2.0 released
+categories:
+- News
+tags: []
+status: publish
+type: post
+published: true
+meta:
+ _edit_last: '4'
+ _wpas_done_all: '1'
+---
+We are happy to announce the availability of Spark 2.2.0! Visit the release notes to read about the new features, or download the release today.
http://git-wip-us.apache.org/repos/asf/spark-website/blob/76d1abf1/releases/_posts/2017-07-11-spark-release-2-2-0.md
--
diff --git a/releases/_posts/2017-07-11-spark-release-2-2-0.md
b/releases/_posts/2017-07-11-spark-release-2-2-0.md
new file mode 100644
index 000..8027d8a
--- /dev/null
+++ b/releases/_posts/2017-07-11-spark-release-2-2-0.md
@@ -0,0 +1,151 @@
+---
+layout: post
+title: Spark Release 2.2.0
+categories: []
+tags: []
+status: publish
+type: post
+published: true
+meta:
+ _edit_last: '4'
+ _wpas_done_all: '1'
+---
+
+
+Apache Spark 2.2.0 is the third release on the 2.x line. This release removes
the experimental tag from Structured Streaming. In addition, this release
focuses more on usability, stability, and polish, resolving over 1100 tickets.
+
+
+To download Apache Spark 2.2.0, visit the downloads page. You can consult JIRA
for the [detailed
changes](https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12315420&version=12338275).
We have curated a list of high level changes here, grouped by major modules.
+
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+
+### Core and Spark SQL
+
+ - **API updates**
+ - SPARK-19107: Support creating hive table with DataFrameWriter and Catalog
+ - SPARK-13721: Add support for LATERAL VIEW OUTER explode()
+ - SPARK-18885: Unify CREATE TABLE syntax for data source and hive serde
tables
+ - SPARK-16475: Added Broadcast Hints BROADCAST, BROADCASTJOIN, and MAPJOIN,
for SQL Queries
+ - SPARK-18350: Support session local timezone
+ - SPARK-19261: Support ALTER TABLE table_name ADD COLUMNS
+ - SPARK-20420: Add events to the external catalog
+ - SPARK-18127: Add hooks and extension points to Spark
+ - SPARK-20576: Support generic hint function in Dataset/DataFrame
+ - SPARK-17203: Data source options should always be case insensitive
+ - SPARK-19139: AES-ba
http://git-wip-us.apache.org/repos/asf/spark-website/blob/f7ec1155/site/docs/2.2.0/api/java/org/apache/spark/Accumulable.html
--
diff --git a/site/docs/2.2.0/api/java/org/apache/spark/Accumulable.html
b/site/docs/2.2.0/api/java/org/apache/spark/Accumulable.html
new file mode 100644
index 000..2bd3cfa
--- /dev/null
+++ b/site/docs/2.2.0/api/java/org/apache/spark/Accumulable.html
@@ -0,0 +1,489 @@
+http://www.w3.org/TR/html4/loose.dtd";>
+
+
+
+
+Accumulable (Spark 2.2.0 JavaDoc)
+
+
+
+
+
+
+var methods =
{"i0":42,"i1":42,"i2":42,"i3":42,"i4":42,"i5":42,"i6":42,"i7":42,"i8":42};
+var tabs = {65535:["t0","All Methods"],2:["t2","Instance
Methods"],8:["t4","Concrete Methods"],32:["t6","Deprecated Methods"]};
+var altColor = "altColor";
+var rowColor = "rowColor";
+var tableTab = "tableTab";
+var activeTableTab = "activeTableTab";
+
+
+JavaScript is disabled on your browser.
+
+
+
+
+
+Skip navigation links
+
+
+
+
+Overview
+Package
+Class
+Tree
+Deprecated
+Index
+Help
+
+
+
+
+Prev Class
+Next Class
+
+
+Frames
+No Frames
+
+
+All Classes
+
+
+
+
+
+
+
+Summary:
+Nested |
+Field |
+Constr |
+Method
+
+
+Detail:
+Field |
+Constr |
+Method
+
+
+
+
+
+
+
+
+org.apache.spark
+Class Accumulable
+
+
+
+Object
+
+
+org.apache.spark.Accumulable
+
+
+
+
+
+
+
+All Implemented Interfaces:
+java.io.Serializable
+
+
+Direct Known Subclasses:
+Accumulator
+
+
+Deprecated.
+use AccumulatorV2. Since
2.0.0.
+
+
+public class Accumulable
+extends Object
+implements java.io.Serializable
+A data type that can be accumulated, i.e. has a commutative
and associative "add" operation,
+ but where the result type, R, may be different from the element
type being added, T.
+
+ You must define how to add data, and how to merge two of these together. For
some data types,
+ such as a counter, these might be the same operation. In that case, you can
use the simpler
+ Accumulator. They won't always be the same,
though -- e.g., imagine you are
+ accumulating a set. You will add items to the set, and you will union two
sets together.
+
+ Operations are not thread-safe.
+
+ param: id ID of this accumulator; for internal use only.
+ param: initialValue initial value of accumulator
+ param: param helper object defining how to add elements of type
R and T
+ param: name human-readable name for use in Spark's web UI
+ param: countFailedValues whether to accumulate values from failed tasks.
This is set to true
+ for system and time metrics like serialization time
or bytes spilled,
+ and false for things with absolute values like
number of input rows.
+ This should be used for internal metrics only.
+
+See Also:
+Serialized
Form
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+Constructors
+
+Constructor and Description
+
+
+Accumulable(R initialValue,
+ AccumulableParam param)
+Deprecated.
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+All Methods Instance Methods Concrete Methods Deprecated Methods
+
+Modifier and Type
+Method and Description
+
+
+void
+add(T term)
+Deprecated.
+Add more data to this accumulator / accumulable
+
+
+
+long
+id()
+Deprecated.
+
+
+
+R
+localValue()
+Deprecated.
+Get the current value of this accumulator from within a
task.
+
+
+
+void
+merge(R term)
+Deprecated.
+Merge two accumulable objects together
+
+
+
+scala.Option
+name()
+Deprecated.
+
+
+
+void
+setValue(R newValue)
+Deprecated.
+Set the accumulator's value.
+
+
+
+String
+toString()
+Deprecated.
+
+
+
+R
+value()
+Deprecated.
+Access the accumulator's current value; only allowed on
driver.
+
+
+
+R
+zero()
+Deprecated.
+
+
+
+
+
+
+
+Methods inherited from class Object
+equals, getClass, hashCode, notify, notifyAll, wait, wait,
wait
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+
+
+
+Accumulable
+public Accumulable(R initialValue,
+ AccumulableParam param)
+Deprecated.
+
+
+
+
+
+
+
+
+
+Method Detail
+
+
+
+
+
+id
+public long id()
+Deprecated.
+
+
+
+
+
+
+
+name
+public scala.Option name()
+Deprecated.
+
+
+
+
+
+
+
+zero
+public R zero()
+Deprecated.
+
+
+
+
+
+
+
+
+
+add
+public void add(T term)
+Deprecated.
+Add more data to this accumulator / accumulable
+
+Parameters:
+term - the data to add
+
+
+
+
+
+
+
+
+
+
+merge
+public void merge(R term)
+Deprecated.
+Merge two accumulable objects together
+
+ Normally, a user will not want to use this version, but will instead ca