[GitHub] spark pull request #14119: [SPARK-16303][DOCS][EXAMPLES][WIP] Updated SQL pr...

liancheng Mon, 11 Jul 2016 07:07:02 -0700

Github user liancheng commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14119#discussion_r70263602
  
    --- Diff: 
examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSqlExample.java 
---
    @@ -0,0 +1,280 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.spark.examples.sql;
    +
    +// $example on:programmatic_schema$
    +import java.util.ArrayList;
    +import java.util.List;
    +// $example off:programmatic_schema$
    +// $example on:create_ds$
    +import java.util.Arrays;
    +import java.util.Collections;
    +import java.io.Serializable;
    +// $example off:create_ds$
    +
    +// $example on:schema_inferring$
    +// $example on:programmatic_schema$
    +import org.apache.spark.api.java.JavaRDD;
    +import org.apache.spark.api.java.function.Function;
    +// $example off:programmatic_schema$
    +// $example on:create_ds$
    +import org.apache.spark.api.java.function.MapFunction;
    +// $example on:create_df$
    +// $example on:run_sql$
    +// $example on:programmatic_schema$
    +import org.apache.spark.sql.Dataset;
    +import org.apache.spark.sql.Row;
    +// $example off:programmatic_schema$
    +// $example off:create_df$
    +// $example off:run_sql$
    +import org.apache.spark.sql.Encoder;
    +import org.apache.spark.sql.Encoders;
    +// $example off:create_ds$
    +// $example off:schema_inferring$
    +import org.apache.spark.sql.RowFactory;
    +// $example on:init_session$
    +import org.apache.spark.sql.SparkSession;
    +// $example off:init_session$
    +// $example on:programmatic_schema$
    +import org.apache.spark.sql.types.DataTypes;
    +import org.apache.spark.sql.types.StructField;
    +import org.apache.spark.sql.types.StructType;
    +// $example off:programmatic_schema$
    +
    +public class JavaSparkSqlExample {
    +  // $example on:create_ds$
    +  public static class Person implements Serializable {
    +    private String name;
    +    private int age;
    +
    +    public String getName() {
    +      return name;
    +    }
    +
    +    public void setName(String name) {
    +      this.name = name;
    +    }
    +
    +    public int getAge() {
    +      return age;
    +    }
    +
    +    public void setAge(int age) {
    +      this.age = age;
    +    }
    +  }
    +  // $example off:create_ds$
    +
    +  public static void main(String[] args) {
    +    // $example on:init_session$
    +    SparkSession spark = SparkSession
    +        .builder()
    +        .appName("Java Spark SQL Example")
    +        .config("spark.some.config.option", "some-value")
    +        .getOrCreate();
    +    // $example off:init_session$
    +
    +    runBasicDataFrameExample(spark);
    +    runDatasetCreationExample(spark);
    +    runInferSchemaExample(spark);
    +    runProgrammaticSchemaExample(spark);
    +
    +    spark.stop();
    +  }
    +
    +  private static void runBasicDataFrameExample(SparkSession spark) {
    +    // $example on:create_df$
    +    Dataset<Row> df = 
spark.read().json("examples/src/main/resources/people.json");
    +
    +    // Displays the content of the DataFrame to stdout
    +    df.show();
    +    // age  name
    +    // null Michael
    +    // 30   Andy
    +    // 19   Justin
    +    // $example off:create_df$
    +
    +    // $example on:untyped_ops$
    +    // Print the schema in a tree format
    +    df.printSchema();
    +    // root
    +    // |-- age: long (nullable = true)
    +    // |-- name: string (nullable = true)
    +
    +    // Select only the "name" column
    +    df.select("name").show();
    +    // name
    +    // Michael
    +    // Andy
    +    // Justin
    +
    +    // Select everybody, but increment the age by 1
    +    df.select(df.col("name"), df.col("age").plus(1)).show();
    +    // name    (age + 1)
    +    // Michael null
    +    // Andy    31
    +    // Justin  20
    +
    +    // Select people older than 21
    +    df.filter(df.col("age").gt(21)).show();
    +    // age name
    +    // 30  Andy
    +
    +    // Count people by age
    +    df.groupBy("age").count().show();
    +    // age  count
    +    // null 1
    +    // 19   1
    +    // 30   1
    +    // $example off:untyped_ops$
    +
    +    // $example on:run_sql$
    +    // Register the DataFrame as a SQL temporary view
    +    df.createOrReplaceTempView("people");
    +
    +    Dataset<Row> sqlDF = spark.sql("SELECT * FROM people");
    +    sqlDF.show();
    +    // $example off:run_sql$
    +  }
    +
    +  private static void runDatasetCreationExample(SparkSession spark) {
    +    // $example on:create_ds$
    +    // Create an instance of a Bean class
    +    Person person = new Person();
    +    person.setName("Andy");
    +    person.setAge(32);
    +
    +    // Encoders are created for Java beans
    +    Encoder<Person> personEncoder = Encoders.bean(Person.class);
    +    Dataset<Person> javaBeanDS = spark.createDataset(
    +        Collections.singletonList(person),
    +        personEncoder
    +    );
    +    javaBeanDS.show();
    +
    +    // Encoders for most common types are provided in class Encoders
    +    Encoder<Integer> integerEncoder = Encoders.INT();
    +    Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 
3), integerEncoder);
    +    Dataset<Integer> transformedDS = primitiveDS.map(new 
MapFunction<Integer, Integer>() {
    +      @Override
    +      public Integer call(Integer value) throws Exception {
    +        return value + 1;
    +      }
    +    }, integerEncoder);
    +    transformedDS.collect(); // Returns [2, 3, 4]
    +
    +    // DataFrames can be converted to a Dataset by providing a class. 
Mapping based on name
    +    String path = "examples/src/main/resources/people.json";
    +    Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
    +    peopleDS.show();
    +    // $example off:create_ds$
    +  }
    +
    +  private static void runInferSchemaExample(SparkSession spark) {
    +    // $example on:schema_inferring$
    +    // Create an RDD of Person objects from a text file
    +    JavaRDD<Person> peopleRDD = spark.read()
    +        .textFile("examples/src/main/resources/people.txt")
    +        .javaRDD()
    +        .map(new Function<String, Person>() {
    +          @Override
    +          public Person call(String line) throws Exception {
    +            String[] parts = line.split(",");
    +            Person person = new Person();
    +            person.setName(parts[0]);
    +            person.setAge(Integer.parseInt(parts[1].trim()));
    +            return person;
    +          }
    +        });
    +
    +    // Apply a schema to an RDD of JavaBeans to get a DataFrame
    +    Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class);
    +    // Register the DataFrame as a temporary view
    +    peopleDF.createOrReplaceTempView("people");
    +
    +    // SQL statements can be run by using the sql methods provided by spark
    +    Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE 
age BETWEEN 13 AND 19");
    +
    +    // The columns of a row in the result can be accessed by field index
    +    Encoder<String> stringEncoder = Encoders.STRING();
    +    Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(new 
MapFunction<Row, String>() {
    +      @Override
    +      public String call(Row row) throws Exception {
    +        return "Name: " + row.getString(0);
    +      }
    +    }, stringEncoder);
    +    teenagerNamesByIndexDF.show();
    +
    +    // or by field name
    +    Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(new 
MapFunction<Row, String>() {
    +      @Override
    +      public String call(Row row) throws Exception {
    +        return "Name: " + row.<String>getAs("name");
    +      }
    +    }, stringEncoder);
    +    teenagerNamesByFieldDF.show();
    +    // $example off:schema_inferring$
    +  }
    +
    +  private static void runProgrammaticSchemaExample(SparkSession spark) {
    +    // $example on:programmatic_schema$
    +    // Create an RDD
    +    JavaRDD<String> peopleRDD = spark.sparkContext()
    +        .textFile("examples/src/main/resources/people.txt", 1)
    +        .toJavaRDD();
    --- End diff --
    
    Nit: Use 2-space indentation here.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #14119: [SPARK-16303][DOCS][EXAMPLES][WIP] Updated SQL pr...

Reply via email to