[GitHub] spark pull request: [SPARK-14070] [SQL] Use ORC data source for SQ...

tejasapatil Wed, 23 Mar 2016 14:28:22 -0700

Github user tejasapatil commented on a diff in the pull request:

    https://github.com/apache/spark/pull/11891#discussion_r57240256
  
    --- Diff: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala ---
    @@ -597,6 +619,107 @@ private[hive] class HiveMetastoreCatalog(val client: 
HiveClient, hive: HiveConte
         }
       }
     
    +  private def convertToOrcRelation(metastoreRelation: MetastoreRelation): 
LogicalRelation = {
    +    val metastoreSchema = 
StructType.fromAttributes(metastoreRelation.output)
    +
    +    val tableIdentifier =
    +      QualifiedTableName(metastoreRelation.databaseName, 
metastoreRelation.tableName)
    +
    +    val orcOptions = Map[String, String]()
    +
    +    val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
    +      val partitionSchema = 
StructType.fromAttributes(metastoreRelation.partitionKeys)
    +      val partitionColumnDataTypes = partitionSchema.map(_.dataType)
    +      // We're converting the entire table into OrcRelation, so predicates 
to Hive metastore
    +      // are empty.
    +      val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
    +        val location = p.getLocation
    +        val values = 
InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map {
    +          case (rawValue, dataType) => Cast(Literal(rawValue), 
dataType).eval(null)
    +        })
    +        PartitionDirectory(values, location)
    +      }
    +      val partitionSpec = PartitionSpec(partitionSchema, partitions)
    +
    +      val cached = getCached(
    +        tableIdentifier,
    +        metastoreRelation,
    +        metastoreSchema,
    +        classOf[OrcDefaultSource],
    +        Some(partitionSpec))
    +
    +      val orcRelation = cached.getOrElse {
    +        val paths = new 
Path(metastoreRelation.table.storage.locationUri.get) :: Nil
    +        val fileCatalog = new MetaStoreFileCatalog(hive, paths, 
partitionSpec)
    +        val format = new OrcDefaultSource()
    +        val inferredSchema = format.inferSchema(hive, orcOptions, 
fileCatalog.allFiles()).get
    +
    +        val relation = HadoopFsRelation(
    +          sqlContext = hive,
    +          location = fileCatalog,
    +          partitionSchema = partitionSchema,
    +          dataSchema = inferredSchema,
    +          bucketSpec = None, // We don't support hive bucketed tables, 
only ones we write out.
    +          fileFormat = new OrcDefaultSource(),
    +          options = orcOptions)
    +
    +        val created = LogicalRelation(relation)
    +        cachedDataSourceTables.put(tableIdentifier, created)
    +        created
    +      }
    +
    +      orcRelation
    +    } else {
    +      val paths = 
Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
    +
    +      val cached = getCached(tableIdentifier,
    +        metastoreRelation,
    +        metastoreSchema,
    +        classOf[OrcDefaultSource],
    +        None)
    +      val orcRelation = cached.getOrElse {
    +        val created =
    +          LogicalRelation(
    +            DataSource(
    +              sqlContext = hive,
    +              paths = paths,
    +              userSpecifiedSchema = Some(metastoreRelation.schema),
    +              options = orcOptions,
    +              className = "orc").resolveRelation())
    +
    +        cachedDataSourceTables.put(tableIdentifier, created)
    +        created
    +      }
    +
    +      orcRelation
    +    }
    +    result.copy(expectedOutputAttributes = Some(metastoreRelation.output))
    +  }
    +
    +  /**
    +    * When scanning Metastore ORC tables, convert them to ORC data source 
relations
    +    * for better performance.
    +    */
    +  object OrcConversions extends Rule[LogicalPlan] {
    +    private def isConvertMetastoreOrc(relation: MetastoreRelation): 
Boolean = {
    --- End diff --
    
    done



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request: [SPARK-14070] [SQL] Use ORC data source for SQ...

Reply via email to