Re: Spark 1.1.0 with Hadoop 2.5.0

Li HM Tue, 07 Oct 2014 19:43:38 -0700

Here is the hive-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>


<configuration>

<!-- Hive Execution Parameters -->

<property>
  <name>hive.metastore.local</name>
  <value>false</value>
  <description>controls whether to connect to remove metastore server
or open a new metastore server in Hive Client JVM</description>
</property>

<property>
  <name>hive.metastore.uris</name>
  <value>thrift://*****************:2513</value>
  <description>Remote location of the metastore server</description>
</property>

<property>
  <name>hive.metastore.warehouse.dir</name>
  <value>/projects/hcatalog-warehouse</value>
  <description>location of default database for the warehouse</description>
</property>

<property>
  <name>hive.metastore.sasl.enabled</name>
  <value>true</value>
  <description>If true, the metastore thrift interface will be secured
with SASL. Clients must authenticate with Kerberos.</description>
</property>

<property>
  <name>hive.metastore.kerberos.principal</name>
  <value>hcat/*****.com@****.COM</value>
  <description>The service principal for the metastore thrift server.
The special string _HOST will be replaced automatically with the
correct host name.</description>
</property>

<property>
  <name>hive.metastore.client.socket.timeout</name>
  <value>200</value>
  <description>MetaStore Client socket timeout in seconds</description>
</property>

<property>
  <name>hive.exec.mode.local.auto</name>
  <value>false</value>
  <description>Let hive determine whether to run in local mode
automatically</description>
</property>

<property>
  <name>hive.hadoop.supports.splittable.combineinputformat</name>
  <value>true</value>
  <description>Hive internal, should be set to true as MAPREDUCE-1597
is present in Hadoop</description>
</property>

<property>
  <name>hive.exec.scratchdir</name>
  <value>/tmp</value>
  <description>HDFS Scratch space for Hive jobs</description>
</property>

<property>
  <name>hive.querylog.location</name>
  <value>${user.home}/hivelogs</value>
  <description>Local Directory where structured hive query logs are
created. One file per session is created in this directory. If this
variable set to empty string structured log will not be
created.</description>
</property>

<property>
  <name>mapreduce.job.queuename</name>
  <value>default</value>
  <description>Set a default queue name for execution of the Hive
queries</description>
</property>

<property>
  <name>hadoop.clientside.fs.operations</name>
  <value>true</value>
  <description>FS operations related to DDL operations are owned by
Hive client</description>
</property>

<property>
  <name>hive.exec.compress.output</name>
  <value>true</value>
  <description> This controls whether the final outputs of a query (to
a local/hdfs file or a hive table) is compressed. The compression
codec and other options are determined from hadoop config variables
mapred.output.compress* </description>
</property>

<property>
  <name>hive.exec.compress.intermediate</name>
  <value>true</value>
  <description> This controls whether intermediate files produced by
hive between multiple map-reduce jobs are compressed. The compression
codec and other options are determined from hadoop config variables
mapred.output.compress* </description>
</property>

<property>
  <name>hive.auto.convert.join</name>
  <value>false</value>
  <description> This controls whether intermediate files produced by
hive between multiple map-reduce jobs are compressed. The compression
codec and other options are determined from hadoop config variables
mapred.output.compress* </description>
</property>

<property>
  <name>hive.optimize.partition.prune.metadata</name>
  <value>true</value>
  <description>This controls whether metadata optimizations are
applied during partition pruning</description>
</property>

<property>
  <name>hive.mapred.mode</name>
  <value>nonstrict</value>
  <description>The mode in which the hive operations are being
performed. In strict mode, some risky queries are not allowed to
run</description>
</property>

<property>
  <name>io.seqfile.compression.type</name>
  <value>BLOCK</value>
  <description>Determines how the compression is performed. Can take
NONE, RECORD or BLOCK</description>
</property>

<property>
  <name>hive.input.format</name>
  <value>org.apache.hadoop.hive.ql.io.CombineHiveInputFormat</value>
  <description>Determines the input format. Can take
org.apache.hadoop.hive.ql.io.HiveInputFormat or
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
(default)</description>
</property>

<property>
  <name>mapreduce.input.fileinputformat.split.minsize</name>
  <value>134217728</value>
  <description>Size of the minimum split for CombineFileInputFormat
(128MB recommended)</description>
</property>

<property>
  <name>mapreduce.input.fileinputformat.split.maxsize</name>
  <value>1073741824</value>
  <description>Size of maximum split for CombineFileInputFormat (1GB
recommended)</description>
</property>

<property>
  <name>mapreduce.input.fileinputformat.split.minsize.per.rack</name>
  <value>134217728</value>
  <description>Size of minimum split size per rack (128MB
recommended)</description>
</property>

<property>
  <name>mapreduce.input.fileinputformat.split.minsize.per.node</name>
  <value>134217728</value>
  <description>Size of minimum split size per node (128MB
recommended)</description>
</property>

<property>
  <name>hive.exec.orc.default.stripe.size</name>
  <value>33554432</value>
  <description>Size in bytes of a stripe in the ORC file (half the
block size recommended)</description>
</property>

<property>
  <name>hive.exec.orc.default.compress</name>
  <value>ZLIB</value>
  <description>The compression method used in an ORC stripe (NONE,
SNAPPY, LZO, ZLIB (default))</description>
</property>

<property>
  <name>hive.exec.orc.default.row.index.stride</name>
  <value>10000</value>
  <description>Size in number of rows of the index stride (10000
recommended)</description>
</property>

<property>
  <name>hive.exec.orc.default.buffer.size</name>
  <value>262144</value>
  <description>Size in bytes of the ORC buffer (256K recommended)</description>
</property>

<property>
  <name>hive.exec.orc.default.block.padding</name>
  <value>true</value>
  <description>Enable block padding (true recommended)</description>
</property>

<property>
  <name>hive.exec.show.job.failure.debug.info</name>
  <value>false</value>
  <description>If a job fails, whether to provide a link in the CLI to
the task with the most failures, along with debugging hints if
applicable.</description>
</property>

<property>
  <name>hive.ppd.remove.duplicatefilters</name>
  <value>true</value>
  <description>Disable HIVE-1538 as it caused regression HIVE-2791 and
HIVE-2344. HIVE-2791 is not available in Hive 0.8 and hence should be
disabled</description>
</property>

<property>
  <name>hive.security.authorization.enabled</name>
  <value>false</value>
  <description>Perform authorization checks on the client</description>
</property>

<!--
<property>
  <name>hive.security.authorization.manager</name>
  <value>org.apache.hcatalog.security.HdfsAuthorizationProvider</value>
  <description>Authorization provider that performs authorization
checks on the client</description>
</property>
-->

<property>
  <name>hive.metastore.schema.verification</name>
  <value>false</value>
</property>

<property>
  <name>hcatalog.hive.client.cache.expiry.time</name>
  <value>120</value>
  <description>Time in seconds after a hive client is created that it
should be removed from the client cache</description>
</property>

<property>
   <name>hive.limit.query.max.table.partition</name>
   <value>125000</value>
   <description>The maximum number of partitions a table can scan</description>
</property>

<property>
   <name>mapred.dfsclient.parallelism.max</name>
   <value>5</value>
   <description>Max threads to perform getInputSummary during job
submission</description>
</property>

<property>
  <name>hive.stats.autogather</name>
  <value>false</value>
  <description>A flag to gather statistics automatically during the
INSERT OVERWRITE command.</description>
</property>

</configuration>

On Tue, Oct 7, 2014 at 6:16 PM, Cheng Lian <lian.cs....@gmail.com> wrote:
> So it seems that the classpath issue has been resolved. The instantiation
> failure should be related to your hive-site.xml. Would you mind to create a
> public gist for your hive-site.xml?
>
>
> On 10/8/14 4:34 AM, Li HM wrote:
>>
>> Thanks Cheng.
>>
>> Here is the error message after a fresh build.
>>
>> $ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.5.0 -Phive -DskipTests
>> clean package
>> [INFO]
>> ------------------------------------------------------------------------
>> [INFO] Reactor Summary:
>> [INFO]
>> [INFO] Spark Project Parent POM .......................... SUCCESS
>> [19.117s]
>> [INFO] Spark Project Core ................................ SUCCESS
>> [11:24.009s]
>> [INFO] Spark Project Bagel ............................... SUCCESS
>> [1:09.498s]
>> [INFO] Spark Project GraphX .............................. SUCCESS
>> [3:41.113s]
>> [INFO] Spark Project Streaming ........................... SUCCESS
>> [4:25.378s]
>> [INFO] Spark Project ML Library .......................... SUCCESS
>> [5:43.323s]
>> [INFO] Spark Project Tools ............................... SUCCESS
>> [44.647s]
>> [INFO] Spark Project Catalyst ............................ SUCCESS
>> [4:48.658s]
>> [INFO] Spark Project SQL ................................. SUCCESS
>> [4:56.966s]
>> [INFO] Spark Project Hive ................................ SUCCESS
>> [3:45.269s]
>> [INFO] Spark Project REPL ................................ SUCCESS
>> [2:11.617s]
>> [INFO] Spark Project YARN Parent POM ..................... SUCCESS
>> [6.723s]
>> [INFO] Spark Project YARN Stable API ..................... SUCCESS
>> [2:20.860s]
>> [INFO] Spark Project Hive Thrift Server .................. SUCCESS
>> [1:15.231s]
>> [INFO] Spark Project Assembly ............................ SUCCESS
>> [1:41.245s]
>> [INFO] Spark Project External Twitter .................... SUCCESS
>> [50.839s]
>> [INFO] Spark Project External Kafka ...................... SUCCESS
>> [1:15.888s]
>> [INFO] Spark Project External Flume Sink ................. SUCCESS
>> [57.807s]
>> [INFO] Spark Project External Flume ...................... SUCCESS
>> [1:26.589s]
>> [INFO] Spark Project External ZeroMQ ..................... SUCCESS
>> [54.361s]
>> [INFO] Spark Project External MQTT ....................... SUCCESS
>> [53.901s]
>> [INFO] Spark Project Examples ............................ SUCCESS
>> [2:39.407s]
>> [INFO]
>> ------------------------------------------------------------------------
>> [INFO] BUILD SUCCESS
>> [INFO]
>> ------------------------------------------------------------------------
>>
>> spark-sql> use mydb;
>> FAILED: Execution Error, return code 1 from
>> org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException:
>> Unable to instantiate
>> org.apache.hadoop.hive.metastore.HiveMetaStoreClient
>> org.apache.spark.sql.execution.QueryExecutionException: FAILED:
>> Execution Error, return code 1 from
>> org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException:
>> Unable to instantiate
>> org.apache.hadoop.hive.metastore.HiveMetaStoreClient
>> at org.apache.spark.sql.hive.HiveContext.runHive(HiveContext.scala:302)
>> at org.apache.spark.sql.hive.HiveContext.runSqlHive(HiveContext.scala:272)
>> at
>> org.apache.spark.sql.hive.execution.NativeCommand.sideEffectResult$lzycompute(NativeCommand.scala:35)
>> at
>> org.apache.spark.sql.hive.execution.NativeCommand.sideEffectResult(NativeCommand.scala:35)
>> at
>> org.apache.spark.sql.hive.execution.NativeCommand.execute(NativeCommand.scala:38)
>> at
>> org.apache.spark.sql.hive.HiveContext$QueryExecution.toRdd$lzycompute(HiveContext.scala:360)
>> at
>> org.apache.spark.sql.hive.HiveContext$QueryExecution.toRdd(HiveContext.scala:360)
>> at org.apache.spark.sql.SchemaRDDLike$class.$init$(SchemaRDDLike.scala:58)
>> at org.apache.spark.sql.SchemaRDD.<init>(SchemaRDD.scala:103)
>> at org.apache.spark.sql.hive.HiveContext.sql(HiveContext.scala:98)
>> at
>> org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:58)
>> at
>> org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:291)
>> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:413)
>> at
>> org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:226)
>> at
>> org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala)
>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>> at
>> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>> at
>> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>> at java.lang.reflect.Method.invoke(Method.java:601)
>> at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:328)
>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>>
>> On Tue, Oct 7, 2014 at 6:19 AM, Cheng Lian <lian.cs....@gmail.com> wrote:
>>>
>>> The build command should be correct. What exact error did you encounter
>>> when
>>> trying Spark 1.1 + Hive 0.12 + Hadoop 2.5.0?
>>>
>>>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Re: Spark 1.1.0 with Hadoop 2.5.0

Reply via email to