Re: Spark with HBase

Akhil Das Thu, 07 Aug 2014 01:57:27 -0700

You can download and compile spark against your existing hadoop version.

Here's a quick start
https://spark.apache.org/docs/latest/cluster-overview.html#cluster-manager-types


You can also read a bit here
http://docs.sigmoidanalytics.com/index.php/Installing_Spark_andSetting_Up_Your_Cluster
( the version is quiet old)

Attached is a piece of Code (Spark Java API) to connect to HBase.



Thanks
Best Regards


On Thu, Aug 7, 2014 at 1:48 PM, Deepa Jayaveer <deepa.jayav...@tcs.com>
wrote:

> Hi
> I read your white paper about " " . We wanted to do a Proof of Concept on
> Spark with HBase. Documents
> are not much available to set up the spark cluster  in Hadoop 2
> environment. If you have any,
> can you please give us some reference URLs
> Also, some sample program to connect to HBase using Spark Java API
>
> Thanks
> Deepa
>
> =====-----=====-----=====
> Notice: The information contained in this e-mail
> message and/or attachments to it may contain
> confidential or privileged information. If you are
> not the intended recipient, any dissemination, use,
> review, distribution, printing or copying of the
> information contained in this e-mail message
> and/or attachments to it are strictly prohibited. If
> you have received this communication in error,
> please notify us by reply e-mail or telephone and
> immediately and permanently delete the message
> and any attachments. Thank you
>
>

import java.util.Iterator;
import java.util.List;

import org.apache.commons.configuration.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.rdd.NewHadoopRDD;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;

import com.google.common.collect.Lists;

import scala.Function1;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.Seq;
import scala.collection.JavaConverters.*;
import scala.reflect.ClassTag;

public class SparkHBaseMain {

	
	@SuppressWarnings("deprecation")
	public static void main(String[] arg){
		
		try{
			
			List<String> jars = Lists.newArrayList("/home/akhld/Desktop/tools/spark-9/jars/spark-assembly-0.9.0-incubating-hadoop2.3.0-mr1-cdh5.0.0.jar",
					"/home/akhld/Downloads/sparkhbasecode/hbase-server-0.96.0-hadoop2.jar",
					"/home/akhld/Downloads/sparkhbasecode/hbase-protocol-0.96.0-hadoop2.jar",
					"/home/akhld/Downloads/sparkhbasecode/hbase-hadoop2-compat-0.96.0-hadoop2.jar",
					"/home/akhld/Downloads/sparkhbasecode/hbase-common-0.96.0-hadoop2.jar",
					"/home/akhld/Downloads/sparkhbasecode/hbase-client-0.96.0-hadoop2.jar",
					"/home/akhld/Downloads/sparkhbasecode/htrace-core-2.02.jar");

			SparkConf spconf = new SparkConf();
			spconf.setMaster("local");
			spconf.setAppName("SparkHBase");
			spconf.setSparkHome("/home/akhld/Desktop/tools/spark-9");
			spconf.setJars(jars.toArray(new String[jars.size()]));
			spconf.set("spark.executor.memory", "1g");

			final JavaSparkContext sc = new JavaSparkContext(spconf);
						
			org.apache.hadoop.conf.Configuration conf = HBaseConfiguration.create();
			conf.addResource("/home/akhld/Downloads/sparkhbasecode/hbase-site.xml");
			conf.set(TableInputFormat.INPUT_TABLE, "blogposts");
			
						
			NewHadoopRDD<ImmutableBytesWritable, Result> rdd = new NewHadoopRDD<ImmutableBytesWritable, Result>(JavaSparkContext.toSparkContext(sc), TableInputFormat.class, ImmutableBytesWritable.class, Result.class, conf);
			
			JavaRDD<Tuple2<ImmutableBytesWritable, Result>> jrdd = rdd.toJavaRDD();
						
			ForEachFunction f = new ForEachFunction();
			JavaRDD<Iterator<String>> retrdd = jrdd.map(f);
			System.out.println("Count =>" + retrdd.count());
			
		}catch(Exception e){
			
			e.printStackTrace();
			System.out.println("Craaaashed : " + e);
			
		}
		
	}
	
	@SuppressWarnings("serial")
    private static class ForEachFunction extends Function<Tuple2<ImmutableBytesWritable, Result>, Iterator<String>>{
           	public Iterator<String> call(Tuple2<ImmutableBytesWritable, Result> test) {
           		Result tmp = (Result) test._2;
				List<KeyValue> kvl = tmp.getColumn("post".getBytes(), "title".getBytes());
				for(KeyValue kl:kvl){
					String sb = new String(kl.getValue());
					System.out.println("Value :" + sb);
				}
           		return null;
            }

     }


}

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Re: Spark with HBase

Reply via email to