[5/5] bigtop git commit: Add BPS Spark driver for new data generator. Re-organize BPS into MapReduce and Spark versions.

jay Wed, 19 Nov 2014 15:44:31 -0800

Add BPS Spark driver for new data generator. Re-organize BPS into MapReduce and 
Spark versions.


[BIGTOP-1366][BigPetStore] Fix bigpetstore-mapreduce/build.gradle pom.xml path

[BIGTOP-1366][BigPetStore] Refactor SparkDriver into more easily tested 
functions

[BIGTOP-1366][BigPetStore] Add unit test for Spark Driver

[BIGTOP-1366][BigPetStore] Update BPS Spark README to mention tests

Signed-off-by: jayunit100 <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/6ec6cebf
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/6ec6cebf
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/6ec6cebf

Branch: refs/heads/master
Commit: 6ec6cebfc56f05520e477768441eaecdc505ede8
Parents: 801bade
Author: RJ Nowling <[email protected]>
Authored: Fri Nov 14 14:42:13 2014 -0500
Committer: jayunit100 <[email protected]>
Committed: Wed Nov 19 18:43:04 2014 -0500

----------------------------------------------------------------------
 bigtop-bigpetstore/BPS_analytics.pig            |  79 ---
 bigtop-bigpetstore/README.md                    | 214 +------
 bigtop-bigpetstore/arch.dot                     |  41 --
 .../bigpetstore-mapreduce/BPS_analytics.pig     |  79 +++
 .../bigpetstore-mapreduce/README.md             | 201 +++++++
 .../bigpetstore-mapreduce/arch.dot              |  41 ++
 .../bigpetstore-mapreduce/build.gradle          | 292 ++++++++++
 .../bigpetstore-mapreduce/pom.xml               | 584 +++++++++++++++++++
 .../bigpetstore-mapreduce/settings.gradle       |  18 +
 .../bigtop/bigpetstore/BigPetStoreMahoutIT.java |  73 +++
 .../bigtop/bigpetstore/BigPetStorePigIT.java    | 100 ++++
 .../org/apache/bigtop/bigpetstore/ITUtils.java  | 168 ++++++
 .../contract/PetStoreStatistics.java            |  34 ++
 .../bigtop/bigpetstore/etl/CrunchETL.java       | 142 +++++
 .../apache/bigtop/bigpetstore/etl/LineItem.java | 112 ++++
 .../bigtop/bigpetstore/etl/PigCSVCleaner.java   | 156 +++++
 .../bigpetstore/generator/BPSGenerator.java     | 108 ++++
 .../generator/CustomerGenerator.scala           |  97 +++
 .../generator/PetStoreTransaction.java          |  32 +
 .../PetStoreTransactionInputSplit.java          |  73 +++
 .../PetStoreTransactionsInputFormat.java        | 139 +++++
 .../bigpetstore/generator/util/Product.java     |  80 +++
 .../bigpetstore/generator/util/ProductType.java |  46 ++
 .../bigpetstore/generator/util/State.java       |  43 ++
 .../bigpetstore/recommend/ItemRecommender.scala | 121 ++++
 .../bigpetstore/util/BigPetStoreConstants.java  |  41 ++
 .../bigtop/bigpetstore/util/DeveloperTools.java |  58 ++
 .../bigpetstore/util/NumericalIdUtils.java      |  48 ++
 .../util/PetStoreParseFunctions.java            |  55 ++
 .../bigtop/bigpetstore/util/StringUtils.java    |  53 ++
 .../bigpetstore/generator/DataForger.scala      | 280 +++++++++
 .../generator/TransactionIteratorFactory.scala  | 106 ++++
 .../bigtop/bigpetstore/docs/TestDocs.java       |  37 ++
 .../generator/TestNumericalIdUtils.java         |  35 ++
 .../TestPetStoreTransactionGeneratorJob.java    | 104 ++++
 .../src/test/resources/log4j.properties         |  47 ++
 .../bigtop/bigpetstore/ScalaTestSample.scala    |  35 ++
 bigtop-bigpetstore/bigpetstore-spark/README.md  |  43 ++
 .../bigpetstore-spark/build.gradle              | 137 +++++
 .../spark/generator/SparkDriver.scala           | 244 ++++++++
 .../spark/generator/SparkDriverSuite.scala      |  60 ++
 bigtop-bigpetstore/build.gradle                 | 292 ----------
 bigtop-bigpetstore/pom.xml                      | 584 -------------------
 bigtop-bigpetstore/settings.gradle              |  18 -
 .../bigtop/bigpetstore/BigPetStoreMahoutIT.java |  73 ---
 .../bigtop/bigpetstore/BigPetStorePigIT.java    | 100 ----
 .../org/apache/bigtop/bigpetstore/ITUtils.java  | 168 ------
 .../contract/PetStoreStatistics.java            |  34 --
 .../bigtop/bigpetstore/etl/CrunchETL.java       | 142 -----
 .../apache/bigtop/bigpetstore/etl/LineItem.java | 112 ----
 .../bigtop/bigpetstore/etl/PigCSVCleaner.java   | 156 -----
 .../bigpetstore/generator/BPSGenerator.java     | 108 ----
 .../generator/CustomerGenerator.scala           |  97 ---
 .../generator/PetStoreTransaction.java          |  32 -
 .../PetStoreTransactionInputSplit.java          |  73 ---
 .../PetStoreTransactionsInputFormat.java        | 139 -----
 .../bigpetstore/generator/util/Product.java     |  80 ---
 .../bigpetstore/generator/util/ProductType.java |  46 --
 .../bigpetstore/generator/util/State.java       |  43 --
 .../bigpetstore/recommend/ItemRecommender.scala | 121 ----
 .../bigpetstore/util/BigPetStoreConstants.java  |  41 --
 .../bigtop/bigpetstore/util/DeveloperTools.java |  58 --
 .../bigpetstore/util/NumericalIdUtils.java      |  48 --
 .../util/PetStoreParseFunctions.java            |  55 --
 .../bigtop/bigpetstore/util/StringUtils.java    |  53 --
 .../bigpetstore/generator/DataForger.scala      | 280 ---------
 .../generator/TransactionIteratorFactory.scala  | 106 ----
 .../bigtop/bigpetstore/docs/TestDocs.java       |  37 --
 .../generator/TestNumericalIdUtils.java         |  35 --
 .../TestPetStoreTransactionGeneratorJob.java    | 104 ----
 .../src/test/resources/log4j.properties         |  47 --
 .../bigtop/bigpetstore/ScalaTestSample.scala    |  35 --
 72 files changed, 4145 insertions(+), 3628 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/BPS_analytics.pig
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/BPS_analytics.pig 
b/bigtop-bigpetstore/BPS_analytics.pig
deleted file mode 100755
index 23e3749..0000000
--- a/bigtop-bigpetstore/BPS_analytics.pig
+++ /dev/null
@@ -1,79 +0,0 @@
-----------------------------------------------------------------------------
--- Licensed to the Apache Software Foundation (ASF) under one or more
--- contributor license agreements.  See the NOTICE file distributed with
--- this work for additional information regarding copyright ownership.
--- The ASF licenses this file to You under the Apache License, Version 2.0
--- (the "License"); you may not use this file except in compliance with
--- the License.  You may obtain a copy of the License at
--- http://www.apache.org/licenses/LICENSE-2.0
--- 
--- Unless required by applicable law or agreed to in writing, software
--- distributed under the License is distributed on an "AS IS" BASIS,
--- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- See the License for the specific language governing permissions and
--- limitations under the License.
------------------------------------------------------------------------------
-
--- This is the analytics script that BigPetStore uses as an example for 
--- demos of how to do ad-hoc analytics on the cleaned transaction data.
--- It is used in conjunction with the big pet store web app, soon to be 
--- added to apache bigtop (As of 4/12/2014, the
--- corresponding web app to consume this scripts output is 
--- in jayunit100.github.io/bigpetstore).
-
--- invoke with two arguments, the input file , and the output file. -input 
/bps/gen -output /bps/analytics
-
--- FYI...
--- If you run into errors, you can see them in
--- 
./target/failsafe-reports/TEST-org.bigtop.bigpetstore.integration.BigPetStorePigIT.xml
-
--- First , we load data in from a file, as tuples.
--- in pig, relations like tables in a relational database
--- so each relation is just a bunch of tuples.
--- in this case csvdata will be a relation,
--- where each tuple is a single petstore transaction.
-csvdata =
-    LOAD '$input' using PigStorage()
-        AS (
-          dump:chararray,
-          state:chararray,
-          transaction:int,
-          custId:long,
-          fname:chararray,
-          lname:chararray,
-          productId:int,
-          product:chararray,
-          price:float,
-          date:chararray);
-
--- RESULT:
--- (BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 
EST 1969)
--- ...
-
--- Okay! Now lets group our data so we can do some stats.
--- lets create a new relation,
--- where each tuple will contain all transactions for a product in a state.
-
-state_product = group csvdata by ( state, product ) ;
-
--- RESULT
--- ((storeCode_AK,dog-food) , 
{(BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 EST 
1969)}) --
--- ...
-
-
--- Okay now lets make some summary stats so that the boss man can
--- decide which products are hottest in which states.
-
--- Note that for the "groups", we tease out each individual field here for 
formatting with
--- the BigPetStore visualization app.
-summary1 = FOREACH state_product generate STRSPLIT(group.state,'_').$1 as sp, 
group.product, COUNT($1);
-
-
--- Okay, the stats look like this.  Lets clean them up.
--- (storeCode_AK,cat-food)      2530
--- (storeCode_AK,dog-food)      2540
--- (storeCode_AK,fuzzy-collar)     2495
-
-dump summary1;
-
-store summary1 into '$output';

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/README.md
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/README.md b/bigtop-bigpetstore/README.md
index c806d57..6f04e8f 100644
--- a/bigtop-bigpetstore/README.md
+++ b/bigtop-bigpetstore/README.md
@@ -1,201 +1,33 @@
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements. See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to You under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-(See accompanying source code for licensing information)
-
 BigPetStore
 ============
 
-Apache Bigtop/Hadoop Ecosystem Demo
------------------------------------
-This software is created to demonstrate Apache Bigtop for processing
-big data sets.
-
-Architecture
-------------
-The application consists of the following modules
-
-* generator: generates raw data on the dfs
-* recommendations: Apache Mahout demo code for generating recommendations by 
anaylyzing the transaction records. This feature can be tracked at this [`JIRA` 
issue](https://issues.apache.org/jira/browse/BIGTOP-1272)
-* Pig: demo code for processing the data using Apache Pig
-* Hive: demo code for processing the data using Apache Hive. This part is not 
complete yet. We are working on it. You can track it using this [`JIRA` 
issue](https://issues.apache.org/jira/browse/BIGTOP-1270)
-* Crunch: demo code for processing the data using Apache Crunch
-
-Build Instructions
-------------------
-
-You'll need to have version 2.0 of  
[`gradle`](http://www.gradle.org/downloads) installed and set-up correctly in 
order to follow along these instructions.
-We could have used the 
[`gradle-wrapper`](http://www.gradle.org/docs/current/userguide/gradle_wrapper.html)
 to avoid having to install `gradle`, but the `bigtop` project includes all 
`gradle*` directories in `.gitignore`. So, that's not going to work.
-
-### Build the JAR
-
-`gradle clean build` will build the bigpetstore `jar`. The `jar` will be 
located in the `build\libs` directory.
-
-### Run Intergration Tests With
-  * Pig profile: `gradle clean integrationTest -P ITProfile=pig`
-  * Mahout Profile: `gradle clean integrationTest -P ITProfile=mahout`
-  * Crunch profile: Not Implemented Yet
-  * Hive profile: Not implemented yet.
-
-If you don't specify any profile-name, or if you specify an invalid-name for 
the `integrationTest` task, no integration tests will be run.
-
-*Note:* At this stage, only the `Pig` and `Mahout` profiles are working. Will 
continue to update this area as further work is completed.
-
-For Eclipse Users
------------------
-
-1. Run `gradle eclipse` to create an eclipse project.
-2. Import the project into eclipse.
-
-*Note* whenever you modify the dependencies, you will need to run the `gradle 
eclipse` again. Refresh the project after doing so. You'd also need to have the 
`scala` plugin installed. Also, having a `gradle` plugin would be quite useful 
as well, for ex. when you want to update dependencies.
-
-High level summary
-------------------
-
-The bigpetstore project exemplifies the hadoop ecosystem for newcomers, and 
also for benchmarking and
-comparing functional space of tools.
-
-The end goal is to run many different implementations of each phase
-using different tools, thus exemplifying overlap of tools in the hadoop 
ecosystem, and allowing people to benchmark/compare tools
-using a common framework and easily understood use case
-
-
-How it works (To Do)
---------------------
-
-### Phase 1: Generating pet store data:
-
-The first step is to generate a raw data set.  This is done by the 
"GeneratePetStoreTransactionsInputFormat":
-
-The first MapReduce job in the pipeline runs a simple job which takes this 
input format and forwards
-its output.  The result is a list of "transactions".  Each transaction is a 
tuple of the format
-
-  *{state,name,date,price,product}.*
-
-### Phase 2: Processing the data
-
-The next phase of the application processes the data to create basic 
aggregations. For example with both pig and hive these could easily include
-
-- *Number of transactions by state* or
-- *Most valuable customer by state* or
-- *Most popular items by state*
-
-
-### Phase 3: Clustering the states by all fields
-
-  Now, say we want to cluster the states, so as to put different states into 
different buying categories
-  for our marketing team to deal with differently.
-
-### Phase 4: Visualizing the Data in D3.
-
- - try it [on the gh-pages branch](http://jayunit100.github.io/bigpetstore/)
-
-
-Running on a hadoop cluster
----------------------------
-
-*Note:* For running the code using the `hadoop jar` command instead of the 
`gradle` tasks, you will need to set the classpath appropriately. The 
discussion after [this comment][jira-mahout] in JIRA could also be useful apart 
from these instructions.
-
-### Build the fat-jar
-
-We are going to use a fat-jar in order to avoid specifying the entire 
classpath ourselves.
-
-The fat-jar is required when we are running the application on a hadoop 
cluster. The other way would be to specify all the dependencies (including the 
transitive ones) manually while running the hadoop job. Fat-jars make it easier 
to bundle almost all the dependencies inside the distribution jar itself.
-
-```
-gradle clean shadowJar -Pfor-cluster
-```
-
-This command will build the fat-jar with all the dependencies bundled in 
except the hadoop, mahout and pig dependencies, which we'll specify using 
`-libjars` option while running the hadoop job. These dependencies are excluded 
to avoid conflicts with the jars provided by hadoop itself.
-
-The generated jar will be inside the `build/libs` dir, with name like 
`BigPetStore-x.x.x-SNAPSHOT-all.jar`. For the remainig discussion I'll refer to 
this jar by `bps.jar`.
-
-### Get the mahout and pig jars
-
-You'll need both mahout and pig jars with the hadoop classes excluded. 
Commonly, you can find both of these in their respective distros. The required 
pig jar is generally named like `pig-x.x.x-withouthadoop.jar` and the mahout 
jar would be named like `mahout-core-job.jar`. If you want, you can build those 
yourself by following the instructions in [this JIRA comment][jira-mahout]]. 
For the remaining discussion, I am going to refer to these two jars by 
`pig-withouthadoop.jar` and `mahout-core-job.jar`.
-
-### Setup the classpath for hadoop nodes in the cluster
-
-```
-export 
JARS="/usr/lib/pig/pig-withouthadoop.jar,/usr/lib/mahout/mahout-core-job.jar"
-```
-
-We also need these jars to be present on the client side to kick-off the jobs. 
Reusing the `JARS` variable to put the same jars on the client classpath.
-
-```
-export HADOOP_CLASSPATH=`echo $JARS | sed s/,/:/g`
-```
-
-### Generate the data
-
-```
-hadoop jar bps.jar org.apache.bigtop.bigpetstore.generator.BPSGenerator 
1000000 bigpetstore/gen
-```
-
-### Clean with pig
-
-```
-hadoop jar bps.jar org.apache.bigtop.bigpetstore.etl.PigCSVCleaner -libjars 
$JARS bigpetstore/gen/ bigpetstore/ custom_pigscript.pig
-```
-
-### Analyze and generate recommendations with mahout
-
-```
-hadoop jar bps.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender 
-libjars $JARS  bigpetstore/pig/Mahout bigpetstore/Mahout/AlsFactorization 
bigpetstore/Mahout/AlsRecommendations
-```
-
-
-... (will add more steps as we add more phases to the workflow) ...
-
-
-Example of running in EMR
---------------------------
-- Put the jar in s3.  Right now there is a copy of it at the url below.
-
-- Download the elastic-mapreduce ruby shell script.
-create your "credentials.json" file.
-
-Now run this to generate 1,000,000 pet store transactions:
-
-./elastic-mapreduce --create --jar s3://bigpetstore/bigpetstore.jar \
---main-class org.apache.bigtop.bigpetstore.generator.BPSGenerator \
---num-instances 10  \
---arg 1000000 \
---arg s3://bigpetstore/data/generated \
---hadoop-version "2.2.0"  \
---master-instance-type m1.medium \
---slave-instance-type m1.medium
+BigPetStore is a family of example applications for the Hadoop/Spark
+ecosystems. BigPetStore generates and analyzes synthetic transaction data for
+a fictional chain of petstores.
 
-...Now lets clean the data with pig...
+BigPetStore has the following aims:
 
-Replace the above "main-class", and "--arg" options with
---main-class org.apache.bigtop.bigpetstore.etl.PigCSVCleaner
---arg s3://bigpetstore/data/generated
---arg s3://bigpetstore/data/pig_out
-(optional, you can send a script referencing the cleaned $input path to do some
-custom analytics, see the BPS_Analytics.pig script and companion
-http://jayunit100.github.io/bigpetstore) as an example).
---arg s3://path_to_custom_analytics_script.pig
+* Serve as a demo application to showcase capabilities of the BigTop 
distribution
+* Perform integration testing for BigTop's components
+* Server as a template for building / packaging Hadoop/Spark applications
+* Provide scalable generation of complex synthetic data
+* Examples for using and integrating components such as Pig, Hive, Spark SQL, 
etc.
+* Examples of how to perform popular analytics tasks
 
-(note about pig: We support custom pig scripts.... for EMR, custom pig scripts 
will need to point to a
-local path, so youll have to put that script on the machine as part
-of EMR setup w/ a custom script).
+BigPetStore has the following components to date:
 
-...
+* Gradle build systems supporting Java, Scala, and Groovy
+* Data generators
+* Analytics
+  * ETL
+  * Item Recommenders
 
-And so on.
+The BigPetStore application was originally developed for MapReduce and 
associated
+components such as Pig, Hive, Mahout, Crunch, etc. With the increasing 
popularity
+and importance of Spark, BigPetStore has been expanded to support Spark.  To 
support
+the use case of deploying to pure MapReduce or Spark environments, we've 
elected to
+separate the MapReduce and Spark support into separate applications. You can 
find the
+two applications, along with futher documentation, under 
`bigpetstore-mapreduce` and
+`bigpetstore-spark`, respectively.
 
 
-[jira-mahout]: 
https://issues.apache.org/jira/browse/BIGTOP-1272?focusedCommentId=14076023&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1407602

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/arch.dot
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/arch.dot b/bigtop-bigpetstore/arch.dot
deleted file mode 100644
index 7d17c5a..0000000
--- a/bigtop-bigpetstore/arch.dot
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-digraph bigpetstore {
-
-   node [shape=record];
-
-
-   BPSAnalytics [label="BPSAnalytics.pig" ,style="rounded, filled", 
shape=diamond];
-   CUSTOMER_PAGE [label="CUSTOMER_PAGE|json|CUSTOMER_PAGE/part*"];
-   DIRTY_CSV [label="DIRTY_CSV|fname   lname -prod , price 
,prod,..|generated/part*"];
-   CSV 
[label="CSV|fname,lname,prod,price,date,xcoord,ycoord,...|cleaned/part*"];
-   MAHOUT_VIEW_INPUT [label="MAHOUT_VIEW  |  (user-id) 10001  (product-id) 203 
 (implicit-rating) 1 |  cleaned/Mahout/part*" ];
-   MAHOUT_ALS [label="Parallel ALS Recommender output  | (user-id) 10001  
[(product-id) 201: (recommendation-strength 0-1)0.546] | 
Mahout/AlsRecommendations/part*" ];
-
-   Generate -> DIRTY_CSV [label="hadoop jar bigpetstore.jar 
org.bigtop.bigpetstore.generator.BPSGenerator 100 bps/generated/"] ;
-   DIRTY_CSV -> pig [label="hadoop jar bigpetstore.jar 
org.bigtop.bigpetstore.etl.PigCSVCleaner bps/generated/ bps/cleaned/ "];
-
-   pig -> CSV [label="pig query to clean up generated transaction records"];
-   pig -> MAHOUT_VIEW_INPUT [label="pig query to produce mahout input format"];
-
-   MAHOUT_VIEW_INPUT -> ParallelALSFactorizationJob [label="hadoop jar 
bigpetstore.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender 
cleaned/Mahout Mahout/AlsFactorization Mahout/AlsRecommendations"];
-   ParallelALSFactorizationJob -> "Mahout RecommenderJob"
-   "Mahout RecommenderJob" -> MAHOUT_ALS
-
-   CSV -> BPSAnalytics;
-   BPSAnalytics  -> pig_job2;
-   pig_job2  -> CUSTOMER_PAGE [label=""];
-}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig 
b/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig
new file mode 100755
index 0000000..8516a7d
--- /dev/null
+++ b/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig
@@ -0,0 +1,79 @@
+----------------------------------------------------------------------------
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+-----------------------------------------------------------------------------
+
+-- This is the analytics script that BigPetStore uses as an example for
+-- demos of how to do ad-hoc analytics on the cleaned transaction data.
+-- It is used in conjunction with the big pet store web app, soon to be
+-- added to apache bigtop (As of 4/12/2014, the
+-- corresponding web app to consume this scripts output is
+-- in jayunit100.github.io/bigpetstore).
+
+-- invoke with two arguments, the input file , and the output file. -input 
/bps/gen -output /bps/analytics
+
+-- FYI...
+-- If you run into errors, you can see them in
+-- 
./target/failsafe-reports/TEST-org.bigtop.bigpetstore.integration.BigPetStorePigIT.xml
+
+-- First , we load data in from a file, as tuples.
+-- in pig, relations like tables in a relational database
+-- so each relation is just a bunch of tuples.
+-- in this case csvdata will be a relation,
+-- where each tuple is a single petstore transaction.
+csvdata =
+    LOAD '$input' using PigStorage()
+        AS (
+          dump:chararray,
+          state:chararray,
+          transaction:int,
+          custId:long,
+          fname:chararray,
+          lname:chararray,
+          productId:int,
+          product:chararray,
+          price:float,
+          date:chararray);
+
+-- RESULT:
+-- (BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 
EST 1969)
+-- ...
+
+-- Okay! Now lets group our data so we can do some stats.
+-- lets create a new relation,
+-- where each tuple will contain all transactions for a product in a state.
+
+state_product = group csvdata by ( state, product ) ;
+
+-- RESULT
+-- ((storeCode_AK,dog-food) , 
{(BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 EST 
1969)}) --
+-- ...
+
+
+-- Okay now lets make some summary stats so that the boss man can
+-- decide which products are hottest in which states.
+
+-- Note that for the "groups", we tease out each individual field here for 
formatting with
+-- the BigPetStore visualization app.
+summary1 = FOREACH state_product generate STRSPLIT(group.state,'_').$1 as sp, 
group.product, COUNT($1);
+
+
+-- Okay, the stats look like this.  Lets clean them up.
+-- (storeCode_AK,cat-food)      2530
+-- (storeCode_AK,dog-food)      2540
+-- (storeCode_AK,fuzzy-collar)     2495
+
+dump summary1;
+
+store summary1 into '$output';

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/README.md
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/README.md 
b/bigtop-bigpetstore/bigpetstore-mapreduce/README.md
new file mode 100644
index 0000000..c806d57
--- /dev/null
+++ b/bigtop-bigpetstore/bigpetstore-mapreduce/README.md
@@ -0,0 +1,201 @@
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+(See accompanying source code for licensing information)
+
+BigPetStore
+============
+
+Apache Bigtop/Hadoop Ecosystem Demo
+-----------------------------------
+This software is created to demonstrate Apache Bigtop for processing
+big data sets.
+
+Architecture
+------------
+The application consists of the following modules
+
+* generator: generates raw data on the dfs
+* recommendations: Apache Mahout demo code for generating recommendations by 
anaylyzing the transaction records. This feature can be tracked at this [`JIRA` 
issue](https://issues.apache.org/jira/browse/BIGTOP-1272)
+* Pig: demo code for processing the data using Apache Pig
+* Hive: demo code for processing the data using Apache Hive. This part is not 
complete yet. We are working on it. You can track it using this [`JIRA` 
issue](https://issues.apache.org/jira/browse/BIGTOP-1270)
+* Crunch: demo code for processing the data using Apache Crunch
+
+Build Instructions
+------------------
+
+You'll need to have version 2.0 of  
[`gradle`](http://www.gradle.org/downloads) installed and set-up correctly in 
order to follow along these instructions.
+We could have used the 
[`gradle-wrapper`](http://www.gradle.org/docs/current/userguide/gradle_wrapper.html)
 to avoid having to install `gradle`, but the `bigtop` project includes all 
`gradle*` directories in `.gitignore`. So, that's not going to work.
+
+### Build the JAR
+
+`gradle clean build` will build the bigpetstore `jar`. The `jar` will be 
located in the `build\libs` directory.
+
+### Run Intergration Tests With
+  * Pig profile: `gradle clean integrationTest -P ITProfile=pig`
+  * Mahout Profile: `gradle clean integrationTest -P ITProfile=mahout`
+  * Crunch profile: Not Implemented Yet
+  * Hive profile: Not implemented yet.
+
+If you don't specify any profile-name, or if you specify an invalid-name for 
the `integrationTest` task, no integration tests will be run.
+
+*Note:* At this stage, only the `Pig` and `Mahout` profiles are working. Will 
continue to update this area as further work is completed.
+
+For Eclipse Users
+-----------------
+
+1. Run `gradle eclipse` to create an eclipse project.
+2. Import the project into eclipse.
+
+*Note* whenever you modify the dependencies, you will need to run the `gradle 
eclipse` again. Refresh the project after doing so. You'd also need to have the 
`scala` plugin installed. Also, having a `gradle` plugin would be quite useful 
as well, for ex. when you want to update dependencies.
+
+High level summary
+------------------
+
+The bigpetstore project exemplifies the hadoop ecosystem for newcomers, and 
also for benchmarking and
+comparing functional space of tools.
+
+The end goal is to run many different implementations of each phase
+using different tools, thus exemplifying overlap of tools in the hadoop 
ecosystem, and allowing people to benchmark/compare tools
+using a common framework and easily understood use case
+
+
+How it works (To Do)
+--------------------
+
+### Phase 1: Generating pet store data:
+
+The first step is to generate a raw data set.  This is done by the 
"GeneratePetStoreTransactionsInputFormat":
+
+The first MapReduce job in the pipeline runs a simple job which takes this 
input format and forwards
+its output.  The result is a list of "transactions".  Each transaction is a 
tuple of the format
+
+  *{state,name,date,price,product}.*
+
+### Phase 2: Processing the data
+
+The next phase of the application processes the data to create basic 
aggregations. For example with both pig and hive these could easily include
+
+- *Number of transactions by state* or
+- *Most valuable customer by state* or
+- *Most popular items by state*
+
+
+### Phase 3: Clustering the states by all fields
+
+  Now, say we want to cluster the states, so as to put different states into 
different buying categories
+  for our marketing team to deal with differently.
+
+### Phase 4: Visualizing the Data in D3.
+
+ - try it [on the gh-pages branch](http://jayunit100.github.io/bigpetstore/)
+
+
+Running on a hadoop cluster
+---------------------------
+
+*Note:* For running the code using the `hadoop jar` command instead of the 
`gradle` tasks, you will need to set the classpath appropriately. The 
discussion after [this comment][jira-mahout] in JIRA could also be useful apart 
from these instructions.
+
+### Build the fat-jar
+
+We are going to use a fat-jar in order to avoid specifying the entire 
classpath ourselves.
+
+The fat-jar is required when we are running the application on a hadoop 
cluster. The other way would be to specify all the dependencies (including the 
transitive ones) manually while running the hadoop job. Fat-jars make it easier 
to bundle almost all the dependencies inside the distribution jar itself.
+
+```
+gradle clean shadowJar -Pfor-cluster
+```
+
+This command will build the fat-jar with all the dependencies bundled in 
except the hadoop, mahout and pig dependencies, which we'll specify using 
`-libjars` option while running the hadoop job. These dependencies are excluded 
to avoid conflicts with the jars provided by hadoop itself.
+
+The generated jar will be inside the `build/libs` dir, with name like 
`BigPetStore-x.x.x-SNAPSHOT-all.jar`. For the remainig discussion I'll refer to 
this jar by `bps.jar`.
+
+### Get the mahout and pig jars
+
+You'll need both mahout and pig jars with the hadoop classes excluded. 
Commonly, you can find both of these in their respective distros. The required 
pig jar is generally named like `pig-x.x.x-withouthadoop.jar` and the mahout 
jar would be named like `mahout-core-job.jar`. If you want, you can build those 
yourself by following the instructions in [this JIRA comment][jira-mahout]]. 
For the remaining discussion, I am going to refer to these two jars by 
`pig-withouthadoop.jar` and `mahout-core-job.jar`.
+
+### Setup the classpath for hadoop nodes in the cluster
+
+```
+export 
JARS="/usr/lib/pig/pig-withouthadoop.jar,/usr/lib/mahout/mahout-core-job.jar"
+```
+
+We also need these jars to be present on the client side to kick-off the jobs. 
Reusing the `JARS` variable to put the same jars on the client classpath.
+
+```
+export HADOOP_CLASSPATH=`echo $JARS | sed s/,/:/g`
+```
+
+### Generate the data
+
+```
+hadoop jar bps.jar org.apache.bigtop.bigpetstore.generator.BPSGenerator 
1000000 bigpetstore/gen
+```
+
+### Clean with pig
+
+```
+hadoop jar bps.jar org.apache.bigtop.bigpetstore.etl.PigCSVCleaner -libjars 
$JARS bigpetstore/gen/ bigpetstore/ custom_pigscript.pig
+```
+
+### Analyze and generate recommendations with mahout
+
+```
+hadoop jar bps.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender 
-libjars $JARS  bigpetstore/pig/Mahout bigpetstore/Mahout/AlsFactorization 
bigpetstore/Mahout/AlsRecommendations
+```
+
+
+... (will add more steps as we add more phases to the workflow) ...
+
+
+Example of running in EMR
+--------------------------
+- Put the jar in s3.  Right now there is a copy of it at the url below.
+
+- Download the elastic-mapreduce ruby shell script.
+create your "credentials.json" file.
+
+Now run this to generate 1,000,000 pet store transactions:
+
+./elastic-mapreduce --create --jar s3://bigpetstore/bigpetstore.jar \
+--main-class org.apache.bigtop.bigpetstore.generator.BPSGenerator \
+--num-instances 10  \
+--arg 1000000 \
+--arg s3://bigpetstore/data/generated \
+--hadoop-version "2.2.0"  \
+--master-instance-type m1.medium \
+--slave-instance-type m1.medium
+
+...Now lets clean the data with pig...
+
+Replace the above "main-class", and "--arg" options with
+--main-class org.apache.bigtop.bigpetstore.etl.PigCSVCleaner
+--arg s3://bigpetstore/data/generated
+--arg s3://bigpetstore/data/pig_out
+(optional, you can send a script referencing the cleaned $input path to do some
+custom analytics, see the BPS_Analytics.pig script and companion
+http://jayunit100.github.io/bigpetstore) as an example).
+--arg s3://path_to_custom_analytics_script.pig
+
+(note about pig: We support custom pig scripts.... for EMR, custom pig scripts 
will need to point to a
+local path, so youll have to put that script on the machine as part
+of EMR setup w/ a custom script).
+
+...
+
+And so on.
+
+
+[jira-mahout]: 
https://issues.apache.org/jira/browse/BIGTOP-1272?focusedCommentId=14076023&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1407602

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot 
b/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot
new file mode 100644
index 0000000..7d17c5a
--- /dev/null
+++ b/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot
@@ -0,0 +1,41 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+digraph bigpetstore {
+
+   node [shape=record];
+
+
+   BPSAnalytics [label="BPSAnalytics.pig" ,style="rounded, filled", 
shape=diamond];
+   CUSTOMER_PAGE [label="CUSTOMER_PAGE|json|CUSTOMER_PAGE/part*"];
+   DIRTY_CSV [label="DIRTY_CSV|fname   lname -prod , price 
,prod,..|generated/part*"];
+   CSV 
[label="CSV|fname,lname,prod,price,date,xcoord,ycoord,...|cleaned/part*"];
+   MAHOUT_VIEW_INPUT [label="MAHOUT_VIEW  |  (user-id) 10001  (product-id) 203 
 (implicit-rating) 1 |  cleaned/Mahout/part*" ];
+   MAHOUT_ALS [label="Parallel ALS Recommender output  | (user-id) 10001  
[(product-id) 201: (recommendation-strength 0-1)0.546] | 
Mahout/AlsRecommendations/part*" ];
+
+   Generate -> DIRTY_CSV [label="hadoop jar bigpetstore.jar 
org.bigtop.bigpetstore.generator.BPSGenerator 100 bps/generated/"] ;
+   DIRTY_CSV -> pig [label="hadoop jar bigpetstore.jar 
org.bigtop.bigpetstore.etl.PigCSVCleaner bps/generated/ bps/cleaned/ "];
+
+   pig -> CSV [label="pig query to clean up generated transaction records"];
+   pig -> MAHOUT_VIEW_INPUT [label="pig query to produce mahout input format"];
+
+   MAHOUT_VIEW_INPUT -> ParallelALSFactorizationJob [label="hadoop jar 
bigpetstore.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender 
cleaned/Mahout Mahout/AlsFactorization Mahout/AlsRecommendations"];
+   ParallelALSFactorizationJob -> "Mahout RecommenderJob"
+   "Mahout RecommenderJob" -> MAHOUT_ALS
+
+   CSV -> BPSAnalytics;
+   BPSAnalytics  -> pig_job2;
+   pig_job2  -> CUSTOMER_PAGE [label=""];
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle 
b/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle
new file mode 100644
index 0000000..c80672c
--- /dev/null
+++ b/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+apply plugin: "java"
+apply plugin: "eclipse"
+// TODO add idea module config.
+apply plugin: "idea"
+apply plugin: "scala"
+apply plugin: 'com.github.johnrengelman.shadow'
+
+buildscript {
+  repositories { jcenter() }
+  dependencies {
+    classpath 'com.github.jengelman.gradle.plugins:shadow:1.0.2'
+  }
+}
+
+// Read the groupId and version properties from the "parent" bigtop project.
+// It would be better if there was some better way of doing this. Howvever,
+// at this point, we have to do this (or some variation thereof) since gradle
+// projects can't have maven projects as parents (AFAIK. If there is a way to 
do it,
+// it doesn't seem to be well-documented).
+def setProjectProperties() {
+    Node xml = new XmlParser().parse("../../pom.xml")
+    group = xml.groupId.first().value().first()
+    version = xml.version.first().value().first()
+}
+
+setProjectProperties()
+description = """"""
+
+// We are using 1.7 as gradle can't play well when java 8 and scala are 
combined.
+// There is an open issue here: http://issues.gradle.org/browse/GRADLE-3023
+// There is talk of this being resolved in the next version of gradle. Till 
then,
+// we are stuck with java 7. But we do have scala if we want more syntactic 
sugar.
+sourceCompatibility = 1.7
+targetCompatibility = 1.7
+
+// Specify any additional project properties.
+ext {
+    slf4jVersion = "1.7.5"
+    guavaVersion = "15.0"
+    datanucleusVersion = "3.2.2"
+    datanucleusJpaVersion = "3.2.1"
+    bonecpVersion = "0.8.0.RELEASE"
+    derbyVersion = "10.10.1.1"
+
+    // from horton-works repo. They compile mahout-core against hadoop2.x. 
These
+    // mahout is compiled against 2.4.0
+    hadoopVersion = "2.4.0.2.1.2.0-402"
+    mahoutVersion = "0.9.0.2.1.2.0-402"
+}
+
+repositories {
+    mavenCentral()
+    maven {
+        url "http://repo.hortonworks.com/content/repositories/releases/";
+    }
+}
+
+tasks.withType(AbstractCompile) {
+    options.encoding = 'UTF-8'
+    options.compilerArgs << "-Xlint:all"
+}
+
+tasks.withType(ScalaCompile) {
+    // Enables incremental compilation.
+    // 
http://www.gradle.org/docs/current/userguide/userguide_single.html#N12F78
+    scalaCompileOptions.useAnt = false
+}
+
+tasks.withType(Test) {
+    testLogging {
+        // Uncomment this if you want to see the console output from the tests.
+        // showStandardStreams = true
+        events "passed", "skipped", "failed"
+        // show standard out and standard error of the test JVM(s) on the 
console
+        //showStandardStreams = true
+    }
+}
+
+test {
+    exclude "**/*TestPig.java", "**/*TestHiveEmbedded.java", 
"**/*TestCrunch.java", "**/*TestPetStoreTransactionGeneratorJob.java"
+}
+
+// Create a separate source-set for the src/integrationTest set of classes. 
The convention here
+// is that gradle will look for a directory with the same name as that of the 
specified source-set
+// under the 'src' directory. So, in this case, it will look for a directory 
named 'src/integrationTest'
+// since the name of the source-set is 'integrationTest'
+sourceSets {
+    main {
+        java.srcDirs = [];
+        scala.srcDirs = ["src/main/scala", "src/main/java"]
+    }
+    // The main and test source-sets are configured by both java and scala 
plugins. They contain
+    // all the src/main and src/test classes. The following statements make 
all of those classes
+    // available on the classpath for the integration-tests, for both java and 
scala.
+    integrationTest {
+        java {
+            compileClasspath += main.output + test.output
+            runtimeClasspath += main.output + test.output
+        }
+        scala {
+            compileClasspath += main.output + test.output
+            runtimeClasspath += main.output + test.output
+        }
+    }
+}
+
+// Creating a source-set automatically add a couple of corresponding 
configurations (when java/scala
+// plugins are applied). The convention for these configurations is 
<sourceSetName>Compile and
+// <sourceSetName>Runtime. The following statements declare that all the 
dependencies from the
+// testCompile configuration will now be available for integrationTestCompile, 
and all the
+// dependencies (and other configuration that we might have provided) for 
testRuntime will be
+// available for integrationTestRuntime. For ex. the testCompile configuration 
has a dependency on
+// jUnit and scalatest. This makes them available for the integration tests as 
well.
+configurations {
+    integrationTestCompile {
+        extendsFrom testCompile
+    }
+
+    integrationTestRuntime {
+        extendsFrom integrationTestCompile, testRuntime
+    }
+}
+
+// To see the API that is being used here, consult the following docs
+// 
http://www.gradle.org/docs/current/dsl/org.gradle.api.artifacts.ResolutionStrategy.html
+def updateDependencyVersion(dependencyDetails, dependencyString) {
+    def parts = dependencyString.split(':')
+    def group = parts[0]
+    def name = parts[1]
+    def version = parts[2]
+    if (dependencyDetails.requested.group == group
+            && dependencyDetails.requested.name == name) {
+        dependencyDetails.useVersion version
+    }
+}
+
+def setupPigIntegrationTestDependencyVersions(dependencyResolveDetails) {
+    // This is the way we override the dependencies.
+    updateDependencyVersion dependencyResolveDetails, "joda-time:joda-time:2.2"
+}
+
+def setupCrunchIntegrationTestDependencyVersions(dependencyResolveDetails) {
+    // Specify any dependencies that you want to override for crunch 
integration tests.
+}
+
+def setupMahoutIntegrationTestDependencyVersions(dependencyResolveDetails) {
+    // Specify any dependencies that you want to override for mahout 
integration tests.
+}
+
+
+task integrationTest(type: Test, dependsOn: test) {
+
+    testClassesDir = sourceSets.integrationTest.output.classesDir
+    classpath = sourceSets.integrationTest.runtimeClasspath
+
+    if(!project.hasProperty('ITProfile')) {
+        // skip integration-tests if no profile has been specified.
+        integrationTest.onlyIf { false }
+        return;
+    }
+
+    def patternsToInclude
+    def dependencyConfigClosure
+    def skipDependencyUpdates = false
+    // Select the pattern for test classes that should be executed, and the 
dependency
+    // configuration function to be called based on the profile name specified 
at the command line.
+    switch (project.ITProfile) {
+        case "pig":
+            patternsToInclude = "*PigIT*"
+            dependencyConfigClosure = { 
setupPigIntegrationTestDependencyVersions(it) }
+            break
+        case "crunch":
+            patternsToInclude = "*CrunchIT*"
+            dependencyConfigClosure = { 
setupCrunchIntegrationTestDependencyVersions(it) }
+            break
+        case "mahout":
+            patternsToInclude = "*MahoutIT*"
+            dependencyConfigClosure = { 
setupMahoutIntegrationTestDependencyVersions(it) }
+            break
+        // skip integration-tests if the passed in profile-name is not valid
+        default: integrationTest.onlyIf { false }; return
+    }
+
+
+    filter { includeTestsMatching patternsToInclude }
+
+    // This is the standard way gradle allows overriding each specific 
dependency.
+    // see: 
http://www.gradle.org/docs/current/dsl/org.gradle.api.artifacts.ResolutionStrategy.html
+    project.configurations.all {
+        resolutionStrategy {
+            eachDependency {
+                dependencyConfigClosure(it)
+            }
+        }
+    }
+}
+
+dependencies {
+    compile "org.kohsuke:graphviz-api:1.0"
+    compile "org.apache.crunch:crunch-core:0.9.0-hadoop2"
+    compile "com.jolbox:bonecp:${project.bonecpVersion}"
+    compile "org.apache.derby:derby:${project.derbyVersion}"
+    compile "com.google.guava:guava:${project.guavaVersion}"
+    compile "commons-lang:commons-lang:2.6"
+    compile "joda-time:joda-time:2.3"
+    compile "org.apache.commons:commons-lang3:3.1"
+    compile "com.google.protobuf:protobuf-java:2.5.0"
+    compile "commons-logging:commons-logging:1.1.3"
+    compile "com.thoughtworks.xstream:xstream:+"
+    compile "org.apache.lucene:lucene-core:+"
+    compile "org.apache.lucene:lucene-analyzers-common:+"
+    compile "org.apache.solr:solr-commons-csv:3.5.0"
+
+    compile group: "org.apache.pig", name: "pig", version: "0.12.0", 
classifier:"h2"
+    compile "org.slf4j:slf4j-api:${project.slf4jVersion}"
+    compile "log4j:log4j:1.2.12"
+    compile "org.slf4j:slf4j-log4j12:${project.slf4jVersion}"
+    compile "org.datanucleus:datanucleus-core:${project.datanucleusVersion}"
+    compile 
"org.datanucleus:datanucleus-rdbms:${project.datanucleusJpaVersion}"
+    compile 
"org.datanucleus:datanucleus-api-jdo:${project.datanucleusJpaVersion}"
+    compile 
"org.datanucleus:datanucleus-accessplatform-jdo-rdbms:${project.datanucleusJpaVersion}"
+    compile group: "org.apache.mrunit", name: "mrunit", version: "1.0.0", 
classifier:"hadoop2"
+
+    compile "org.jfairy:jfairy:0.2.4"
+
+    // from horton-works repo. They compile mahout-core against hadoop2.x
+    compile "org.apache.hadoop:hadoop-client:${hadoopVersion}"
+    compile "org.apache.mahout:mahout-core:${mahoutVersion}"
+
+    compile 'org.scala-lang:scala-library:2.11.0'
+
+    testCompile "junit:junit:4.11"
+    testCompile "org.hamcrest:hamcrest-all:1.3"
+    testCompile "org.scalatest:scalatest_2.11:2.1.7"
+}
+
+configurations {
+    hadoopClusterRuntime {
+           // extendsFrom integrationTestRuntime
+           if(project.hasProperty('for-cluster')) {
+                   excludeRules += [getGroup: { 'org.apache.crunch' }, 
getModule: { 'crunch-core' } ] as ExcludeRule
+                   excludeRules += [getGroup: { 'org.apache.pig' }, getModule: 
{ 'pig' } ] as ExcludeRule
+                   excludeRules += [getGroup: { 'org.apache.mahout' }, 
getModule: { 'mahout-core' } ] as ExcludeRule
+                   excludeRules += [getGroup: { 'org.apache.hadoop' }, 
getModule: { 'hadoop-client' } ] as ExcludeRule
+               }
+    }
+}
+
+task listJars << {
+    configurations.shadow.each { println it.name }
+}
+
+def copyDependencyJarsForHadoopCluster() {
+    copy {
+        from configurations.hadoopClusterRuntime
+        into 'build/libs'
+    }
+}
+
+build {
+    doLast {
+        copyDependencyJarsForHadoopCluster()
+    }
+}
+
+eclipse {
+    classpath {
+        // Add the dependencies and the src dirs for the integrationTest 
source-set to the
+        // .classpath file that will be generated by the eclipse plugin.
+        plusConfigurations += [configurations.integrationTestCompile]
+        // Comment out the following two lines if you want to generate an 
eclipse project quickly.
+        downloadSources = true
+        downloadJavadoc = false
+    }
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml 
b/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml
new file mode 100644
index 0000000..a5fc979
--- /dev/null
+++ b/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml
@@ -0,0 +1,584 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+       Licensed to the Apache Software Foundation (ASF) under one or more
+       contributor license agreements. See the NOTICE file distributed with
+       this work for additional information regarding copyright ownership.
+       The ASF licenses this file to You under the Apache License, Version 2.0
+       (the "License"); you may not use this file except in compliance with
+       the License. You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+       Unless required by applicable law or agreed to in writing, software
+       distributed under the License is distributed on an "AS IS" BASIS,
+       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+       See the License for the specific language governing permissions and
+       limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+       xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+       <modelVersion>4.0.0</modelVersion>
+       <groupId>org.apache.bigtop</groupId>
+       <artifactId>BigPetStore</artifactId>
+       <version>0.9.0-SNAPSHOT</version>
+       <properties>
+               
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+               
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+               
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+               
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+               <slf4j.version>1.7.5</slf4j.version>
+               <guava.version>15.0</guava.version>
+               <hadoop.version>2.2.0</hadoop.version>
+               <hive.version>0.12.0</hive.version>
+               <datanucleus.version>3.2.2</datanucleus.version>
+               <datanucleus.jpa.version>3.2.1</datanucleus.jpa.version>
+               <bonecp.version>0.9.0-SNAPSHOT.RELEASE</bonecp.version>
+               <derby.version>10.10.1.1</derby.version>
+               <plugin.surefire.version>2.17</plugin.surefire.version>
+       </properties>
+
+       <dependencies>
+               <dependency>
+                       <groupId>org.kohsuke</groupId>
+                       <artifactId>graphviz-api</artifactId>
+                       <version>1.0</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.apache.crunch</groupId>
+                       <artifactId>crunch-core</artifactId>
+                       <version>0.9.0-hadoop2</version>
+               </dependency>
+
+               <!-- misc deps -->
+               <dependency>
+                       <groupId>com.jolbox</groupId>
+                       <artifactId>bonecp</artifactId>
+                       <version>${bonecp.version}</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.apache.derby</groupId>
+                       <artifactId>derby</artifactId>
+                       <version>${derby.version}</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>com.google.guava</groupId>
+                       <artifactId>guava</artifactId>
+                       <version>${guava.version}</version>
+               </dependency>
+
+               <!-- From pig profile -->
+               <dependency>
+                       <groupId>commons-lang</groupId>
+                       <artifactId>commons-lang</artifactId>
+                       <version>2.6</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>joda-time</groupId>
+                       <artifactId>joda-time</artifactId>
+                       <version>2.3</version>
+               </dependency>
+               <!-- end pig profile -->
+               <!-- From hive profile -->
+               <dependency>
+                       <groupId>org.apache.commons</groupId>
+                       <artifactId>commons-lang3</artifactId>
+                       <version>3.1</version>
+               </dependency>
+               <!-- end hive profile -->
+               <!-- From Crunch profile -->
+               <dependency>
+                       <groupId>com.google.protobuf</groupId>
+                       <artifactId>protobuf-java</artifactId>
+                       <version>2.5.0</version>
+               </dependency>
+               <!-- end crunch profile -->
+               <!-- From Mahout profile -->
+               <dependency>
+                       <groupId>commons-logging</groupId>
+                       <artifactId>commons-logging</artifactId>
+                       <version>1.1.3</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.mahout</groupId>
+                       <artifactId>mahout-math</artifactId>
+                       <version>0.9</version>
+               </dependency>
+               <dependency>
+                       <groupId>com.thoughtworks.xstream</groupId>
+                       <artifactId>xstream</artifactId>
+                       <version>LATEST</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.lucene</groupId>
+                       <artifactId>lucene-core</artifactId>
+                       <version>LATEST</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.lucene</groupId>
+                       <artifactId>lucene-analyzers-common</artifactId>
+                       <version>LATEST</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.mahout.commons</groupId>
+                       <artifactId>commons-cli</artifactId>
+                       <version>LATEST</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.commons</groupId>
+                       <artifactId>commons-math3</artifactId>
+                       <version>LATEST</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.solr</groupId>
+                       <artifactId>solr-commons-csv</artifactId>
+                       <version>3.5.0</version>
+               </dependency>
+               <!-- end Mahout profile -->
+
+               <!-- TODO ask question about this comment -->
+               <!-- We keep this at top level so that mvn eclipse:eclipse 
creates a nice
+                       tidy project, but its a little messy. later we'll 
create a profile for eclipse
+                       and move this (and other deps) into profiles as needed. 
Important: Remove
+                       this dependency when running hive integration tests... 
-->
+               <dependency>
+                       <groupId>org.apache.hadoop</groupId>
+                       <artifactId>hadoop-client</artifactId>
+                       <version>${hadoop.version}</version>
+               </dependency>
+               <!-- TODO ask question about this comment -->
+               <!-- mahout deps : may need to turn these on/off when testing 
mahout locally -->
+               <!-- For testing on my machine, I created a bigpetstore mahout 
jar which
+                       is compiled for 2.2.0 . Or substitute this with the 
standard apache mahout-core
+                       but not sure if it will work. -->
+               <dependency>
+                       <groupId>org.apache.mahout</groupId>
+                       <artifactId>mahout-core</artifactId>
+                       <version>0.8</version>
+               </dependency>
+               <!-- pig deps -->
+               <dependency>
+                       <groupId>org.apache.pig</groupId>
+                       <artifactId>pig</artifactId>
+                       <classifier>h2</classifier>
+                       <version>0.12.0</version>
+               </dependency>
+
+               <!--logging -->
+
+               <dependency>
+                       <groupId>org.slf4j</groupId>
+                       <artifactId>slf4j-api</artifactId>
+                       <version>${slf4j.version}</version>
+               </dependency>
+               <dependency>
+                       <groupId>log4j</groupId>
+                       <artifactId>log4j</artifactId>
+                       <version>1.2.12</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.slf4j</groupId>
+                       <artifactId>slf4j-log4j12</artifactId>
+                       <version>${slf4j.version}</version>
+               </dependency>
+               <!-- hive -->
+               <dependency>
+                       <groupId>org.apache.hive</groupId>
+                       <artifactId>hive-common</artifactId>
+                       <version>${hive.version}</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hive</groupId>
+                       <artifactId>hive-serde</artifactId>
+                       <version>${hive.version}</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hive</groupId>
+                       <artifactId>hive-jdbc</artifactId>
+                       <version>${hive.version}</version>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hive</groupId>
+                       <artifactId>hive-contrib</artifactId>
+                       <version>${hive.version}</version>
+               </dependency>
+
+               <!-- datanucleus -->
+               <dependency>
+                       <groupId>org.datanucleus</groupId>
+                       <artifactId>datanucleus-core</artifactId>
+                       <version>${datanucleus.version}</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.datanucleus</groupId>
+                       <artifactId>datanucleus-rdbms</artifactId>
+                       <version>${datanucleus.jpa.version}</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.datanucleus</groupId>
+                       <artifactId>datanucleus-api-jdo</artifactId>
+                       <version>${datanucleus.jpa.version}</version>
+               </dependency>
+
+               <!-- TODO eliminate this pom dependency -->
+               <dependency>
+                       <groupId>org.datanucleus</groupId>
+                       
<artifactId>datanucleus-accessplatform-jdo-rdbms</artifactId>
+                       <version>${datanucleus.jpa.version}</version>
+                       <type>pom</type>
+               </dependency>
+
+               <!-- Unit test artifacts -->
+               <dependency>
+                       <groupId>junit</groupId>
+                       <artifactId>junit</artifactId>
+                       <version>4.11</version>
+                       <scope>test</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.hamcrest</groupId>
+                       <artifactId>hamcrest-all</artifactId>
+                       <version>1.3</version>
+                       <scope>test</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.mrunit</groupId>
+                       <artifactId>mrunit</artifactId>
+                       <version>1.0.0</version>
+                       <classifier>hadoop2</classifier>
+               </dependency>
+       </dependencies>
+
+       <build>
+               <extensions>
+                       <extension>
+                               <groupId>org.springframework.build.aws</groupId>
+                               
<artifactId>org.springframework.build.aws.maven</artifactId>
+                               <version>3.0.0.RELEASE</version>
+                       </extension>
+               </extensions>
+               <finalName>bigpetstore-${project.version}</finalName>
+               <plugins>
+                       <plugin>
+                               <groupId>org.apache.maven.plugins</groupId>
+                               <artifactId>maven-release-plugin</artifactId>
+                               <version>2.5</version>
+                       </plugin>
+                       <plugin>
+                               <groupId>org.apache.maven.plugins</groupId>
+                               <artifactId>maven-eclipse-plugin</artifactId>
+                               <version>2.9</version>
+                               <configuration>
+                                       <downloadSources>true</downloadSources>
+                                       
<downloadJavadocs>true</downloadJavadocs>
+                               </configuration>
+                       </plugin>
+
+                       <plugin>
+                               <groupId>org.apache.maven.plugins</groupId>
+                               <artifactId>maven-compiler-plugin</artifactId>
+                               
<version>${maven-compiler-plugin.version}</version>
+                               <configuration>
+                                       <source>1.8</source>
+                                       <target>1.8</target>
+                               </configuration>
+                       </plugin>
+                       <plugin>
+                               <groupId>org.apache.maven.plugins</groupId>
+                               <artifactId>maven-jar-plugin</artifactId>
+                               <version>2.4</version>
+                               <configuration>
+                                       
<outputDirectory>${basedir}/target</outputDirectory>
+                               </configuration>
+                       </plugin>
+                       <plugin>
+                               <groupId>org.apache.maven.plugins</groupId>
+                               <artifactId>maven-surefire-plugin</artifactId>
+                               <version>${plugin.surefire.version}</version>
+                               <configuration>
+                                       <excludes>
+                                               
<exclude>**/*TestPig.java</exclude>
+                                               
<exclude>**/*TestHiveEmbedded.java</exclude>
+                                               
<exclude>**/*TestCrunch.java</exclude>
+                                       </excludes>
+                               </configuration>
+                       </plugin>
+               </plugins>
+       </build>
+
+       <profiles>
+               <profile>
+                       <id>pig</id>
+                       <build>
+                               <plugins>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-surefire-plugin</artifactId>
+                                               
<version>${plugin.surefire.version}</version>
+                                               <configuration>
+                                                       <excludes>
+                                                               
<exclude>**/*TestPig.java</exclude>
+                                                               
<exclude>**/*TestHiveEmbedded.java</exclude>
+                                                               
<exclude>**/*TestCrunch.java</exclude>
+                                                               
<exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+                                                       </excludes>
+
+                                               </configuration>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.codehaus.mojo</groupId>
+                                               
<artifactId>build-helper-maven-plugin</artifactId>
+                                               <version>1.5</version>
+                                               <executions>
+                                                       <execution>
+                                                               
<id>add-test-source</id>
+                                                               
<phase>generate-test-sources</phase>
+                                                               <goals>
+                                                                       
<goal>add-test-source</goal>
+                                                               </goals>
+                                                               <configuration>
+                                                                       
<sources>
+                                                                               
<source>src/integration/java</source>
+                                                                       
</sources>
+                                                               </configuration>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-failsafe-plugin</artifactId>
+                                               <version>2.12</version>
+
+                                               <configuration>
+                                                       
<argLine>-Xmx1g</argLine>
+                                                       <excludes>
+                                                               
<exclude>**/*BigPetStoreMahoutIT.java</exclude>
+                                                               
<exclude>**/*BigPetStoreHiveIT.java</exclude>
+                                                               
<exclude>**/*BigPetStoreCrunchIT.java</exclude>
+                                                       </excludes>
+                                               </configuration>
+                                               <executions>
+                                                       <!-- States that both 
integration-test and verify goals of the Failsafe
+                                                               Maven plugin 
are executed. -->
+                                                       <execution>
+                                                               
<id>integration-tests</id>
+                                                               <goals>
+                                                                       
<goal>integration-test</goal>
+                                                                       
<goal>verify</goal>
+                                                               </goals>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                               </plugins>
+                       </build>
+               </profile>
+
+               <profile>
+                       <id>hive</id>
+                       <build>
+                               <plugins>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-surefire-plugin</artifactId>
+                                               
<version>${plugin.surefire.version}</version>
+                                               <configuration>
+                                                       <excludes>
+                                                               
<exclude>**/*TestPig.java</exclude>
+                                                               
<exclude>**/*TestHiveEmbedded.java</exclude>
+                                                               
<exclude>**/*TestCrunch.java</exclude>
+                                                               
<exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+                                                       </excludes>
+                                               </configuration>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.codehaus.mojo</groupId>
+                                               
<artifactId>build-helper-maven-plugin</artifactId>
+                                               <version>1.5</version>
+                                               <executions>
+                                                       <execution>
+                                                               
<id>add-test-source</id>
+                                                               
<phase>generate-test-sources</phase>
+                                                               <goals>
+                                                                       
<goal>add-test-source</goal>
+                                                               </goals>
+                                                               <configuration>
+                                                                       
<sources>
+                                                                               
<source>src/integration/java</source>
+                                                                       
</sources>
+                                                               </configuration>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-failsafe-plugin</artifactId>
+                                               <version>2.12</version>
+                                               <configuration>
+                                                       <excludes>
+                                                               
<exclude>**/*BigPetStoreMahoutIT.java</exclude>
+                                                               
<exclude>**/*BigPetStorePigIT.java</exclude>
+                                                               
<exclude>**/*BigPetStoreCrunchIT.java</exclude>
+                                                       </excludes>
+                                               </configuration>
+                                               <executions>
+                                                       <!-- States that both 
integration-test and verify goals of the Failsafe
+                                                               Maven plugin 
are executed. -->
+                                                       <execution>
+                                                               
<id>integration-tests</id>
+                                                               <goals>
+                                                                       
<goal>integration-test</goal>
+                                                                       
<goal>verify</goal>
+                                                               </goals>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                               </plugins>
+                       </build>
+                       <dependencies>
+                               <!-- hadoop -->
+                               <!-- TODO is this version change required? 
Version 2.2.0 is provided
+                                       by hadoop-client dependency. Shouldn't 
we have the same versions for the
+                                       related dependencies? -->
+                               <dependency>
+                                       <groupId>org.apache.hadoop</groupId>
+                                       
<artifactId>hadoop-mapreduce-client-app</artifactId>
+                                       <version>2.3.0</version>
+                               </dependency>
+                       </dependencies>
+               </profile>
+               <profile>
+                       <id>crunch</id>
+                       <build>
+                               <plugins>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-surefire-plugin</artifactId>
+                                               
<version>${plugin.surefire.version}</version>
+                                               <configuration>
+                                                       <excludes>
+                                                               
<exclude>**/*TestPig.java</exclude>
+                                                               
<exclude>**/*TestHiveEmbedded.java</exclude>
+                                                               
<exclude>**/*TestCrunch.java</exclude>
+                                                               
<exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+                                                       </excludes>
+                                               </configuration>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.codehaus.mojo</groupId>
+                                               
<artifactId>build-helper-maven-plugin</artifactId>
+                                               <version>1.5</version>
+                                               <executions>
+                                                       <execution>
+                                                               
<id>add-test-source</id>
+                                                               
<phase>generate-test-sources</phase>
+                                                               <goals>
+                                                                       
<goal>add-test-source</goal>
+                                                               </goals>
+                                                               <configuration>
+                                                                       
<sources>
+                                                                               
<source>src/integration/java</source>
+                                                                       
</sources>
+                                                               </configuration>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-failsafe-plugin</artifactId>
+                                               <version>2.12</version>
+                                               <configuration>
+                                                       <excludes>
+                                                               
<exclude>**/*BigPetStorePigIT.java</exclude>
+                                                               
<exclude>**/*BigPetStoreHiveIT.java</exclude>
+                                                               
<exclude>**/*BigPetStoreMahoutIT.java</exclude>
+                                                       </excludes>
+                                               </configuration>
+                                               <executions>
+                                                       <!-- States that both 
integration-test and verify goals of the Failsafe
+                                                               Maven plugin 
are executed. -->
+                                                       <execution>
+                                                               
<id>integration-tests</id>
+                                                               <goals>
+                                                                       
<goal>integration-test</goal>
+                                                                       
<goal>verify</goal>
+                                                               </goals>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                               </plugins>
+                       </build>
+               </profile>
+               <profile>
+                       <id>mahout</id>
+                       <!-- TODO this property is not being used anywhere. 
It's not even automatically
+                               detectable. Remove? Or do something that the 
name suggests? -->
+                       <properties>
+                               <skip.unit.tests>true</skip.unit.tests>
+                       </properties>
+                       <build>
+                               <plugins>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-surefire-plugin</artifactId>
+                                               
<version>${plugin.surefire.version}</version>
+                                               <configuration>
+                                                       <excludes>
+                                                               
<exclude>**/*TestPig.java</exclude>
+                                                               
<exclude>**/*TestHiveEmbedded.java</exclude>
+                                                               
<exclude>**/*TestCrunch.java</exclude>
+                                                               
<exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+                                                       </excludes>
+                                               </configuration>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.codehaus.mojo</groupId>
+                                               
<artifactId>build-helper-maven-plugin</artifactId>
+                                               <version>1.5</version>
+                                               <executions>
+                                                       <execution>
+                                                               
<id>add-test-source</id>
+                                                               
<phase>generate-test-sources</phase>
+                                                               <goals>
+                                                                       
<goal>add-test-source</goal>
+                                                               </goals>
+                                                               <configuration>
+                                                                       
<sources>
+                                                                               
<source>src/integration/java</source>
+                                                                       
</sources>
+                                                               </configuration>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                                       <plugin>
+                                               
<groupId>org.apache.maven.plugins</groupId>
+                                               
<artifactId>maven-failsafe-plugin</artifactId>
+                                               <version>2.12</version>
+                                               <configuration>
+                                                       <excludes>
+                                                               
<exclude>**/*BigPetStorePigIT.java</exclude>
+                                                               
<exclude>**/*BigPetStoreCrunchIT.java</exclude>
+                                                               
<exclude>**/*BigPetStoreHiveIT.java</exclude>
+                                                       </excludes>
+                                               </configuration>
+                                               <executions>
+                                                       <!-- States that both 
integration-test and verify goals of the Failsafe
+                                                               Maven plugin 
are executed. -->
+                                                       <execution>
+                                                               
<id>integration-tests</id>
+                                                               <goals>
+                                                                       
<goal>integration-test</goal>
+                                                                       
<goal>verify</goal>
+                                                               </goals>
+                                                       </execution>
+                                               </executions>
+                                       </plugin>
+                               </plugins>
+                       </build>
+               </profile>
+       </profiles>
+</project>

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle 
b/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle
new file mode 100644
index 0000000..53d74f2
--- /dev/null
+++ b/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle
@@ -0,0 +1,18 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+rootProject.name = 'BigPetStore'

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java
----------------------------------------------------------------------
diff --git 
a/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java
 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java
new file mode 100644
index 0000000..b07c5a0
--- /dev/null
+++ 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore;
+
+import static org.apache.bigtop.bigpetstore.ITUtils.createTestOutputPath;
+import static org.apache.bigtop.bigpetstore.ITUtils.setup;
+
+import java.util.regex.Pattern;
+
+import org.apache.bigtop.bigpetstore.recommend.ItemRecommender;
+import 
org.apache.bigtop.bigpetstore.util.BigPetStoreConstants.OUTPUTS.MahoutPaths;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.base.Predicate;
+
+public class BigPetStoreMahoutIT {
+
+  public static final Path INPUT_DIR_PATH =
+          new Path(ITUtils.BPS_TEST_PIG_CLEANED, MahoutPaths.Mahout.name());
+  public static final String INPUT_DIR_PATH_STR = INPUT_DIR_PATH.toString();
+  private static final Path MAHOUT_OUTPUT_DIR = 
createTestOutputPath(MahoutPaths.Mahout.name());
+  private static final Path ALS_FACTORIZATION_OUTPUT_DIR =
+          createTestOutputPath(MahoutPaths.Mahout.name(), 
MahoutPaths.AlsFactorization.name());
+  private static final Path ALS_RECOMMENDATIONS_DIR =
+          createTestOutputPath(MahoutPaths.Mahout.name(), 
MahoutPaths.AlsRecommendations.name());
+
+  private ItemRecommender itemRecommender;
+
+  @Before
+  public void setupTest() throws Throwable {
+    setup();
+    try {
+      FileSystem fs = FileSystem.get(new Configuration());
+      fs.delete(MAHOUT_OUTPUT_DIR, true);
+      itemRecommender = new ItemRecommender(INPUT_DIR_PATH_STR, 
ALS_FACTORIZATION_OUTPUT_DIR.toString(),
+              ALS_RECOMMENDATIONS_DIR.toString());
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static final Predicate<String> TEST_OUTPUT_FORMAT = new 
Predicate<String>() {
+    private final Pattern p = 
Pattern.compile("^\\d+\\s\\[\\d+:\\d+\\.\\d+\\]$");
+    @Override
+    public boolean apply(String input) {
+      return p.matcher(input).matches();
+    }
+  };
+
+  @Test
+  public void testPetStorePipeline() throws Exception {
+    itemRecommender.recommend();
+    ITUtils.assertOutput(ALS_RECOMMENDATIONS_DIR, TEST_OUTPUT_FORMAT);
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java
----------------------------------------------------------------------
diff --git 
a/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java
 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java
new file mode 100644
index 0000000..78d5c6b
--- /dev/null
+++ 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore;
+
+import static org.apache.bigtop.bigpetstore.ITUtils.BPS_TEST_GENERATED;
+import static org.apache.bigtop.bigpetstore.ITUtils.BPS_TEST_PIG_CLEANED;
+import static org.apache.bigtop.bigpetstore.ITUtils.fs;
+
+import java.io.File;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.bigtop.bigpetstore.etl.PigCSVCleaner;
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.pig.ExecType;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.ImmutableMap;
+
+/**
+ * This is the main integration test for pig. Like all BPS integration tests, 
it
+ * is designed to simulate exactly what will happen on the actual cluster,
+ * except with a small amount of records.
+ *
+ * In addition to cleaning the dataset, it also runs the BPS_analytics.pig
+ * script which BigPetStore ships with.
+ */
+public class BigPetStorePigIT {
+
+       final static Logger log = 
LoggerFactory.getLogger(BigPetStorePigIT.class);
+
+       /**
+        * An extra unsupported code path that we have so people can do ad hoc
+        * analytics on pig data after it is cleaned.
+        */
+       public static final Path BPS_TEST_PIG_COUNT_PRODUCTS = fs
+                       .makeQualified(new Path("bps_integration_",
+                                       
BigPetStoreConstants.OUTPUTS.pig_ad_hoc_script.name() + "0"));
+
+       static final File PIG_SCRIPT = new File("BPS_analytics.pig");
+
+       static {
+               if (!PIG_SCRIPT.exists()) {
+                       throw new RuntimeException("Couldnt find pig script at 
" + PIG_SCRIPT.getAbsolutePath());
+               }
+       }
+
+       @Before
+       public void setupTest() throws Throwable {
+               ITUtils.setup();
+               try {
+                       FileSystem.get(new 
Configuration()).delete(BPS_TEST_PIG_CLEANED, true);
+                       FileSystem.get(new 
Configuration()).delete(BPS_TEST_PIG_COUNT_PRODUCTS, true);
+               } catch (Exception e) {
+                       throw new RuntimeException(e);
+               }
+       }
+
+       static Map<Path, Predicate<String>> TESTS = ImmutableMap.of(
+               /** Test of the main output */
+               BPS_TEST_PIG_CLEANED, ITUtils.VERIFICATION_PERDICATE,
+               // Example of how to count products after doing basic pig data 
cleanup
+               BPS_TEST_PIG_COUNT_PRODUCTS, ITUtils.VERIFICATION_PERDICATE,
+               // Test the output that is to be used as an input for Mahout.
+               BigPetStoreMahoutIT.INPUT_DIR_PATH, 
ITUtils.VERIFICATION_PERDICATE
+       );
+
+       @Test
+       public void testPetStoreCorePipeline() throws Exception {
+               runPig(BPS_TEST_GENERATED, BPS_TEST_PIG_CLEANED, PIG_SCRIPT);
+               for (Entry<Path, Predicate<String>> e : TESTS.entrySet()) {
+                       ITUtils.assertOutput(e.getKey(), e.getValue());
+               }
+       }
+
+       private void runPig(Path input, Path output, File pigscript)
+                       throws Exception {
+               new PigCSVCleaner(input, output, ExecType.LOCAL, pigscript);
+       }
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java
----------------------------------------------------------------------
diff --git 
a/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java
 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java
new file mode 100644
index 0000000..fd53dc1
--- /dev/null
+++ 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.InetAddress;
+import java.nio.charset.Charset;
+import java.util.List;
+
+import org.apache.bigtop.bigpetstore.generator.BPSGenerator;
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.junit.Assert;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Predicate;
+import com.google.common.io.Files;
+
+public class ITUtils {
+  public static final Path TEST_OUTPUT_DIR = new Path("bps_integration_");
+
+  public static Predicate<String> VERIFICATION_PERDICATE = new 
Predicate<String>() {
+    @Override
+    public boolean apply(String input) {
+      return true;
+    }
+  };
+
+       static final Logger log = LoggerFactory.getLogger(ITUtils.class);
+
+       static FileSystem fs;
+       static {
+               try {
+                       fs = FileSystem.getLocal(new Configuration());
+               } catch (Throwable e) {
+                       String cpath = (String) 
System.getProperties().get("java.class.path");
+                       String msg = "";
+                       for (String cp : cpath.split(":")) {
+                               if (cp.contains("hadoop")) {
+                                       msg += cp.replaceAll("hadoop", 
"**HADOOP**") + "\n";
+                               }
+                       }
+                       throw new RuntimeException("Major error:  Probably 
issue.   "
+                               + "Check hadoop version?  " + e.getMessage()
+                               + " .... check these classpath elements:" + 
msg);
+               }
+       }
+
+       public static final Path BPS_TEST_GENERATED =
+               
createTestOutputPath(BigPetStoreConstants.OUTPUTS.generated.name());
+       public static final Path BPS_TEST_PIG_CLEANED =
+               createTestOutputPath 
(BigPetStoreConstants.OUTPUTS.cleaned.name());
+
+       public static Path createTestOutputPath(String... pathParts) {
+         Path path = TEST_OUTPUT_DIR;
+         for(String pathPart: pathParts) {
+           path = new Path(path, pathPart);
+         }
+         return path;
+       }
+
+       /**
+        * Some simple checks to make sure that unit tests in local FS. these 
arent
+        * designed to be run against a distribtued system.
+        */
+       public static void checkConf(Configuration conf) throws Exception {
+               if (conf.get("mapreduce.jobtracker.address") == null) {
+                       log.warn("Missing 
mapreduce.jobtracker.address???????!!!! " + "This can be the case in hive tests 
which use special "
+                                       + "configurations, but we should fix it 
sometime.");
+                       return;
+               }
+               if (!conf.get("mapreduce.jobtracker.address").equals("local")) {
+                       throw new RuntimeException("ERROR: bad conf : " + 
"mapreduce.jobtracker.address");
+               }
+               if 
(!conf.get("fs.AbstractFileSystem.file.impl").contains("Local")) {
+                       throw new RuntimeException("ERROR: bad conf : " + 
"mapreduce.jobtracker.address");
+               }
+               try {
+                       InetAddress addr = java.net.InetAddress.getLocalHost();
+                       System.out.println("Localhost = hn=" + 
addr.getHostName() + " / ha=" + addr.getHostAddress());
+               } catch (Throwable e) {
+                       throw new RuntimeException(" ERROR : Hadoop wont work 
at all  on this machine yet"
+                                       + "...I can't get / resolve localhost ! 
Check java version/ " + "/etc/hosts / DNS or other networking related issues on 
your box"
+                                       + e.getMessage());
+               }
+       }
+
+       /**
+        * Creates a generated input data set in
+        *
+        * test_data_directory/generated. i.e.
+        * test_data_directory/generated/part-r-00000
+        */
+       public static void setup() throws Throwable {
+               Configuration conf = new Configuration();
+
+               // debugging for Jeff and others in local fs that won't build
+               checkConf(conf);
+
+               conf.setInt(BPSGenerator.props.bigpetstore_records.name(), 
BPSGenerator.DEFAULT_NUM_RECORDS);
+
+               if (FileSystem.getLocal(conf).exists(BPS_TEST_GENERATED)) {
+                       return;
+               }
+
+               Job createInput = 
BPSGenerator.getCreateTransactionRecordsJob(BPS_TEST_GENERATED, conf);
+               createInput.waitForCompletion(true);
+
+               Path outputfile = new Path(BPS_TEST_GENERATED, "part-r-00000");
+               List<String> lines = 
Files.readLines(FileSystem.getLocal(conf).pathToFile(outputfile), 
Charset.defaultCharset());
+               log.info("output : " + 
FileSystem.getLocal(conf).pathToFile(outputfile));
+               for (String l : lines) {
+                       System.out.println(l);
+               }
+       }
+
+
+       // A functions that logs the output file as a verification test
+       public static void assertOutput(Path base, Predicate<String> validator) 
throws Exception {
+         FileSystem fs = FileSystem.getLocal(new Configuration());
+
+         FileStatus[] files = fs.listStatus(base);
+         // print out all the files.
+         for (FileStatus stat : files) {
+           System.out.println(stat.getPath() + "  " + stat.getLen());
+         }
+
+         /**
+          * Support map OR reduce outputs
+          */
+         Path partm = new Path(base, "part-m-00000");
+         Path partr = new Path(base, "part-r-00000");
+         Path p = fs.exists(partm) ? partm : partr;
+
+         /**
+          * Now we read through the file and validate its contents.
+          */
+         BufferedReader r = new BufferedReader(new 
InputStreamReader(fs.open(p)));
+
+         // line:{"product":"big chew toy","count":3}
+         while (r.ready()) {
+           String line = r.readLine();
+           log.info("line:" + line);
+           // System.out.println("line:"+line);
+           Assert.assertTrue("validationg line : " + line, 
validator.apply(line));
+         }
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java
----------------------------------------------------------------------
diff --git 
a/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java
 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java
new file mode 100755
index 0000000..158f875
--- /dev/null
+++ 
b/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.bigtop.bigpetstore.contract;
+
+import java.util.Map;
+
+/**
+ * This is the contract for the web site. This object is created by each ETL
+ * tool : Summary stats.
+ */
+public abstract class PetStoreStatistics {
+
+    public abstract Map<String, ? extends Number> numberOfTransactionsByState()
+            throws Exception;
+
+    public abstract Map<String, ? extends Number> numberOfProductsByProduct()
+            throws Exception;
+
+}
\ No newline at end of file

[5/5] bigtop git commit: Add BPS Spark driver for new data generator. Re-organize BPS into MapReduce and Spark versions.

Reply via email to