Add BPS Spark driver for new data generator. Re-organize BPS into MapReduce and Spark versions.
[BIGTOP-1366][BigPetStore] Fix bigpetstore-mapreduce/build.gradle pom.xml path [BIGTOP-1366][BigPetStore] Refactor SparkDriver into more easily tested functions [BIGTOP-1366][BigPetStore] Add unit test for Spark Driver [BIGTOP-1366][BigPetStore] Update BPS Spark README to mention tests Signed-off-by: jayunit100 <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/6ec6cebf Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/6ec6cebf Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/6ec6cebf Branch: refs/heads/master Commit: 6ec6cebfc56f05520e477768441eaecdc505ede8 Parents: 801bade Author: RJ Nowling <[email protected]> Authored: Fri Nov 14 14:42:13 2014 -0500 Committer: jayunit100 <[email protected]> Committed: Wed Nov 19 18:43:04 2014 -0500 ---------------------------------------------------------------------- bigtop-bigpetstore/BPS_analytics.pig | 79 --- bigtop-bigpetstore/README.md | 214 +------ bigtop-bigpetstore/arch.dot | 41 -- .../bigpetstore-mapreduce/BPS_analytics.pig | 79 +++ .../bigpetstore-mapreduce/README.md | 201 +++++++ .../bigpetstore-mapreduce/arch.dot | 41 ++ .../bigpetstore-mapreduce/build.gradle | 292 ++++++++++ .../bigpetstore-mapreduce/pom.xml | 584 +++++++++++++++++++ .../bigpetstore-mapreduce/settings.gradle | 18 + .../bigtop/bigpetstore/BigPetStoreMahoutIT.java | 73 +++ .../bigtop/bigpetstore/BigPetStorePigIT.java | 100 ++++ .../org/apache/bigtop/bigpetstore/ITUtils.java | 168 ++++++ .../contract/PetStoreStatistics.java | 34 ++ .../bigtop/bigpetstore/etl/CrunchETL.java | 142 +++++ .../apache/bigtop/bigpetstore/etl/LineItem.java | 112 ++++ .../bigtop/bigpetstore/etl/PigCSVCleaner.java | 156 +++++ .../bigpetstore/generator/BPSGenerator.java | 108 ++++ .../generator/CustomerGenerator.scala | 97 +++ .../generator/PetStoreTransaction.java | 32 + .../PetStoreTransactionInputSplit.java | 73 +++ .../PetStoreTransactionsInputFormat.java | 139 +++++ .../bigpetstore/generator/util/Product.java | 80 +++ .../bigpetstore/generator/util/ProductType.java | 46 ++ .../bigpetstore/generator/util/State.java | 43 ++ .../bigpetstore/recommend/ItemRecommender.scala | 121 ++++ .../bigpetstore/util/BigPetStoreConstants.java | 41 ++ .../bigtop/bigpetstore/util/DeveloperTools.java | 58 ++ .../bigpetstore/util/NumericalIdUtils.java | 48 ++ .../util/PetStoreParseFunctions.java | 55 ++ .../bigtop/bigpetstore/util/StringUtils.java | 53 ++ .../bigpetstore/generator/DataForger.scala | 280 +++++++++ .../generator/TransactionIteratorFactory.scala | 106 ++++ .../bigtop/bigpetstore/docs/TestDocs.java | 37 ++ .../generator/TestNumericalIdUtils.java | 35 ++ .../TestPetStoreTransactionGeneratorJob.java | 104 ++++ .../src/test/resources/log4j.properties | 47 ++ .../bigtop/bigpetstore/ScalaTestSample.scala | 35 ++ bigtop-bigpetstore/bigpetstore-spark/README.md | 43 ++ .../bigpetstore-spark/build.gradle | 137 +++++ .../spark/generator/SparkDriver.scala | 244 ++++++++ .../spark/generator/SparkDriverSuite.scala | 60 ++ bigtop-bigpetstore/build.gradle | 292 ---------- bigtop-bigpetstore/pom.xml | 584 ------------------- bigtop-bigpetstore/settings.gradle | 18 - .../bigtop/bigpetstore/BigPetStoreMahoutIT.java | 73 --- .../bigtop/bigpetstore/BigPetStorePigIT.java | 100 ---- .../org/apache/bigtop/bigpetstore/ITUtils.java | 168 ------ .../contract/PetStoreStatistics.java | 34 -- .../bigtop/bigpetstore/etl/CrunchETL.java | 142 ----- .../apache/bigtop/bigpetstore/etl/LineItem.java | 112 ---- .../bigtop/bigpetstore/etl/PigCSVCleaner.java | 156 ----- .../bigpetstore/generator/BPSGenerator.java | 108 ---- .../generator/CustomerGenerator.scala | 97 --- .../generator/PetStoreTransaction.java | 32 - .../PetStoreTransactionInputSplit.java | 73 --- .../PetStoreTransactionsInputFormat.java | 139 ----- .../bigpetstore/generator/util/Product.java | 80 --- .../bigpetstore/generator/util/ProductType.java | 46 -- .../bigpetstore/generator/util/State.java | 43 -- .../bigpetstore/recommend/ItemRecommender.scala | 121 ---- .../bigpetstore/util/BigPetStoreConstants.java | 41 -- .../bigtop/bigpetstore/util/DeveloperTools.java | 58 -- .../bigpetstore/util/NumericalIdUtils.java | 48 -- .../util/PetStoreParseFunctions.java | 55 -- .../bigtop/bigpetstore/util/StringUtils.java | 53 -- .../bigpetstore/generator/DataForger.scala | 280 --------- .../generator/TransactionIteratorFactory.scala | 106 ---- .../bigtop/bigpetstore/docs/TestDocs.java | 37 -- .../generator/TestNumericalIdUtils.java | 35 -- .../TestPetStoreTransactionGeneratorJob.java | 104 ---- .../src/test/resources/log4j.properties | 47 -- .../bigtop/bigpetstore/ScalaTestSample.scala | 35 -- 72 files changed, 4145 insertions(+), 3628 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/BPS_analytics.pig ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/BPS_analytics.pig b/bigtop-bigpetstore/BPS_analytics.pig deleted file mode 100755 index 23e3749..0000000 --- a/bigtop-bigpetstore/BPS_analytics.pig +++ /dev/null @@ -1,79 +0,0 @@ ----------------------------------------------------------------------------- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with --- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 --- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, software --- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language governing permissions and --- limitations under the License. ------------------------------------------------------------------------------ - --- This is the analytics script that BigPetStore uses as an example for --- demos of how to do ad-hoc analytics on the cleaned transaction data. --- It is used in conjunction with the big pet store web app, soon to be --- added to apache bigtop (As of 4/12/2014, the --- corresponding web app to consume this scripts output is --- in jayunit100.github.io/bigpetstore). - --- invoke with two arguments, the input file , and the output file. -input /bps/gen -output /bps/analytics - --- FYI... --- If you run into errors, you can see them in --- ./target/failsafe-reports/TEST-org.bigtop.bigpetstore.integration.BigPetStorePigIT.xml - --- First , we load data in from a file, as tuples. --- in pig, relations like tables in a relational database --- so each relation is just a bunch of tuples. --- in this case csvdata will be a relation, --- where each tuple is a single petstore transaction. -csvdata = - LOAD '$input' using PigStorage() - AS ( - dump:chararray, - state:chararray, - transaction:int, - custId:long, - fname:chararray, - lname:chararray, - productId:int, - product:chararray, - price:float, - date:chararray); - --- RESULT: --- (BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 EST 1969) --- ... - --- Okay! Now lets group our data so we can do some stats. --- lets create a new relation, --- where each tuple will contain all transactions for a product in a state. - -state_product = group csvdata by ( state, product ) ; - --- RESULT --- ((storeCode_AK,dog-food) , {(BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 EST 1969)}) -- --- ... - - --- Okay now lets make some summary stats so that the boss man can --- decide which products are hottest in which states. - --- Note that for the "groups", we tease out each individual field here for formatting with --- the BigPetStore visualization app. -summary1 = FOREACH state_product generate STRSPLIT(group.state,'_').$1 as sp, group.product, COUNT($1); - - --- Okay, the stats look like this. Lets clean them up. --- (storeCode_AK,cat-food) 2530 --- (storeCode_AK,dog-food) 2540 --- (storeCode_AK,fuzzy-collar) 2495 - -dump summary1; - -store summary1 into '$output'; http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/README.md ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/README.md b/bigtop-bigpetstore/README.md index c806d57..6f04e8f 100644 --- a/bigtop-bigpetstore/README.md +++ b/bigtop-bigpetstore/README.md @@ -1,201 +1,33 @@ -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. See the NOTICE file distributed with -this work for additional information regarding copyright ownership. -The ASF licenses this file to You under the Apache License, Version 2.0 -(the "License"); you may not use this file except in compliance with -the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -(See accompanying source code for licensing information) - BigPetStore ============ -Apache Bigtop/Hadoop Ecosystem Demo ------------------------------------ -This software is created to demonstrate Apache Bigtop for processing -big data sets. - -Architecture ------------- -The application consists of the following modules - -* generator: generates raw data on the dfs -* recommendations: Apache Mahout demo code for generating recommendations by anaylyzing the transaction records. This feature can be tracked at this [`JIRA` issue](https://issues.apache.org/jira/browse/BIGTOP-1272) -* Pig: demo code for processing the data using Apache Pig -* Hive: demo code for processing the data using Apache Hive. This part is not complete yet. We are working on it. You can track it using this [`JIRA` issue](https://issues.apache.org/jira/browse/BIGTOP-1270) -* Crunch: demo code for processing the data using Apache Crunch - -Build Instructions ------------------- - -You'll need to have version 2.0 of [`gradle`](http://www.gradle.org/downloads) installed and set-up correctly in order to follow along these instructions. -We could have used the [`gradle-wrapper`](http://www.gradle.org/docs/current/userguide/gradle_wrapper.html) to avoid having to install `gradle`, but the `bigtop` project includes all `gradle*` directories in `.gitignore`. So, that's not going to work. - -### Build the JAR - -`gradle clean build` will build the bigpetstore `jar`. The `jar` will be located in the `build\libs` directory. - -### Run Intergration Tests With - * Pig profile: `gradle clean integrationTest -P ITProfile=pig` - * Mahout Profile: `gradle clean integrationTest -P ITProfile=mahout` - * Crunch profile: Not Implemented Yet - * Hive profile: Not implemented yet. - -If you don't specify any profile-name, or if you specify an invalid-name for the `integrationTest` task, no integration tests will be run. - -*Note:* At this stage, only the `Pig` and `Mahout` profiles are working. Will continue to update this area as further work is completed. - -For Eclipse Users ------------------ - -1. Run `gradle eclipse` to create an eclipse project. -2. Import the project into eclipse. - -*Note* whenever you modify the dependencies, you will need to run the `gradle eclipse` again. Refresh the project after doing so. You'd also need to have the `scala` plugin installed. Also, having a `gradle` plugin would be quite useful as well, for ex. when you want to update dependencies. - -High level summary ------------------- - -The bigpetstore project exemplifies the hadoop ecosystem for newcomers, and also for benchmarking and -comparing functional space of tools. - -The end goal is to run many different implementations of each phase -using different tools, thus exemplifying overlap of tools in the hadoop ecosystem, and allowing people to benchmark/compare tools -using a common framework and easily understood use case - - -How it works (To Do) --------------------- - -### Phase 1: Generating pet store data: - -The first step is to generate a raw data set. This is done by the "GeneratePetStoreTransactionsInputFormat": - -The first MapReduce job in the pipeline runs a simple job which takes this input format and forwards -its output. The result is a list of "transactions". Each transaction is a tuple of the format - - *{state,name,date,price,product}.* - -### Phase 2: Processing the data - -The next phase of the application processes the data to create basic aggregations. For example with both pig and hive these could easily include - -- *Number of transactions by state* or -- *Most valuable customer by state* or -- *Most popular items by state* - - -### Phase 3: Clustering the states by all fields - - Now, say we want to cluster the states, so as to put different states into different buying categories - for our marketing team to deal with differently. - -### Phase 4: Visualizing the Data in D3. - - - try it [on the gh-pages branch](http://jayunit100.github.io/bigpetstore/) - - -Running on a hadoop cluster ---------------------------- - -*Note:* For running the code using the `hadoop jar` command instead of the `gradle` tasks, you will need to set the classpath appropriately. The discussion after [this comment][jira-mahout] in JIRA could also be useful apart from these instructions. - -### Build the fat-jar - -We are going to use a fat-jar in order to avoid specifying the entire classpath ourselves. - -The fat-jar is required when we are running the application on a hadoop cluster. The other way would be to specify all the dependencies (including the transitive ones) manually while running the hadoop job. Fat-jars make it easier to bundle almost all the dependencies inside the distribution jar itself. - -``` -gradle clean shadowJar -Pfor-cluster -``` - -This command will build the fat-jar with all the dependencies bundled in except the hadoop, mahout and pig dependencies, which we'll specify using `-libjars` option while running the hadoop job. These dependencies are excluded to avoid conflicts with the jars provided by hadoop itself. - -The generated jar will be inside the `build/libs` dir, with name like `BigPetStore-x.x.x-SNAPSHOT-all.jar`. For the remainig discussion I'll refer to this jar by `bps.jar`. - -### Get the mahout and pig jars - -You'll need both mahout and pig jars with the hadoop classes excluded. Commonly, you can find both of these in their respective distros. The required pig jar is generally named like `pig-x.x.x-withouthadoop.jar` and the mahout jar would be named like `mahout-core-job.jar`. If you want, you can build those yourself by following the instructions in [this JIRA comment][jira-mahout]]. For the remaining discussion, I am going to refer to these two jars by `pig-withouthadoop.jar` and `mahout-core-job.jar`. - -### Setup the classpath for hadoop nodes in the cluster - -``` -export JARS="/usr/lib/pig/pig-withouthadoop.jar,/usr/lib/mahout/mahout-core-job.jar" -``` - -We also need these jars to be present on the client side to kick-off the jobs. Reusing the `JARS` variable to put the same jars on the client classpath. - -``` -export HADOOP_CLASSPATH=`echo $JARS | sed s/,/:/g` -``` - -### Generate the data - -``` -hadoop jar bps.jar org.apache.bigtop.bigpetstore.generator.BPSGenerator 1000000 bigpetstore/gen -``` - -### Clean with pig - -``` -hadoop jar bps.jar org.apache.bigtop.bigpetstore.etl.PigCSVCleaner -libjars $JARS bigpetstore/gen/ bigpetstore/ custom_pigscript.pig -``` - -### Analyze and generate recommendations with mahout - -``` -hadoop jar bps.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender -libjars $JARS bigpetstore/pig/Mahout bigpetstore/Mahout/AlsFactorization bigpetstore/Mahout/AlsRecommendations -``` - - -... (will add more steps as we add more phases to the workflow) ... - - -Example of running in EMR --------------------------- -- Put the jar in s3. Right now there is a copy of it at the url below. - -- Download the elastic-mapreduce ruby shell script. -create your "credentials.json" file. - -Now run this to generate 1,000,000 pet store transactions: - -./elastic-mapreduce --create --jar s3://bigpetstore/bigpetstore.jar \ ---main-class org.apache.bigtop.bigpetstore.generator.BPSGenerator \ ---num-instances 10 \ ---arg 1000000 \ ---arg s3://bigpetstore/data/generated \ ---hadoop-version "2.2.0" \ ---master-instance-type m1.medium \ ---slave-instance-type m1.medium +BigPetStore is a family of example applications for the Hadoop/Spark +ecosystems. BigPetStore generates and analyzes synthetic transaction data for +a fictional chain of petstores. -...Now lets clean the data with pig... +BigPetStore has the following aims: -Replace the above "main-class", and "--arg" options with ---main-class org.apache.bigtop.bigpetstore.etl.PigCSVCleaner ---arg s3://bigpetstore/data/generated ---arg s3://bigpetstore/data/pig_out -(optional, you can send a script referencing the cleaned $input path to do some -custom analytics, see the BPS_Analytics.pig script and companion -http://jayunit100.github.io/bigpetstore) as an example). ---arg s3://path_to_custom_analytics_script.pig +* Serve as a demo application to showcase capabilities of the BigTop distribution +* Perform integration testing for BigTop's components +* Server as a template for building / packaging Hadoop/Spark applications +* Provide scalable generation of complex synthetic data +* Examples for using and integrating components such as Pig, Hive, Spark SQL, etc. +* Examples of how to perform popular analytics tasks -(note about pig: We support custom pig scripts.... for EMR, custom pig scripts will need to point to a -local path, so youll have to put that script on the machine as part -of EMR setup w/ a custom script). +BigPetStore has the following components to date: -... +* Gradle build systems supporting Java, Scala, and Groovy +* Data generators +* Analytics + * ETL + * Item Recommenders -And so on. +The BigPetStore application was originally developed for MapReduce and associated +components such as Pig, Hive, Mahout, Crunch, etc. With the increasing popularity +and importance of Spark, BigPetStore has been expanded to support Spark. To support +the use case of deploying to pure MapReduce or Spark environments, we've elected to +separate the MapReduce and Spark support into separate applications. You can find the +two applications, along with futher documentation, under `bigpetstore-mapreduce` and +`bigpetstore-spark`, respectively. -[jira-mahout]: https://issues.apache.org/jira/browse/BIGTOP-1272?focusedCommentId=14076023&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1407602 http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/arch.dot ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/arch.dot b/bigtop-bigpetstore/arch.dot deleted file mode 100644 index 7d17c5a..0000000 --- a/bigtop-bigpetstore/arch.dot +++ /dev/null @@ -1,41 +0,0 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -digraph bigpetstore { - - node [shape=record]; - - - BPSAnalytics [label="BPSAnalytics.pig" ,style="rounded, filled", shape=diamond]; - CUSTOMER_PAGE [label="CUSTOMER_PAGE|json|CUSTOMER_PAGE/part*"]; - DIRTY_CSV [label="DIRTY_CSV|fname lname -prod , price ,prod,..|generated/part*"]; - CSV [label="CSV|fname,lname,prod,price,date,xcoord,ycoord,...|cleaned/part*"]; - MAHOUT_VIEW_INPUT [label="MAHOUT_VIEW | (user-id) 10001 (product-id) 203 (implicit-rating) 1 | cleaned/Mahout/part*" ]; - MAHOUT_ALS [label="Parallel ALS Recommender output | (user-id) 10001 [(product-id) 201: (recommendation-strength 0-1)0.546] | Mahout/AlsRecommendations/part*" ]; - - Generate -> DIRTY_CSV [label="hadoop jar bigpetstore.jar org.bigtop.bigpetstore.generator.BPSGenerator 100 bps/generated/"] ; - DIRTY_CSV -> pig [label="hadoop jar bigpetstore.jar org.bigtop.bigpetstore.etl.PigCSVCleaner bps/generated/ bps/cleaned/ "]; - - pig -> CSV [label="pig query to clean up generated transaction records"]; - pig -> MAHOUT_VIEW_INPUT [label="pig query to produce mahout input format"]; - - MAHOUT_VIEW_INPUT -> ParallelALSFactorizationJob [label="hadoop jar bigpetstore.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender cleaned/Mahout Mahout/AlsFactorization Mahout/AlsRecommendations"]; - ParallelALSFactorizationJob -> "Mahout RecommenderJob" - "Mahout RecommenderJob" -> MAHOUT_ALS - - CSV -> BPSAnalytics; - BPSAnalytics -> pig_job2; - pig_job2 -> CUSTOMER_PAGE [label=""]; -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig b/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig new file mode 100755 index 0000000..8516a7d --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/BPS_analytics.pig @@ -0,0 +1,79 @@ +---------------------------------------------------------------------------- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +----------------------------------------------------------------------------- + +-- This is the analytics script that BigPetStore uses as an example for +-- demos of how to do ad-hoc analytics on the cleaned transaction data. +-- It is used in conjunction with the big pet store web app, soon to be +-- added to apache bigtop (As of 4/12/2014, the +-- corresponding web app to consume this scripts output is +-- in jayunit100.github.io/bigpetstore). + +-- invoke with two arguments, the input file , and the output file. -input /bps/gen -output /bps/analytics + +-- FYI... +-- If you run into errors, you can see them in +-- ./target/failsafe-reports/TEST-org.bigtop.bigpetstore.integration.BigPetStorePigIT.xml + +-- First , we load data in from a file, as tuples. +-- in pig, relations like tables in a relational database +-- so each relation is just a bunch of tuples. +-- in this case csvdata will be a relation, +-- where each tuple is a single petstore transaction. +csvdata = + LOAD '$input' using PigStorage() + AS ( + dump:chararray, + state:chararray, + transaction:int, + custId:long, + fname:chararray, + lname:chararray, + productId:int, + product:chararray, + price:float, + date:chararray); + +-- RESULT: +-- (BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 EST 1969) +-- ... + +-- Okay! Now lets group our data so we can do some stats. +-- lets create a new relation, +-- where each tuple will contain all transactions for a product in a state. + +state_product = group csvdata by ( state, product ) ; + +-- RESULT +-- ((storeCode_AK,dog-food) , {(BigPetStore,storeCode_AK,1,11,jay,guy,3,dog-food,10.5,Thu Dec 18 12:17:10 EST 1969)}) -- +-- ... + + +-- Okay now lets make some summary stats so that the boss man can +-- decide which products are hottest in which states. + +-- Note that for the "groups", we tease out each individual field here for formatting with +-- the BigPetStore visualization app. +summary1 = FOREACH state_product generate STRSPLIT(group.state,'_').$1 as sp, group.product, COUNT($1); + + +-- Okay, the stats look like this. Lets clean them up. +-- (storeCode_AK,cat-food) 2530 +-- (storeCode_AK,dog-food) 2540 +-- (storeCode_AK,fuzzy-collar) 2495 + +dump summary1; + +store summary1 into '$output'; http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/README.md ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/README.md b/bigtop-bigpetstore/bigpetstore-mapreduce/README.md new file mode 100644 index 0000000..c806d57 --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/README.md @@ -0,0 +1,201 @@ +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +(See accompanying source code for licensing information) + +BigPetStore +============ + +Apache Bigtop/Hadoop Ecosystem Demo +----------------------------------- +This software is created to demonstrate Apache Bigtop for processing +big data sets. + +Architecture +------------ +The application consists of the following modules + +* generator: generates raw data on the dfs +* recommendations: Apache Mahout demo code for generating recommendations by anaylyzing the transaction records. This feature can be tracked at this [`JIRA` issue](https://issues.apache.org/jira/browse/BIGTOP-1272) +* Pig: demo code for processing the data using Apache Pig +* Hive: demo code for processing the data using Apache Hive. This part is not complete yet. We are working on it. You can track it using this [`JIRA` issue](https://issues.apache.org/jira/browse/BIGTOP-1270) +* Crunch: demo code for processing the data using Apache Crunch + +Build Instructions +------------------ + +You'll need to have version 2.0 of [`gradle`](http://www.gradle.org/downloads) installed and set-up correctly in order to follow along these instructions. +We could have used the [`gradle-wrapper`](http://www.gradle.org/docs/current/userguide/gradle_wrapper.html) to avoid having to install `gradle`, but the `bigtop` project includes all `gradle*` directories in `.gitignore`. So, that's not going to work. + +### Build the JAR + +`gradle clean build` will build the bigpetstore `jar`. The `jar` will be located in the `build\libs` directory. + +### Run Intergration Tests With + * Pig profile: `gradle clean integrationTest -P ITProfile=pig` + * Mahout Profile: `gradle clean integrationTest -P ITProfile=mahout` + * Crunch profile: Not Implemented Yet + * Hive profile: Not implemented yet. + +If you don't specify any profile-name, or if you specify an invalid-name for the `integrationTest` task, no integration tests will be run. + +*Note:* At this stage, only the `Pig` and `Mahout` profiles are working. Will continue to update this area as further work is completed. + +For Eclipse Users +----------------- + +1. Run `gradle eclipse` to create an eclipse project. +2. Import the project into eclipse. + +*Note* whenever you modify the dependencies, you will need to run the `gradle eclipse` again. Refresh the project after doing so. You'd also need to have the `scala` plugin installed. Also, having a `gradle` plugin would be quite useful as well, for ex. when you want to update dependencies. + +High level summary +------------------ + +The bigpetstore project exemplifies the hadoop ecosystem for newcomers, and also for benchmarking and +comparing functional space of tools. + +The end goal is to run many different implementations of each phase +using different tools, thus exemplifying overlap of tools in the hadoop ecosystem, and allowing people to benchmark/compare tools +using a common framework and easily understood use case + + +How it works (To Do) +-------------------- + +### Phase 1: Generating pet store data: + +The first step is to generate a raw data set. This is done by the "GeneratePetStoreTransactionsInputFormat": + +The first MapReduce job in the pipeline runs a simple job which takes this input format and forwards +its output. The result is a list of "transactions". Each transaction is a tuple of the format + + *{state,name,date,price,product}.* + +### Phase 2: Processing the data + +The next phase of the application processes the data to create basic aggregations. For example with both pig and hive these could easily include + +- *Number of transactions by state* or +- *Most valuable customer by state* or +- *Most popular items by state* + + +### Phase 3: Clustering the states by all fields + + Now, say we want to cluster the states, so as to put different states into different buying categories + for our marketing team to deal with differently. + +### Phase 4: Visualizing the Data in D3. + + - try it [on the gh-pages branch](http://jayunit100.github.io/bigpetstore/) + + +Running on a hadoop cluster +--------------------------- + +*Note:* For running the code using the `hadoop jar` command instead of the `gradle` tasks, you will need to set the classpath appropriately. The discussion after [this comment][jira-mahout] in JIRA could also be useful apart from these instructions. + +### Build the fat-jar + +We are going to use a fat-jar in order to avoid specifying the entire classpath ourselves. + +The fat-jar is required when we are running the application on a hadoop cluster. The other way would be to specify all the dependencies (including the transitive ones) manually while running the hadoop job. Fat-jars make it easier to bundle almost all the dependencies inside the distribution jar itself. + +``` +gradle clean shadowJar -Pfor-cluster +``` + +This command will build the fat-jar with all the dependencies bundled in except the hadoop, mahout and pig dependencies, which we'll specify using `-libjars` option while running the hadoop job. These dependencies are excluded to avoid conflicts with the jars provided by hadoop itself. + +The generated jar will be inside the `build/libs` dir, with name like `BigPetStore-x.x.x-SNAPSHOT-all.jar`. For the remainig discussion I'll refer to this jar by `bps.jar`. + +### Get the mahout and pig jars + +You'll need both mahout and pig jars with the hadoop classes excluded. Commonly, you can find both of these in their respective distros. The required pig jar is generally named like `pig-x.x.x-withouthadoop.jar` and the mahout jar would be named like `mahout-core-job.jar`. If you want, you can build those yourself by following the instructions in [this JIRA comment][jira-mahout]]. For the remaining discussion, I am going to refer to these two jars by `pig-withouthadoop.jar` and `mahout-core-job.jar`. + +### Setup the classpath for hadoop nodes in the cluster + +``` +export JARS="/usr/lib/pig/pig-withouthadoop.jar,/usr/lib/mahout/mahout-core-job.jar" +``` + +We also need these jars to be present on the client side to kick-off the jobs. Reusing the `JARS` variable to put the same jars on the client classpath. + +``` +export HADOOP_CLASSPATH=`echo $JARS | sed s/,/:/g` +``` + +### Generate the data + +``` +hadoop jar bps.jar org.apache.bigtop.bigpetstore.generator.BPSGenerator 1000000 bigpetstore/gen +``` + +### Clean with pig + +``` +hadoop jar bps.jar org.apache.bigtop.bigpetstore.etl.PigCSVCleaner -libjars $JARS bigpetstore/gen/ bigpetstore/ custom_pigscript.pig +``` + +### Analyze and generate recommendations with mahout + +``` +hadoop jar bps.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender -libjars $JARS bigpetstore/pig/Mahout bigpetstore/Mahout/AlsFactorization bigpetstore/Mahout/AlsRecommendations +``` + + +... (will add more steps as we add more phases to the workflow) ... + + +Example of running in EMR +-------------------------- +- Put the jar in s3. Right now there is a copy of it at the url below. + +- Download the elastic-mapreduce ruby shell script. +create your "credentials.json" file. + +Now run this to generate 1,000,000 pet store transactions: + +./elastic-mapreduce --create --jar s3://bigpetstore/bigpetstore.jar \ +--main-class org.apache.bigtop.bigpetstore.generator.BPSGenerator \ +--num-instances 10 \ +--arg 1000000 \ +--arg s3://bigpetstore/data/generated \ +--hadoop-version "2.2.0" \ +--master-instance-type m1.medium \ +--slave-instance-type m1.medium + +...Now lets clean the data with pig... + +Replace the above "main-class", and "--arg" options with +--main-class org.apache.bigtop.bigpetstore.etl.PigCSVCleaner +--arg s3://bigpetstore/data/generated +--arg s3://bigpetstore/data/pig_out +(optional, you can send a script referencing the cleaned $input path to do some +custom analytics, see the BPS_Analytics.pig script and companion +http://jayunit100.github.io/bigpetstore) as an example). +--arg s3://path_to_custom_analytics_script.pig + +(note about pig: We support custom pig scripts.... for EMR, custom pig scripts will need to point to a +local path, so youll have to put that script on the machine as part +of EMR setup w/ a custom script). + +... + +And so on. + + +[jira-mahout]: https://issues.apache.org/jira/browse/BIGTOP-1272?focusedCommentId=14076023&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1407602 http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot b/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot new file mode 100644 index 0000000..7d17c5a --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/arch.dot @@ -0,0 +1,41 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +digraph bigpetstore { + + node [shape=record]; + + + BPSAnalytics [label="BPSAnalytics.pig" ,style="rounded, filled", shape=diamond]; + CUSTOMER_PAGE [label="CUSTOMER_PAGE|json|CUSTOMER_PAGE/part*"]; + DIRTY_CSV [label="DIRTY_CSV|fname lname -prod , price ,prod,..|generated/part*"]; + CSV [label="CSV|fname,lname,prod,price,date,xcoord,ycoord,...|cleaned/part*"]; + MAHOUT_VIEW_INPUT [label="MAHOUT_VIEW | (user-id) 10001 (product-id) 203 (implicit-rating) 1 | cleaned/Mahout/part*" ]; + MAHOUT_ALS [label="Parallel ALS Recommender output | (user-id) 10001 [(product-id) 201: (recommendation-strength 0-1)0.546] | Mahout/AlsRecommendations/part*" ]; + + Generate -> DIRTY_CSV [label="hadoop jar bigpetstore.jar org.bigtop.bigpetstore.generator.BPSGenerator 100 bps/generated/"] ; + DIRTY_CSV -> pig [label="hadoop jar bigpetstore.jar org.bigtop.bigpetstore.etl.PigCSVCleaner bps/generated/ bps/cleaned/ "]; + + pig -> CSV [label="pig query to clean up generated transaction records"]; + pig -> MAHOUT_VIEW_INPUT [label="pig query to produce mahout input format"]; + + MAHOUT_VIEW_INPUT -> ParallelALSFactorizationJob [label="hadoop jar bigpetstore.jar org.apache.bigtop.bigpetstore.recommend.ItemRecommender cleaned/Mahout Mahout/AlsFactorization Mahout/AlsRecommendations"]; + ParallelALSFactorizationJob -> "Mahout RecommenderJob" + "Mahout RecommenderJob" -> MAHOUT_ALS + + CSV -> BPSAnalytics; + BPSAnalytics -> pig_job2; + pig_job2 -> CUSTOMER_PAGE [label=""]; +} http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle b/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle new file mode 100644 index 0000000..c80672c --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/build.gradle @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +apply plugin: "java" +apply plugin: "eclipse" +// TODO add idea module config. +apply plugin: "idea" +apply plugin: "scala" +apply plugin: 'com.github.johnrengelman.shadow' + +buildscript { + repositories { jcenter() } + dependencies { + classpath 'com.github.jengelman.gradle.plugins:shadow:1.0.2' + } +} + +// Read the groupId and version properties from the "parent" bigtop project. +// It would be better if there was some better way of doing this. Howvever, +// at this point, we have to do this (or some variation thereof) since gradle +// projects can't have maven projects as parents (AFAIK. If there is a way to do it, +// it doesn't seem to be well-documented). +def setProjectProperties() { + Node xml = new XmlParser().parse("../../pom.xml") + group = xml.groupId.first().value().first() + version = xml.version.first().value().first() +} + +setProjectProperties() +description = """""" + +// We are using 1.7 as gradle can't play well when java 8 and scala are combined. +// There is an open issue here: http://issues.gradle.org/browse/GRADLE-3023 +// There is talk of this being resolved in the next version of gradle. Till then, +// we are stuck with java 7. But we do have scala if we want more syntactic sugar. +sourceCompatibility = 1.7 +targetCompatibility = 1.7 + +// Specify any additional project properties. +ext { + slf4jVersion = "1.7.5" + guavaVersion = "15.0" + datanucleusVersion = "3.2.2" + datanucleusJpaVersion = "3.2.1" + bonecpVersion = "0.8.0.RELEASE" + derbyVersion = "10.10.1.1" + + // from horton-works repo. They compile mahout-core against hadoop2.x. These + // mahout is compiled against 2.4.0 + hadoopVersion = "2.4.0.2.1.2.0-402" + mahoutVersion = "0.9.0.2.1.2.0-402" +} + +repositories { + mavenCentral() + maven { + url "http://repo.hortonworks.com/content/repositories/releases/" + } +} + +tasks.withType(AbstractCompile) { + options.encoding = 'UTF-8' + options.compilerArgs << "-Xlint:all" +} + +tasks.withType(ScalaCompile) { + // Enables incremental compilation. + // http://www.gradle.org/docs/current/userguide/userguide_single.html#N12F78 + scalaCompileOptions.useAnt = false +} + +tasks.withType(Test) { + testLogging { + // Uncomment this if you want to see the console output from the tests. + // showStandardStreams = true + events "passed", "skipped", "failed" + // show standard out and standard error of the test JVM(s) on the console + //showStandardStreams = true + } +} + +test { + exclude "**/*TestPig.java", "**/*TestHiveEmbedded.java", "**/*TestCrunch.java", "**/*TestPetStoreTransactionGeneratorJob.java" +} + +// Create a separate source-set for the src/integrationTest set of classes. The convention here +// is that gradle will look for a directory with the same name as that of the specified source-set +// under the 'src' directory. So, in this case, it will look for a directory named 'src/integrationTest' +// since the name of the source-set is 'integrationTest' +sourceSets { + main { + java.srcDirs = []; + scala.srcDirs = ["src/main/scala", "src/main/java"] + } + // The main and test source-sets are configured by both java and scala plugins. They contain + // all the src/main and src/test classes. The following statements make all of those classes + // available on the classpath for the integration-tests, for both java and scala. + integrationTest { + java { + compileClasspath += main.output + test.output + runtimeClasspath += main.output + test.output + } + scala { + compileClasspath += main.output + test.output + runtimeClasspath += main.output + test.output + } + } +} + +// Creating a source-set automatically add a couple of corresponding configurations (when java/scala +// plugins are applied). The convention for these configurations is <sourceSetName>Compile and +// <sourceSetName>Runtime. The following statements declare that all the dependencies from the +// testCompile configuration will now be available for integrationTestCompile, and all the +// dependencies (and other configuration that we might have provided) for testRuntime will be +// available for integrationTestRuntime. For ex. the testCompile configuration has a dependency on +// jUnit and scalatest. This makes them available for the integration tests as well. +configurations { + integrationTestCompile { + extendsFrom testCompile + } + + integrationTestRuntime { + extendsFrom integrationTestCompile, testRuntime + } +} + +// To see the API that is being used here, consult the following docs +// http://www.gradle.org/docs/current/dsl/org.gradle.api.artifacts.ResolutionStrategy.html +def updateDependencyVersion(dependencyDetails, dependencyString) { + def parts = dependencyString.split(':') + def group = parts[0] + def name = parts[1] + def version = parts[2] + if (dependencyDetails.requested.group == group + && dependencyDetails.requested.name == name) { + dependencyDetails.useVersion version + } +} + +def setupPigIntegrationTestDependencyVersions(dependencyResolveDetails) { + // This is the way we override the dependencies. + updateDependencyVersion dependencyResolveDetails, "joda-time:joda-time:2.2" +} + +def setupCrunchIntegrationTestDependencyVersions(dependencyResolveDetails) { + // Specify any dependencies that you want to override for crunch integration tests. +} + +def setupMahoutIntegrationTestDependencyVersions(dependencyResolveDetails) { + // Specify any dependencies that you want to override for mahout integration tests. +} + + +task integrationTest(type: Test, dependsOn: test) { + + testClassesDir = sourceSets.integrationTest.output.classesDir + classpath = sourceSets.integrationTest.runtimeClasspath + + if(!project.hasProperty('ITProfile')) { + // skip integration-tests if no profile has been specified. + integrationTest.onlyIf { false } + return; + } + + def patternsToInclude + def dependencyConfigClosure + def skipDependencyUpdates = false + // Select the pattern for test classes that should be executed, and the dependency + // configuration function to be called based on the profile name specified at the command line. + switch (project.ITProfile) { + case "pig": + patternsToInclude = "*PigIT*" + dependencyConfigClosure = { setupPigIntegrationTestDependencyVersions(it) } + break + case "crunch": + patternsToInclude = "*CrunchIT*" + dependencyConfigClosure = { setupCrunchIntegrationTestDependencyVersions(it) } + break + case "mahout": + patternsToInclude = "*MahoutIT*" + dependencyConfigClosure = { setupMahoutIntegrationTestDependencyVersions(it) } + break + // skip integration-tests if the passed in profile-name is not valid + default: integrationTest.onlyIf { false }; return + } + + + filter { includeTestsMatching patternsToInclude } + + // This is the standard way gradle allows overriding each specific dependency. + // see: http://www.gradle.org/docs/current/dsl/org.gradle.api.artifacts.ResolutionStrategy.html + project.configurations.all { + resolutionStrategy { + eachDependency { + dependencyConfigClosure(it) + } + } + } +} + +dependencies { + compile "org.kohsuke:graphviz-api:1.0" + compile "org.apache.crunch:crunch-core:0.9.0-hadoop2" + compile "com.jolbox:bonecp:${project.bonecpVersion}" + compile "org.apache.derby:derby:${project.derbyVersion}" + compile "com.google.guava:guava:${project.guavaVersion}" + compile "commons-lang:commons-lang:2.6" + compile "joda-time:joda-time:2.3" + compile "org.apache.commons:commons-lang3:3.1" + compile "com.google.protobuf:protobuf-java:2.5.0" + compile "commons-logging:commons-logging:1.1.3" + compile "com.thoughtworks.xstream:xstream:+" + compile "org.apache.lucene:lucene-core:+" + compile "org.apache.lucene:lucene-analyzers-common:+" + compile "org.apache.solr:solr-commons-csv:3.5.0" + + compile group: "org.apache.pig", name: "pig", version: "0.12.0", classifier:"h2" + compile "org.slf4j:slf4j-api:${project.slf4jVersion}" + compile "log4j:log4j:1.2.12" + compile "org.slf4j:slf4j-log4j12:${project.slf4jVersion}" + compile "org.datanucleus:datanucleus-core:${project.datanucleusVersion}" + compile "org.datanucleus:datanucleus-rdbms:${project.datanucleusJpaVersion}" + compile "org.datanucleus:datanucleus-api-jdo:${project.datanucleusJpaVersion}" + compile "org.datanucleus:datanucleus-accessplatform-jdo-rdbms:${project.datanucleusJpaVersion}" + compile group: "org.apache.mrunit", name: "mrunit", version: "1.0.0", classifier:"hadoop2" + + compile "org.jfairy:jfairy:0.2.4" + + // from horton-works repo. They compile mahout-core against hadoop2.x + compile "org.apache.hadoop:hadoop-client:${hadoopVersion}" + compile "org.apache.mahout:mahout-core:${mahoutVersion}" + + compile 'org.scala-lang:scala-library:2.11.0' + + testCompile "junit:junit:4.11" + testCompile "org.hamcrest:hamcrest-all:1.3" + testCompile "org.scalatest:scalatest_2.11:2.1.7" +} + +configurations { + hadoopClusterRuntime { + // extendsFrom integrationTestRuntime + if(project.hasProperty('for-cluster')) { + excludeRules += [getGroup: { 'org.apache.crunch' }, getModule: { 'crunch-core' } ] as ExcludeRule + excludeRules += [getGroup: { 'org.apache.pig' }, getModule: { 'pig' } ] as ExcludeRule + excludeRules += [getGroup: { 'org.apache.mahout' }, getModule: { 'mahout-core' } ] as ExcludeRule + excludeRules += [getGroup: { 'org.apache.hadoop' }, getModule: { 'hadoop-client' } ] as ExcludeRule + } + } +} + +task listJars << { + configurations.shadow.each { println it.name } +} + +def copyDependencyJarsForHadoopCluster() { + copy { + from configurations.hadoopClusterRuntime + into 'build/libs' + } +} + +build { + doLast { + copyDependencyJarsForHadoopCluster() + } +} + +eclipse { + classpath { + // Add the dependencies and the src dirs for the integrationTest source-set to the + // .classpath file that will be generated by the eclipse plugin. + plusConfigurations += [configurations.integrationTestCompile] + // Comment out the following two lines if you want to generate an eclipse project quickly. + downloadSources = true + downloadJavadoc = false + } +} http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml b/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml new file mode 100644 index 0000000..a5fc979 --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/pom.xml @@ -0,0 +1,584 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <groupId>org.apache.bigtop</groupId> + <artifactId>BigPetStore</artifactId> + <version>0.9.0-SNAPSHOT</version> + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> + <slf4j.version>1.7.5</slf4j.version> + <guava.version>15.0</guava.version> + <hadoop.version>2.2.0</hadoop.version> + <hive.version>0.12.0</hive.version> + <datanucleus.version>3.2.2</datanucleus.version> + <datanucleus.jpa.version>3.2.1</datanucleus.jpa.version> + <bonecp.version>0.9.0-SNAPSHOT.RELEASE</bonecp.version> + <derby.version>10.10.1.1</derby.version> + <plugin.surefire.version>2.17</plugin.surefire.version> + </properties> + + <dependencies> + <dependency> + <groupId>org.kohsuke</groupId> + <artifactId>graphviz-api</artifactId> + <version>1.0</version> + </dependency> + + <dependency> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch-core</artifactId> + <version>0.9.0-hadoop2</version> + </dependency> + + <!-- misc deps --> + <dependency> + <groupId>com.jolbox</groupId> + <artifactId>bonecp</artifactId> + <version>${bonecp.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.derby</groupId> + <artifactId>derby</artifactId> + <version>${derby.version}</version> + </dependency> + + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>${guava.version}</version> + </dependency> + + <!-- From pig profile --> + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + <version>2.6</version> + </dependency> + + <dependency> + <groupId>joda-time</groupId> + <artifactId>joda-time</artifactId> + <version>2.3</version> + </dependency> + <!-- end pig profile --> + <!-- From hive profile --> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + <version>3.1</version> + </dependency> + <!-- end hive profile --> + <!-- From Crunch profile --> + <dependency> + <groupId>com.google.protobuf</groupId> + <artifactId>protobuf-java</artifactId> + <version>2.5.0</version> + </dependency> + <!-- end crunch profile --> + <!-- From Mahout profile --> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>1.1.3</version> + </dependency> + <dependency> + <groupId>org.apache.mahout</groupId> + <artifactId>mahout-math</artifactId> + <version>0.9</version> + </dependency> + <dependency> + <groupId>com.thoughtworks.xstream</groupId> + <artifactId>xstream</artifactId> + <version>LATEST</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>LATEST</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>LATEST</version> + </dependency> + <dependency> + <groupId>org.apache.mahout.commons</groupId> + <artifactId>commons-cli</artifactId> + <version>LATEST</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <version>LATEST</version> + </dependency> + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-commons-csv</artifactId> + <version>3.5.0</version> + </dependency> + <!-- end Mahout profile --> + + <!-- TODO ask question about this comment --> + <!-- We keep this at top level so that mvn eclipse:eclipse creates a nice + tidy project, but its a little messy. later we'll create a profile for eclipse + and move this (and other deps) into profiles as needed. Important: Remove + this dependency when running hive integration tests... --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <version>${hadoop.version}</version> + </dependency> + <!-- TODO ask question about this comment --> + <!-- mahout deps : may need to turn these on/off when testing mahout locally --> + <!-- For testing on my machine, I created a bigpetstore mahout jar which + is compiled for 2.2.0 . Or substitute this with the standard apache mahout-core + but not sure if it will work. --> + <dependency> + <groupId>org.apache.mahout</groupId> + <artifactId>mahout-core</artifactId> + <version>0.8</version> + </dependency> + <!-- pig deps --> + <dependency> + <groupId>org.apache.pig</groupId> + <artifactId>pig</artifactId> + <classifier>h2</classifier> + <version>0.12.0</version> + </dependency> + + <!--logging --> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + <version>${slf4j.version}</version> + </dependency> + <dependency> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + <version>1.2.12</version> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <version>${slf4j.version}</version> + </dependency> + <!-- hive --> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-common</artifactId> + <version>${hive.version}</version> + </dependency> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-serde</artifactId> + <version>${hive.version}</version> + </dependency> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-jdbc</artifactId> + <version>${hive.version}</version> + </dependency> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-contrib</artifactId> + <version>${hive.version}</version> + </dependency> + + <!-- datanucleus --> + <dependency> + <groupId>org.datanucleus</groupId> + <artifactId>datanucleus-core</artifactId> + <version>${datanucleus.version}</version> + </dependency> + + <dependency> + <groupId>org.datanucleus</groupId> + <artifactId>datanucleus-rdbms</artifactId> + <version>${datanucleus.jpa.version}</version> + </dependency> + + <dependency> + <groupId>org.datanucleus</groupId> + <artifactId>datanucleus-api-jdo</artifactId> + <version>${datanucleus.jpa.version}</version> + </dependency> + + <!-- TODO eliminate this pom dependency --> + <dependency> + <groupId>org.datanucleus</groupId> + <artifactId>datanucleus-accessplatform-jdo-rdbms</artifactId> + <version>${datanucleus.jpa.version}</version> + <type>pom</type> + </dependency> + + <!-- Unit test artifacts --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.11</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.hamcrest</groupId> + <artifactId>hamcrest-all</artifactId> + <version>1.3</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.mrunit</groupId> + <artifactId>mrunit</artifactId> + <version>1.0.0</version> + <classifier>hadoop2</classifier> + </dependency> + </dependencies> + + <build> + <extensions> + <extension> + <groupId>org.springframework.build.aws</groupId> + <artifactId>org.springframework.build.aws.maven</artifactId> + <version>3.0.0.RELEASE</version> + </extension> + </extensions> + <finalName>bigpetstore-${project.version}</finalName> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-release-plugin</artifactId> + <version>2.5</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-eclipse-plugin</artifactId> + <version>2.9</version> + <configuration> + <downloadSources>true</downloadSources> + <downloadJavadocs>true</downloadJavadocs> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>${maven-compiler-plugin.version}</version> + <configuration> + <source>1.8</source> + <target>1.8</target> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>2.4</version> + <configuration> + <outputDirectory>${basedir}/target</outputDirectory> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>${plugin.surefire.version}</version> + <configuration> + <excludes> + <exclude>**/*TestPig.java</exclude> + <exclude>**/*TestHiveEmbedded.java</exclude> + <exclude>**/*TestCrunch.java</exclude> + </excludes> + </configuration> + </plugin> + </plugins> + </build> + + <profiles> + <profile> + <id>pig</id> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>${plugin.surefire.version}</version> + <configuration> + <excludes> + <exclude>**/*TestPig.java</exclude> + <exclude>**/*TestHiveEmbedded.java</exclude> + <exclude>**/*TestCrunch.java</exclude> + <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude> + </excludes> + + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <version>1.5</version> + <executions> + <execution> + <id>add-test-source</id> + <phase>generate-test-sources</phase> + <goals> + <goal>add-test-source</goal> + </goals> + <configuration> + <sources> + <source>src/integration/java</source> + </sources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.12</version> + + <configuration> + <argLine>-Xmx1g</argLine> + <excludes> + <exclude>**/*BigPetStoreMahoutIT.java</exclude> + <exclude>**/*BigPetStoreHiveIT.java</exclude> + <exclude>**/*BigPetStoreCrunchIT.java</exclude> + </excludes> + </configuration> + <executions> + <!-- States that both integration-test and verify goals of the Failsafe + Maven plugin are executed. --> + <execution> + <id>integration-tests</id> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + + <profile> + <id>hive</id> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>${plugin.surefire.version}</version> + <configuration> + <excludes> + <exclude>**/*TestPig.java</exclude> + <exclude>**/*TestHiveEmbedded.java</exclude> + <exclude>**/*TestCrunch.java</exclude> + <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude> + </excludes> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <version>1.5</version> + <executions> + <execution> + <id>add-test-source</id> + <phase>generate-test-sources</phase> + <goals> + <goal>add-test-source</goal> + </goals> + <configuration> + <sources> + <source>src/integration/java</source> + </sources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.12</version> + <configuration> + <excludes> + <exclude>**/*BigPetStoreMahoutIT.java</exclude> + <exclude>**/*BigPetStorePigIT.java</exclude> + <exclude>**/*BigPetStoreCrunchIT.java</exclude> + </excludes> + </configuration> + <executions> + <!-- States that both integration-test and verify goals of the Failsafe + Maven plugin are executed. --> + <execution> + <id>integration-tests</id> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + <dependencies> + <!-- hadoop --> + <!-- TODO is this version change required? Version 2.2.0 is provided + by hadoop-client dependency. Shouldn't we have the same versions for the + related dependencies? --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-app</artifactId> + <version>2.3.0</version> + </dependency> + </dependencies> + </profile> + <profile> + <id>crunch</id> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>${plugin.surefire.version}</version> + <configuration> + <excludes> + <exclude>**/*TestPig.java</exclude> + <exclude>**/*TestHiveEmbedded.java</exclude> + <exclude>**/*TestCrunch.java</exclude> + <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude> + </excludes> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <version>1.5</version> + <executions> + <execution> + <id>add-test-source</id> + <phase>generate-test-sources</phase> + <goals> + <goal>add-test-source</goal> + </goals> + <configuration> + <sources> + <source>src/integration/java</source> + </sources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.12</version> + <configuration> + <excludes> + <exclude>**/*BigPetStorePigIT.java</exclude> + <exclude>**/*BigPetStoreHiveIT.java</exclude> + <exclude>**/*BigPetStoreMahoutIT.java</exclude> + </excludes> + </configuration> + <executions> + <!-- States that both integration-test and verify goals of the Failsafe + Maven plugin are executed. --> + <execution> + <id>integration-tests</id> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + <profile> + <id>mahout</id> + <!-- TODO this property is not being used anywhere. It's not even automatically + detectable. Remove? Or do something that the name suggests? --> + <properties> + <skip.unit.tests>true</skip.unit.tests> + </properties> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>${plugin.surefire.version}</version> + <configuration> + <excludes> + <exclude>**/*TestPig.java</exclude> + <exclude>**/*TestHiveEmbedded.java</exclude> + <exclude>**/*TestCrunch.java</exclude> + <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude> + </excludes> + </configuration> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <version>1.5</version> + <executions> + <execution> + <id>add-test-source</id> + <phase>generate-test-sources</phase> + <goals> + <goal>add-test-source</goal> + </goals> + <configuration> + <sources> + <source>src/integration/java</source> + </sources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.12</version> + <configuration> + <excludes> + <exclude>**/*BigPetStorePigIT.java</exclude> + <exclude>**/*BigPetStoreCrunchIT.java</exclude> + <exclude>**/*BigPetStoreHiveIT.java</exclude> + </excludes> + </configuration> + <executions> + <!-- States that both integration-test and verify goals of the Failsafe + Maven plugin are executed. --> + <execution> + <id>integration-tests</id> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + </profiles> +</project> http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle b/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle new file mode 100644 index 0000000..53d74f2 --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/settings.gradle @@ -0,0 +1,18 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +rootProject.name = 'BigPetStore' http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java new file mode 100644 index 0000000..b07c5a0 --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.bigtop.bigpetstore; + +import static org.apache.bigtop.bigpetstore.ITUtils.createTestOutputPath; +import static org.apache.bigtop.bigpetstore.ITUtils.setup; + +import java.util.regex.Pattern; + +import org.apache.bigtop.bigpetstore.recommend.ItemRecommender; +import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants.OUTPUTS.MahoutPaths; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.Before; +import org.junit.Test; + +import com.google.common.base.Predicate; + +public class BigPetStoreMahoutIT { + + public static final Path INPUT_DIR_PATH = + new Path(ITUtils.BPS_TEST_PIG_CLEANED, MahoutPaths.Mahout.name()); + public static final String INPUT_DIR_PATH_STR = INPUT_DIR_PATH.toString(); + private static final Path MAHOUT_OUTPUT_DIR = createTestOutputPath(MahoutPaths.Mahout.name()); + private static final Path ALS_FACTORIZATION_OUTPUT_DIR = + createTestOutputPath(MahoutPaths.Mahout.name(), MahoutPaths.AlsFactorization.name()); + private static final Path ALS_RECOMMENDATIONS_DIR = + createTestOutputPath(MahoutPaths.Mahout.name(), MahoutPaths.AlsRecommendations.name()); + + private ItemRecommender itemRecommender; + + @Before + public void setupTest() throws Throwable { + setup(); + try { + FileSystem fs = FileSystem.get(new Configuration()); + fs.delete(MAHOUT_OUTPUT_DIR, true); + itemRecommender = new ItemRecommender(INPUT_DIR_PATH_STR, ALS_FACTORIZATION_OUTPUT_DIR.toString(), + ALS_RECOMMENDATIONS_DIR.toString()); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private static final Predicate<String> TEST_OUTPUT_FORMAT = new Predicate<String>() { + private final Pattern p = Pattern.compile("^\\d+\\s\\[\\d+:\\d+\\.\\d+\\]$"); + @Override + public boolean apply(String input) { + return p.matcher(input).matches(); + } + }; + + @Test + public void testPetStorePipeline() throws Exception { + itemRecommender.recommend(); + ITUtils.assertOutput(ALS_RECOMMENDATIONS_DIR, TEST_OUTPUT_FORMAT); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java new file mode 100644 index 0000000..78d5c6b --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.bigtop.bigpetstore; + +import static org.apache.bigtop.bigpetstore.ITUtils.BPS_TEST_GENERATED; +import static org.apache.bigtop.bigpetstore.ITUtils.BPS_TEST_PIG_CLEANED; +import static org.apache.bigtop.bigpetstore.ITUtils.fs; + +import java.io.File; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.bigtop.bigpetstore.etl.PigCSVCleaner; +import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.pig.ExecType; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Predicate; +import com.google.common.collect.ImmutableMap; + +/** + * This is the main integration test for pig. Like all BPS integration tests, it + * is designed to simulate exactly what will happen on the actual cluster, + * except with a small amount of records. + * + * In addition to cleaning the dataset, it also runs the BPS_analytics.pig + * script which BigPetStore ships with. + */ +public class BigPetStorePigIT { + + final static Logger log = LoggerFactory.getLogger(BigPetStorePigIT.class); + + /** + * An extra unsupported code path that we have so people can do ad hoc + * analytics on pig data after it is cleaned. + */ + public static final Path BPS_TEST_PIG_COUNT_PRODUCTS = fs + .makeQualified(new Path("bps_integration_", + BigPetStoreConstants.OUTPUTS.pig_ad_hoc_script.name() + "0")); + + static final File PIG_SCRIPT = new File("BPS_analytics.pig"); + + static { + if (!PIG_SCRIPT.exists()) { + throw new RuntimeException("Couldnt find pig script at " + PIG_SCRIPT.getAbsolutePath()); + } + } + + @Before + public void setupTest() throws Throwable { + ITUtils.setup(); + try { + FileSystem.get(new Configuration()).delete(BPS_TEST_PIG_CLEANED, true); + FileSystem.get(new Configuration()).delete(BPS_TEST_PIG_COUNT_PRODUCTS, true); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + static Map<Path, Predicate<String>> TESTS = ImmutableMap.of( + /** Test of the main output */ + BPS_TEST_PIG_CLEANED, ITUtils.VERIFICATION_PERDICATE, + // Example of how to count products after doing basic pig data cleanup + BPS_TEST_PIG_COUNT_PRODUCTS, ITUtils.VERIFICATION_PERDICATE, + // Test the output that is to be used as an input for Mahout. + BigPetStoreMahoutIT.INPUT_DIR_PATH, ITUtils.VERIFICATION_PERDICATE + ); + + @Test + public void testPetStoreCorePipeline() throws Exception { + runPig(BPS_TEST_GENERATED, BPS_TEST_PIG_CLEANED, PIG_SCRIPT); + for (Entry<Path, Predicate<String>> e : TESTS.entrySet()) { + ITUtils.assertOutput(e.getKey(), e.getValue()); + } + } + + private void runPig(Path input, Path output, File pigscript) + throws Exception { + new PigCSVCleaner(input, output, ExecType.LOCAL, pigscript); + } +} http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java new file mode 100644 index 0000000..fd53dc1 --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/src/integrationTest/java/org/apache/bigtop/bigpetstore/ITUtils.java @@ -0,0 +1,168 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.bigtop.bigpetstore; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.InetAddress; +import java.nio.charset.Charset; +import java.util.List; + +import org.apache.bigtop.bigpetstore.generator.BPSGenerator; +import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.Job; +import org.junit.Assert; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Predicate; +import com.google.common.io.Files; + +public class ITUtils { + public static final Path TEST_OUTPUT_DIR = new Path("bps_integration_"); + + public static Predicate<String> VERIFICATION_PERDICATE = new Predicate<String>() { + @Override + public boolean apply(String input) { + return true; + } + }; + + static final Logger log = LoggerFactory.getLogger(ITUtils.class); + + static FileSystem fs; + static { + try { + fs = FileSystem.getLocal(new Configuration()); + } catch (Throwable e) { + String cpath = (String) System.getProperties().get("java.class.path"); + String msg = ""; + for (String cp : cpath.split(":")) { + if (cp.contains("hadoop")) { + msg += cp.replaceAll("hadoop", "**HADOOP**") + "\n"; + } + } + throw new RuntimeException("Major error: Probably issue. " + + "Check hadoop version? " + e.getMessage() + + " .... check these classpath elements:" + msg); + } + } + + public static final Path BPS_TEST_GENERATED = + createTestOutputPath(BigPetStoreConstants.OUTPUTS.generated.name()); + public static final Path BPS_TEST_PIG_CLEANED = + createTestOutputPath (BigPetStoreConstants.OUTPUTS.cleaned.name()); + + public static Path createTestOutputPath(String... pathParts) { + Path path = TEST_OUTPUT_DIR; + for(String pathPart: pathParts) { + path = new Path(path, pathPart); + } + return path; + } + + /** + * Some simple checks to make sure that unit tests in local FS. these arent + * designed to be run against a distribtued system. + */ + public static void checkConf(Configuration conf) throws Exception { + if (conf.get("mapreduce.jobtracker.address") == null) { + log.warn("Missing mapreduce.jobtracker.address???????!!!! " + "This can be the case in hive tests which use special " + + "configurations, but we should fix it sometime."); + return; + } + if (!conf.get("mapreduce.jobtracker.address").equals("local")) { + throw new RuntimeException("ERROR: bad conf : " + "mapreduce.jobtracker.address"); + } + if (!conf.get("fs.AbstractFileSystem.file.impl").contains("Local")) { + throw new RuntimeException("ERROR: bad conf : " + "mapreduce.jobtracker.address"); + } + try { + InetAddress addr = java.net.InetAddress.getLocalHost(); + System.out.println("Localhost = hn=" + addr.getHostName() + " / ha=" + addr.getHostAddress()); + } catch (Throwable e) { + throw new RuntimeException(" ERROR : Hadoop wont work at all on this machine yet" + + "...I can't get / resolve localhost ! Check java version/ " + "/etc/hosts / DNS or other networking related issues on your box" + + e.getMessage()); + } + } + + /** + * Creates a generated input data set in + * + * test_data_directory/generated. i.e. + * test_data_directory/generated/part-r-00000 + */ + public static void setup() throws Throwable { + Configuration conf = new Configuration(); + + // debugging for Jeff and others in local fs that won't build + checkConf(conf); + + conf.setInt(BPSGenerator.props.bigpetstore_records.name(), BPSGenerator.DEFAULT_NUM_RECORDS); + + if (FileSystem.getLocal(conf).exists(BPS_TEST_GENERATED)) { + return; + } + + Job createInput = BPSGenerator.getCreateTransactionRecordsJob(BPS_TEST_GENERATED, conf); + createInput.waitForCompletion(true); + + Path outputfile = new Path(BPS_TEST_GENERATED, "part-r-00000"); + List<String> lines = Files.readLines(FileSystem.getLocal(conf).pathToFile(outputfile), Charset.defaultCharset()); + log.info("output : " + FileSystem.getLocal(conf).pathToFile(outputfile)); + for (String l : lines) { + System.out.println(l); + } + } + + + // A functions that logs the output file as a verification test + public static void assertOutput(Path base, Predicate<String> validator) throws Exception { + FileSystem fs = FileSystem.getLocal(new Configuration()); + + FileStatus[] files = fs.listStatus(base); + // print out all the files. + for (FileStatus stat : files) { + System.out.println(stat.getPath() + " " + stat.getLen()); + } + + /** + * Support map OR reduce outputs + */ + Path partm = new Path(base, "part-m-00000"); + Path partr = new Path(base, "part-r-00000"); + Path p = fs.exists(partm) ? partm : partr; + + /** + * Now we read through the file and validate its contents. + */ + BufferedReader r = new BufferedReader(new InputStreamReader(fs.open(p))); + + // line:{"product":"big chew toy","count":3} + while (r.ready()) { + String line = r.readLine(); + log.info("line:" + line); + // System.out.println("line:"+line); + Assert.assertTrue("validationg line : " + line, validator.apply(line)); + } + } + +} http://git-wip-us.apache.org/repos/asf/bigtop/blob/6ec6cebf/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java ---------------------------------------------------------------------- diff --git a/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java b/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java new file mode 100755 index 0000000..158f875 --- /dev/null +++ b/bigtop-bigpetstore/bigpetstore-mapreduce/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.bigtop.bigpetstore.contract; + +import java.util.Map; + +/** + * This is the contract for the web site. This object is created by each ETL + * tool : Summary stats. + */ +public abstract class PetStoreStatistics { + + public abstract Map<String, ? extends Number> numberOfTransactionsByState() + throws Exception; + + public abstract Map<String, ? extends Number> numberOfProductsByProduct() + throws Exception; + +} \ No newline at end of file
