Mforns has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/353287 )
Change subject: UDF to tag requests ...................................................................... UDF to tag requests UDF that understands webrequest data and can classify requests into types like "portal", "wikidata" and others. It uses a set of classes to look at requests and tag them as belonging to a type, a request can belong to several types. To create a Tagger you must implement the tagger interface and annotate your class with @Tag Tested with guava 12.0 Bug: T164021 Change-Id: I725e130431f3a869864275aa41af479fef9f157c --- M refinery-camus/pom.xml M refinery-cassandra/pom.xml M refinery-core/pom.xml M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java R refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/WebrequestData.java A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/PortalTagger.java A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tag.java A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tagger.java A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerChain.java A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerComparator.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java A refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/FakePageviewTagger.java A refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TestTagger.java A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetWebrequestTagsUDF.java M refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java M refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsRedirectToPageviewUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetWebrequestRequestTagsUDF.java M refinery-tools/pom.xml 18 files changed, 581 insertions(+), 6 deletions(-) Approvals: Mforns: Verified; Looks good to me, approved diff --git a/refinery-camus/pom.xml b/refinery-camus/pom.xml index 88044a8..c0ee693 100644 --- a/refinery-camus/pom.xml +++ b/refinery-camus/pom.xml @@ -19,6 +19,16 @@ <groupId>com.linkedin.camus</groupId> <artifactId>camus-api</artifactId> <version>${camus.version}</version> + <exclusions> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </exclusion> + <exclusion> + <groupId>com.google.collections</groupId> + <artifactId>google-collections</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> @@ -40,6 +50,12 @@ </dependency> <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>12.0</version> + </dependency> + + <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> diff --git a/refinery-cassandra/pom.xml b/refinery-cassandra/pom.xml index 1996d2e..9038924 100644 --- a/refinery-cassandra/pom.xml +++ b/refinery-cassandra/pom.xml @@ -58,12 +58,32 @@ <groupId>com.datastax.cassandra</groupId> <artifactId>cassandra-driver-core</artifactId> <version>2.2.0-rc3</version> + <exclusions> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </exclusion> + <exclusion> + <groupId>com.google.collections</groupId> + <artifactId>google-collections</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> <groupId>org.apache.cassandra</groupId> <artifactId>cassandra-all</artifactId> <version>2.2.6</version> + <exclusions> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </exclusion> + <exclusion> + <groupId>com.google.collections</groupId> + <artifactId>google-collections</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> @@ -77,7 +97,11 @@ <version>1.10.19</version> <scope>test</scope> </dependency> - + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>12.0</version> + </dependency> </dependencies> <build> diff --git a/refinery-core/pom.xml b/refinery-core/pom.xml index 6721a44..271d84c 100644 --- a/refinery-core/pom.xml +++ b/refinery-core/pom.xml @@ -84,7 +84,17 @@ <groupId>com.github.nscala-time</groupId> <artifactId>nscala-time_2.10</artifactId> </dependency> - + <!-- https://mvnrepository.com/artifact/com.google.guava/guava --> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>12.0</version> + </dependency> + <dependency> + <groupId>org.reflections</groupId> + <artifactId>reflections</artifactId> + <version>0.9.7</version> + </dependency> </dependencies> <build> @@ -131,6 +141,10 @@ <maxmind.database.country>${project.build.testOutputDirectory}/GeoIP2-Country-Test.mmdb</maxmind.database.country> <maxmind.database.city>${project.build.testOutputDirectory}/GeoIP2-City-Test.mmdb</maxmind.database.city> </systemPropertyVariables> + <forkCount>1</forkCount> + <includes> + <include>**/Test*.java</include> + </includes> </configuration> </plugin> diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java index b577701..c6d7632 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java @@ -16,6 +16,8 @@ package org.wikimedia.analytics.refinery.core; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; + import java.util.Arrays; import java.util.HashSet; import java.util.Set; diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/WebrequestData.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/WebrequestData.java similarity index 64% rename from refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/WebrequestData.java rename to refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/WebrequestData.java index 84f4df1..4b5abb9 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/WebrequestData.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/WebrequestData.java @@ -1,4 +1,9 @@ -package org.wikimedia.analytics.refinery.core; +package org.wikimedia.analytics.refinery.core.webrequest; + +import org.json.simple.JSONObject; + +import java.util.HashMap; +import java.util.Map; /** * POJO That encapsulates data from webrequest @@ -16,17 +21,22 @@ public WebrequestData(String uriHost, String uriPath, String uriQuery, String httpStatus, String contentType, String userAgent, String rawXAnalyticsHeader){ - this.uriHost = uriHost.toLowerCase(); + this.uriHost = uriHost.toLowerCase().trim(); this.uriPath = uriPath; this.uriQuery = uriQuery; this.httpStatus = httpStatus; this.contentType = contentType; this.userAgent = userAgent; + if (rawXAnalyticsHeader == null){ rawXAnalyticsHeader = ""; } + this.rawXAnalyticsHeader = rawXAnalyticsHeader; + + } + public String getUriHost(){ return uriHost; @@ -55,4 +65,20 @@ public String getRawXAnalyticsHeader(){ return rawXAnalyticsHeader; } + + @Override + public String toString(){ + Map webrequestMap = new HashMap<String, String>(); + + webrequestMap.put("uriHost", this.uriHost); + webrequestMap.put("uriPath", this.uriPath); + webrequestMap.put("uriQuery", this.uriQuery); + webrequestMap.put("httpStatus", this.httpStatus); + webrequestMap.put("contentType", this.contentType); + webrequestMap.put("userAgent", this.userAgent); + webrequestMap.put("X-Analytics", this.rawXAnalyticsHeader); + + return JSONObject.toJSONString(webrequestMap); + } + } diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/PortalTagger.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/PortalTagger.java new file mode 100644 index 0000000..558294e --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/PortalTagger.java @@ -0,0 +1,47 @@ +package org.wikimedia.analytics.refinery.core.webrequest.tag; + +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * Tags requests made to various wikimedia project portals, + * such as wwww.wikipedia.org. + * [executionStage = 0] because this tagger does not depend on other tags + * found in tagAccumulator. + */ +@Tag(tag = "portal", executionStage = 0) +public class PortalTagger implements Tagger { + + Set<String> portalDomains = new HashSet<String>( + Arrays.asList("www.wikimedia.org", + "www.wikipedia.org", + "www.wiktionary.org", + "www.wikiquote.org", + "www.wikibooks.org", + "www.wikinews.org", + "www.wikiversity.org", + "www.wikivoyage.org")); + + public Set<String> getTags(WebrequestData webrequestData, Set<String> tagAccumulator){ + + Set<String> tags = new HashSet<>(); + + assert webrequestData.getUriPath()!=null: webrequestData; + + if (webrequestData.getUriPath().equals("/") + && webrequestData.getContentType().startsWith("text/html") ) { + + + if (portalDomains.contains(webrequestData.getUriHost())) { + tags.add("portal"); + } + + } + return tags; + + } + +} \ No newline at end of file diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tag.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tag.java new file mode 100644 index 0000000..3d7d111 --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tag.java @@ -0,0 +1,24 @@ +package org.wikimedia.analytics.refinery.core.webrequest.tag; + +/** + * Created by nuriaruiz on 5/10/17. + * Annotation that identifies classes that belong to the TaggerChain + * Chain is build at runtime + */ + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface Tag { + // The tag string that the tagger will apply to webrequests. + String tag() default ""; + // If the Tagger does not depend on other tags, executionStage should be 0. + // If the Tagger depends on other tags, executionStage should be 1 plus the + // maximum executionStage of all its tag dependencies. + // This ensures that tag dependencies are executed in correct order. + int executionStage() default 0; +} diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tagger.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tagger.java new file mode 100644 index 0000000..bd6e69d --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/Tagger.java @@ -0,0 +1,26 @@ +package org.wikimedia.analytics.refinery.core.webrequest.tag; + + +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; + +import java.util.Set; + +/** + * + * Defines the method(s) that taggers need to implement + * + * Given a set of attributes from the request it returns a set of tags. + * Set will be empty if no tag is found, also set might just contain one element + * + * We also pass the tags thus far so no logic needs to be repeated, for example + * if you want to tag only requests that are pageviews you can look whether that + * is been established via a prior tag. If you do so, remember to specify the + * executionStage value in the @Tag annotation to guarantee execution order + * (see Tag.java for executionStage description). + * + * @return String + */ +public interface Tagger { + + public Set<String> getTags(WebrequestData webrequestData, Set<String> tagAccumulator); +} diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerChain.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerChain.java new file mode 100644 index 0000000..ad87d97 --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerChain.java @@ -0,0 +1,76 @@ +package org.wikimedia.analytics.refinery.core.webrequest.tag; + +import org.reflections.Reflections; +import org.wikimedia.analytics.refinery.core.Webrequest; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; + +import java.util.*; +/** + * Base utility class to loop through a set of taggers + * and return an array of strings that correspond to the tags + * that are associated to a request. + * + * Tags could be "portal", "pageview", "preview" + */ + +public class TaggerChain { + + + protected List<Tagger> chain = new ArrayList<Tagger>(); + + /** + * Initializes the tag chain by + * Discovering the taggers at runtime + * + * @throws ClassNotFoundException + * @throws IllegalAccessException + * @throws InstantiationException + */ + public TaggerChain() throws ClassNotFoundException, IllegalAccessException, InstantiationException{ + // scan annotations at runtime and build chain + Reflections reflections = new Reflections("org.wikimedia.analytics.refinery.core.webrequest.tag"); + Set<Class<?>> taggerClasses = reflections.getTypesAnnotatedWith(org.wikimedia.analytics.refinery.core.webrequest.tag.Tag.class); + + for (Class taggerClass : taggerClasses) { + Class<?> clazz; + clazz = Class.forName(taggerClass.getName()); + chain.add((Tagger) clazz.newInstance()); + + Collections.sort(chain, new TaggerComparator()); + } + + } + /** + * Builds the list of tags asking the "taggers" + * + * @return Set<String> Set of tags or empty list + */ + public Set<String> getTags(WebrequestData webrequest){ + Set<String> tags = new HashSet<>(); + + /** + * Avoid null pointer exceptions in records like the following: + * "hostname":null,"sequence":null,"dt":null,"time_firstbyte":null,"ip":null, + * "cache_status":null,"http_status":"304", + * "response_size":null,"http_method":null,"uri_host":"maps.wikimedia.org", + * "uri_path":"/osm-intl/4/2/5.png","uri_query":"","content_type":"image/png", + * "referer":null,"x_forwarded_for":null, + * ... + * "namespace_id":null,"webrequest_source":"maps","year":2017,"month":5,"day":5,"hour":1} + */ + + assert webrequest.getUriHost()!=null: webrequest; + assert webrequest.getHttpStatus() != null: webrequest; + assert webrequest.getUserAgent() != null: webrequest; + + // only pass to taggers healthy requests + if(Webrequest.SUCCESS_HTTP_STATUSES.contains(webrequest.getHttpStatus()) ) { + for (Tagger t : this.chain) { + Set<String> newTags = t.getTags(webrequest, tags); + tags.addAll(newTags); + } + } + return tags; + } + +} diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerComparator.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerComparator.java new file mode 100644 index 0000000..fdf779d --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TaggerComparator.java @@ -0,0 +1,26 @@ +package org.wikimedia.analytics.refinery.core.webrequest.tag; + +/** + * Created by nuriaruiz on 6/30/17. + */ +import java.util.Comparator; + +/** + * Used to sort tags depending on executionStage + */ +public class TaggerComparator implements Comparator<Tagger> { + /** + * Returns a negative integer, zero, or a positive integer as + * the first argument is less than, equal to, or greater than the second. + * @param t1 + * @param t2 + * @return + */ + @Override + public int compare(Tagger t1, Tagger t2){ + + return t1.getClass().getAnnotation(Tag.class).executionStage() - + t2.getClass().getAnnotation(Tag.class).executionStage(); + } +} + diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java index feeccf3..ef7869d 100644 --- a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java @@ -19,6 +19,7 @@ import junitparams.mappers.CsvWithHeaderMapper; import org.junit.Test; import org.junit.runner.RunWith; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; import static org.junit.Assert.assertEquals; diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/FakePageviewTagger.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/FakePageviewTagger.java new file mode 100644 index 0000000..ae24372 --- /dev/null +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/FakePageviewTagger.java @@ -0,0 +1,28 @@ +package org.wikimedia.analytics.refinery.core.webrequest.tag; + +import org.wikimedia.analytics.refinery.core.PageviewDefinition; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; + +import java.util.HashSet; +import java.util.Set; + +/** + * Created by nuriaruiz on 6/1/17. + * + * Testing whether this tagger can "tag" pageviews + * No need for this code to run in prod quite yet + * Some "fake" execution stage order + */ +@Tag(tag = "pageview", executionStage = 2) +public class FakePageviewTagger implements Tagger { + @Override + public Set<String> getTags(WebrequestData webrequestData, Set<String> tagAccumulator){ + + Set <String> tags = new HashSet<>(); + + if (PageviewDefinition.getInstance().isPageview(webrequestData)){ + tags.add("pageview"); + } + return tags; + } +} diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TestTagger.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TestTagger.java new file mode 100644 index 0000000..711a403 --- /dev/null +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/webrequest/tag/TestTagger.java @@ -0,0 +1,138 @@ +package org.wikimedia.analytics.refinery.core.webrequest.tag; + +import junit.framework.TestCase; +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.wikimedia.analytics.refinery.core.PageviewDefinition; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; +import java.util.Set; + +/** + * Created by nuriaruiz on 5/9/17. + * + */ +@RunWith(JUnitParamsRunner.class) +public class TestTagger extends TestCase { + + + /* + * Make sure chain was initialized + */ + @Test + public void testChainSize() throws Exception { + + TaggerChain taggerChain = new TaggerChain(); + // see fakePageviewTagger just for tests + assertTrue(taggerChain.chain.size() == 2); + assertTrue(taggerChain.chain.get(0) != null); + assertTrue(taggerChain.chain.get(1) != null); + } + + + @Test + public void testChainExecutionStage() throws Exception { + + TaggerChain taggerChain = new TaggerChain(); + // see fakePageviewTagger just for tests + Tagger t1 = taggerChain.chain.get(0); + Tagger t2 = taggerChain.chain.get(1); + + //according to our setup PortalTagger comes 1st, FakePageviewtagger comes after + + assertTrue("executionStage is considered when building chain", + t1.getClass().getAnnotation(Tag.class).executionStage() < + t2.getClass().getAnnotation(Tag.class).executionStage()); + + } + + + /** + * No tags returns empty set + * @throws Exception + */ + @Test + public void testNoTags() throws Exception{ + + TaggerChain taggerChain = new TaggerChain(); + WebrequestData data = new WebrequestData("en.wikipedia","/", "", "200", + "text/html", "fake user agent", ""); + + // not tags thus far + assertTrue("no tags returns empty set", taggerChain.getTags(data).isEmpty()); + + } + + + /** + * Test portal tag + * @throws Exception + */ + @Test + public void testPortalHappyCase() throws Exception{ + + TaggerChain taggerChain = new TaggerChain(); + WebrequestData data = new WebrequestData("www.wikipedia.org","/", "", "200", + "text/html", "fake user agent", ""); + + assertTrue(taggerChain.getTags(data).size() == 1); + assertTrue(taggerChain.getTags(data).contains("portal")); + + } + + @Test + @FileParameters( + value = "src/test/resources/pageview_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsTaggedPageview( + String test_description, + String project, + String dialect, + String page_title, + boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, + String uri_host, + String uri_path, + String uri_query, + String http_status, + String content_type, + String user_agent, + String x_analytics_header + ) throws Exception { + //uses pageview data to see if a possible pageview tag is behaving as it should + PageviewDefinition PageviewDefinitionInstance = PageviewDefinition.getInstance(); + + WebrequestData data = new WebrequestData(uri_host, + uri_path, + uri_query, + http_status, + content_type, + user_agent, + x_analytics_header) ; + + TaggerChain taggerChain = new TaggerChain(); + + Set<String> tags = taggerChain.getTags(data); + + // if this a pageview we should have at least 1 tag: 'pageview' + + + if (is_pageview) { + assertTrue(test_description, tags.contains("pageview")); + } else { + assertFalse(test_description, tags.contains("pageview")); + } + + } + + + + + + +} \ No newline at end of file diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetWebrequestTagsUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetWebrequestTagsUDF.java new file mode 100644 index 0000000..90e0e74 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetWebrequestTagsUDF.java @@ -0,0 +1,100 @@ +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; +import org.wikimedia.analytics.refinery.core.webrequest.tag.TaggerChain; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +@UDFType(deterministic = true) +@Description(name = "tag", value = "_FUNC_(UA) - " + + "Returns an array of tags for a given request") + +// for CR: how about one UDF extending another one? +public class GetWebrequestTagsUDF extends IsPageviewUDF{ + + private TaggerChain taggerChain; + + /** + * Executed once per job, checks arguments size. + * Initializes the chain of taggers that can return a possible tag for the request + * @param arguments + * @return + * @throws UDFArgumentException + */ + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException{ + + try { + this.taggerChain = new TaggerChain(); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } catch (InstantiationException e) { + e.printStackTrace(); + } + + super.initialize(arguments); + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector); + + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException{ + + Set<String> tags = new HashSet<>(); + + String uriHost = PrimitiveObjectInspectorUtils.getString( + arguments[0].get(), (PrimitiveObjectInspector) argumentsOI[0]); + + String uriPath = PrimitiveObjectInspectorUtils.getString( + arguments[1].get(), (PrimitiveObjectInspector) argumentsOI[1]); + + String uriQuery = PrimitiveObjectInspectorUtils.getString( + arguments[2].get(), (PrimitiveObjectInspector) argumentsOI[2]); + + String httpStatus = PrimitiveObjectInspectorUtils.getString( + arguments[3].get(), (PrimitiveObjectInspector) argumentsOI[3]); + + String contentType = PrimitiveObjectInspectorUtils.getString( + arguments[4].get(), (PrimitiveObjectInspector) argumentsOI[4]); + + String userAgent = PrimitiveObjectInspectorUtils.getString( + arguments[5].get(), (PrimitiveObjectInspector) argumentsOI[5]); + + String rawXAnalyticsHeader = ""; + + if (checkForXAnalytics) { + rawXAnalyticsHeader = PrimitiveObjectInspectorUtils.getString( + arguments[6].get(), (PrimitiveObjectInspector) argumentsOI[6]); + } + + + WebrequestData webrequest = new WebrequestData(uriHost, uriPath, uriQuery, + httpStatus, contentType, userAgent, rawXAnalyticsHeader); + + // converting set to a list + tags = taggerChain.getTags(webrequest); + + return new ArrayList<String>(tags); + + + } + + @Override + public String getDisplayString(String[] arguments){ + return "GetWebrequestTagsUDF(" + arguments.toString() + ")"; + } +} \ No newline at end of file diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java index 8d034d4..439fb9e 100644 --- a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java @@ -26,7 +26,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.wikimedia.analytics.refinery.core.PageviewDefinition; -import org.wikimedia.analytics.refinery.core.WebrequestData; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; /** diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsRedirectToPageviewUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsRedirectToPageviewUDF.java index 77fe8b6..dab4eb3 100644 --- a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsRedirectToPageviewUDF.java +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsRedirectToPageviewUDF.java @@ -22,7 +22,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.wikimedia.analytics.refinery.core.PageviewDefinition; -import org.wikimedia.analytics.refinery.core.WebrequestData; +import org.wikimedia.analytics.refinery.core.webrequest.WebrequestData; /** diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetWebrequestRequestTagsUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetWebrequestRequestTagsUDF.java new file mode 100644 index 0000000..d6f36b6 --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetWebrequestRequestTagsUDF.java @@ -0,0 +1,17 @@ +package org.wikimedia.analytics.refinery.hive; + +import junit.framework.TestCase; +import org.junit.Test; + +/** + * Created by nuriaruiz on 5/11/17. + */ +public class TestGetWebrequestRequestTagsUDF extends TestCase { + + @Test + public void testMain() throws Exception { + assert(true); + + } + +} \ No newline at end of file diff --git a/refinery-tools/pom.xml b/refinery-tools/pom.xml index b07c4ad..a635c75 100644 --- a/refinery-tools/pom.xml +++ b/refinery-tools/pom.xml @@ -16,6 +16,16 @@ <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <scope>provided</scope> + <exclusions> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </exclusion> + <exclusion> + <groupId>com.google.collections</groupId> + <artifactId>google-collections</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> -- To view, visit https://gerrit.wikimedia.org/r/353287 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I725e130431f3a869864275aa41af479fef9f157c Gerrit-PatchSet: 21 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: Nuria <nu...@wikimedia.org> Gerrit-Reviewer: Bearloga <mpo...@wikimedia.org> Gerrit-Reviewer: Chelsyx <c...@wikimedia.org> Gerrit-Reviewer: Fdans <fd...@wikimedia.org> Gerrit-Reviewer: Joal <j...@wikimedia.org> Gerrit-Reviewer: Mforns <mfo...@wikimedia.org> Gerrit-Reviewer: Nuria <nu...@wikimedia.org> Gerrit-Reviewer: Ottomata <ao...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits