Author: gates Date: Mon Oct 13 09:20:49 2008 New Revision: 704151 URL: http://svn.apache.org/viewvc?rev=704151&view=rev Log: PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, determines the host.
Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java Modified: incubator/pig/trunk/CHANGES.txt Modified: incubator/pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=704151&r1=704150&r2=704151&view=diff ============================================================================== --- incubator/pig/trunk/CHANGES.txt (original) +++ incubator/pig/trunk/CHANGES.txt Mon Oct 13 09:20:49 2008 @@ -364,3 +364,6 @@ PIG-486: Added SearchEngineExtractor, a piggybank eval func that recognizes a set of the most common search engines in a URL and extracts the name of the search engine (spackest via gates). + + PIG-487: Added HostExtractor, a piggybank eval func that, given a URL, + determines the host (spackest via gates). Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java?rev=704151&view=auto ============================================================================== --- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/util/apachelogparser/HostExtractor.java Mon Oct 13 09:20:49 2008 @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +/* + * HostExtractor takes a url and returns the host. For example, + * + * http://sports.espn.go.com/mlb/recap?gameId=281009122 + * + * leads to + * + * sports.espn.go.com + * + * Pig latin usage looks like + * + * host = FOREACH row GENERATE + * org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor(referer); + */ + +package org.apache.pig.piggybank.evaluation.util.apachelogparser; + + +import java.net.URL; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Tuple; + +public class HostExtractor extends EvalFunc<DataAtom> { + @Override + public void exec(Tuple input, DataAtom output) { + String string = input.getAtomField(0).strval(); + + if (string == null) + return; + + String host = null; + try { + host = new URL(string).getHost().toLowerCase(); + } catch (Exception e) { + } + if (host != null) + output.setValue(host); + } +} Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java?rev=704151&view=auto ============================================================================== --- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java (added) +++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/util/apachelogparser/TestHostExtractor.java Mon Oct 13 09:20:49 2008 @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the + * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + +package org.apache.pig.piggybank.test.evaluation.util.apachelogparser; + +import java.util.ArrayList; +import java.util.HashMap; + +import junit.framework.TestCase; + +import org.apache.pig.data.DataAtom; +import org.apache.pig.data.Datum; +import org.apache.pig.data.Tuple; +import org.apache.pig.piggybank.evaluation.util.apachelogparser.HostExtractor; +import org.apache.pig.piggybank.evaluation.util.apachelogparser.SearchEngineExtractor; +import org.junit.Test; + +public class TestHostExtractor extends TestCase { + private static HashMap<String, String> tests = new HashMap<String, String>(); + static { + tests.put("http://sports.espn.go.com/mlb/recap?gameId=281009122", "sports.espn.go.com"); + tests.put("http://www.google.com/search?hl=en&safe=active&rls=GGLG,GGLG:2005-24,GGLG:en&q=purpose+of+life&btnG=Search", "www.google.com"); + tests.put("http://search.msn.com/results.aspx?q=a+simple+test&geovar=56&FORM=REDIR", "search.msn.com"); + tests.put("http://www.altavista.com/web/results?itag=ody&q=a+simple+test&kgs=1&kls=0", "www.altavista.com"); + tests.put("dud", null); + } + + @Test + public void testInstantiation() { + assertNotNull(new SearchEngineExtractor()); + } + + @Test + public void testTests() { + HostExtractor hostExtractor = new HostExtractor(); + int testCount = 0; + for (String key : tests.keySet()) { + String expected = tests.get(key); + + ArrayList<Datum> input = new ArrayList<Datum>(); + input.add(new DataAtom(key)); + + DataAtom output = new DataAtom(); + hostExtractor.exec(new Tuple(input), output); + if (expected == null) + assertEquals(0, output.toString().length()); + else + assertEquals(expected, output.toString()); + testCount++; + } + assertEquals(tests.size(), testCount); + } +}