QChris has submitted this change and it was merged. Change subject: Fix overriding MaxMind database location in Geocoding ......................................................................
Fix overriding MaxMind database location in Geocoding Change-Id: Id2efa0292ca06c44a77807db0f0c76245d4a9753 --- M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java M refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java M refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java 5 files changed, 112 insertions(+), 36 deletions(-) Approvals: QChris: Verified; Looks good to me, approved diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java index 9abf990..1bfb909 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java @@ -29,15 +29,20 @@ import com.maxmind.geoip2.model.CityResponse; import com.maxmind.geoip2.model.CountryResponse; import com.maxmind.geoip2.record.*; + import org.apache.log4j.Logger; /** * Contains functions to find geo information of an IP address using Maxmind's GeoIP2 + * + * TODO: Allow usage of this class without always instantiating both city and country databases. */ public class Geocode { + // Default paths to Maxmind databases + public static final String DEFAULT_DATABASE_COUNTRY_PATH = "/usr/share/GeoIP/GeoIP2-Country.mmdb"; + public static final String DEFAULT_DATABASE_CITY_PATH = "/usr/share/GeoIP/GeoIP2-City.mmdb"; - private DatabaseReader countryDatabaseReader; - private DatabaseReader cityDatabaseReader; + static final Logger LOG = Logger.getLogger(Geocode.class.getName()); //Constants to hold the keys to use in geo-coded data map private static final String CONTINENT = "continent"; @@ -53,21 +58,45 @@ private static final String UNKNOWN_COUNTRY_CODE = "--"; private static final String UNKNOWN_VALUE = "Unknown"; - static final Logger LOG = Logger.getLogger(Geocode.class.getName()); + private DatabaseReader countryDatabaseReader; + private DatabaseReader cityDatabaseReader; + + /** + * Constructs a Geocode object with the default Maxmind 2 database paths. + * You can override either of the default database paths by setting + * the 'maxmind.database.country' and/or 'maxmind.database.city' properties. + */ public Geocode() throws IOException { - // Default paths to Maxmind database - String defaultCountryDatabasePath = "/usr/share/GeoIP/GeoIP2-Country.mmdb"; - String defaultCityDatabasePath = "/usr/share/GeoIP/GeoIP2-City.mmdb"; + this(null, null); + } - String countryDatabasePath = System.getProperty("maxmind.database.country", defaultCountryDatabasePath); - String cityDatabasePath = System.getProperty("maxmind.database.city", defaultCityDatabasePath); + /** + * Constructs a Geocode object with the provided Maxmind 2 database paths. + * These are 'optional', in that you may set either one to null. If null, + * the system properties 'maxmind.database.country' and 'maxmind.database.city' + * will be checked for paths. If these are not set, then this will default to + * DEFAULT_DATABASE_PATH_COUNTRY and DEFAULT_DATABASE_PATH_CITY respectively. + * + * @param countryDatabasePath + * String path to Maxmind's country database + * @param cityDatabasePath + * String path to Maxmind's city database + */ + public Geocode(String countryDatabasePath, String cityDatabasePath) throws IOException { + // Override database paths with System properties, if they exist + if (countryDatabasePath == null) { + countryDatabasePath = System.getProperty("maxmind.database.country", DEFAULT_DATABASE_COUNTRY_PATH); + } + if (cityDatabasePath == null) { + cityDatabasePath = System.getProperty("maxmind.database.city", DEFAULT_DATABASE_CITY_PATH); + } - LOG.info("Country Database: " + countryDatabasePath); - LOG.info("City database: " + cityDatabasePath); + LOG.info("Geocode using Maxmind country database: " + countryDatabasePath); + LOG.info("Geocode using Maxmind city database: " + cityDatabasePath); countryDatabaseReader = new DatabaseReader.Builder(new File(countryDatabasePath)).build(); - cityDatabaseReader = new DatabaseReader.Builder(new File(cityDatabasePath)).build(); + cityDatabaseReader = new DatabaseReader.Builder(new File(cityDatabasePath)).build(); } /** diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java index be19c94..bf64281 100644 --- a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; import org.wikimedia.analytics.refinery.core.Geocode; @@ -43,10 +45,12 @@ * ADD JAR /path/to/refinery-hive.jar; * CREATE TEMPORARY FUNCTION geocode_country as 'org.wikimedia.analytics.refinery.hive.GeocodedCountryUDF'; * SELECT geocode_country(ip) from webrequest where year = 2014 limit 10; + * * The above steps assume that the two required files - GeoIP2-Country.mmdb and GeoIP2-City.mmdb - are available * in their default path /usr/share/GeoIP. If not, then add the following steps: - * SET system:maxmind.database.country=/path/to/GeoIP2-Country.mmdb; - * SET system:maxmind.database.city=/path/to/GeoIP2-City.mmdb; + * + * SET maxmind.database.country=/path/to/GeoIP2-Country.mmdb; + * SET maxmind.database.city=/path/to/GeoIP2-City.mmdb; */ @UDFType(deterministic = true) @Description( @@ -87,21 +91,32 @@ //Cache the argument to be used in evaluate argumentOI = arg1; + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + } + + @Override + public void configure(MapredContext context) { if (geocode == null) { try { - geocode = new Geocode(); + JobConf jobConf = context.getJobConf(); + geocode = new Geocode( + jobConf.getTrimmed("maxmind.database.country"), + jobConf.getTrimmed("maxmind.database.city") + ); } catch (IOException ex) { LOG.error(ex); throw new RuntimeException(ex); } } - return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + super.configure(context); } @SuppressWarnings("unchecked") @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert geocode != null : "Evaluate called without initializing 'geocode'"; + result.clear(); if (arguments.length == 1 && argumentOI != null && arguments[0] != null) { diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java index 1b2fe31..1426eee 100644 --- a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java @@ -17,6 +17,7 @@ package org.wikimedia.analytics.refinery.hive; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; @@ -28,6 +29,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.mapred.JobConf; + import org.apache.log4j.Logger; import org.wikimedia.analytics.refinery.core.Geocode; @@ -42,10 +45,12 @@ * ADD JAR /path/to/refinery-hive.jar; * CREATE TEMPORARY FUNCTION geocode_data as 'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF'; * SELECT geocode_data(ip)['country'], geocode_data(ip)['city'] from webrequest where year = 2014 limit 10; + * * The above steps assume that the two required files - GeoIP2-Country.mmdb and GeoIP2-City.mmdb - are available * in their default path /usr/share/GeoIP. If not, then add the following steps: - * SET system:maxmind.database.country=/path/to/GeoIP2-Country.mmdb; - * SET system:maxmind.database.city=/path/to/GeoIP2-City.mmdb; + * + * SET maxmind.database.country=/path/to/GeoIP2-Country.mmdb; + * SET maxmind.database.city=/path/to/GeoIP2-City.mmdb; */ @UDFType(deterministic = true) @Description(name = "geocoded_data", value = "_FUNC_(ip) - " @@ -98,20 +103,29 @@ argumentOI = arg1; + result = new HashMap<String, String>(); + + return ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + PrimitiveObjectInspectorFactory.javaStringObjectInspector); + } + + @Override + public void configure(MapredContext context) { if (geocode == null) { try { - geocode = new Geocode(); + JobConf jobConf = context.getJobConf(); + geocode = new Geocode( + jobConf.getTrimmed("maxmind.database.country"), + jobConf.getTrimmed("maxmind.database.city") + ); } catch (IOException ex) { LOG.error(ex); throw new RuntimeException(ex); } } - result = new HashMap<String, String>(); - - return ObjectInspectorFactory.getStandardMapObjectInspector( - PrimitiveObjectInspectorFactory.javaStringObjectInspector, - PrimitiveObjectInspectorFactory.javaStringObjectInspector); + super.configure(context); } /** @@ -134,6 +148,8 @@ @SuppressWarnings("unchecked") @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert geocode != null : "Evaluate called without initializing 'geocode'"; + result.clear(); if (arguments.length == 1 && argumentOI != null && arguments[0] != null) { diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java index bc0a9ea..55adfb6 100644 --- a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java @@ -1,4 +1,4 @@ -/** +/* * Copyright (C) 2014 Wikimedia Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,6 +15,7 @@ */ package org.wikimedia.analytics.refinery.hive; +import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; @@ -23,6 +24,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import java.io.IOException; + import org.junit.Test; import static org.junit.Assert.*; @@ -48,42 +52,47 @@ } @Test - public void testEvaluateWithValidIPv4() throws HiveException { + public void testEvaluateWithValidIPv4() throws HiveException, IOException { ObjectInspector value1 = PrimitiveObjectInspectorFactory.javaStringObjectInspector; ObjectInspector[] initArguments = new ObjectInspector[]{value1}; GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF(); geocodedCountryUDF.initialize(initArguments); + geocodedCountryUDF.configure(MapredContext.init(false, new JobConf())); //IPv4 addresses taken from Maxmind's test suite String ip = "81.2.69.160"; DeferredObject[] args = new DeferredObject[] { new DeferredJavaObject(ip) }; Text result = (Text)geocodedCountryUDF.evaluate(args); assertEquals("ISO country code check", "GB", result.toString()); + geocodedCountryUDF.close(); } @Test - public void testEvaluateWithValidIPv6() throws HiveException { + public void testEvaluateWithValidIPv6() throws HiveException, IOException { ObjectInspector value1 = PrimitiveObjectInspectorFactory.javaStringObjectInspector; ObjectInspector[] initArguments = new ObjectInspector[]{value1}; GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF(); geocodedCountryUDF.initialize(initArguments); + geocodedCountryUDF.configure(MapredContext.init(false, new JobConf())); //IPv6 representation of an IPv4 address taken from Maxmind's test suite String ip = "::ffff:81.2.69.160"; DeferredObject[] args = new DeferredObject[] { new DeferredJavaObject(ip) }; Text result = (Text)geocodedCountryUDF.evaluate(args); assertEquals("ISO country code check", "GB", result.toString()); + geocodedCountryUDF.close(); } @Test - public void testEvaluateWithInvalidIP() throws HiveException { + public void testEvaluateWithInvalidIP() throws HiveException, IOException { ObjectInspector value1 = PrimitiveObjectInspectorFactory.javaStringObjectInspector; ObjectInspector[] initArguments = new ObjectInspector[]{value1}; GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF(); geocodedCountryUDF.initialize(initArguments); + geocodedCountryUDF.configure(MapredContext.init(false, new JobConf())); //Invalid IPv4 addresses String ip = "-"; @@ -95,22 +104,22 @@ args = new DeferredObject[] { new DeferredJavaObject(ip) }; result = (Text)geocodedCountryUDF.evaluate(args); assertEquals("ISO country code check", "--", result.toString()); + geocodedCountryUDF.close(); } @Test - public void testIPWithoutIsoCode() throws HiveException { + public void testIPWithoutIsoCode() throws HiveException, IOException { ObjectInspector value1 = PrimitiveObjectInspectorFactory.javaStringObjectInspector; ObjectInspector[] initArguments = new ObjectInspector[]{value1}; GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF(); geocodedCountryUDF.initialize(initArguments); + geocodedCountryUDF.configure(MapredContext.init(false, new JobConf())); String ip = "2a02:d500::"; // IP for EU DeferredObject[] args = new DeferredObject[] { new DeferredJavaObject(ip) }; Text result = (Text)geocodedCountryUDF.evaluate(args); assertEquals("ISO country code check", "--", result.toString()); - - // Consistency trumps good style. Hence not closing the UDF, as other - // UDFs in this test case aren't closed either :-( + geocodedCountryUDF.close(); } } diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java index c5b8070..04b37cd 100644 --- a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java @@ -15,6 +15,7 @@ */ package org.wikimedia.analytics.refinery.hive; +import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; @@ -22,6 +23,9 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.mapred.JobConf; +import java.io.IOException; + import org.junit.Test; import static org.junit.Assert.*; @@ -47,7 +51,7 @@ } @Test - public void testEvaluateWithValidIPv4() throws HiveException { + public void testEvaluateWithValidIPv4() throws HiveException, IOException { //IPv4 addresses taken from Maxmind's test suite String ip = "81.2.69.160"; Map<String, String> result = evaluate (ip); @@ -64,7 +68,7 @@ } @Test - public void testEvaluateWithValidIPv6() throws HiveException { + public void testEvaluateWithValidIPv6() throws HiveException, IOException { //IPv6 representation of an IPv4 address taken from Maxmind's test suite String ip = "::ffff:81.2.69.160"; Map<String, String> result = evaluate (ip); @@ -81,7 +85,7 @@ } @Test - public void testEvaluateWithInvalidIP() throws HiveException { + public void testEvaluateWithInvalidIP() throws HiveException, IOException { //Invalid IP String ip = "-"; Map<String, String> result = evaluate(ip); @@ -110,14 +114,17 @@ assertEquals("Timezone check", "Unknown", result.get("timezone")); } - private Map<String, String> evaluate(String ip) throws HiveException { + private Map<String, String> evaluate(String ip) throws HiveException, IOException { ObjectInspector value1 = PrimitiveObjectInspectorFactory.javaStringObjectInspector; ObjectInspector[] initArguments = new ObjectInspector[]{value1}; GeocodedDataUDF geocodedDataUDF = new GeocodedDataUDF(); geocodedDataUDF.initialize(initArguments); + geocodedDataUDF.configure(MapredContext.init(false, new JobConf())); DeferredObject[] args = new DeferredObject[] { new DeferredJavaObject(ip) }; - return (Map<String, String>)geocodedDataUDF.evaluate(args); + Map<String, String> result = (Map<String, String>)geocodedDataUDF.evaluate(args); + geocodedDataUDF.close(); + return result; } } -- To view, visit https://gerrit.wikimedia.org/r/188012 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Id2efa0292ca06c44a77807db0f0c76245d4a9753 Gerrit-PatchSet: 7 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: QChris <christ...@quelltextlich.at> Gerrit-Reviewer: Ananthrk <anant...@ymxdata.com> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: QChris <christ...@quelltextlich.at> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits