QChris has submitted this change and it was merged.

Change subject: Fix overriding MaxMind database location in Geocoding
......................................................................


Fix overriding MaxMind database location in Geocoding

Change-Id: Id2efa0292ca06c44a77807db0f0c76245d4a9753
---
M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
M 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
M 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java
5 files changed, 112 insertions(+), 36 deletions(-)

Approvals:
  QChris: Verified; Looks good to me, approved



diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
index 9abf990..1bfb909 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
@@ -29,15 +29,20 @@
 import com.maxmind.geoip2.model.CityResponse;
 import com.maxmind.geoip2.model.CountryResponse;
 import com.maxmind.geoip2.record.*;
+
 import org.apache.log4j.Logger;
 
 /**
  * Contains functions to find geo information of an IP address using Maxmind's 
GeoIP2
+ *
+ * TODO: Allow usage of this class without always instantiating both city and 
country databases.
  */
 public class Geocode {
+    // Default paths to Maxmind databases
+    public static final String DEFAULT_DATABASE_COUNTRY_PATH  = 
"/usr/share/GeoIP/GeoIP2-Country.mmdb";
+    public static final String DEFAULT_DATABASE_CITY_PATH     = 
"/usr/share/GeoIP/GeoIP2-City.mmdb";
 
-    private DatabaseReader countryDatabaseReader;
-    private DatabaseReader cityDatabaseReader;
+    static final Logger LOG = Logger.getLogger(Geocode.class.getName());
 
     //Constants to hold the keys to use in geo-coded data map
     private static final String CONTINENT = "continent";
@@ -53,21 +58,45 @@
     private static final String UNKNOWN_COUNTRY_CODE = "--";
     private static final String UNKNOWN_VALUE = "Unknown";
 
-    static final Logger LOG = Logger.getLogger(Geocode.class.getName());
+    private DatabaseReader countryDatabaseReader;
+    private DatabaseReader cityDatabaseReader;
 
+
+    /**
+     * Constructs a Geocode object with the default Maxmind 2 database paths.
+     * You can override either of the default database paths by setting
+     * the 'maxmind.database.country' and/or 'maxmind.database.city' 
properties.
+     */
     public Geocode() throws IOException {
-        // Default paths to Maxmind database
-        String defaultCountryDatabasePath = 
"/usr/share/GeoIP/GeoIP2-Country.mmdb";
-        String defaultCityDatabasePath = "/usr/share/GeoIP/GeoIP2-City.mmdb";
+        this(null, null);
+    }
 
-        String countryDatabasePath = 
System.getProperty("maxmind.database.country", defaultCountryDatabasePath);
-        String cityDatabasePath = System.getProperty("maxmind.database.city", 
defaultCityDatabasePath);
+    /**
+     * Constructs a Geocode object with the provided Maxmind 2 database paths.
+     * These are 'optional', in that you may set either one to null.  If null,
+     * the system properties 'maxmind.database.country' and 
'maxmind.database.city'
+     * will be checked for paths.  If these are not set, then this will 
default to
+     * DEFAULT_DATABASE_PATH_COUNTRY and DEFAULT_DATABASE_PATH_CITY 
respectively.
+     *
+     * @param countryDatabasePath
+     *      String path to Maxmind's country database
+     * @param cityDatabasePath
+     *      String path to Maxmind's city database
+     */
+    public Geocode(String countryDatabasePath, String cityDatabasePath) throws 
IOException {
+        // Override database paths with System properties, if they exist
+        if (countryDatabasePath == null) {
+            countryDatabasePath = 
System.getProperty("maxmind.database.country", DEFAULT_DATABASE_COUNTRY_PATH);
+        }
+        if (cityDatabasePath == null) {
+            cityDatabasePath = System.getProperty("maxmind.database.city", 
DEFAULT_DATABASE_CITY_PATH);
+        }
 
-        LOG.info("Country Database: " + countryDatabasePath);
-        LOG.info("City database: " + cityDatabasePath);
+        LOG.info("Geocode using Maxmind country database: " + 
countryDatabasePath);
+        LOG.info("Geocode using Maxmind city database: "    + 
cityDatabasePath);
 
         countryDatabaseReader = new DatabaseReader.Builder(new 
File(countryDatabasePath)).build();
-        cityDatabaseReader = new DatabaseReader.Builder(new 
File(cityDatabasePath)).build();
+        cityDatabaseReader    = new DatabaseReader.Builder(new 
File(cityDatabasePath)).build();
     }
 
     /**
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
index be19c94..bf64281 100644
--- 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
@@ -20,6 +20,7 @@
 
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.MapredContext;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
@@ -32,6 +33,7 @@
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
 
 import org.apache.log4j.Logger;
 import org.wikimedia.analytics.refinery.core.Geocode;
@@ -43,10 +45,12 @@
  *   ADD JAR /path/to/refinery-hive.jar;
  *   CREATE TEMPORARY FUNCTION geocode_country as 
'org.wikimedia.analytics.refinery.hive.GeocodedCountryUDF';
  *   SELECT geocode_country(ip) from webrequest where year = 2014 limit 10;
+ *
  * The above steps assume that the two required files - GeoIP2-Country.mmdb 
and GeoIP2-City.mmdb - are available
  * in their default path /usr/share/GeoIP. If not, then add the following 
steps:
- *   SET system:maxmind.database.country=/path/to/GeoIP2-Country.mmdb;
- *   SET system:maxmind.database.city=/path/to/GeoIP2-City.mmdb;
+ *
+ *   SET maxmind.database.country=/path/to/GeoIP2-Country.mmdb;
+ *   SET maxmind.database.city=/path/to/GeoIP2-City.mmdb;
  */
 @UDFType(deterministic = true)
 @Description(
@@ -87,21 +91,32 @@
         //Cache the argument to be used in evaluate
         argumentOI = arg1;
 
+        return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    }
+
+    @Override
+    public void configure(MapredContext context) {
         if (geocode == null) {
             try {
-                geocode = new Geocode();
+                JobConf jobConf = context.getJobConf();
+                geocode = new Geocode(
+                    jobConf.getTrimmed("maxmind.database.country"),
+                    jobConf.getTrimmed("maxmind.database.city")
+                );
             } catch (IOException ex) {
                 LOG.error(ex);
                 throw new RuntimeException(ex);
             }
         }
 
-        return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+        super.configure(context);
     }
 
     @SuppressWarnings("unchecked")
     @Override
     public Object evaluate(DeferredObject[] arguments) throws HiveException {
+        assert geocode != null : "Evaluate called without initializing 
'geocode'";
+
         result.clear();
 
         if (arguments.length == 1 && argumentOI != null && arguments[0] != 
null) {
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java
index 1b2fe31..1426eee 100644
--- 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedDataUDF.java
@@ -17,6 +17,7 @@
 package org.wikimedia.analytics.refinery.hive;
 
 import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.MapredContext;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
@@ -28,6 +29,8 @@
 import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+import org.apache.hadoop.mapred.JobConf;
+
 import org.apache.log4j.Logger;
 import org.wikimedia.analytics.refinery.core.Geocode;
 
@@ -42,10 +45,12 @@
  *   ADD JAR /path/to/refinery-hive.jar;
  *   CREATE TEMPORARY FUNCTION geocode_data as 
'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF';
  *   SELECT geocode_data(ip)['country'], geocode_data(ip)['city'] from 
webrequest where year = 2014 limit 10;
+ *
  * The above steps assume that the two required files - GeoIP2-Country.mmdb 
and GeoIP2-City.mmdb - are available
  * in their default path /usr/share/GeoIP. If not, then add the following 
steps:
- *   SET system:maxmind.database.country=/path/to/GeoIP2-Country.mmdb;
- *   SET system:maxmind.database.city=/path/to/GeoIP2-City.mmdb;
+ *
+ *   SET maxmind.database.country=/path/to/GeoIP2-Country.mmdb;
+ *   SET maxmind.database.city=/path/to/GeoIP2-City.mmdb;
  */
 @UDFType(deterministic = true)
 @Description(name = "geocoded_data", value = "_FUNC_(ip) - "
@@ -98,20 +103,29 @@
 
         argumentOI = arg1;
 
+        result = new HashMap<String, String>();
+
+        return ObjectInspectorFactory.getStandardMapObjectInspector(
+                PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+                PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+    }
+
+    @Override
+    public void configure(MapredContext context) {
         if (geocode == null) {
             try {
-                geocode = new Geocode();
+                JobConf jobConf = context.getJobConf();
+                geocode = new Geocode(
+                    jobConf.getTrimmed("maxmind.database.country"),
+                    jobConf.getTrimmed("maxmind.database.city")
+                );
             } catch (IOException ex) {
                 LOG.error(ex);
                 throw new RuntimeException(ex);
             }
         }
 
-        result = new HashMap<String, String>();
-
-        return ObjectInspectorFactory.getStandardMapObjectInspector(
-                PrimitiveObjectInspectorFactory.javaStringObjectInspector,
-                PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+        super.configure(context);
     }
 
     /**
@@ -134,6 +148,8 @@
     @SuppressWarnings("unchecked")
     @Override
     public Object evaluate(DeferredObject[] arguments) throws HiveException {
+        assert geocode != null : "Evaluate called without initializing 
'geocode'";
+
         result.clear();
 
         if (arguments.length == 1 && argumentOI != null && arguments[0] != 
null) {
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java
index bc0a9ea..55adfb6 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedCountryUDF.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Copyright (C) 2014  Wikimedia Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,7 @@
  */
 package org.wikimedia.analytics.refinery.hive;
 
+import org.apache.hadoop.hive.ql.exec.MapredContext;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
@@ -23,6 +24,9 @@
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import java.io.IOException;
+
 import org.junit.Test;
 import static org.junit.Assert.*;
 
@@ -48,42 +52,47 @@
     }
 
     @Test
-    public void testEvaluateWithValidIPv4() throws HiveException {
+    public void testEvaluateWithValidIPv4() throws HiveException, IOException {
         ObjectInspector value1 = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
         ObjectInspector[] initArguments = new ObjectInspector[]{value1};
         GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF();
 
         geocodedCountryUDF.initialize(initArguments);
+        geocodedCountryUDF.configure(MapredContext.init(false, new JobConf()));
 
         //IPv4 addresses taken from Maxmind's test suite
         String ip = "81.2.69.160";
         DeferredObject[] args = new DeferredObject[] { new 
DeferredJavaObject(ip) };
         Text result = (Text)geocodedCountryUDF.evaluate(args);
         assertEquals("ISO country code check", "GB", result.toString());
+        geocodedCountryUDF.close();
     }
 
     @Test
-    public void testEvaluateWithValidIPv6() throws HiveException {
+    public void testEvaluateWithValidIPv6() throws HiveException, IOException {
         ObjectInspector value1 = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
         ObjectInspector[] initArguments = new ObjectInspector[]{value1};
         GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF();
 
         geocodedCountryUDF.initialize(initArguments);
+        geocodedCountryUDF.configure(MapredContext.init(false, new JobConf()));
 
         //IPv6 representation of an IPv4 address taken from Maxmind's test 
suite
         String ip = "::ffff:81.2.69.160";
         DeferredObject[] args = new DeferredObject[] { new 
DeferredJavaObject(ip) };
         Text result = (Text)geocodedCountryUDF.evaluate(args);
         assertEquals("ISO country code check", "GB", result.toString());
+        geocodedCountryUDF.close();
     }
 
     @Test
-    public void testEvaluateWithInvalidIP() throws HiveException {
+    public void testEvaluateWithInvalidIP() throws HiveException, IOException {
         ObjectInspector value1 = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
         ObjectInspector[] initArguments = new ObjectInspector[]{value1};
         GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF();
 
         geocodedCountryUDF.initialize(initArguments);
+        geocodedCountryUDF.configure(MapredContext.init(false, new JobConf()));
 
         //Invalid IPv4 addresses
         String ip = "-";
@@ -95,22 +104,22 @@
         args = new DeferredObject[] { new DeferredJavaObject(ip) };
         result = (Text)geocodedCountryUDF.evaluate(args);
         assertEquals("ISO country code check", "--", result.toString());
+        geocodedCountryUDF.close();
     }
 
     @Test
-    public void testIPWithoutIsoCode() throws HiveException {
+    public void testIPWithoutIsoCode() throws HiveException, IOException {
         ObjectInspector value1 = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
         ObjectInspector[] initArguments = new ObjectInspector[]{value1};
         GeocodedCountryUDF geocodedCountryUDF = new GeocodedCountryUDF();
 
         geocodedCountryUDF.initialize(initArguments);
+        geocodedCountryUDF.configure(MapredContext.init(false, new JobConf()));
 
         String ip = "2a02:d500::"; // IP for EU
         DeferredObject[] args = new DeferredObject[] { new 
DeferredJavaObject(ip) };
         Text result = (Text)geocodedCountryUDF.evaluate(args);
         assertEquals("ISO country code check", "--", result.toString());
-
-        // Consistency trumps good style. Hence not closing the UDF, as other
-        // UDFs in this test case aren't closed either :-(
+        geocodedCountryUDF.close();
     }
 }
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java
index c5b8070..04b37cd 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGeocodedDataUDF.java
@@ -15,6 +15,7 @@
  */
 package org.wikimedia.analytics.refinery.hive;
 
+import org.apache.hadoop.hive.ql.exec.MapredContext;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
@@ -22,6 +23,9 @@
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.mapred.JobConf;
+import java.io.IOException;
+
 import org.junit.Test;
 import static org.junit.Assert.*;
 
@@ -47,7 +51,7 @@
     }
 
     @Test
-    public void testEvaluateWithValidIPv4() throws HiveException {
+    public void testEvaluateWithValidIPv4() throws HiveException, IOException {
         //IPv4 addresses taken from Maxmind's test suite
         String ip = "81.2.69.160";
         Map<String, String> result = evaluate (ip);
@@ -64,7 +68,7 @@
     }
 
     @Test
-    public void testEvaluateWithValidIPv6() throws HiveException {
+    public void testEvaluateWithValidIPv6() throws HiveException, IOException {
         //IPv6 representation of an IPv4 address taken from Maxmind's test 
suite
         String ip = "::ffff:81.2.69.160";
         Map<String, String> result = evaluate (ip);
@@ -81,7 +85,7 @@
     }
 
     @Test
-    public void testEvaluateWithInvalidIP() throws HiveException {
+    public void testEvaluateWithInvalidIP() throws HiveException, IOException {
         //Invalid IP
         String ip = "-";
         Map<String, String> result = evaluate(ip);
@@ -110,14 +114,17 @@
         assertEquals("Timezone check", "Unknown", result.get("timezone"));
     }
 
-    private Map<String, String> evaluate(String ip) throws HiveException {
+    private Map<String, String> evaluate(String ip) throws HiveException, 
IOException {
         ObjectInspector value1 = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
         ObjectInspector[] initArguments = new ObjectInspector[]{value1};
         GeocodedDataUDF geocodedDataUDF = new GeocodedDataUDF();
 
         geocodedDataUDF.initialize(initArguments);
+        geocodedDataUDF.configure(MapredContext.init(false, new JobConf()));
 
         DeferredObject[] args = new DeferredObject[] { new 
DeferredJavaObject(ip) };
-        return (Map<String, String>)geocodedDataUDF.evaluate(args);
+        Map<String, String> result = (Map<String, 
String>)geocodedDataUDF.evaluate(args);
+        geocodedDataUDF.close();
+        return result;
     }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/188012
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id2efa0292ca06c44a77807db0f0c76245d4a9753
Gerrit-PatchSet: 7
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: Ananthrk <anant...@ymxdata.com>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: QChris <christ...@quelltextlich.at>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to