package foobit.nutch;

import com.maxmind.geoip.LookupService;
import net.nutch.util.LogFormatter;
import net.nutch.util.NutchConf;
import org.apache.oro.text.regex.MalformedPatternException;

import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.logging.Logger;

/**
 * IPRegexURLFilter is a subclass of RegexURLFilter that can filter:
 * - by hostname regex
 * - by country, using com.maxmind.geoip.LookupService
 *
 * Configuration properties:
 *
 * urlfilter.ipregex.debuglevel     = 0 off, 1 on
 * urlfilter.ipregex.countrymatch   = pattern: [+-]C1,C2,...
 *      where C1,C2,... = comma separated list of two-character country codes
 *
 * countrymatch examples:
 *    +CA,MX,CU         Match Canada, Mexico, and Cuba
 *    -US               All countries except USA
 *
 * Requires a copy of the MaxMind GeoIP Country Free database (maxmind.com);
 * file must exist in the CLASSPATH as "GeoIP.dat"
 *
 * (11/2004 kangas)
 */
public class IPRegexURLFilter extends net.nutch.net.RegexURLFilter
{
  private static final Logger LOG =
            LogFormatter.getLogger("foobit.nutch.IPRegexURLFilter");

  private HashSet countryset = new HashSet();
  private boolean countrysign;
  private int debugLevel;

  private static LookupService lookupService;

  // ------------------------

  public IPRegexURLFilter()
          throws IOException, MalformedPatternException
  {
    super();
    init();
  }

  public IPRegexURLFilter(String filename)
          throws IOException, MalformedPatternException
  {
    super(filename);
    init();
  }

  /**
   * Filter one URL by regex and GeoIP country
   *
   * @param url String
   * @return String or null
   */
  public String filter(String url)
  {
    String regexUrl = super.filter(url);

    if (regexUrl == null) {
      return null;
    }
    else if (countryset.size() == 0) {
      if (debugLevel >= 1) { LOG.info("GEOIP_OFF "+ regexUrl); }
      return regexUrl;
    }

    try {
      String logTag, retval;
      String host = new URL(regexUrl).getHost();

      // FIXME: DNS lookups are inherently slow. Can we parallelize this?
      InetAddress addr = InetAddress.getByName(host);
      String country = lookupService.getCountry(addr).getCode();

      if (countryset.contains(country)) {
        logTag = countrysign ? "GEOIP_MATCH" : "GEOIP_REJECT";
        retval = countrysign ? regexUrl : null;
      }
      else {
        logTag = "GEOIP_PASS";
        retval = regexUrl;
      }

      if (debugLevel >= 1) {
        LOG.info(logTag +" "+ host +" "+ addr.toString() + " "+ country);
      }
      return retval;

    }
    catch (MalformedURLException e) {
      if (debugLevel >= 1) { e.printStackTrace(); }
      return null;
    }
    catch (UnknownHostException e) {
      if (debugLevel >= 1) { e.printStackTrace(); }
      return null;
    }
  }


  /**
   * Calls initLookupService(), then fetches two values from NutchConf:
   * - urlfilter.ipregex.debuglevel
   * - urlfilter.ipregex.countrymatch
   *
   * @throws IOException
   */
  private synchronized void init() throws IOException
  {
    final String confKeyDebug = "urlfilter.ipregex.debuglevel";
    final String confKeyCountryMatch = "urlfilter.ipregex.countrymatch";

    initLookupService();

    // get debugLevel
    debugLevel = NutchConf.getInt(confKeyDebug, 0);

    // get countrymatch
    String line = NutchConf.get(confKeyCountryMatch);
    if (line != null) {
      char first = line.charAt(0);
      switch (first) {
      case '+' :
        countrysign=true;
        break;
      case '-' :
        countrysign=false;
        break;
      default :
        throw new IOException(confKeyCountryMatch +": Invalid first character: "+line);
      }

      StringTokenizer st = new StringTokenizer(line.substring(1), ",");
      while (st.hasMoreTokens()) {
        String c = st.nextToken().trim();
        if (c.length() != 2) {
          throw new IOException(confKeyCountryMatch +": Invalid country value: "+c);
        }
        countryset.add(c);
      }
    }

  }

  /**
   * Singleton initializer for lookupService.
   * We require the file "GeoIP.dat" to exist in the classpath
   *
   * @throws IOException
   */
  private static synchronized void initLookupService() throws IOException
  {
    final String geoipResourceName = "/GeoIP.dat";

    if (lookupService == null) {

      URL resUrl = IPRegexURLFilter.class.getResource(geoipResourceName);

      if (resUrl == null) {
        LOG.severe("Can't find resourceName: " + geoipResourceName);
        return;
      }

      String urlTxt = resUrl.toExternalForm();

      // trim off leading "file:/" chars from url
      if (urlTxt.substring(0,5).equals("file:")) {
        urlTxt = urlTxt.substring(5);
        lookupService = new LookupService(urlTxt, LookupService.GEOIP_MEMORY_CACHE);
      }
      else {
        LOG.severe("Non-local URL "+ urlTxt +" for resourceName "+ geoipResourceName);
      }
    }
  }
}
