/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.net;

import java.net.URL;
import java.net.MalformedURLException;
// import java.net.URI;
// import java.net.URISyntaxException;

import java.util.logging.Logger;
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;

import net.nutch.util.LogFormatter;
import org.apache.oro.text.regex.*;

/**
 * Converts URLs to a normal form .
 */
public class BasicUrlNormalizer implements UrlNormalizer {
    public static final Logger LOG =
            LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer");

    private Perl5Compiler compiler = new Perl5Compiler();
    private PatternMatcher matcher = new Perl5Matcher();
    Rule relativePathRule = null;
    Rule leadingRelativePathRule = null;

    public BasicUrlNormalizer() {
        try {
            /*
            the pattern tries to find spots like "/xx/../" in the url, which could be replaced by "/"
            xx consists of chars, different then "/" (slash) and needs to have at least one char different from "."
            */
            relativePathRule = new Rule();
            relativePathRule.pattern = (Perl5Pattern) compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)");
            relativePathRule.substitution = new Perl5Substitution("/");
            /*
            the pattern tries to find spots like leading "/../" in the url, which could be replaced by "/"
            */
            leadingRelativePathRule = new Rule();
            leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile("^(/\\.\\./)+");
            leadingRelativePathRule.substitution = new Perl5Substitution("/");

        } catch (MalformedPatternException e) {
            e.printStackTrace();
        }
    }

    public String normalize(String urlString)
            throws MalformedURLException {
        if ("".equals(urlString))                     // permit empty
            return urlString;

        urlString = urlString.trim();                 // remove extra spaces

        URL url = new URL(urlString);

        String protocol = url.getProtocol();
        String host = url.getHost();
        int port = url.getPort();
        String file = url.getFile();

        boolean changed = false;

        if (!urlString.startsWith(protocol))        // protocol was lowercased
            changed = true;

        if ("http".equals(protocol) || "ftp".equals(protocol)) {

            if (host != null) {
                String newHost = host.toLowerCase();    // lowercase host
                if (!host.equals(newHost)) {
                    host = newHost;
                    changed = true;
                }
            }

            if (port == url.getDefaultPort()) {       // uses default port
                port = -1;                              // so don't specify it
                changed = true;
            }

            if (file == null || "".equals(file)) {    // add a slash
                file = "/";
                changed = true;
            }

            if (url.getRef() != null) {                 // remove the ref
                changed = true;
            }

            // check for unnecessary use of "/../"
            String file2 = substituteUnnecessaryRelativePaths(file);

            if (!file.equals(file2)) {
                changed = true;
                file = file2;
            }

        }

        if (changed)
            urlString = new URL(protocol, host, port, file).toString();

        return urlString;
    }

    private String substituteUnnecessaryRelativePaths(String file) {
        String fileWorkCopy = new String(file);
        int oldLen = file.length();
        int newLen = oldLen - 1;

        /*
        / All substitutions will be done step by step, to ensure that certain constellations will be normalized, too
        /
        / For example: "/aa/bb/../../cc/../foo.html will be normalized in the following manner:
        / "/aa/bb/../../cc/../foo.html"
        / "/aa/../cc/../foo.html"
        / "/cc/../foo.html"
        / "/foo.html"
        /
        / The normalization also takes care of leading "/../", which will be replaced by "/", because this
        / is a rather a sign of bad webserver configuration than of a wanted link.
        / For example, urls like "http://www.foo.com/../" should return a http 404 error instead of
        / redirecting to "http://www.foo.com".
        /
        */
        while (oldLen != newLen) {
            // substitue first occurence of "/xx/../" by "/"
            oldLen = fileWorkCopy.length();
            fileWorkCopy = Util.substitute(matcher, relativePathRule.pattern, relativePathRule.substitution, fileWorkCopy, 1);

            // remove leading "/../"
            fileWorkCopy = Util.substitute(matcher, leadingRelativePathRule.pattern, leadingRelativePathRule.substitution, fileWorkCopy, 1);
            newLen = fileWorkCopy.length();
        }

        return fileWorkCopy;
    }


    /**
     * Class which holds a compiled pattern and its corresponding substition string.
     */
    private static class Rule {
        public Perl5Pattern pattern;
        public Perl5Substitution substitution;
    }

}

