'''
levenshtein.py

Copyright 2008 Andres Riancho

This file is part of w3af, w3af.sourceforge.net .

w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.

w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

'''

import difflib
import pprint

from upper_bounds import UPPER_BOUNDS

def relative_distance_boolean(a_str, b_str, threshold=0.6):
    '''
    Indicates if the strings to compare are enough "similar". This (optimized)
    function is equivalent to the expression:
    relative_distance(x, y) >= threshold
    
    @param a_str: A string object
    @param b_str: A string object
    @param threshold: Float value indicating the expected "similarity". Must be 0 <= threshold <= 1.0
    @return: A boolean value
    '''

    #Next three lines makes sure we have something in a_str and b_str that is iterable
    #and something that is a number in threshold. They do the job to raise TypeErrors
    iter(a_str)
    iter(b_str)
    threshold = float(threshold)

    if threshold < 0 or threshold > 1.0:
        raise TypeError('Parameter threshold has to be: 0<=threshold<=1.0')

    if threshold == 0:
        return True
    elif threshold == 1.0:
        return a_str == b_str

    # First we need b_str to be the longer of both
    if len(b_str) < len(a_str):
        a_str, b_str = b_str, a_str

    alen = len(a_str)
    blen = len(b_str)

    if blen == 0 or alen == 0:
        return alen == blen

    ratio = float(blen) / alen

    last_ratio, last_bound = UPPER_BOUNDS[-1]

    if threshold < last_bound:
        # Bad, we can't optimize anything here
        return relative_distance(a_str, b_str) >= threshold
    else:
        if last_ratio < ratio:
            # Good, we know the upper bound
            return False
        else:
            # We have to step through
            for size_ratio, bound in UPPER_BOUNDS:
                if size_ratio > ratio:
                    # Bad: we have to do the relative_distance
                    return relative_distance(a_str, b_str) >= threshold
                elif bound < threshold:
                    # Good: We found an upper bound
                    return False


def relative_distance_ge(a_str, b_str, threshold=0.6):
    '''
    Indicates if the 'similarity' index between strings
    is *greater equal* than 'threshold'. See 'relative_distance_boolean'.
    '''
    return relative_distance_boolean(a_str, b_str, threshold)

def relative_distance_lt(a_str, b_str, threshold=0.6):
    '''
    Indicates if the 'similarity' index between strings
    is *less than* 'threshold'
    '''
    return not relative_distance_boolean(a_str, b_str, threshold)


def relative_distance(a_str, b_str):
    '''
    Measures the "similarity" of the strings. A return value value over 0.6
    means the strings are close matches.
    
    @param a_str: A string object
    @param b_str: A string object
    @return: A float with the distance
    '''
    return difflib.SequenceMatcher(None, a_str, b_str).quick_ratio()


def _generate_upper_bounds(left_max=40, right_max=30):
    '''
    This function can be used to produce new upper bounds,
    but shouldn't be used in productive code. Simply run this
    command once and then hardcode the list.
    '''

    UPPER_BOUNDS = set()
    UPPER_BOUNDS.add((1.0, 1.0))

    def addToBounds(a, b):
        size = float(len(b)) / len(a)
        upper_bound = relative_distance(a, b)
        UPPER_BOUNDS.add((size, upper_bound))

    for k in range(1, left_max):
        for i in range(1, right_max):
            if k == i == 1:
                continue
            atest = 'm' * k
            btest = 'm' * k + 'm' * (i - 1)
            addToBounds(atest, btest)

    # Remove duplicates
    UPPER_BOUNDS = list(UPPER_BOUNDS)

    # Sort
    UPPER_BOUNDS.sort(lambda x, y: cmp(x[0], y[0]))

    fp = file("upper_bounds.py", "w")
    fp.write("UPPER_BOUNDS = ")
    pprint.pprint(UPPER_BOUNDS, fp)
    fp.close()

