'''
errorPages.py

Copyright 2006 Andres Riancho

This file is part of w3af, w3af.sourceforge.net .

w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.

w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

'''

import core.controllers.outputManager as om
# options
from core.data.options.option import option
from core.data.options.optionList import optionList

from core.controllers.basePlugin.baseGrepPlugin import baseGrepPlugin

import core.data.kb.knowledgeBase as kb
import core.data.kb.info as info

from core.controllers.misc.levenshtein import relative_distance

REMOVE_PARAMETER_FROM_BODY_MIN_STRING_LENGTH = 6


class clusterResponseBodies(baseGrepPlugin):
    '''
    Cluster all response bodies and show one of each cluster to the user.
      
    @author: floyd fuh ( floyd_fuh@yahoo.de )
    '''

    def __init__(self):
        baseGrepPlugin.__init__(self)
        
        self._req_resp_ids = []
        self._comment_no_group_members = ''
        self._comment_no_groups = ''
        
        
        #Options
        self._removeParameterFromBodies = True
        self._similarity = 0.85
        self._maxOutputedGroups = 50
        self._maxOutputedMembersPerGroup = 30
    
    def _checkSimilarity(self, original_str, new_str, origParams=None, newParams=None):
        
        if self._removeParameterFromBodies:
            original_str = self._removeParameterFromString(original_str, origParams)
            new_str = self._removeParameterFromString(new_str, newParams)
        
        original_to_error = relative_distance(original_str, new_str )
        if original_to_error < self._similarity:
            return False
        else:
            return True
    
    def _removeParameterFromString(self, string, params):
        
        if params:
            for paramName in params.keys():
                if len(str(paramName)) > REMOVE_PARAMETER_FROM_BODY_MIN_STRING_LENGTH:
                    string.replace(str(paramName), '')
                for value in params[paramName]:
                    if len(value) > REMOVE_PARAMETER_FROM_BODY_MIN_STRING_LENGTH:
                        string.replace(value, '')        
        
        return string

    def grep(self, request, response):
        '''
        Plugin entry point, cluster the response.
        
        @parameter request: The HTTP request object.
        @parameter response: The HTTP response object
        @return: None
        '''
      
      
        #TODO: This method can be improved
        #It would be great to implement some kind of clustering algorithm
        #like k-means to find groups and not just this simple compare      
        
          
        isNowCategorized = False
          
        for groupRequest, groupResponse, groupIds in reversed(self._req_resp_ids):
        
            if self._checkSimilarity(groupResponse.getBody(), response.getBody(), origParams=groupRequest.getDc(), newParams=request.getDc()):
                #Append to group
                isNowCategorized = True
                if len(groupIds) >= self._maxOutputedMembersPerGroup:
                    #Don't save it, because we wont output it anyway
                    if self._comment_no_group_members == '':
                        self._comment_no_group_members = 'Some groups were truncated. Increase maxOutputedMembersPerGroup if you want to avoid that.'
                        om.out.information('clusterResponseBodies: '+self._comment_no_group_members)
                        self._comment_no_group_members += '\n'
                    return
                groupIds.append(response.id)
                break
        if not isNowCategorized:
            if len(self._req_resp_ids) >= self._maxOutputedGroups:
                #Don't save a new group, because we wont output it anyway
                if self._comment_no_groups == '':
                    self._comment_no_groups = 'Warning: There were more groups than outputed. Decrease similarity measure (recommended) or increase maxOutputedGroups to avoid.'
                    om.out.information('clusterResponseBodies: '+self._comment_no_groups)
                    self._comment_no_groups += '\n'
                return
            #Add a new group
            self._req_resp_ids.append([request, response,  [response.id, ]])
        
        
    def setOptions( self, optionsMap ):
        
        if optionsMap['similarity'].getValue() >=0 and \
        optionsMap['similarity'].getValue() <=1:
            self._similarity = optionsMap['similarity'].getValue()
        else:
            msg = 'Please choose similarity between 0 and 1'
            raise w3afException(msg)
            
        self._removeParameterFromBodies = optionsMap['removeParameterFromBodies'].getValue()
        
        
        if optionsMap['maxOutputedGroups'].getValue() >0:
            self._maxOutputedGroups = optionsMap['maxOutputedGroups'].getValue()
        else:
            msg = 'Please choose maxOutputedGroups higher than 0'
            raise w3afException(msg)
        
        if optionsMap['maxOutputedMembersPerGroup'].getValue() >0:
            self._maxOutputedMembersPerGroup = optionsMap['maxOutputedMembersPerGroup'].getValue()
        else:
            msg = 'Please choose maxOutputedMembersPerGroup higher than 0'
            raise w3afException(msg)
    
    def getOptions( self ):
        '''
        @return: A list of option objects for this plugin.
        '''    
        
        ol = optionList()
        
        d1 = 'Two strings are similar when they match with this rate'
        h1 = 'Two strings are similar when they match with this rate'
        o1 = option('similarity', self._similarity, d1, 'float', help=h1)
        
        
        d2 = 'Remove all occurences of the parameter'
        h2 = 'Remove all occurences of the parameter name or the parameter value inside the response body before comparing'
        h2 += '. This will only be done for parameter names or parameter values which are longer than '\
            +str(REMOVE_PARAMETER_FROM_BODY_MIN_STRING_LENGTH)+' characters. '
        h2 += 'If the application often echos back parameters, use this option.'
        o2 = option('removeParameterFromBodies', self._removeParameterFromBodies, d2, 'boolean', help=h2 )
        
        
        d3 = 'The maximum of clusters that are shown'
        h3 = 'The maximum of clusters that are shown'
        o3 = option('maxOutputedGroups', self._maxOutputedGroups, d3, 'integer', help=h3)
        
        
        d4 = 'The maximum numer of members of each group that are shown'
        h4 = 'The maximum numer of members of each group that are shown'
        o4 = option('maxOutputedMembersPerGroup', self._maxOutputedMembersPerGroup, d4, 'integer', help=h4)
        
        ol.add(o1)
        ol.add(o2)
        ol.add(o3)
        ol.add(o4)
        
        return ol

    def end(self):
        '''
        This method is called when the plugin wont be used anymore.
        '''
        
        #Analyze outliers in all responses together
        groups = self._req_resp_ids
        if len(groups) > 1:
            #Sort the groups by length of each group
            groups.sort(lambda x, y: len(x[2])-len(y[2]))
            # Create the info
            i = info.info()
            i.setName('Summary')
            #i.setURL(freq.getURL())  
            #i.setURI(freq.getURI()) 
            #i.setMethod() 
            ids = []
            groupString = ''
            index = 0
            for group in groups:
                ids.append(group[1].id)
                if index < self._maxOutputedGroups:
                    index+=1
                    groupString += 'Response group '+str(index)+': '
                    outputedMembers = 0
                    lastId = -1
                    firstId = -1
                    for member in group[2]:
                        if outputedMembers <= self._maxOutputedMembersPerGroup:
                            if lastId == -1:
                                #The first ever
                                lastId = member
                                firstId = member
                            elif member == (lastId+1):
                                #A member of a sequence
                                lastId = member
                            elif lastId == firstId:
                                #It was a single member, no sequence
                                outputedMembers += 1
                                groupString += str(firstId)+' '
                                #But the current one is the beginning of a sequence or a single member
                                lastId = member
                                firstId = member
                            elif lastId > firstId:
                                #It was a sequence
                                outputedMembers += 2
                                groupString += str(firstId)+'-'+str(lastId)+' '
                                #But the current one is the beginning of a sequence or a single member
                                lastId = member
                                firstId = member
                            else:
                                #Debug
                                raise w3afException('clusteredResponseBodies plugin corrupted output, this should never happen')
                        else:
                            break
                    #End it now
                    if lastId == firstId:
                        #single member to end
                        groupString += str(firstId)
                    elif lastId > firstId:
                        #sequence to end
                        groupString += str(firstId)+'-'+str(lastId)
                    else:
                        #Debug
                        raise w3afException('clusteredResponseBodies plugin corrupted output, this should never happen')
                    if outputedMembers >= self._maxOutputedMembersPerGroup:
                        #There were more, but we didn't want to output them
                        groupString += '...'
                    groupString += '\n'
                else:
                    groupString += '\nThere are more bigger groups'
            desc = self._comment_no_group_members+self._comment_no_groups
            desc += 'After all, there were '+str(len(groups))+' group(s) of responses. '
            desc += 'The smallest group has '+str(len(groups[0][2])) +' response(s) (the representative of this group '
            desc += 'is response no. '+str(groups[0][1].id)+'), the biggest group '
            desc += 'has '+str(len(groups[len(groups)-1][2]))+' members (representative is response no '+str(groups[len(groups)-1][1].id)+').\n'
            desc += groupString
            i.setId(ids) #can be set to a list!
            i.setDesc(desc) 
            #i.setVar() 
            #i.setDc() 
            if not self._comment_no_groups == '':
                i.addToHighlight(self._comment_no_groups)
            kb.kb.append( self, 'Clustering Test', i )
            om.out.information(str(i))
        else:
            om.out.information('There was only one group of responses')
        

    def getPluginDeps( self ):
        '''
        @return: A list with the names of the plugins that should be runned before the
        current one.
        '''
        return []
    
    def getLongDesc( self ):
        '''
        @return: A DETAILED description of the plugin functions and features.
        '''
        return '''
        
        Clusters all response bodies and shows a summary at the end of the scan, which
        shows all different kind of response bodies there were.
        
        Adjust similarity if you get too few (eg. one group with all responses) 
        or too many groups (eg. one member in each group).
        
        This plugin takes the first response body and adds it to a list. 
        The second response body is compared to the first. If they are similar, 
        the second response is appended to the same list, otherwise a new list will be created.
        The third is first compared to the first response and appended to the same list if the response
        bodies are similar, otherwise compared to the second response and if both don't match, 
        a new list is created and so on...
        '''
