/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000-2003 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache", "Jakarta", "JAMES" and "Apache Software Foundation"
 *    must not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 *
 * Portions of this software are based upon public domain software
 * originally written at the National Center for Supercomputing Applications,
 * University of Illinois, Urbana-Champaign.
 */

package org.apache.james.transport.matchers;

import net.ukrpost.mail.spam.BayesStats;
import net.ukrpost.mail.spam.BayesUtils;
import net.ukrpost.mail.spam.FilePersistantStats;
import net.ukrpost.mail.spam.Bayes;
import org.apache.commons.lang.time.StopWatch;
import org.apache.mailet.GenericMatcher;
import org.apache.mailet.Mail;

import javax.mail.MessagingException;
import javax.mail.internet.MimeMessage;
import java.io.File;
import java.util.Collection;

/** Bayesian spam matcher.
 *
 * @author Alexander Zhukov <zhukov@ukrpost.net>
 */
public class BayesianSpam extends GenericMatcher {

    int cutoff = 0;
    private BayesStats spamStats = null;

    public void init() {
        log("initializing bayesian spam matcher");
        //todo: currently only hashtable serialized to file is implemented
        String bayesianStatsStorageUrl = getCondition();
        final FilePersistantStats persistantStats = new FilePersistantStats();
        spamStats = new BayesStats();
        persistantStats.setPersistable(spamStats);
        try {
            persistantStats.setContext(new File(bayesianStatsStorageUrl));
            //todo: ok i know this is ugly but it doesnt matter for now i just want my stats be available for bayesian spam processor
            persistantStats.load();
            log("bayesian spam stats loaded");
        } catch (Exception ex) {
            log("ERORR: bayes stats not loaded", ex);
            spamStats = null;
        }
    }

    private final static float probabilityToTreatAsSpam = 0.75f;

    public Collection match(Mail mail) throws MessagingException {
        if (spamStats == null)
            return null;

        MimeMessage message = mail.getMessage();
        final String tokenizedMessage[] = tokenizeMessage(message);
        if (tokenizedMessage == null)
            return null;
        log("message token count: "+tokenizedMessage.length);
        //todo: maybe we dont have to calculate probability if not enough tokens?
        float spamProbability = Bayes.calculateProbability(tokenizedMessage, spamStats, 0.5f);
        log("spam probability: "+spamProbability);
        if ( spamProbability > probabilityToTreatAsSpam )
            return mail.getRecipients();

        return null;
    }

    private final String[] tokenizeMessage(MimeMessage mm) {
        final StopWatch tokenizerWatch = new StopWatch();
        tokenizerWatch.start();
        final String tokenizedMessage[] = BayesUtils.messageToStringArray(mm);
        tokenizerWatch.stop();
        if (tokenizedMessage == null) {
            log("message not tokenized. tokenizedMessage == null.");
            return null;
        } else {
            log("message tokenized in " + tokenizerWatch.getTime() + "msec");
        }
        return tokenizedMessage;
    }

}
