"""
/* Document.py
 *
 * Copyright (c) 2005 Mohanaraj Gopala
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


/** A utility for making PyLucene Documents for HTML documents. */

 """
import os,sys,stat,re,HTMLStripper
import PyLucene
from PyLucene import Field

    
def uid(path):
    # Append path and date into a string in such a way that lexicographic
    # sorting gives the same results as a walk of the file hierarchy.  Thus
    # null (\u0000) is used both to separate directory components and to
    # separate the path from the date.
    return  path.replace(os.sep,u'\u0000')+u'\u0000'+modifiedTimestampPadded(path);

def modifiedTimestampPadded(path):
    datetimeString=str(os.stat(path)[stat.ST_MTIME])
    
    while len(datetimeString) < len(str(1000L*365*24*60*60*1000)):
        datetimeString = "0" + datetimeString 
        
    return datetimeString
    
def uid2url(uid):
    newUrl = uid.replace(u'\u0000', os.sep)	  # replace nulls with slashes
    return newUrl[0:newUrl.rfind(os.sep)] # remove date from end

def Document(path):
    if  path.endswith('.html') or path.endswith('.htm'):
        return HTMLDocument(path)
    else:
        return TextDocument(path)
        
def HTMLDocument(path):
    doc = PyLucene.Document()

    # Add the url as a field named "url".  Use an UnIndexed field, so
    # that the url is just stored with the document, but is not searchable.
    doc.add(Field.UnIndexed("url", path.replace(os.sep, '/')))

    # Add the last modified date of the file a field named "modified".  Use a
    # Keyword field, so that it's searchable, but so that no attempt is made
    # to tokenize the field into words.
    doc.add(Field.Keyword("modified",modifiedTimestampPadded(path)))

    # Add the uid as a field, so that index can be incrementally maintained.
    # This field is not stored with document, it is indexed, but it is not
    # tokenized prior to indexing.
    doc.add(Field("uid", uid(path), False, True, False))

    #HTMLParser parser = new HTMLParser(f);

    # Add the tag-stripped contents as a Reader-valued Text field so it will
    # get tokenized and indexed.
    try:
        contents=HTMLStripper.strip(unicode(open(path).read(),'iso-8859-1'))
    except Exception,e:
        #print "There was a problem parsing HTML document",path
        #print "Exception",e
        #print "Falling to plain text content storage"
        contents=unicode(open(path).read(),'iso-8859-1')

    doc.add(Field.Text("contents",contents))
    return doc

    # Add the summary as an UnIndexed field, so that it is stored and returned
    # with hit documents for display.
    #doc.add(Field.UnIndexed("summary", parser.getSummary()));

    # Add the title as a separate Text field, so that it can be searched
    # separately.
    #doc.add(Field.Text("title", parser.getTitle()));

def TextDocument(path):
    doc = PyLucene.Document()

    doc.add(Field.UnIndexed("url", path.replace(os.sep, '/')))

    doc.add(Field.Keyword("modified",modifiedTimestampPadded(path)))

    doc.add(Field("uid", uid(path), False, True, False))
    contents=unicode(open(path).read(),'iso-8859-1')
    doc.add(Field.Text("contents",contents))
    return doc