"""
/* SearchFiles.py
 *
 * Copyright (c) 2005 Mohanaraj Gopala 
 * Based on work by Douglass R. Cutting from the Lucene project.
 * Also incorporates work from other PyLucene examples by OSAF
 * HTMLStrip.py is a slightly modified version of 
 * http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52281
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
"""
import sys,getopt,os,PyLucene,threading, time
from datetime import datetime
import Document
class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)


class IndexHTML :

    def __init__(self,root,index,create):
        if create==False:
            #delete stale docs
            self.deleting=True
            self.indexDocs(root, index, create)
        
        #print 'index:'+index
        #print 'create:'+create
        
        self.writer = PyLucene.IndexWriter(index, PyLucene.StandardAnalyzer(), create)
        self.writer.mergeFactor = 20
        self.writer.maxFieldLength = 1000000
        self.indexDocs(root, index, create)		  # add new docs
        
        ticker = Ticker()
        print 'optimizing index',
        threading.Thread(target=ticker.run).start()
        self.writer.optimize()
        self.writer.close()
        ticker.tick = False
        print 'done'


    def indexDocs(self,root,index,create):
        self.uidIter=None
        
        if create==False:
            # incrementally update
            self.reader = PyLucene.IndexReader.open(index)  # open existing index
            self.uidIter = self.reader.terms(PyLucene.Term("uid", "")) #  init uid iterator
            self.recursiveIndexDocs(root);

            if self.deleting :				  # delete rest of stale docs
                while self.uidIter.term() != None and  self.uidIter.term().field() == "uid" :
                    print "deleting " + Document.uid2url(self.uidIter.term().text())
                    self.reader.deleteDocument(self.uidIter.term())
                    self.uidIter.next();
                self.deleting = False;

            self.uidIter.close()				  # close uid iterator
            self.reader.close()				  # close existing index

        else: 					 # don't have exisiting index
            self.recursiveIndexDocs(root);

    def recursiveIndexDocs(self,root):
        for root, dirnames, filenames in os.walk(root):
            
            for filename in filenames:
                if filename[-3:] not in ('txt','htm','tml') :
                    continue
                try:
                    if self.uidIter!=None :
                        
                        uid = Document.uid(os.path.join(root,filename)) 	 #construct uid for doc
                        
                        while self.uidIter.term() != None and self.uidIter.term().field() == "uid" and self.uidIter.term().text() < uid :
                            if self.deleting:			  
                                #delete stale docs from index
                                print "deleting " + Document.uid2url(self.uidIter.term().text())
                                self.reader.deleteDocuments(self.uidIter.term())
                            self.uidIter.next()

                        if self.uidIter.term() != None  and self.uidIter.term().field() == "uid" and self.uidIter.term().text() == uid:
                            self.uidIter.next()			  #// keep matching docs

                        elif self.deleting==False:			  #// add new docs
                            doc = Document.Document(os.path.join(root,filename))
                            print "adding " + doc.get("url")
                            self.writer.addDocument(doc);
                      
                    else:
                        print os.path.join(root,filename)				  #// creating a new index
                        doc = Document.Document(os.path.join(root,filename));
                        print "adding " + doc.get("url");
                        self.writer.addDocument(doc);		  #// add docs unconditionally

                except Exception, e:
                    print "Failed in indexDocs:", e
                    sys.exit()

def usage():
    print "IndexHTML [--create] [--index <index>] <root_directory>"

def run(root,index,create):
    start = datetime.now()
    try:
        IndexHTML(root,index,create)
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e

if __name__ == '__main__':
    if len(sys.argv) < 2:
        usage()
    else:
        try:
            opts, args = getopt.getopt(sys.argv[1:], "ci:", ["create", "index="])
            create,root,index=False,False,False
            
            if len(args) <> 1:
                print "Wrong amount of arguments passed as <root_directory>"
                usage()
                sys.exit(2)
            elif not os.path.exists(args[0]):
                print "Path passed as <root_directory> is not valid"
                usage()
                sys.exit(2)
            else :
                root=args[0]

            for options, argument in opts:
                if options in ('-c','--create'):
                    create=True
                elif options in ('-i','--index'):
                    if not os.path.exists(argument):
                        os.mkdir(argument)
                    if not os.path.exists(argument):
                        print 'The path to the index file "'+argument+'" is not valid'
                        usage()
                        sys.exit(2)
                    else:
                        index=argument

            run(root,index,create)

        except getopt.GetoptError:
            # print help information and exit:
            usage()
            sys.exit(2)