Hello George,

Here is a quick hack (with a few TODOs).  I only tested it a bit, so
the actual delete calls are still commented out.  If this works for
you, and especially if you take care of TODOs, I may put this in the
Lucene Sandbox.

Otis
P.S.
Usage example showing how the fool found some unused segments (this was
caused by a bug in one of the earlier 1.4 versions of Lucene).

[EMAIL PROTECTED] java]$ java org.apache.lucene.index.SegmentPurger
/simpy/users/1/index
Candidate non-Lucene file found: _1b2.del
Candidate unused Lucene file found: _1b2.cfs
Candidate unused Lucene file found: _1bm.cfs
Candidate unused Lucene file found: _1c6.cfs
Candidate unused Lucene file found: _1cq.cfs
Candidate unused Lucene file found: _1da.cfs
Candidate unused Lucene file found: _1du.cfs
Candidate unused Lucene file found: _1ee.cfs
Candidate unused Lucene file found: _1ey.cfs
[EMAIL PROTECTED] java]$
[EMAIL PROTECTED] java]$ strings /simpy/users/1/index/segments
_3o0
[EMAIL PROTECTED] java]$ ls -al /simpy/users/1/index/
total 647
drwxrwsr-x    2 otis     simpy        1024 Dec  7 14:39 .
drwxrwsr-x    3 otis     simpy        1024 Sep 16 20:39 ..
-rw-rw-r--    1 otis     simpy      212815 Nov 17 18:36 _1b2.cfs
-rw-rw-r--    1 otis     simpy         104 Nov 17 18:40 _1b2.del
-rw-rw-r--    1 otis     simpy        3380 Nov 17 18:40 _1bm.cfs
-rw-rw-r--    1 otis     simpy        3533 Nov 17 18:40 _1c6.cfs
-rw-rw-r--    1 otis     simpy        4774 Nov 17 18:40 _1cq.cfs
-rw-rw-r--    1 otis     simpy        3389 Nov 17 18:40 _1da.cfs
-rw-rw-r--    1 otis     simpy        3809 Nov 17 18:40 _1du.cfs
-rw-rw-r--    1 otis     simpy        3423 Nov 17 18:40 _1ee.cfs
-rw-rw-r--    1 otis     simpy        4016 Nov 17 18:40 _1ey.cfs
-rw-rw-r--    1 otis     simpy      410299 Dec  7 14:39 _3o0.cfs
-rw-rw-r--    1 otis     simpy           4 Dec  7 14:39 deletable
-rw-rw-r--    1 otis     simpy          29 Dec  7 14:39 segments


--- [EMAIL PROTECTED] wrote:

> Hello all.
> 
>  
> 
> I recently ran into a problem where errors during indexing or
> optimization
> (perhaps related to running out of disk space) left me with a working
> index
> in a directory but with additional segment files (partial) that were
> unneeded.  The solution for finding the ~40 files to keep out of the
> ~900
> files in the directory amounted to dumping the segments file and
> noting that
> only 5 segments were in fact "live".  The index is a non-compound
> index
> using FSDirectory.
> 
>  
> 
> Is there (or would it be possible to add (and I'd be willing to
> submit code
> if it made sense to people)) some sort of interrogation on the index
> of what
> files belonged to it?  I looked first as FSDirectory itself thinking
> that
> it's "list()" method should return the subset of index-related files
> but
> looking deeper it looks like Directory is at a lower level
> abstracting
> simple I/O and thus wouldn't "know".
> 
>  
> 
> So any thoughts?  Would it make sense to have a form of clean on
> IndexWriter()?  I hesitate since it seems there isn't a charter that
> only
> Lucene files could exist in the directory thus what is ideal for my
> application (since I know I won't mingle other files) might not be
> ideal for
> all.  Would it be fair to look for Lucene known extensions and file
> naming
> signatures to identify unused files that might be failed or dead
> segments?
> 
>  
> 
> Thanks,
> 
> -George
> 
> 
package org.apache.lucene.index;

import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;
import java.io.File;


/**
 * A tool that peeks into Lucene index directories and removes
 * unwanted files.  In its more radical mode, this tool can be used to
 * remove all non-Lucene index files from a directory.  The other
 * option is to remove unused Lucene segment files, should the index
 * directory get polluted.
 *
 * TODO: this tool should really lock the directory for writing before
 * removing any Lucene segment files, otherwise this tool itself may
 * corrupt the index.
 *
 * @author Otis Gospodnetic
 * @version $Id$
 */
public class SegmentPurger
{
    // TODO: copied from SegmentMerger - should probably made public
    // static final, to make it reusable
    // TODO: add .del extension

    // File extensions of old-style index files
    public static final String MULTIFILE_EXTENSIONS[] = new String[] {
        "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
    };
    public static final String VECTOR_EXTENSIONS[] = new String[] {
        "tvx", "tvd", "tvf"
    };
    public static final String COMPOUNDFILE_EXTENSIONS[] = new String[] {
        "cfs"
    };
    public static final String INDEX_FILES[] = new String[] {
        "segments", "deletable"
    };

    public static final String[][] SEGMENT_EXTENSIONS = new String[][] {
        MULTIFILE_EXTENSIONS, COMPOUNDFILE_EXTENSIONS, VECTOR_EXTENSIONS
    };

    /** The file format version, a negative number. */
    /* Works since counter, the old 1st entry, is always >= 0 */
    public static final int FORMAT = -1;

    private int counter = 0;    // used to name new segments
    private long version = 0;   // counts how often the index has been changed by adding or deleting docs
    private FSDirectory directory;


    public SegmentPurger(FSDirectory directory)
    {
        this.directory = directory;
    }

    public void purgeNonLuceneFiles() throws IOException
    {
        String indexDirFiles[] = directory.list();

        // loop through all files in dir
        for (int i = 0; i < indexDirFiles.length; i++)
        {
            //System.out.println("FILE: " + indexDirFiles[i]);
            if (!isLuceneDirectoryFile(indexDirFiles[i]))
            {
                System.out.println("Candidate non-Lucene file found: " + indexDirFiles[i]);
                // directory.deleteFile(indexDirFiles[i]);
            }
        }
    }

    public void purgeUnusedLuceneFiles() throws IOException
    {
        String indexDirFiles[] = directory.list();

        // loop through all files in dir
        for (int i = 0; i < indexDirFiles.length; i++)
        {
            // if this is a Lucene file
            if (isLuceneDirectoryFile(indexDirFiles[i]))
            {
                // check if it's in the list of used segments
                if (!isUsed(indexDirFiles[i]))
                {
                    System.out.println("Candidate unused Lucene file found: " + indexDirFiles[i]);
                    // directory.deleteFile(indexDirFiles[i]);
                }
            }
        }
    }


    private List getSegmentInfos() throws IOException
    {
        List siList = new ArrayList();
        IndexInput input = directory.openInput("segments");
        try {
            int format = input.readInt();
            if(format < 0){     // file contains explicit format info
                // check that it is a format we can understand
                if (format < FORMAT)
                    throw new IOException("Unknown format version: " + format);
                version = input.readLong(); // read version
                counter = input.readInt(); // read counter
            }
            else{     // file is in old format without explicit format info
                counter = format;
            }

            for (int i = input.readInt(); i > 0; i--) { // read segmentInfos
                SegmentInfo si =
                    new SegmentInfo(input.readString(), input.readInt(), directory);
                siList.add(si);
            }

            if(format >= 0){    // in old format the version number may be at the end of the file
                if (input.getFilePointer() >= input.length())
                    version = 0; // old file format without version number
                else
                    version = input.readLong(); // read version
            }
        }
        finally {
            input.close();
        }

        return siList;
    }

    private boolean isLuceneDirectoryFile(String fileName)
    {
        for (int i = 0; i < SEGMENT_EXTENSIONS.length; i++)
        {
            String[] EXTENSIONS = SEGMENT_EXTENSIONS[i];
            for (int j = 0; j < EXTENSIONS.length; j++)
            {
                if (fileName.endsWith("." + EXTENSIONS[j]))
                    return true;
            }
        }

        // TODO: also account for .fN files

        for (int i = 0; i < INDEX_FILES.length; i++)
        {
            if (fileName.equals(INDEX_FILES[i]))
                return true;
        }

        return false;
    }

    private boolean isUsed(String fileName) throws IOException
    {
        // these files are always used (e.g. segments, deletable)
        for (int i = 0; i < INDEX_FILES.length; i++)
        {
            if (fileName.equals(INDEX_FILES[i]))
                return true;
        }

        // split file name into base and extension, because we compare
        // file base name names of used segments
        String f = (new File(fileName)).getName();
        String[] baseExt = f.split("\\.", 2);
        if (baseExt.length < 2)
        {
            System.err.println("Can't split file name into base and extension: " + fileName);
            return false;
        }

        List siList = getSegmentInfos();
        // if the file base name matches a name of a used segment, the
        // file is considered used
        for (Iterator it = siList.iterator(); it.hasNext();)
        {
            SegmentInfo si = (SegmentInfo) it.next();
//             System.out.println("FN: " + baseExt[0]);
//             System.out.println("SI: " + si.name);
            if (si.name.equals(baseExt[0]))
                return true;
        }
        return false;
    }


    public static void main(String[] args) throws IOException
    {
        FSDirectory directory = FSDirectory.getDirectory(args[0], false);
        SegmentPurger sp = new SegmentPurger(directory);
        sp.purgeNonLuceneFiles();
        sp.purgeUnusedLuceneFiles();
    }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to