Hi Jay,
I think with the current version you could only prune segments.
We have once written a class to prune the db.
Maybe you could use this and add a function to delete pages according to the urlfilter. I have attached our class.
Matthias
--
http://www.eventax.com - eventax GmbH
http://www.umkreisfinder.de - Die Suchmaschine für Lokales und Events


Jay Pound schrieb:

How do I write my queries file for pruning my database, to only .com .edu
.org .us etc... only us sites?
Thanks,
Jay Pound
/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.tools;

import java.io.*;
import java.util.*;
import java.util.logging.*;

import org.apache.nutch.db.*;
import org.apache.nutch.fs.*;
import org.apache.nutch.util.*;

/******************************************
 * The PruneDB is for Nutch administrators
 * who need special access to the webdb.  It allows
 * deleeting pages from the db, which have 
 * getNextFetchTime() == Long.MAX_VALUE.
 *
 * @author Jakob Heidebrecht
 ******************************************/

public class PruneDB {
    
    
    IWebDBReader reader;
    WebDBWriter writer;
    public static final Logger LOG = 
            LogFormatter.getLogger("org.apache.nutch.tools.PruneDB");
    
    /** Creates a new instance of PruneDB */
    public PruneDB(IWebDBReader reader, WebDBWriter writer) {
        this.reader = reader;
        this.writer = writer;
    }
    

    /**
     *Deleets pages which have getNextFetchTime() == Long.MAX_VALUE
     */
    public void pruneDeletedPages() 
    throws IOException {
        
        long currentpage    = 0;
        long pagesdeleted   = 0;
        int  untilprint     = 100000;
        int i = 0;
        Enumeration e = this.reader.pagesByMD5();
        
        while (e.hasMoreElements()) {
            currentpage++;
            Page page = (Page) e.nextElement();
            
            //delete pages from the db, if page is gone
            if (page.getNextFetchTime() == Long.MAX_VALUE) {
                //LOG.info("URL: *"+page.getURL().toString()+"*");
                this.writer.deletePage(page.getURL().toString());
                pagesdeleted++;
            }
            untilprint--;
            if (untilprint<1) {
                LOG.info("Deleted " + pagesdeleted + " from " + currentpage + 
                        " pages read");
                untilprint = 100000;
            }
            i++;
        }
        LOG.info(pagesdeleted + " from " + currentpage + 
                " pages deleted alltogether, now sorting & merging");
                      

    }
    
    /**
     * @param args the command line arguments
     */
    public static void main(String[] argv) 
    throws FileNotFoundException, IOException {
        if (argv.length < 2) {
            System.out.println("Usage: java org.apache.nutch.tools.PruneDB (-local | -ndfs <namenode:port>) db [-prunedeletedpages]");
            return;
        }
        
        String command = null;
        int i = 0;
        NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
        File root = new File(argv[i++]);
        IWebDBReader reader = new WebDBReader(nfs, root);
        WebDBWriter writer = new WebDBWriter(nfs, root);
        
        
        try {
            
            for (; i < argv.length; i++) {
            
                if ("-prunedeletedpages".equals(argv[i])) {
                    PruneDB prunedb = new PruneDB(reader, writer);
                    prunedb.pruneDeletedPages();
                }
            }
        } finally {
            writer.close();
            nfs.close();
        }
   }
        
    
}

Reply via email to