I am working from the crawler example code found here:
http://www.google.com/url?sa=t&ct=res&cd=1&url=http%3A%2F%2Fwww.zend.com%2Ftopics%2FImprove-your-PHP-Applications-Search-Capabilities-with-Lucene.pdf&ei=y2qgSIiJHpHovAWBn9z-BQ&usg=AFQjCNHBU8vre59KVlgn0fV2O3h6B6bKMw&sig2=QTi5S9NULQDB4t30admvFQ

For some reason the contents from the page aren't being indexed.  In my log
files everything stops at: "Before add document"

None of the log messages after are showing up.

Is there something that I am missing:

<code>

        public function crawlerAction() {
                
                $this->_helper->viewRenderer->setNoRender();
                
                $this->_logger->info("Crawler initialized");
                
    /**
     * Setup Zend_Http_Client
     */
    $client = new Zend_Http_Client();
    $client->setConfig(array('timeout' => 30));
    
    $indexpath = $this->_globalConfig->lucene->index;

                try {
                        
                        $this->_index = Zend_Search_Lucene::open($indexpath);
                        
                        $this->_logger->info("Opened existing index in 
{$indexpath}");

                } catch (Zend_Search_Lucene_Exception $e) {
                  try {
                        
                        $index = Zend_Search_Lucene::create($indexpath);
                        $this->_logger->info("Created new index in {$indexpath} 
");
                        
                        /**
                         * If both fail,give up and show errormessage
                         */
                  } catch (Zend_Search_Lucene_Exception $e) {
                        
                        $this->_logger->err("Failed opening or creating index in
{$indexpath}");
                        $this->_logger->err($e->getMessage());
                        
                        print "Unable to open or create 
index:{$e->getMessage()}";
                        exit(1);
                  }
                }
                
    /**
     * Setup the targets array
     */
    $targets = array($this->_globalConfig->lucene->url);
    
    $this->_logger->info("Target count: ". count($targets));
                
    /**
     * Start iterating
     */
    for ($i = 0; $i < count($targets); $i++) {
        
      /**
         * Fetch content with HTTPClient
         */
        $client->setUri($targets[$i]);
        
        $response = $client->request();
        
      if ($response->isSuccessful()) {
        
        $body = $response->getBody();
        
        $this->_logger->info("Fetched ".strlen($body)." bytes from
{$targets[$i]}");
        
        $body_checksum = md5($body);
        
        $this->_logger->info("Body checksum {$body_checksum}");
        
        $this->_logger->info("Index: ".Zend_Debug::dump($this->_index, "",
false));
        
        $hits = $this->_index->find('url:'.$targets[$i]);
        
        $this->_logger->info("Hits: ".count($hits));
        
        $matched = false;
        
        foreach ($hits as $hit){

                $this->_logger->info("Hit md5 {$hit->md5} : checksum
{$body_checksum}");

                if ($hit->md5 == $body_checksum) {
                        
                        if ($matched == true) {
                          
                                $index->delete($hit->id);
                          $matched = true;
                        }

                } else {

                        $this->_logger->info("{$targets[$i]} is out of date and 
needs
reindexing");
                        $index->delete($hit);
                }
                
                if ($matched){

                        $this->_logger->info($targets[$i]." is uptodate, 
skipping");
                  continue;
                }
        }
        
        /**
         * Create document
         */
        $doc = Zend_Search_Lucene_Document_Html::loadHTML($body);
        
        $this->_logger->info("Url {$targets[$i]}");
        
        $doc->addField(Zend_Search_Lucene_Field::UnIndexed('url',
$targets[$i]));
        $doc->addField(Zend_Search_Lucene_Field::UnIndexed('md5',
$body_checksum));
        
        $this->_logger->info("Before add document");
        
        /**
         * Index
         */
        $this->_index->addDocument($doc);
        
        $this->_logger->info("After added the doc");
        
        $this->_logger->info("Indexed {$targets[$i]}");
        
        /**
         * Fetch new links
         */
        $links = $doc->getLinks();
        
        $this->_logger->info("Get links".Zend_Debug::dump($links));
        
        foreach($links as $link){
                
                if (strpos($link,
$this->_globalConfig->lucene->index)&&(!in_array($link, $targets))) {
                 $targets[] = $link;
                }
        }

      } else {
        $this->_logger->warn("Requesting {$url} returned HTTP
{$response->getStatus()}");
      }
    }
    
    $this->_logger->info("Iterated over ".count($targets)."documents");
    $this->_logger->info("Optimizing index...");
    
    $index->optimize();
    
    $this->_logger->info("Done. Index now
contains".$index->numDocs()."documents");
    $this->_logger->info("Crawling completed");
    
    
        }
</code>
-- 
View this message in context: 
http://www.nabble.com/Lucene-Crawler-Not-Adding-to-Index-tp18929433p18929433.html
Sent from the Zend Framework mailing list archive at Nabble.com.

Reply via email to