I am working from the crawler example code found here: http://www.google.com/url?sa=t&ct=res&cd=1&url=http%3A%2F%2Fwww.zend.com%2Ftopics%2FImprove-your-PHP-Applications-Search-Capabilities-with-Lucene.pdf&ei=y2qgSIiJHpHovAWBn9z-BQ&usg=AFQjCNHBU8vre59KVlgn0fV2O3h6B6bKMw&sig2=QTi5S9NULQDB4t30admvFQ
For some reason the contents from the page aren't being indexed. In my log files everything stops at: "Before add document" None of the log messages after are showing up. Is there something that I am missing: <code> public function crawlerAction() { $this->_helper->viewRenderer->setNoRender(); $this->_logger->info("Crawler initialized"); /** * Setup Zend_Http_Client */ $client = new Zend_Http_Client(); $client->setConfig(array('timeout' => 30)); $indexpath = $this->_globalConfig->lucene->index; try { $this->_index = Zend_Search_Lucene::open($indexpath); $this->_logger->info("Opened existing index in {$indexpath}"); } catch (Zend_Search_Lucene_Exception $e) { try { $index = Zend_Search_Lucene::create($indexpath); $this->_logger->info("Created new index in {$indexpath} "); /** * If both fail,give up and show errormessage */ } catch (Zend_Search_Lucene_Exception $e) { $this->_logger->err("Failed opening or creating index in {$indexpath}"); $this->_logger->err($e->getMessage()); print "Unable to open or create index:{$e->getMessage()}"; exit(1); } } /** * Setup the targets array */ $targets = array($this->_globalConfig->lucene->url); $this->_logger->info("Target count: ". count($targets)); /** * Start iterating */ for ($i = 0; $i < count($targets); $i++) { /** * Fetch content with HTTPClient */ $client->setUri($targets[$i]); $response = $client->request(); if ($response->isSuccessful()) { $body = $response->getBody(); $this->_logger->info("Fetched ".strlen($body)." bytes from {$targets[$i]}"); $body_checksum = md5($body); $this->_logger->info("Body checksum {$body_checksum}"); $this->_logger->info("Index: ".Zend_Debug::dump($this->_index, "", false)); $hits = $this->_index->find('url:'.$targets[$i]); $this->_logger->info("Hits: ".count($hits)); $matched = false; foreach ($hits as $hit){ $this->_logger->info("Hit md5 {$hit->md5} : checksum {$body_checksum}"); if ($hit->md5 == $body_checksum) { if ($matched == true) { $index->delete($hit->id); $matched = true; } } else { $this->_logger->info("{$targets[$i]} is out of date and needs reindexing"); $index->delete($hit); } if ($matched){ $this->_logger->info($targets[$i]." is uptodate, skipping"); continue; } } /** * Create document */ $doc = Zend_Search_Lucene_Document_Html::loadHTML($body); $this->_logger->info("Url {$targets[$i]}"); $doc->addField(Zend_Search_Lucene_Field::UnIndexed('url', $targets[$i])); $doc->addField(Zend_Search_Lucene_Field::UnIndexed('md5', $body_checksum)); $this->_logger->info("Before add document"); /** * Index */ $this->_index->addDocument($doc); $this->_logger->info("After added the doc"); $this->_logger->info("Indexed {$targets[$i]}"); /** * Fetch new links */ $links = $doc->getLinks(); $this->_logger->info("Get links".Zend_Debug::dump($links)); foreach($links as $link){ if (strpos($link, $this->_globalConfig->lucene->index)&&(!in_array($link, $targets))) { $targets[] = $link; } } } else { $this->_logger->warn("Requesting {$url} returned HTTP {$response->getStatus()}"); } } $this->_logger->info("Iterated over ".count($targets)."documents"); $this->_logger->info("Optimizing index..."); $index->optimize(); $this->_logger->info("Done. Index now contains".$index->numDocs()."documents"); $this->_logger->info("Crawling completed"); } </code> -- View this message in context: http://www.nabble.com/Lucene-Crawler-Not-Adding-to-Index-tp18929433p18929433.html Sent from the Zend Framework mailing list archive at Nabble.com.