HI all,
   I am a newbie use nutch.when I use nutch-1.0 meeting some problem
below is my config
master
ubuntu3

slaves
ubuntu6
ubuntu7

urllist.txt
http://www.163.com


crawl-urlfilter.txt
# accept hosts in MY.DOMAIN.NAME
+^http://([a-z0-9]*\.)*163.com/

hadoop-env.sh
# Set Hadoop-specific environment variables here.

# The only required environment variable is JAVA_HOME.  All others are
# optional.  When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.

# The java implementation to use.  Required.
export JAVA_HOME=/home/hadoop/jdk6


export HADOOP_HOME=/home/hadoop/search

# The maximum amount of heap to use, in MB. Default is 1000.
# export HADOOP_HEAPSIZE=2000

# Extra Java runtime options.  Empty by default.
# export HADOOP_OPTS=-server

# Extra ssh options.  Default: '-o ConnectTimeout=1 -o
SendEnv=HADOOP_CONF_DIR'.
# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR"

# Where log files are stored.  $HADOOP_HOME/logs by default.
export HADOOP_LOG_DIR=${HADOOP_HOME}/logs

# File naming remote slave hosts.  $HADOOP_HOME/conf/slaves by default.
export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves

# host:path where hadoop code should be rsync'd from.  Unset by default.
# export HADOOP_MASTER=master:/home/$USER/src/hadoop

# The directory where pid files are stored. /tmp by default.
# export HADOOP_PID_DIR=/var/hadoop/pids

# A string representing this instance of hadoop. $USER by default.
# export HADOOP_IDENT_STRING=$USER


hadoop-site.xml

<configuration>
<property>
  <name>fs.default.name</name>
  <value>hdfs://ubuntu3:9000/</value>
  <description>
    The name of the default file system. Either the literal string 
    "local" or a host:port for NDFS.
  </description>
</property>
<property>
  <name>mapred.job.tracker</name>
  <value>hdfs://ubuntu3:9001/</value>
  <description>
    The host and port that the MapReduce job tracker runs at. If 
    "local", then jobs are run in-process as a single map and 
    reduce task.
  </description>
</property>
<property> 
  <name>mapred.map.tasks</name>
  <value>2</value>
  <description>
    define mapred.map tasks to be number of slave hosts
  </description> 
</property> 
<property> 
  <name>mapred.reduce.tasks</name>
  <value>2</value>
  <description>
    define mapred.reduce tasks to be number of slave hosts
  </description> 
</property> 
<property>
  <name>dfs.name.dir</name>
  <value>/home/hadoop/filesystem/name</value>
</property>
<property>
  <name>dfs.data.dir</name>
  <value>/home/hadoop/filesystem/data</value>
</property>
<property>
  <name>mapred.system.dir</name>
  <value>/home/hadoop/filesystem/mapreduce/system</value>
</property>
<property>
  <name>mapred.local.dir</name>
  <value>/home/hadoop/filesystem/mapreduce/local</value>
</property>
<property>
  <name>dfs.replication</name>
  <value>1</value>
</property>
</configuration>

nutch-site.xml

configuration>
 <property>
      <name>http.robots.agents</name>
      <value>*</value> 
</property>
<property>
     <name>http.agent.name</name>
     <value>mic</value>
     </property>
<property>
<name>http.agent.url</name>
<value>www.baidu.com</value>
</property>
</configuration>


had...@ubuntu3:~/search$ bin/nutch crawl urls -dir crawled -depth 5 -topN
100
crawl started in: crawled
rootUrlDir = urls
threads = 10
depth = 5
topN = 100
Injector: starting
Injector: crawlDb: crawled/crawldb
Injector: urlDir: urls
Injector: Converting injected urls to crawl db entries.
Injector: Merging injected urls into crawl db.
Injector: done
Generator: Selecting best-scoring urls due for fetch.
Generator: starting
Generator: segment: crawled/segments/20090403010943
Generator: filtering: true
Generator: topN: 100
Generator: Partitioning selected urls by host, for politeness.
Generator: done.
Fetcher: starting
Fetcher: segment: crawled/segments/20090403010943
Fetcher: done
CrawlDb update: starting
CrawlDb update: db: crawled/crawldb
CrawlDb update: segments: [crawled/segments/20090403010943]
CrawlDb update: additions allowed: true
CrawlDb update: URL normalizing: true
CrawlDb update: URL filtering: true
CrawlDb update: Merging segment data into db.
CrawlDb update: done
Generator: Selecting best-scoring urls due for fetch.
Generator: starting
Generator: segment: crawled/segments/20090403011147
Generator: filtering: true
Generator: topN: 100
Generator: Partitioning selected urls by host, for politeness.
Generator: done.
Fetcher: starting
Fetcher: segment: crawled/segments/20090403011147
Fetcher: done
CrawlDb update: starting
CrawlDb update: db: crawled/crawldb
CrawlDb update: segments: [crawled/segments/20090403011147]
CrawlDb update: additions allowed: true
CrawlDb update: URL normalizing: true
CrawlDb update: URL filtering: true
CrawlDb update: Merging segment data into db.
CrawlDb update: done
Generator: Selecting best-scoring urls due for fetch.
Generator: starting
Generator: segment: crawled/segments/20090403011354
Generator: filtering: true
Generator: topN: 100
Generator: Partitioning selected urls by host, for politeness.
Generator: done.
Fetcher: starting
Fetcher: segment: crawled/segments/20090403011354
Fetcher: done
CrawlDb update: starting
CrawlDb update: db: crawled/crawldb
CrawlDb update: segments: [crawled/segments/20090403011354]
CrawlDb update: additions allowed: true
CrawlDb update: URL normalizing: true
CrawlDb update: URL filtering: true
CrawlDb update: Merging segment data into db.
CrawlDb update: done
Generator: Selecting best-scoring urls due for fetch.
Generator: starting
Generator: segment: crawled/segments/20090403011601
Generator: filtering: true
Generator: topN: 100
Generator: Partitioning selected urls by host, for politeness.
Generator: done.
Fetcher: starting
Fetcher: segment: crawled/segments/20090403011601
Fetcher: done
CrawlDb update: starting
CrawlDb update: db: crawled/crawldb
CrawlDb update: segments: [crawled/segments/20090403011601]
CrawlDb update: additions allowed: true
CrawlDb update: URL normalizing: true
CrawlDb update: URL filtering: true
CrawlDb update: Merging segment data into db.
CrawlDb update: done
Generator: Selecting best-scoring urls due for fetch.
Generator: starting
Generator: segment: crawled/segments/20090403011810
Generator: filtering: true
Generator: topN: 100
Generator: Partitioning selected urls by host, for politeness.
Generator: done.
Fetcher: starting
Fetcher: segment: crawled/segments/20090403011810
Fetcher: done
CrawlDb update: starting
CrawlDb update: db: crawled/crawldb
CrawlDb update: segments: [crawled/segments/20090403011810]
CrawlDb update: additions allowed: true
CrawlDb update: URL normalizing: true
CrawlDb update: URL filtering: true
CrawlDb update: Merging segment data into db.
CrawlDb update: done
LinkDb: starting
LinkDb: linkdb: crawled/linkdb
LinkDb: URL normalize: true
LinkDb: URL filter: true
LinkDb: adding segment:
hdfs://ubuntu3:9000/user/hadoop/crawled/segments/20090403010943
LinkDb: adding segment:
hdfs://ubuntu3:9000/user/hadoop/crawled/segments/20090403011147
LinkDb: adding segment:
hdfs://ubuntu3:9000/user/hadoop/crawled/segments/20090403011354
LinkDb: adding segment:
hdfs://ubuntu3:9000/user/hadoop/crawled/segments/20090403011601
LinkDb: adding segment:
hdfs://ubuntu3:9000/user/hadoop/crawled/segments/20090403011810
LinkDb: done
Indexer: starting
Indexer: done
Dedup: starting
Dedup: adding indexes in: crawled/indexes
Dedup: done
merging indexes to: crawled/index
Adding hdfs://ubuntu3:9000/user/hadoop/crawled/indexes/part-00000
Adding hdfs://ubuntu3:9000/user/hadoop/crawled/indexes/part-00001
done merging
crawl finished: crawled

why it no get urls and data,the logs no exception ,help help


-- 
View this message in context: 
http://www.nabble.com/nutch-1.0-distribution-config-problem-tp22864593p22864593.html
Sent from the Nutch - User mailing list archive at Nabble.com.

Reply via email to