Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change 
notification.

The following page has been changed by mozdevil:
http://wiki.apache.org/nutch/Nutch0%2e9-Hadoop0%2e10-Tutorial

The comment on the change is:
Replaced export variables with absolute path names

------------------------------------------------------------------------------
  
  Nutch is written in Java, so the java compiler and runtime are needed as well 
as ant. Hadoop makes use of ssh clients and servers on all machines. Lucene 
needs an servlet container, I used tomcat5.
  
+ To be able to login as root with su execute the following command and enter 
the new password for root as prompted.
+ {{{
+ sudo passwd
+ }}}
+ Login as root
  {{{
  su
- #enable the universe and multiverse repositories.
+ }}}
+ 
+ Enable the universe and multiverse repositories by editing the apt 
sources.list file.
+ {{{
  vi /etc/apt/sources.list 
- #on all the machines
+ }}}
+ Or execute the following if you are in the Netherlands and are using Ubuntu 
6.06 Dapper.
+ {{{
+ echo "deb http://nl.archive.ubuntu.com/ubuntu/ dapper universe multiverse" >> 
/etc/apt/sources.list
+ echo "deb-src http://nl.archive.ubuntu.com/ubuntu/ dapper universe 
multiverse" >> /etc/apt/sources.list
+ }}}
+ 
+ Install the necessary packages for Nutch (java and ssh) on all machines
+ {{{
  apt-get install sun-java5-jre
  apt-get install ssh
  
  update-alternatives --config java
- #and select /usr/lib/jvm/java-1.5.0-sun/jre/bin/java
+ #select /usr/lib/jvm/java-1.5.0-sun/jre/bin/java
+ }}}
  
- #only for the search web application
+ And for the search web server 
+ {{{
  apt-get install apache2
  apt-get install sun-java5-jdk
  apt-get install tomcat5
+ }}}
  
+ Configure tomcat by editing /etc/default/tomcat5
+ {{{ 
  vi /etc/default/tomcat5
  #Add JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun/
+ }}}
+ Or execute the following
+ {{{
+ echo "JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun/" >> /etc/default/tomcat5
  }}}
  
  == Build nutch ==
@@ -34, +59 @@

  
  Unpack the tarball to nutch-nightly and build it with ant.
  {{{
- export NUTCH_BUILD_DIR=~/nutch-build
  tar -xvzf nutch-2007-02-06.tar.gz
  cd nutch-nightly
- mkdir ${NUTCH_BUILD_DIR}
- echo ${NUTCH_BUILD_DIR} >> build.properties
+ mkdir /nutch-build
+ echo "/nutch-build" >> build.properties
  ant package
  }}}
  
@@ -47, +71 @@

  Create the nutch user on each machine and create the necessary directories 
for nutch
  {{{
  ssh [EMAIL PROTECTED]
- export NUTCH_INSTALL_DIR=/nutch-0.9.0
- mkdir ${NUTCH_INSTALL_DIR}
- mkdir ${NUTCH_INSTALL_DIR}/search
- mkdir ${NUTCH_INSTALL_DIR}/filesystem
- mkdir ${NUTCH_INSTALL_DIR}/local
- mkdir ${NUTCH_INSTALL_DIR}/home
+ 
+ mkdir /nutch-0.9.0
+ mkdir /nutch-0.9.0/search
+ mkdir /nutch-0.9.0/filesystem
+ mkdir /nutch-0.9.0/local
+ mkdir /nutch-0.9.0/home
  
  groupadd users
- useradd -d ${NUTCH_INSTALL_DIR}/home -g users nutch
+ useradd -d /nutch-0.9.0/home -g users nutch
  passwd nutch
  
- chown -R nutch:users ${NUTCH_INSTALL_DIR}
+ chown -R nutch:users /nutch-0.9.0
  exit
  }}}
  
@@ -66, +90 @@

  Install nutch on the namenode (the master) and add the following variables to 
the hadoop-env.sh shell script.
  {{{
  ssh [EMAIL PROTECTED]
+ cp -Rv /nutch-build/* /nutch-0.9.0/search/
- export NUTCH_INSTALL_DIR=/nutch-0.9.0
- cp -Rv ${NUTCH_BUILD_DIR}/* ${NUTCH_INSTALL_DIR}/search/
- #chown -R nutch:users ${NUTCH_INSTALL_DIR}
  
- echo "export HADOOP_HOME="${NUTCH_INSTALL_DIR}"/search" >> 
${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
+ echo "export HADOOP_HOME=/nutch-0.9.0/search" >> 
/nutch-0.9.0/search/conf/hadoop-env.sh
- echo "export JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun" >> 
${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
+ echo "export JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun" >> 
/nutch-0.9.0/search/conf/hadoop-env.sh
- echo "export HADOOP_LOG_DIR=\${HADOOP_HOME}/logs" >> 
${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
- echo "export HADOOP_SLAVES=\${HADOOP_HOME}/conf/slaves" >> 
${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
+ echo "export HADOOP_LOG_DIR=/nutch-0.9.0/search/logs" >> 
/nutch-0.9.0/search/conf/hadoop-env.sh
+ echo "export HADOOP_SLAVES=/nutch-0.9.0/search/conf/slaves" >> 
/nutch-0.9.0/search/conf/hadoop-env.sh
  
  exit
  }}}
@@ -81, +103 @@

  === Configure SSH ===
  Create ssh keys so that the nutch user can login over ssh without being 
prompted for a password.
  {{{
- ssh [EMAIL PROTECTED]
- cd ${NUTCH_INSTALL_DIR}/home
+ ssh [EMAIL PROTECTED]
+ cd /nutch-0.9.0/home
+ ssh-keygen -t rsa
+ }}}
+ 
+ {{{
- ssh-keygen -t rsa (Use empty responses for each prompt)
+ #! Use empty responses for each prompt
-   Enter passphrase (empty for no passphrase): 
+ #  Enter passphrase (empty for no passphrase): 
-   Enter same passphrase again: 
+ #  Enter same passphrase again: 
-   Your identification has been saved in ${NUTCH_INSTALL_DIR}/home/.ssh/id_rsa.
+ #  Your identification has been saved in /nutch-0.9.0/home/.ssh/id_rsa.
-   Your public key has been saved in ${NUTCH_INSTALL_DIR}/home/.ssh/id_rsa.pub.
+ #  Your public key has been saved in /nutch-0.9.0/home/.ssh/id_rsa.pub.
-   The key fingerprint is:
+ #  The key fingerprint is:
-   a6:5c:c3:eb:18:94:0b:06:a1:a6:29:58:fa:80:0a:bc [EMAIL PROTECTED]
+ #  a6:5c:c3:eb:18:94:0b:06:a1:a6:29:58:fa:80:0a:bc [EMAIL PROTECTED]
  }}}
  
  Copy the key for this machine to the authorized_keys file that will be copied 
to the other machines (the slaves).
  {{{
- cd ${NUTCH_INSTALL_DIR}/home/.ssh
+ cd /nutch-0.9.0/home/.ssh
  cp id_rsa.pub authorized_keys
  }}}
  
@@ -176, +202 @@

  
  <property>
    <name>dfs.name.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/name</value>
+   <value>/nutch-0.9.0/filesystem/name</value>
  </property>
  
  <property>
    <name>dfs.data.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/data</value>
+   <value>/nutch-0.9.0/filesystem/data</value>
  </property>
  
  <property>
    <name>mapred.system.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/mapreduce/system</value>
+   <value>$/nutch-0.9.0/filesystem/mapreduce/system</value>
  </property>
  
  <property>
    <name>mapred.local.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/mapreduce/local</value>
+   <value>/nutch-0.9.0/filesystem/mapreduce/local</value>
  </property>
  
  <property>
@@ -279, +305 @@

  === Distribute the code and the configuration ===
  Copy the code and the configuration to the slaves
  {{{
- scp -r ${NUTCH_INSTALL_DIR}/search/* [EMAIL 
PROTECTED]:${NUTCH_INSTALL_DIR}/search
+ scp -r /nutch-0.9.0/search/* [EMAIL PROTECTED]:/nutch-0.9.0/search
  }}}
  
  Copy the keys to the slave machines
  {{{
- scp ${NUTCH_INSTALL_DIR}/home/.ssh/authorized_keys [EMAIL 
PROTECTED]:${NUTCH_INSTALL_DIR}/home/.ssh/authorized_keys
+ scp /nutch-0.9.0/home/.ssh/authorized_keys [EMAIL 
PROTECTED]:/nutch-0.9.0/home/.ssh/authorized_keys
  }}}
  
  Check if shhd is ready on the machines
@@ -334, +360 @@

  Because the searching needs different settings for nutch than for crawling, 
the easiest thing to do is to make a sepperate folder for the nutch search part.
  {{{
  ssh [EMAIL PROTECTED]
+ mkdir /nutchsearch-0.9.0
+ chown nutch:users /nutchsearch-0.9.0
- export NUTCH_BUILD_DIR=~/nutch-build
- export SEARCH_INSTALL_DIR=/nutch-search-0.9.0
- mkdir ${SEARCH_INSTALL_DIR}
- chown nutch:users ${SEARCH_INSTALL_DIR}
  exit
  
  ssh [EMAIL PROTECTED]
+ cp -Rv /nutch-build /nutchsearch-0.9.0/search
+ mkdir /nutchsearch-0.9.0/local
- export SEARCH_INSTALL_DIR=/nutch-search-0.9.0
- cp -Rv ${NUTCH_BUILD_DIR}/search ${SEARCH_INSTALL_DIR}/search
- mkdir ${SEARCH_INSTALL_DIR}/local
  }}}
  
  === Configure ===
@@ -363, +386 @@

  
    <property>
      <name>searcher.dir</name>
-     <value>${SEARCH_INSTALL_DIR}/local/crawled</value>
+     <value>/nutchsearch-0.9.0/local/crawled</value>
    </property>
  
  </configuration>
@@ -384, +407 @@

  === Make a local index ===
  Copy the data from dfs to the local filesystem.
  {{{
- bin/hadoop dfs -copyToLocal crawled ${SEARCH_INSTALL_DIR}/local/
+ bin/hadoop dfs -copyToLocal crawled /nutchsearch-0.9.0/local/
  }}}
  
  Test if all is configured properly
@@ -397, +420 @@

  Copy the war file to the tomcat directory
  {{{
  rm -rf usr/share/tomcat5/webapps/ROOT*
- cp ${SEARCH_INSTALL_DIR}/*.war /usr/share/tomcat5/webapps/ROOT.war
+ cp /nutchsearch-0.9.0/*.war /usr/share/tomcat5/webapps/ROOT.war
  }}}
  
  Copy the configuration to the tomcat directory
  {{{
- cp ${SEARCH_INSTALL_DIR}/search/conf/* 
/usr/share/tomcat5/webapps/ROOT/WEB-INF/classes/
+ cp /nutchsearch-0.9.0/search/conf/* 
/usr/share/tomcat5/webapps/ROOT/WEB-INF/classes/
  }}}
  
  Start tomcat 
@@ -417, +440 @@

  Prepare the other machines that are going to host a part of the index.
  {{{
  ssh [EMAIL PROTECTED]
+ mkdir /nutchsearch-0.9.0
+ mkdir /nutchsearch-0.9.0/search
+ chown -R nutch:users /nutchsearch-0.9.0
- export SEARCH_INSTALL_DIR=/nutchsearch-0.9.0
- mkdir ${SEARCH_INSTALL_DIR}
- mkdir ${SEARCH_INSTALL_DIR}/search
- chown -R nutch:users ${SEARCH_INSTALL_DIR}
  exit
  }}}
  
  Copy the search install directory to other machines.
  {{{
- scp -r ${SEARCH_INSTALL_DIR}/search [EMAIL 
PROTECTED]:${SEARCH_INSTALL_DIR}/search
+ scp -r /nutchsearch-0.9.0/search [EMAIL PROTECTED]:/nutchsearch-0.9.0/search
  }}}
  
  === Configure ===
@@ -454, +476 @@

  
    <property>
      <name>searcher.dir</name>
-     <value>${SEARCH_INSTALL_DIR}/search/conf/</value>
+     <value>/nutchsearch-0.9.0/search/conf/</value>
    </property>
  
  </configuration>
@@ -466, +488 @@

  Copy each part of the index to a different machine.
  {{{
  ???
- scp -R ${SEARCH_INSTALL_DIR}/local/partX/crawled [EMAIL 
PROTECTED]:${SEARCH_INSTALL_DIR}/local/
+ scp -R /nutchsearch-0.9.0/local/partX/crawled [EMAIL 
PROTECTED]:/nutchsearch-0.9.0/local/
  }}}
  
  === Start the services ===
  Startup the search services on all the machines that have a part of the index.
  {{{
- bin/nutch server 9999 ${SEARCH_INSTALL_DIR}/local/crawled
+ bin/nutch server 9999 /nutchsearch-0.9.0/local/crawled
  }}}
  
  Restart the master search node

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to