#!/bin/sh # # Testbed for proposed SegmentNormalizeTool: # creates a small set of .html pages, then crawls them # # NOTE: Relies on protocol-file, which is disabled by default in nutch-cvs. # You may have to add the following to your conf/nutch-site.xml: # # # file.content.ignored # false # # # # plugin.includes # protocol-http|protocol-file|parse-(text|html)|index-basic|query-(basic|site|url) # # # (1/2005 kangas) ROOT=`pwd` if [ $# = 1 ]; then ROOT=$1 fi TEST=$ROOT/TEST NUTCH=nutch # Create htdocs directory + content echo "Using TEST dir: $TEST" if [ -r $TEST ]; then echo "Error: file exists: $TEST"; exit 1 fi echo "Writing to $TEST/htdocs" (mkdir -p $TEST/htdocs) || exit 1 cat > $TEST/htdocs/index.html < do you foobit?

do you foobit?

EOF cat > $TEST/htdocs/eggs1.html < eggs1 go home 1 EOF cat > $TEST/htdocs/eggs2.html < eggs2 go home 2 go eggs1 EOF cat > $TEST/htdocs/eggs3.html < eggs3 go home 3 go eggs1 EOF cat > $TEST/htdocs/eggs4.html < eggs4 go home 4 EOF cat > $TEST/urlfile <