#!/bin/sh
#
# Testbed for proposed SegmentNormalizeTool:
# creates a small set of .html pages, then crawls them
#
# NOTE: Relies on protocol-file, which is disabled by default in nutch-cvs.
# You may have to add the following to your conf/nutch-site.xml:
#
#<property>
#  <name>file.content.ignored</name>
#  <value>false</value>
#</property>
#
#<property>
#  <name>plugin.includes</name>
#  <value>protocol-http|protocol-file|parse-(text|html)|index-basic|query-(basic|site|url)</value>
#</property>
#
# (1/2005 kangas)

ROOT=`pwd`
if [ $# = 1 ]; then
    ROOT=$1
fi

TEST=$ROOT/TEST
NUTCH=nutch

# Create htdocs directory + content

echo "Using TEST dir: $TEST"
if [ -r $TEST ]; then
    echo "Error: file exists: $TEST"; exit 1
fi

echo "Writing to $TEST/htdocs"
(mkdir -p $TEST/htdocs) || exit 1
cat > $TEST/htdocs/index.html <<EOF
<html>
<head><title>do you foobit?</title></head>
<body>
<h1>do you foobit?</h1>
<a href="eggs1.html">a dish fit for the eggs1</a><p>
<a href="eggs2.html">this was the noblest eggs2 of them all</a><p>
<a href="eggs3.html">et tu, eggs3!</a><p>
<a href="eggs4.html">beware the ides of eggs4</a><p>
</body>
</html>
EOF

cat > $TEST/htdocs/eggs1.html <<EOF
<html>
<head><title>eggs1</title></head>
<body>
<a href="index.html">go home 1</a>
</body>
</html>
EOF

cat > $TEST/htdocs/eggs2.html <<EOF
<html>
<head><title>eggs2</title></head>
<body>
<a href="index.html">go home 2</a>
<a href="eggs1.html">go eggs1</a>
</body>
</html>
EOF

cat > $TEST/htdocs/eggs3.html <<EOF
<html>
<head><title>eggs3</title></head>
<body>
<a href="index.html">go home 3</a>
<a href="eggs1.html">go eggs1</a>
</body>
</html>
EOF

cat > $TEST/htdocs/eggs4.html <<EOF
<html>
<head><title>eggs4</title></head>
<body>
<a href="index.html">go home 4</a>
</body>
</html>
EOF

cat > $TEST/urlfile <<EOF
file:$TEST/htdocs/index.html
EOF

# Now crawl this content

die () { echo "died on $1"; exit; }

echo "Writing to $TEST/db $TEST/segments"
mkdir $TEST/db $TEST/segments
echo "--ADMIN-- $TEST/db -create"
$NUTCH admin -local $TEST/db -create || die "admin"
echo "--INJECT-- $TEST/db -urlfile $TEST/urlfile"
$NUTCH inject -local $TEST/db -urlfile $TEST/urlfile || die "inject"

# verify that our root url wasn't filtered out!
if [ -z "`$NUTCH readdb -local $TEST/db -dumppageurl | grep "^Page 1"`" ]
then
    echo
    echo "ERROR: command failed: nutch inject $TEST/urlfile"
    echo "(do you have a regex-urlfilter.txt that removes 'file:' urls?)"
    exit 1
fi

# main crawl loop
for i in 1 2 3; do
    echo "--------- segment $i ----------"
    echo "--GENERATE-- $TEST/db $TEST/segments"
    $NUTCH generate -local $TEST/db $TEST/segments || die "generate"
    s1=`ls -d $TEST/segments/2* | tail -1`
    echo "--FETCH-- $s1"
    $NUTCH fetch -local -logLevel warning $s1 || die "fetch $s1"
    
    # verify that first-pass content was actually fetched
    if [ $i -eq 1 -a -z "`$NUTCH segread -local -dump $s1 | grep "outlink: toUrl: .*eggs1.html"`" ]
    then
	echo
	echo "ERROR: command failed: nutch fetch $s1"
	echo "(does your config include 'parse-file' in 'plugin.includes'?)"
	echo "(try running with '-logLevel fine' for more details)"
	exit
    fi

    echo "--UPDATEDB-- $TEST/db $s1"
    $NUTCH updatedb -local $TEST/db $s1 || die "updatedb $s1"
    echo "--ANALYZE $TEST/db 2"
    $NUTCH analyze -local $TEST/db 2 || die "analyze"
    #$NUTCH index $s1
done

echo
echo "======================================================="
echo "--crawl done--"
echo "======================================================="
echo 

# View WebDB data
echo "--readdb $TEST/db -dumplinks"
echo
$NUTCH readdb -local $TEST/db -dumplinks

echo
echo " ------------------------------------------------------"
echo 

# View segment data
echo "--segread -dump -nocontent -dir $TEST/segments"
$NUTCH segread -dump -nocontent -noparsedata -noparsetext -dir $TEST/segments