Dear Wiki user, You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.
The "Automating_Fetches_with_Python" page has been changed by newacct. http://wiki.apache.org/nutch/Automating_Fetches_with_Python?action=diff&rev1=5&rev2=6 -------------------------------------------------- import sys import getopt import re - import string import logging import logging.config import commands @@ -259, +258 @@ total_urls += 1 urllinecount.close() numsplits = total_urls / splitsize - padding = "0" * len(`numsplits`) + padding = "0" * len(repr(numsplits)) # create the url load folder - linenum = 0 filenum = 0 - strfilenum = `filenum` + strfilenum = repr(filenum) urloutdir = outdir + "/urls-" + padding[len(strfilenum):] + strfilenum os.mkdir(urloutdir) urlfile = urloutdir + "/urls" @@ -275, +273 @@ outhandle = open(urlfile, "w") # loop through the file - for line in inhandle: + for linenum, line in enumerate(inhandle): # if we have come to a split then close the current file, create a new # url folder and open a new url file - if linenum > 0 and (linenum % splitsize == 0): + if linenum > 0 and linenum % splitsize == 0: - filenum = filenum + 1 + filenum += 1 - strfilenum = `filenum` + strfilenum = repr(filenum) urloutdir = outdir + "/urls-" + padding[len(strfilenum):] + strfilenum os.mkdir(urloutdir) urlfile = urloutdir + "/urls" @@ -290, +288 @@ outhandle.close() outhandle = open(urlfile, "w") - # write the url to the file and increase the number of lines read + # write the url to the file outhandle.write(line) - linenum = linenum + 1 # close the input and output files inhandle.close() @@ -362, +359 @@ # fetch the current segment outar = result[1].splitlines() - output = outar[len(outar) - 1] + output = outar[-1] - tempseg = string.split(output)[0] + tempseg = output.split()[0] tempseglist.append(tempseg) fetch = self.nutchdir + "/bin/nutch fetch " + tempseg self.log.info("Starting fetch for: " + tempseg) @@ -392, +389 @@ # merge the crawldbs self.log.info("Merging master and temp crawldbs.") - crawlmerge = (self.nutchdir + "/bin/nutch mergedb mergetemp/crawldb " + + crawlmerge = self.nutchdir + "/bin/nutch mergedb mergetemp/crawldb " + \ - mastercrawldbdir + " " + string.join(tempdblist, " ")) + mastercrawldbdir + " " + " ".join(tempdblist) self.log.info("Running: " + crawlmerge) result = commands.getstatusoutput(crawlmerge) self.checkStatus(result, "Error occurred while running command" + crawlmerge) @@ -404, +401 @@ result = commands.getstatusoutput(getsegment) self.checkStatus(result, "Error occurred while running command" + getsegment) outar = result[1].splitlines() - output = outar[len(outar) - 1] + output = outar[-1] - masterseg = string.split(output)[0] + masterseg = output.split()[0] - mergesegs = (self.nutchdir + "/bin/nutch mergesegs mergetemp/segments " + + mergesegs = self.nutchdir + "/bin/nutch mergesegs mergetemp/segments " + \ - masterseg + " " + string.join(tempseglist, " ")) + masterseg + " " + " ".join(tempseglist) self.log.info("Running: " + mergesegs) result = commands.getstatusoutput(mergesegs) self.checkStatus(result, "Error occurred while running command" + mergesegs) @@ -464, +461 @@ usage.append(" [-b | --backupdir] The master backup directory, [crawl-backup].\n") usage.append(" [-s | --splitsize] The number of urls per load [500000].\n") usage.append(" [-f | --fetchmerge] The number of fetches to run before merging [1].\n") - message = string.join(usage) + message = " ".join(usage) print message """