Author: lewismc Date: Thu Jan 29 05:38:59 2015 New Revision: 1655526 URL: http://svn.apache.org/r1655526 Log: UTCH-865 Format source code in unique style
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java nutch/trunk/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java nutch/trunk/src/java/org/apache/nutch/indexer/CleaningJob.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexWriter.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexWriters.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java nutch/trunk/src/java/org/apache/nutch/indexer/NutchIndexAction.java nutch/trunk/src/java/org/apache/nutch/metadata/CreativeCommons.java nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java nutch/trunk/src/java/org/apache/nutch/net/package-info.java nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java nutch/trunk/src/java/org/apache/nutch/parse/Parse.java nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java nutch/trunk/src/java/org/apache/nutch/parse/Parser.java nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java nutch/trunk/src/java/org/apache/nutch/parse/package-info.java nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java nutch/trunk/src/java/org/apache/nutch/plugin/ExtensionPoint.java nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java nutch/trunk/src/java/org/apache/nutch/plugin/Plugin.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginRuntimeException.java nutch/trunk/src/java/org/apache/nutch/protocol/Content.java nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolNotFound.java nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java nutch/trunk/src/java/org/apache/nutch/protocol/package-info.java nutch/trunk/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/package-info.java nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilters.java nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java nutch/trunk/src/java/org/apache/nutch/segment/package-info.java nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java nutch/trunk/src/java/org/apache/nutch/tools/package-info.java nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java nutch/trunk/src/java/org/apache/nutch/util/package-info.java nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java nutch/trunk/src/test/org/apache/nutch/crawl/TestSignatureFactory.java nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java nutch/trunk/src/test/org/apache/nutch/parse/TestParseText.java nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java nutch/trunk/src/test/org/apache/nutch/plugin/HelloWorldExtension.java nutch/trunk/src/test/org/apache/nutch/plugin/ITestExtension.java nutch/trunk/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java nutch/trunk/src/test/org/apache/nutch/protocol/TestProtocolFactory.java nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/DelayHandler.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java nutch/trunk/src/test/org/apache/nutch/tools/proxy/package-info.java nutch/trunk/src/test/org/apache/nutch/util/TestGZIPUtils.java nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java nutch/trunk/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java nutch/trunk/src/test/org/apache/nutch/util/TestStringUtil.java nutch/trunk/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jan 29 05:38:59 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-865 Format source code in unique style (lewismc) + * NUTCH-1893 Parse-tika failes to parse feed files (Mengying Wang via snagel) * NUTCH-1920 Upgrade Nutch to use Java 1.7 (lewismc) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Thu Jan 29 05:38:59 2015 @@ -30,38 +30,43 @@ import org.apache.nutch.crawl.CrawlDatum * * @author Andrzej Bialecki */ -public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule { - private static final Logger LOG = LoggerFactory.getLogger(AbstractFetchSchedule.class); - +public abstract class AbstractFetchSchedule extends Configured implements + FetchSchedule { + private static final Logger LOG = LoggerFactory + .getLogger(AbstractFetchSchedule.class); + protected int defaultInterval; protected int maxInterval; - + public AbstractFetchSchedule() { super(null); } - + public AbstractFetchSchedule(Configuration conf) { super(conf); } - + public void setConf(Configuration conf) { super.setConf(conf); - if (conf == null) return; + if (conf == null) + return; defaultInterval = conf.getInt("db.fetch.interval.default", 0); - maxInterval = conf.getInt("db.fetch.interval.max", 0 ); + maxInterval = conf.getInt("db.fetch.interval.max", 0); LOG.info("defaultInterval=" + defaultInterval); LOG.info("maxInterval=" + maxInterval); } - + /** - * Initialize fetch schedule related data. Implementations should at least - * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default - * implementation sets the <code>fetchTime</code> to now, using the - * default <code>fetchInterval</code>. - * - * @param url URL of the page. - * - * @param datum datum instance to be initialized (modified in place). + * Initialize fetch schedule related data. Implementations should at least set + * the <code>fetchTime</code> and <code>fetchInterval</code>. The default + * implementation sets the <code>fetchTime</code> to now, using the default + * <code>fetchInterval</code>. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance to be initialized (modified in place). */ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) { datum.setFetchTime(System.currentTimeMillis()); @@ -69,101 +74,111 @@ public abstract class AbstractFetchSched datum.setRetriesSinceFetch(0); return datum; } - + /** * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a - * successfully fetched page. NOTE: this implementation resets the - * retry counter - extending classes should call super.setFetchSchedule() to + * successfully fetched page. NOTE: this implementation resets the retry + * counter - extending classes should call super.setFetchSchedule() to * preserve this behavior. */ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, - long prevFetchTime, long prevModifiedTime, - long fetchTime, long modifiedTime, int state) { + long prevFetchTime, long prevModifiedTime, long fetchTime, + long modifiedTime, int state) { datum.setRetriesSinceFetch(0); return datum; } - + /** - * This method specifies how to schedule refetching of pages - * marked as GONE. Default implementation increases fetchInterval by 50% - * but the value may never exceed <code>maxInterval</code>. - * - * @param url URL of the page. - * - * @param datum datum instance to be adjusted. - * + * This method specifies how to schedule refetching of pages marked as GONE. + * Default implementation increases fetchInterval by 50% but the value may + * never exceed <code>maxInterval</code>. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance to be adjusted. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than @see CrawlDatum, but - * implementations should make sure that it contains at least all - * information from @see CrawlDatum. + * NOTE: this may be a different instance than @see CrawlDatum, but + * implementations should make sure that it contains at least all + * information from @see CrawlDatum. */ public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, - long prevFetchTime, long prevModifiedTime, long fetchTime) { + long prevFetchTime, long prevModifiedTime, long fetchTime) { // no page is truly GONE ... just increase the interval by 50% // and try much later. if ((datum.getFetchInterval() * 1.5f) < maxInterval) datum.setFetchInterval(datum.getFetchInterval() * 1.5f); else datum.setFetchInterval(maxInterval * 0.9f); - datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000); + datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000); return datum; } - + /** - * This method adjusts the fetch schedule if fetching needs to be - * re-tried due to transient errors. The default implementation - * sets the next fetch time 1 day in the future and increases - * the retry counter. - * - * @param url URL of the page. - * - * @param datum page information. - * - * @param prevFetchTime previous fetch time. - * - * @param prevModifiedTime previous modified time. - * - * @param fetchTime current fetch time. - * + * This method adjusts the fetch schedule if fetching needs to be re-tried due + * to transient errors. The default implementation sets the next fetch time 1 + * day in the future and increases the retry counter. + * + * @param url + * URL of the page. + * + * @param datum + * page information. + * + * @param prevFetchTime + * previous fetch time. + * + * @param prevModifiedTime + * previous modified time. + * + * @param fetchTime + * current fetch time. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than @see CrawlDatum, but - * implementations should make sure that it contains at least all - * information from @see CrawlDatum. + * NOTE: this may be a different instance than @see CrawlDatum, but + * implementations should make sure that it contains at least all + * information from @see CrawlDatum. */ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, - long prevFetchTime, long prevModifiedTime, long fetchTime) { - datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY*1000); + long prevFetchTime, long prevModifiedTime, long fetchTime) { + datum.setFetchTime(fetchTime + (long) SECONDS_PER_DAY * 1000); datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1); return datum; } - + /** * This method return the last fetch time of the CrawlDatum + * * @return the date as a long. */ public long calculateLastFetchTime(CrawlDatum datum) { - return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000; + return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000; } /** - * This method provides information whether the page is suitable for - * selection in the current fetchlist. NOTE: a true return value does not - * guarantee that the page will be fetched, it just allows it to be - * included in the further selection process based on scores. The default - * implementation checks <code>fetchTime</code>, if it is higher than the - * <code>curTime</code> it returns false, and true otherwise. It will also - * check that fetchTime is not too remote (more than <code>maxInterval</code>, - * in which case it lowers the interval and returns true. - * - * @param url URL of the page. - * - * @param datum datum instance. - * - * @param curTime reference time (usually set to the time when the - * fetchlist generation process was started). - * + * This method provides information whether the page is suitable for selection + * in the current fetchlist. NOTE: a true return value does not guarantee that + * the page will be fetched, it just allows it to be included in the further + * selection process based on scores. The default implementation checks + * <code>fetchTime</code>, if it is higher than the <code>curTime</code> it + * returns false, and true otherwise. It will also check that fetchTime is not + * too remote (more than <code>maxInterval</code>, in which case it lowers the + * interval and returns true. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance. + * + * @param curTime + * reference time (usually set to the time when the fetchlist + * generation process was started). + * * @return true, if the page should be considered for inclusion in the current - * fetchlist, otherwise false. + * fetchlist, otherwise false. */ public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) { // pages are never truly GONE - we have to check them from time to time. @@ -176,24 +191,27 @@ public abstract class AbstractFetchSched datum.setFetchTime(curTime); } if (datum.getFetchTime() > curTime) { - return false; // not time yet + return false; // not time yet } return true; } - + /** * This method resets fetchTime, fetchInterval, modifiedTime, * retriesSinceFetch and page signature, so that it forces refetching. - * - * @param url URL of the page. - * - * @param datum datum instance. - * - * @param asap if true, force refetch as soon as possible - this sets - * the fetchTime to now. If false, force refetch whenever the next fetch - * time is set. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance. + * + * @param asap + * if true, force refetch as soon as possible - this sets the + * fetchTime to now. If false, force refetch whenever the next fetch + * time is set. */ - public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) { + public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) { // reduce fetchInterval so that it fits within the max value if (datum.getFetchInterval() > maxInterval) datum.setFetchInterval(maxInterval * 0.9f); @@ -201,7 +219,8 @@ public abstract class AbstractFetchSched datum.setRetriesSinceFetch(0); datum.setSignature(null); datum.setModifiedTime(0L); - if (asap) datum.setFetchTime(System.currentTimeMillis()); + if (asap) + datum.setFetchTime(System.currentTimeMillis()); return datum; } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Thu Jan 29 05:38:59 2015 @@ -37,11 +37,12 @@ import org.slf4j.LoggerFactory; * If SYNC_DELTA property is true, then: * <ul> * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li> - * <li>try to synchronize with the time of change, by shifting the next fetchTime - * by a fraction of the difference between the last modification time and the last - * fetch time. I.e. the next fetch time will be set to + * <li>try to synchronize with the time of change, by shifting the next + * fetchTime by a fraction of the difference between the last modification time + * and the last fetch time. I.e. the next fetch time will be set to * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li> - * <li>if the adjusted fetch interval is bigger than the delta, then <code>fetchInterval = delta</code>.</li> + * <li>if the adjusted fetch interval is bigger than the delta, then + * <code>fetchInterval = delta</code>.</li> * </ul> * </li> * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL @@ -49,17 +50,21 @@ import org.slf4j.LoggerFactory; * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL * (default is 365 days).</li> * </ul> - * <p>NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize the algorithm, - * so that the fetch interval either increases or decreases infinitely, with little - * relevance to the page changes. Please use {@link #main(String[])} method to - * test the values before applying them in a production system.</p> + * <p> + * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize + * the algorithm, so that the fetch interval either increases or decreases + * infinitely, with little relevance to the page changes. Please use + * {@link #main(String[])} method to test the values before applying them in a + * production system. + * </p> * * @author Andrzej Bialecki */ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { // Loggg - public static final Logger LOG = LoggerFactory.getLogger(AbstractFetchSchedule.class); + public static final Logger LOG = LoggerFactory + .getLogger(AbstractFetchSchedule.class); protected float INC_RATE; @@ -68,26 +73,29 @@ public class AdaptiveFetchSchedule exten private int MAX_INTERVAL; private int MIN_INTERVAL; - + private boolean SYNC_DELTA; private double SYNC_DELTA_RATE; - + public void setConf(Configuration conf) { super.setConf(conf); - if (conf == null) return; + if (conf == null) + return; INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f); MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60); - MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval", SECONDS_PER_DAY * 365 ); // 1 year + MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval", + SECONDS_PER_DAY * 365); // 1 year SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true); - SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f); + SYNC_DELTA_RATE = conf.getFloat( + "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f); } @Override public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, - long prevFetchTime, long prevModifiedTime, - long fetchTime, long modifiedTime, int state) { + long prevFetchTime, long prevModifiedTime, long fetchTime, + long modifiedTime, int state) { super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, fetchTime, modifiedTime, state); @@ -99,24 +107,27 @@ public class AdaptiveFetchSchedule exten if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) { // Is fetch interval preset in CrawlDatum MD? Then use preset interval - FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY)); + FloatWritable customIntervalWritable = (FloatWritable) (datum + .getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY)); interval = customIntervalWritable.get(); } else { - if (modifiedTime <= 0) modifiedTime = fetchTime; + if (modifiedTime <= 0) + modifiedTime = fetchTime; switch (state) { - case FetchSchedule.STATUS_MODIFIED: - interval *= (1.0f - DEC_RATE); - break; - case FetchSchedule.STATUS_NOTMODIFIED: - interval *= (1.0f + INC_RATE); - break; - case FetchSchedule.STATUS_UNKNOWN: - break; + case FetchSchedule.STATUS_MODIFIED: + interval *= (1.0f - DEC_RATE); + break; + case FetchSchedule.STATUS_NOTMODIFIED: + interval *= (1.0f + INC_RATE); + break; + case FetchSchedule.STATUS_UNKNOWN: + break; } if (SYNC_DELTA) { // try to synchronize with the time of change long delta = (fetchTime - modifiedTime) / 1000L; - if (delta > interval) interval = delta; + if (delta > interval) + interval = delta; refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000); } if (interval < MIN_INTERVAL) { @@ -154,30 +165,39 @@ public class AdaptiveFetchSchedule exten // let's move the timeline a couple of deltas for (int i = 0; i < 10000; i++) { if (lastModified + update < curTime) { - //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime); + // System.out.println("i=" + i + ", lastModified=" + lastModified + + // ", update=" + update + ", curTime=" + curTime); changed = true; changeCnt++; lastModified = curTime; } - LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " - + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss); + LOG.info(i + ". " + changed + "\twill fetch at " + + (p.getFetchTime() / delta) + "\tinterval " + + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed " + + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; - fs.setFetchSchedule(new Text("http://www.example.com"), p, - p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, - changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); - LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " - + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days"); - if (!changed) miss++; - if (miss > maxMiss) maxMiss = miss; + fs.setFetchSchedule(new Text("http://www.example.com"), p, p + .getFetchTime(), p.getModifiedTime(), curTime, lastModified, + changed ? FetchSchedule.STATUS_MODIFIED + : FetchSchedule.STATUS_NOTMODIFIED); + LOG.info("\tfetched & adjusted: " + "\twill fetch at " + + (p.getFetchTime() / delta) + "\tinterval " + + (p.getFetchInterval() / SECONDS_PER_DAY) + " days"); + if (!changed) + miss++; + if (miss > maxMiss) + maxMiss = miss; changed = false; totalMiss += miss; miss = 0; } - if (changed) miss++; + if (changed) + miss++; curTime += delta; } LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); - LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times."); + LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + + " times."); } } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Thu Jan 29 05:38:59 2015 @@ -41,52 +41,51 @@ public class CrawlDatum implements Writa private static final byte OLD_STATUS_FETCH_SUCCESS = 5; private static final byte OLD_STATUS_FETCH_RETRY = 6; private static final byte OLD_STATUS_FETCH_GONE = 7; - + private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>(); - + /** Page was not fetched yet. */ - public static final byte STATUS_DB_UNFETCHED = 0x01; + public static final byte STATUS_DB_UNFETCHED = 0x01; /** Page was successfully fetched. */ - public static final byte STATUS_DB_FETCHED = 0x02; + public static final byte STATUS_DB_FETCHED = 0x02; /** Page no longer exists. */ - public static final byte STATUS_DB_GONE = 0x03; + public static final byte STATUS_DB_GONE = 0x03; /** Page temporarily redirects to other page. */ - public static final byte STATUS_DB_REDIR_TEMP = 0x04; + public static final byte STATUS_DB_REDIR_TEMP = 0x04; /** Page permanently redirects to other page. */ - public static final byte STATUS_DB_REDIR_PERM = 0x05; + public static final byte STATUS_DB_REDIR_PERM = 0x05; /** Page was successfully fetched and found not modified. */ - public static final byte STATUS_DB_NOTMODIFIED = 0x06; - public static final byte STATUS_DB_DUPLICATE = 0x07; - + public static final byte STATUS_DB_NOTMODIFIED = 0x06; + public static final byte STATUS_DB_DUPLICATE = 0x07; + /** Maximum value of DB-related status. */ - public static final byte STATUS_DB_MAX = 0x1f; - + public static final byte STATUS_DB_MAX = 0x1f; + /** Fetching was successful. */ - public static final byte STATUS_FETCH_SUCCESS = 0x21; + public static final byte STATUS_FETCH_SUCCESS = 0x21; /** Fetching unsuccessful, needs to be retried (transient errors). */ - public static final byte STATUS_FETCH_RETRY = 0x22; + public static final byte STATUS_FETCH_RETRY = 0x22; /** Fetching temporarily redirected to other page. */ - public static final byte STATUS_FETCH_REDIR_TEMP = 0x23; + public static final byte STATUS_FETCH_REDIR_TEMP = 0x23; /** Fetching permanently redirected to other page. */ - public static final byte STATUS_FETCH_REDIR_PERM = 0x24; + public static final byte STATUS_FETCH_REDIR_PERM = 0x24; /** Fetching unsuccessful - page is gone. */ - public static final byte STATUS_FETCH_GONE = 0x25; + public static final byte STATUS_FETCH_GONE = 0x25; /** Fetching successful - page is not modified. */ public static final byte STATUS_FETCH_NOTMODIFIED = 0x26; - + /** Maximum value of fetch-related status. */ - public static final byte STATUS_FETCH_MAX = 0x3f; - + public static final byte STATUS_FETCH_MAX = 0x3f; + /** Page signature. */ - public static final byte STATUS_SIGNATURE = 0x41; + public static final byte STATUS_SIGNATURE = 0x41; /** Page was newly injected. */ - public static final byte STATUS_INJECTED = 0x42; + public static final byte STATUS_INJECTED = 0x42; /** Page discovered through a link. */ - public static final byte STATUS_LINKED = 0x43; + public static final byte STATUS_LINKED = 0x43; /** Page got metadata from a parser */ - public static final byte STATUS_PARSE_META = 0x44; - - + public static final byte STATUS_PARSE_META = 0x44; + public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>(); static { statNames.put(STATUS_DB_UNFETCHED, "db_unfetched"); @@ -106,7 +105,7 @@ public class CrawlDatum implements Writa statNames.put(STATUS_FETCH_GONE, "fetch_gone"); statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified"); statNames.put(STATUS_PARSE_META, "parse_metadata"); - + oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED); oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED); oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE); @@ -125,22 +124,25 @@ public class CrawlDatum implements Writa private byte[] signature = null; private long modifiedTime; private org.apache.hadoop.io.MapWritable metaData; - + public static boolean hasDbStatus(CrawlDatum datum) { - if (datum.status <= STATUS_DB_MAX) return true; + if (datum.status <= STATUS_DB_MAX) + return true; return false; } public static boolean hasFetchStatus(CrawlDatum datum) { - if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX) return true; + if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX) + return true; return false; } - public CrawlDatum() { } + public CrawlDatum() { + } public CrawlDatum(int status, int fetchInterval) { this(); - this.status = (byte)status; + this.status = (byte) status; this.fetchInterval = fetchInterval; } @@ -153,26 +155,36 @@ public class CrawlDatum implements Writa // accessor methods // - public byte getStatus() { return status; } - + public byte getStatus() { + return status; + } + public static String getStatusName(byte value) { String res = statNames.get(value); - if (res == null) res = "unknown"; + if (res == null) + res = "unknown"; return res; } - - public void setStatus(int status) { this.status = (byte)status; } + + public void setStatus(int status) { + this.status = (byte) status; + } /** * Returns either the time of the last fetch, or the next fetch time, * depending on whether Fetcher or CrawlDbReducer set the time. */ - public long getFetchTime() { return fetchTime; } + public long getFetchTime() { + return fetchTime; + } + /** - * Sets either the time of the last fetch or the next fetch time, - * depending on whether Fetcher or CrawlDbReducer set the time. + * Sets either the time of the last fetch or the next fetch time, depending on + * whether Fetcher or CrawlDbReducer set the time. */ - public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; } + public void setFetchTime(long fetchTime) { + this.fetchTime = fetchTime; + } public long getModifiedTime() { return modifiedTime; @@ -181,20 +193,34 @@ public class CrawlDatum implements Writa public void setModifiedTime(long modifiedTime) { this.modifiedTime = modifiedTime; } - - public byte getRetriesSinceFetch() { return retries; } - public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;} - public int getFetchInterval() { return fetchInterval; } + public byte getRetriesSinceFetch() { + return retries; + } + + public void setRetriesSinceFetch(int retries) { + this.retries = (byte) retries; + } + + public int getFetchInterval() { + return fetchInterval; + } + public void setFetchInterval(int fetchInterval) { this.fetchInterval = fetchInterval; } + public void setFetchInterval(float fetchInterval) { this.fetchInterval = Math.round(fetchInterval); } - public float getScore() { return score; } - public void setScore(float score) { this.score = score; } + public float getScore() { + return score; + } + + public void setScore(float score) { + this.score = score; + } public byte[] getSignature() { return signature; @@ -202,33 +228,37 @@ public class CrawlDatum implements Writa public void setSignature(byte[] signature) { if (signature != null && signature.length > 256) - throw new RuntimeException("Max signature length (256) exceeded: " + signature.length); + throw new RuntimeException("Max signature length (256) exceeded: " + + signature.length); this.signature = signature; } - - public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) { - this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable); - } - - /** Add all metadata from other CrawlDatum to this CrawlDatum. - * - * @param other CrawlDatum - */ - public void putAllMetaData(CrawlDatum other) { - for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) { - getMetaData().put(e.getKey(), e.getValue()); - } - } + + public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) { + this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable); + } /** - * returns a MapWritable if it was set or read in @see readFields(DataInput), - * returns empty map in case CrawlDatum was freshly created (lazily instantiated). + * Add all metadata from other CrawlDatum to this CrawlDatum. + * + * @param other + * CrawlDatum + */ + public void putAllMetaData(CrawlDatum other) { + for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) { + getMetaData().put(e.getKey(), e.getValue()); + } + } + + /** + * returns a MapWritable if it was set or read in @see readFields(DataInput), + * returns empty map in case CrawlDatum was freshly created (lazily + * instantiated). */ public org.apache.hadoop.io.MapWritable getMetaData() { - if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable(); + if (this.metaData == null) + this.metaData = new org.apache.hadoop.io.MapWritable(); return this.metaData; } - // // writable methods @@ -241,8 +271,8 @@ public class CrawlDatum implements Writa } public void readFields(DataInput in) throws IOException { - byte version = in.readByte(); // read version - if (version > CUR_VERSION) // check version + byte version = in.readByte(); // read version + if (version > CUR_VERSION) // check version throw new VersionMismatchException(CUR_VERSION, version); status = in.readByte(); @@ -250,7 +280,8 @@ public class CrawlDatum implements Writa retries = in.readByte(); if (version > 5) { fetchInterval = in.readInt(); - } else fetchInterval = Math.round(in.readFloat()); + } else + fetchInterval = Math.round(in.readFloat()); score = in.readFloat(); if (version > 2) { modifiedTime = in.readLong(); @@ -258,9 +289,10 @@ public class CrawlDatum implements Writa if (cnt > 0) { signature = new byte[cnt]; in.readFully(signature); - } else signature = null; + } else + signature = null; } - + if (version > 3) { boolean hasMetadata = false; if (version < 7) { @@ -280,7 +312,8 @@ public class CrawlDatum implements Writa metaData.readFields(in); } } - if (hasMetadata==false) metaData = null; + if (hasMetadata == false) + metaData = null; } // translate status codes if (version < 5) { @@ -288,7 +321,7 @@ public class CrawlDatum implements Writa status = oldToNew.get(status); else status = STATUS_DB_UNFETCHED; - + } } @@ -297,7 +330,7 @@ public class CrawlDatum implements Writa private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8; public void write(DataOutput out) throws IOException { - out.writeByte(CUR_VERSION); // store current version + out.writeByte(CUR_VERSION); // store current version out.writeByte(status); out.writeLong(fetchTime); out.writeByte(retries); @@ -328,17 +361,19 @@ public class CrawlDatum implements Writa this.modifiedTime = that.modifiedTime; this.signature = that.signature; if (that.metaData != null) { - this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy + this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make + // a + // deep + // copy } else { this.metaData = null; } } - // // compare methods // - + /** Sort by decreasing score. */ public int compareTo(CrawlDatum that) { if (that.score != this.score) @@ -356,47 +391,49 @@ public class CrawlDatum implements Writa return SignatureComparator._compare(this, that); } - /** A Comparator optimized for CrawlDatum. */ + /** A Comparator optimized for CrawlDatum. */ public static class Comparator extends WritableComparator { - public Comparator() { super(CrawlDatum.class); } + public Comparator() { + super(CrawlDatum.class); + } public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { - float score1 = readFloat(b1,s1+SCORE_OFFSET); - float score2 = readFloat(b2,s2+SCORE_OFFSET); + float score1 = readFloat(b1, s1 + SCORE_OFFSET); + float score2 = readFloat(b2, s2 + SCORE_OFFSET); if (score2 != score1) { return (score2 - score1) > 0 ? 1 : -1; } - int status1 = b1[s1+1]; - int status2 = b2[s2+1]; + int status1 = b1[s1 + 1]; + int status2 = b2[s2 + 1]; if (status2 != status1) return status1 - status2; - long fetchTime1 = readLong(b1, s1+1+1); - long fetchTime2 = readLong(b2, s2+1+1); + long fetchTime1 = readLong(b1, s1 + 1 + 1); + long fetchTime2 = readLong(b2, s2 + 1 + 1); if (fetchTime2 != fetchTime1) return (fetchTime2 - fetchTime1) > 0 ? 1 : -1; - int retries1 = b1[s1+1+1+8]; - int retries2 = b2[s2+1+1+8]; + int retries1 = b1[s1 + 1 + 1 + 8]; + int retries2 = b2[s2 + 1 + 1 + 8]; if (retries2 != retries1) return retries2 - retries1; - int fetchInterval1 = readInt(b1, s1+1+1+8+1); - int fetchInterval2 = readInt(b2, s2+1+1+8+1); + int fetchInterval1 = readInt(b1, s1 + 1 + 1 + 8 + 1); + int fetchInterval2 = readInt(b2, s2 + 1 + 1 + 8 + 1); if (fetchInterval2 != fetchInterval1) return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1; long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4); long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4); if (modifiedTime2 != modifiedTime1) return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1; - int sigl1 = b1[s1+SIG_OFFSET]; - int sigl2 = b2[s2+SIG_OFFSET]; - return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, SIG_OFFSET, sigl2); + int sigl1 = b1[s1 + SIG_OFFSET]; + int sigl2 = b2[s2 + SIG_OFFSET]; + return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, + SIG_OFFSET, sigl2); } } - static { // register this comparator + static { // register this comparator WritableComparator.define(CrawlDatum.class, new Comparator()); } - // // basic methods // @@ -404,12 +441,13 @@ public class CrawlDatum implements Writa public String toString() { StringBuilder buf = new StringBuilder(); buf.append("Version: " + CUR_VERSION + "\n"); - buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n"); + buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + + ")\n"); buf.append("Fetch time: " + new Date(getFetchTime()) + "\n"); buf.append("Modified time: " + new Date(getModifiedTime()) + "\n"); buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n"); - buf.append("Retry interval: " + getFetchInterval() + " seconds (" + - (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n"); + buf.append("Retry interval: " + getFetchInterval() + " seconds (" + + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n"); buf.append("Score: " + getScore() + "\n"); buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n"); buf.append("Metadata: \n "); @@ -424,35 +462,35 @@ public class CrawlDatum implements Writa } return buf.toString(); } - + private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) { - if (metaData==null || metaData.size() ==0) { + if (metaData == null || metaData.size() == 0) { return otherMetaData == null || otherMetaData.size() == 0; } if (otherMetaData == null) { // we already know that the current object is not null or empty return false; } - HashSet<Entry<Writable, Writable>> set1 = - new HashSet<Entry<Writable,Writable>>(metaData.entrySet()); - HashSet<Entry<Writable, Writable>> set2 = - new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet()); + HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable, Writable>>( + metaData.entrySet()); + HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable, Writable>>( + otherMetaData.entrySet()); return set1.equals(set2); } public boolean equals(Object o) { if (!(o instanceof CrawlDatum)) return false; - CrawlDatum other = (CrawlDatum)o; - boolean res = - (this.status == other.status) && - (this.fetchTime == other.fetchTime) && - (this.modifiedTime == other.modifiedTime) && - (this.retries == other.retries) && - (this.fetchInterval == other.fetchInterval) && - (SignatureComparator._compare(this.signature, other.signature) == 0) && - (this.score == other.score); - if (!res) return res; + CrawlDatum other = (CrawlDatum) o; + boolean res = (this.status == other.status) + && (this.fetchTime == other.fetchTime) + && (this.modifiedTime == other.modifiedTime) + && (this.retries == other.retries) + && (this.fetchInterval == other.fetchInterval) + && (SignatureComparator._compare(this.signature, other.signature) == 0) + && (this.score == other.score); + if (!res) + return res; return metadataEquals(other.metaData); } @@ -460,20 +498,14 @@ public class CrawlDatum implements Writa int res = 0; if (signature != null) { for (int i = 0; i < signature.length / 4; i += 4) { - res ^= (signature[i] << 24 + signature[i+1] << 16 + - signature[i+2] << 8 + signature[i+3]); + res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2] << 8 + signature[i + 3]); } } if (metaData != null) { res ^= metaData.entrySet().hashCode(); } - return - res ^ status ^ - ((int)fetchTime) ^ - ((int)modifiedTime) ^ - retries ^ - fetchInterval ^ - Float.floatToIntBits(score); + return res ^ status ^ ((int) fetchTime) ^ ((int) modifiedTime) ^ retries + ^ fetchInterval ^ Float.floatToIntBits(score); } public Object clone() { Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Jan 29 05:38:59 2015 @@ -38,8 +38,8 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.TimingUtil; /** - * This class takes the output of the fetcher and updates the - * crawldb accordingly. + * This class takes the output of the fetcher and updates the crawldb + * accordingly. */ public class CrawlDb extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(CrawlDb.class); @@ -49,21 +49,26 @@ public class CrawlDb extends Configured public static final String CRAWLDB_PURGE_404 = "db.update.purge.404"; public static final String CURRENT_NAME = "current"; - + public static final String LOCK_NAME = ".locked"; - - public CrawlDb() {} - + + public CrawlDb() { + } + public CrawlDb(Configuration conf) { setConf(conf); } - public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException { - boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); + public void update(Path crawlDb, Path[] segments, boolean normalize, + boolean filter) throws IOException { + boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, + true); update(crawlDb, segments, normalize, filter, additionsAllowed, false); } - - public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException { + + public void update(Path crawlDb, Path[] segments, boolean normalize, + boolean filter, boolean additionsAllowed, boolean force) + throws IOException { FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.createLockFile(fs, lock, force); @@ -106,22 +111,24 @@ public class CrawlDb extends Configured } catch (IOException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); - if (fs.exists(outPath) ) fs.delete(outPath, true); + if (fs.exists(outPath)) + fs.delete(outPath, true); throw e; } CrawlDb.install(job, crawlDb); long end = System.currentTimeMillis(); - LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } -/* - * Configure a new CrawlDb in a temp folder at crawlDb/<rand> - */ + + /* + * Configure a new CrawlDb in a temp folder at crawlDb/<rand> + */ public static JobConf createJob(Configuration config, Path crawlDb) - throws IOException { - Path newCrawlDb = - new Path(crawlDb, - Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + throws IOException { + Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random() + .nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("crawldb " + crawlDb); @@ -154,12 +161,14 @@ public class CrawlDb extends Configured Path old = new Path(crawlDb, "old"); Path current = new Path(crawlDb, CURRENT_NAME); if (fs.exists(current)) { - if (fs.exists(old)) fs.delete(old, true); + if (fs.exists(old)) + fs.delete(old, true); fs.rename(current, old); } fs.mkdirs(crawlDb); fs.rename(newCrawlDb, current); - if (!preserveBackup && fs.exists(old)) fs.delete(old, true); + if (!preserveBackup && fs.exists(old)) + fs.delete(old, true); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.removeLockFile(fs, lock); } @@ -171,20 +180,29 @@ public class CrawlDb extends Configured public int run(String[] args) throws Exception { if (args.length < 1) { - System.err.println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]"); + System.err + .println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]"); System.err.println("\tcrawldb\tCrawlDb to update"); - System.err.println("\t-dir segments\tparent directory containing all segments to update from"); - System.err.println("\tseg1 seg2 ...\tlist of segment names to update from"); - System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)"); - System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)"); - System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment"); - System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs"); + System.err + .println("\t-dir segments\tparent directory containing all segments to update from"); + System.err + .println("\tseg1 seg2 ...\tlist of segment names to update from"); + System.err + .println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)"); + System.err + .println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)"); + System.err + .println("\t-filter\tuse URLFilters on urls in CrawlDb and segment"); + System.err + .println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs"); return -1; } - boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, false); + boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, + false); boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false); - boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); + boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, + true); boolean force = false; final FileSystem fs = FileSystem.get(getConf()); HashSet<Path> dirs = new HashSet<Path>(); @@ -198,14 +216,16 @@ public class CrawlDb extends Configured } else if (args[i].equals("-noAdditions")) { additionsAllowed = false; } else if (args[i].equals("-dir")) { - FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); + FileStatus[] paths = fs.listStatus(new Path(args[++i]), + HadoopFSUtil.getPassDirectoriesFilter(fs)); dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths))); } else { dirs.add(new Path(args[i])); } } try { - update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force); + update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, + filter, additionsAllowed, force); return 0; } catch (Exception e) { LOG.error("CrawlDb update: " + StringUtils.stringifyException(e)); Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Thu Jan 29 05:38:59 2015 @@ -30,12 +30,13 @@ import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; /** - * This class provides a way to separate the URL normalization - * and filtering steps from the rest of CrawlDb manipulation code. + * This class provides a way to separate the URL normalization and filtering + * steps from the rest of CrawlDb manipulation code. * * @author Andrzej Bialecki */ -public class CrawlDbFilter implements Mapper<Text, CrawlDatum, Text, CrawlDatum> { +public class CrawlDbFilter implements + Mapper<Text, CrawlDatum, Text, CrawlDatum> { public static final String URL_FILTERING = "crawldb.url.filters"; public static final String URL_NORMALIZING = "crawldb.url.normalizers"; @@ -51,7 +52,7 @@ public class CrawlDbFilter implements Ma private URLFilters filters; private URLNormalizers normalizers; - + private String scope; public static final Logger LOG = LoggerFactory.getLogger(CrawlDbFilter.class); @@ -70,17 +71,19 @@ public class CrawlDbFilter implements Ma } } - public void close() {} - + public void close() { + } + private Text newKey = new Text(); public void map(Text key, CrawlDatum value, - OutputCollector<Text, CrawlDatum> output, - Reporter reporter) throws IOException { + OutputCollector<Text, CrawlDatum> output, Reporter reporter) + throws IOException { String url = key.toString(); - // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, cheaper than normalizing or filtering + // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, + // cheaper than normalizing or filtering if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) { url = null; } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Thu Jan 29 05:38:59 2015 @@ -39,36 +39,42 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.TimingUtil; /** - * This tool merges several CrawlDb-s into one, optionally filtering - * URLs through the current URLFilters, to skip prohibited - * pages. + * This tool merges several CrawlDb-s into one, optionally filtering URLs + * through the current URLFilters, to skip prohibited pages. * - * <p>It's possible to use this tool just for filtering - in that case - * only one CrawlDb should be specified in arguments.</p> - * <p>If more than one CrawlDb contains information about the same URL, - * only the most recent version is retained, as determined by the - * value of {@link org.apache.nutch.crawl.CrawlDatum#getFetchTime()}. - * However, all metadata information from all versions is accumulated, - * with newer values taking precedence over older values. + * <p> + * It's possible to use this tool just for filtering - in that case only one + * CrawlDb should be specified in arguments. + * </p> + * <p> + * If more than one CrawlDb contains information about the same URL, only the + * most recent version is retained, as determined by the value of + * {@link org.apache.nutch.crawl.CrawlDatum#getFetchTime()}. However, all + * metadata information from all versions is accumulated, with newer values + * taking precedence over older values. * * @author Andrzej Bialecki */ public class CrawlDbMerger extends Configured implements Tool { - private static final Logger LOG = LoggerFactory.getLogger(CrawlDbMerger.class); + private static final Logger LOG = LoggerFactory + .getLogger(CrawlDbMerger.class); - public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> { + public static class Merger extends MapReduceBase implements + Reducer<Text, CrawlDatum, Text, CrawlDatum> { private org.apache.hadoop.io.MapWritable meta; private CrawlDatum res = new CrawlDatum(); private FetchSchedule schedule; - public void close() throws IOException {} + public void close() throws IOException { + } public void configure(JobConf conf) { schedule = FetchScheduleFactory.getFetchSchedule(conf); } - public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) - throws IOException { + public void reduce(Text key, Iterator<CrawlDatum> values, + OutputCollector<Text, CrawlDatum> output, Reporter reporter) + throws IOException { long resTime = 0L; boolean resSet = false; meta = new org.apache.hadoop.io.MapWritable(); @@ -91,7 +97,7 @@ public class CrawlDbMerger extends Confi meta.put(e.getKey(), e.getValue()); } res.set(val); - resTime = valTime ; + resTime = valTime; } else { // insert older metadata before newer for (Entry<Writable, Writable> e : meta.entrySet()) { @@ -104,37 +110,44 @@ public class CrawlDbMerger extends Confi output.collect(key, res); } } - + public CrawlDbMerger() { - + } - + public CrawlDbMerger(Configuration conf) { setConf(conf); } - public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { + public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) + throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("CrawlDb merge: starting at " + sdf.format(start)); JobConf job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { - if (LOG.isInfoEnabled()) { LOG.info("Adding " + dbs[i]); } + if (LOG.isInfoEnabled()) { + LOG.info("Adding " + dbs[i]); + } FileInputFormat.addInputPath(job, new Path(dbs[i], CrawlDb.CURRENT_NAME)); } JobClient.runJob(job); FileSystem fs = FileSystem.get(getConf()); - if(fs.exists(output)) - fs.delete(output,true); + if (fs.exists(output)) + fs.delete(output, true); fs.mkdirs(output); - fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, CrawlDb.CURRENT_NAME)); + fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, + CrawlDb.CURRENT_NAME)); long end = System.currentTimeMillis(); - LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) { - Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + public static JobConf createMergeJob(Configuration conf, Path output, + boolean normalize, boolean filter) { + Path newCrawlDb = new Path("crawldb-merge-" + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(conf); job.setJobName("crawldb merge " + output); @@ -158,16 +171,20 @@ public class CrawlDbMerger extends Confi * @param args */ public static void main(String[] args) throws Exception { - int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(), + args); System.exit(res); } - + public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: CrawlDbMerger <output_crawldb> <crawldb1> [<crawldb2> <crawldb3> ...] [-normalize] [-filter]"); + System.err + .println("Usage: CrawlDbMerger <output_crawldb> <crawldb1> [<crawldb2> <crawldb3> ...] [-normalize] [-filter]"); System.err.println("\toutput_crawldb\toutput CrawlDb"); - System.err.println("\tcrawldb1 ...\tinput CrawlDb-s (single input CrawlDb is ok)"); - System.err.println("\t-normalize\tuse URLNormalizer on urls in the crawldb(s) (usually not needed)"); + System.err + .println("\tcrawldb1 ...\tinput CrawlDb-s (single input CrawlDb is ok)"); + System.err + .println("\t-normalize\tuse URLNormalizer on urls in the crawldb(s) (usually not needed)"); System.err.println("\t-filter\tuse URLFilters on urls in the crawldb(s)"); return -1; } @@ -185,8 +202,8 @@ public class CrawlDbMerger extends Confi continue; } final Path dbPath = new Path(args[i]); - if(fs.exists(dbPath)) - dbs.add(dbPath); + if (fs.exists(dbPath)) + dbs.add(dbPath); } try { merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);