Remove nutch references.
Project: http://git-wip-us.apache.org/repos/asf/gora/repo Commit: http://git-wip-us.apache.org/repos/asf/gora/commit/77ccfec6 Tree: http://git-wip-us.apache.org/repos/asf/gora/tree/77ccfec6 Diff: http://git-wip-us.apache.org/repos/asf/gora/diff/77ccfec6 Branch: refs/heads/master Commit: 77ccfec66f5cc9249898fe4f2d9cf0e37f398098 Parents: a7c5f77 Author: Damien Raude-Morvan <dam...@dictanova.com> Authored: Sun May 18 00:02:46 2014 +0200 Committer: Damien Raude-Morvan <dam...@dictanova.com> Committed: Sun May 18 00:02:46 2014 +0200 ---------------------------------------------------------------------- .../conf/nutch/gora-mongodb-mapping.xml | 51 ----- .../src/examples/conf/nutch/gora.properties | 90 --------- .../src/examples/conf/nutch/log4j.properties | 96 ---------- .../src/examples/conf/nutch/nutch-site.xml | 189 ------------------- .../gora/mongodb/store/TestMongoStoreNutch.java | 77 -------- 5 files changed, 503 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml ---------------------------------------------------------------------- diff --git a/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml b/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml deleted file mode 100644 index 48d496c..0000000 --- a/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml +++ /dev/null @@ -1,51 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<gora-orm> - - <class document="frontier" keyClass="java.lang.String" name="org.apache.nutch.storage.WebPage"> - <field name="baseUrl" docfield="baseUrl" type="string"/> - <field name="status" docfield="status" type="int32"/> - <field name="fetchTime" docfield="fetchTime" type="int64"/> - <field name="prevFetchTime" docfield="prevFetchTime" type="int64"/> - <field name="fetchInterval" docfield="fetchInterval" type="int32"/> - <field name="retriesSinceFetch" docfield="retriesSinceFetch" type="int32"/> - <field name="modifiedTime" docfield="modifiedTime" type="int64"/> - <field name="protocolStatus" docfield="protocolStatus" type="document"/> - <field name="content" docfield="content" type="binary"/> - <field name="contentType" docfield="contentType" type="string"/> - <field name="signature" docfield="signature" type="binary"/> - <field name="prevSignature" docfield="prevSignature" type="binary"/> - <field name="title" docfield="title" type="string"/> - <field name="text" docfield="text" type="string"/> - <field name="parseStatus" docfield="parseStatus" type="document"/> - <field name="score" docfield="score" type="double"/> - <field name="reprUrl" docfield="reprUrl" type="string"/> - <field name="headers" docfield="headers" type="document"/> - <field name="outlinks" docfield="outlinks" type="document"/> - <field name="inlinks" docfield="inlinks" type="document"/> - <field name="markers" docfield="markers" type="document"/> - <field name="metadata" docfield="metadata" type="document"/> - </class> - - <class document="hosts" keyClass="java.lang.String" name="org.apache.nutch.storage.Host"> - <field name="metadata" docfield="metadata" type="document"/> - <field name="outlinks" docfield="links.out" type="document"/> - <field name="inlinks" docfield="links.in" type="document"/> - </class> - -</gora-orm> http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/gora.properties ---------------------------------------------------------------------- diff --git a/gora-mongodb/src/examples/conf/nutch/gora.properties b/gora-mongodb/src/examples/conf/nutch/gora.properties deleted file mode 100644 index 1d15229..0000000 --- a/gora-mongodb/src/examples/conf/nutch/gora.properties +++ /dev/null @@ -1,90 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#gora.datastore.default=org.apache.gora.mock.store.MockDataStore -gora.datastore.autocreateschema=true - -############################### -# Default SqlStore properties # -############################### - -gora.sqlstore.jdbc.driver=org.hsqldb.jdbc.JDBCDriver -gora.sqlstore.jdbc.url=jdbc:hsqldb:hsql://localhost/nutchtest -gora.sqlstore.jdbc.user=sa -gora.sqlstore.jdbc.password= - -################################ -# Default AvroStore properties # -################################ - -# gora.avrostore.codec.type=BINARY||JSON -# gora.avrostore.output.path=file:///tmp/gora.avrostore.test.output - -################################ -# DatafileAvroStore properties # -################################ -# DataFileAvroStore is file based store which uses Avro's -# DataFile{Writer,Reader}'s as a backend. This datastore supports -# mapreduce. - -# gora.datafileavrostore.###= - -######################### -# HBaseStore properties # -######################### -# HBase requires that the Configuration has a valid "hbase.zookeeper.quorum" -# property. It should be included within hbase-site.xml on the classpath. When -# this property is omitted, it expects Zookeeper to run on localhost:2181. - -# To greatly improve scan performance, increase the hbase-site Configuration -# property "hbase.client.scanner.caching". This sets the number of rows to grab -# per request. - -# HBase autoflushing. Enabling autoflush decreases write performance. -# Available since Gora 0.2. Defaults to disabled. -# hbase.client.autoflush.default=false - -############################# -# CassandraStore properties # -############################# - -# gora.cassandrastore.servers=localhost:9160 - -####################### -# MemStore properties # -####################### -# This is a memory based {@link DataStore} implementation for tests. - -# gora.memstore.###= - -############################ -# AccumuloStore properties # -############################ -#gora.datastore.default=org.apache.gora.accumulo.store.AccumuloStore -#gora.datastore.accumulo.mock=true -#gora.datastore.accumulo.instance=a14 -#gora.datastore.accumulo.zookeepers=localhost -#gora.datastore.accumulo.user=root -#gora.datastore.accumulo.password=secret - -############################ -# MongoDBStore properties # -############################ -gora.datastore.default=org.apache.gora.mongodb.store.MongoStore -gora.mongodb.override_hadoop_configuration=true -gora.mongodb.mapping.file=/gora-mongodb-mapping.xml -gora.mongodb.servers=localhost -gora.mongodb.db=nutchtest - http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/log4j.properties ---------------------------------------------------------------------- diff --git a/gora-mongodb/src/examples/conf/nutch/log4j.properties b/gora-mongodb/src/examples/conf/nutch/log4j.properties deleted file mode 100644 index e1e839e..0000000 --- a/gora-mongodb/src/examples/conf/nutch/log4j.properties +++ /dev/null @@ -1,96 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Define some default values that can be overridden by system properties -hadoop.log.dir=. -hadoop.log.file=hadoop.log - -# RootLogger - DailyRollingFileAppender -log4j.rootLogger=INFO,DRFA - -# Logging Threshold -log4j.threshhold=ALL - -#special logging requirements for some commandline tools -log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.InjectorJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.host.HostInjectorJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.GeneratorJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.DbUpdaterJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.host.HostDbUpdateJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.fetcher.FetcherJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout -log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout -log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout - -log4j.logger.org.apache.nutch=WARN -log4j.logger.org.apache.hadoop=WARN -log4j.logger.org.apache.zookeeper=WARN - -# -# Daily Rolling File Appender -# - -log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender -log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} - -# Rollver at midnight -log4j.appender.DRFA.DatePattern=.yyyy-MM-dd - -# 30-day backup -#log4j.appender.DRFA.MaxBackupIndex=30 -log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout - -# Pattern format: Date LogLevel LoggerName LogMessage -log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n -# Debugging Pattern format: Date LogLevel LoggerName (FileName:MethodName:LineNo) LogMessage -#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n - - -# -# stdout -# Add *stdout* to rootlogger above if you want to use this -# - -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n - -# -# plain layout used for commandline tools to output to console -# -log4j.appender.cmdstdout=org.apache.log4j.ConsoleAppender -log4j.appender.cmdstdout.layout=org.apache.log4j.PatternLayout -log4j.appender.cmdstdout.layout.ConversionPattern=%m%n - -# -# Rolling File Appender -# - -#log4j.appender.RFA=org.apache.log4j.RollingFileAppender -#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file} - -# Logfile size and and 30-day backups -#log4j.appender.RFA.MaxFileSize=1MB -#log4j.appender.RFA.MaxBackupIndex=30 - -#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout -#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n -#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n - http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/nutch-site.xml ---------------------------------------------------------------------- diff --git a/gora-mongodb/src/examples/conf/nutch/nutch-site.xml b/gora-mongodb/src/examples/conf/nutch/nutch-site.xml deleted file mode 100644 index 49083d9..0000000 --- a/gora-mongodb/src/examples/conf/nutch/nutch-site.xml +++ /dev/null @@ -1,189 +0,0 @@ -<?xml version="1.0"?> -<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> - -<!-- Put site-specific property overrides in this file. --> - -<configuration> - -<property> - <name>http.agent.name</name> - <value>nutch-crawler</value> -</property> - -<property> - <name>http.robots.agents</name> - <value>nutch-crawler,*</value> -</property> - -<property> - <name>http.accept.language</name> - <value>fr-fr,fr,en;q=0.7,*;q=0.3</value> -</property> - - -<property> - <name>db.fetch.schedule.class</name> - <value>org.apache.nutch.crawl.AdaptiveFetchSchedule</value> -</property> - -<property> - <name>fetcher.throughput.threshold.pages</name> - <value>0.8</value> - <description>The threshold of minimum pages per second. If the fetcher downloads less - pages per second than the configured threshold, the fetcher stops, preventing slow queue's - from stalling the throughput. This threshold must be an integer. This can be useful when - fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check. - </description> -</property> - -<property> - <name>generate.update.crawldb</name> - <value>true</value> - <description>For highly-concurrent environments, where several - generate/fetch/update cycles may overlap, setting this to true ensures - that generate will create different fetchlists even without intervening - updatedb-s, at the cost of running an additional job to update CrawlDB. - If false, running generate twice without intervening - updatedb will generate identical fetchlists.</description> -</property> - -<property> - <name>crawl.gen.delay</name> - <value>86400000</value> <!-- Make it one day --> - <description> - This value, expressed in days, defines how long we should keep the lock on records - in CrawlDb that were just selected for fetching. If these records are not updated - in the meantime, the lock is canceled, i.e. the become eligible for selecting. - Default value of this is 7 days. - </description> -</property> - -<property> - <name>fetcher.parse</name> - <value>true</value> - <description>If true, fetcher will parse content. NOTE: previous releases would - default to true. Since 2.0 this is set to false as a safer default.</description> -</property> - -<property> - <name>parser.html.outlinks.ignore_tags</name> - <value>img,script,link</value> - <description>Comma separated list of HTML tags, from which outlinks - shouldn't be extracted. Nutch takes links from: a, area, form, frame, - iframe, script, link, img. If you add any of those tags here, it - won't be taken. Default is empty list. Probably reasonable value - for most people would be "img,script,link".</description> -</property> - -<property> - <name>plugin.includes</name> - <value>protocol-http|urlfilter-regex|parse-(html|tika)|urlnormalizer-(pass|regex|basic)|scoring-opic</value> -</property> - -<property> - <name>mapred.task.timeout</name> - <value>600000</value> - <!-- Max 10 minutes idle --> -</property> - -<property> - <name>parser.html.impl</name> - <value>tagsoup</value> - <description>HTML Parser implementation. Currently the following keywords - are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. - </description> -</property> - -<property> - <name>db.update.additions.allowed</name> - <value>true</value> - <description>If true, updatedb will add newly discovered URLs, if false - only already existing URLs in the CrawlDb will be updated and no new - URLs will be added. - </description> -</property> - -<property> - <name>db.ignore.internal.links</name> - <value>false</value> - <description>If true, when adding new links to a page, links from - the same host are ignored. This is an effective way to limit the - size of the link database, keeping only the highest quality - links. - </description> -</property> - -<property> - <name>db.score.link.external</name> - <value>2.0</value> - <description>The score factor for new pages added due to a link from - another host relative to the referencing page's score. Scoring plugins - may use this value to affect initial scores of external links. - </description> -</property> - -<property> - <name>db.score.link.internal</name> - <value>0.5</value> - <description>The score factor for pages added due to a link from the - same host, relative to the referencing page's score. Scoring plugins - may use this value to affect initial scores of internal links. - </description> -</property> - -<property> - <name>db.parsemeta.to.crawldb</name> - <value>lang</value> - <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). - Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' - will copy both the key 'lang' and its value to the corresponding entry in the crawldb. - </description> -</property> - -<property> - <name>generate.max.count</name> - <value>10000</value> - <description>The maximum number of urls in a single - fetchlist. -1 if unlimited. The urls are counted according - to the value of the parameter generator.count.mode. - </description> -</property> - -<!-- storage properties --> - -<property> - <name>storage.data.store.class</name> - <value>org.apache.gora.mongodb.store.MongoStore</value> -</property> - -<property> - <name>storage.schema.webpage</name> - <value>frontier</value> - <description>This value holds the schema name used for Nutch web db. - Note that Nutch ignores the value in the gora mapping files, and uses - this as the webpage schema name. - </description> -</property> - -<property> - <name>storage.schema.host</name> - <value>host</value> - <description>This value holds the schema name used for Nutch host db. - Note that Nutch ignores the value in the gora mapping files, and uses - this as the host schema name. - </description> -</property> - -<property> - <name>storage.crawl.id</name> - <value></value> - <description>This value helps differentiate between the datasets that - the jobs in the crawl cycle generate and operate on. The value will - be input to all the jobs which then will use it as a prefix when - accessing to the schemas. The default configuration uses no id to prefix - the schemas. The value could also be given as a command line argument - to each job. - </description> -</property> - -</configuration> http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java ---------------------------------------------------------------------- diff --git a/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java b/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java deleted file mode 100644 index 8487b45..0000000 --- a/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gora.mongodb.store; - -import org.apache.gora.mapreduce.GoraOutputFormat; -//import org.apache.gora.mongodb.beans.tests.WebPage; -import org.apache.gora.store.DataStore; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -//import org.apache.nutch.crawl.InjectorJob.UrlMapper; -//import org.apache.nutch.storage.StorageUtils; -//import org.apache.nutch.util.NutchJob; -//import org.apache.nutch.util.ToolUtil; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.IOException; -import java.util.HashMap; - -@Ignore("Needs Nutch configuration") -public class TestMongoStoreNutch { - - /** - * For this test to work, it is necessary to provide: - * - a plugin directory with at least one urlnormalizer - * - include the jar of the used plugins in the classpath - * - * @throws IOException - * @throws ClassNotFoundException - * @throws InterruptedException - */ - /*@Test - public void testNutchInjection() throws IOException, ClassNotFoundException, InterruptedException { - Path input = new Path("src/it/resources/test-nutch-inject.csv"); - Configuration conf = new Configuration(); - conf.set("storage.data.store.class", "org.apache.gora.mongodb.store.MongoStore"); - conf.set("plugin.folders", "/home/grdscarabe/PROJECTS/DictaLab/workspace-nutch/nutch-2.1/runtime/local/plugins"); // FIXME - conf.set("plugin.auto-activation", "true"); - conf.set("plugin.includes", "urlnormalizer-basic"); - conf.set("plugin.excludes", ""); - - HashMap<String, Object> results = new HashMap<String, Object>(); - Job currentJob = new NutchJob(conf, "inject " + input); - FileInputFormat.addInputPath(currentJob, input); - currentJob.setMapperClass(UrlMapper.class); - currentJob.setMapOutputKeyClass(String.class); - currentJob.setMapOutputValueClass(WebPage.class); - currentJob.setOutputFormatClass(GoraOutputFormat.class); - DataStore<String, org.apache.nutch.storage.WebPage> store = - StorageUtils.createWebStore(currentJob.getConfiguration(), - String.class, org.apache.nutch.storage.WebPage.class); - GoraOutputFormat.setOutput(currentJob, store, true); - currentJob.setReducerClass(Reducer.class); - currentJob.setNumReduceTasks(0); - currentJob.waitForCompletion(true); - ToolUtil.recordJobStatus(null, currentJob, results); - } - */ -}