Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java (original) +++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java Fri Jan 9 06:34:33 2015 @@ -40,191 +40,192 @@ import static org.junit.Assert.assertEqu */ public class TestProtocolHttpClient { - private Server server; - private Configuration conf; - private static final String RES_DIR = System.getProperty("test.data", "."); - private int port; - private Http http = new Http(); + private Server server; + private Configuration conf; + private static final String RES_DIR = System.getProperty("test.data", "."); + private int port; + private Http http = new Http(); @Before - public void setUp() throws Exception { + public void setUp() throws Exception { - server = new Server(); - -// Context scontext = new Context(); -// scontext.setContextPath("/"); -// scontext.setResourceBase(RES_DIR); -// // servlet handler? -// scontext.addServlet("JSP", "*.jsp", -// "org.apache.jasper.servlet.JspServlet"); -// scontext.addHandler(new ResourceHandler()); - - Context root = new Context(server,"/",Context.SESSIONS); - root.setContextPath("/"); - root.setResourceBase(RES_DIR); - ServletHolder sh = new ServletHolder(org.apache.jasper.servlet.JspServlet.class); - root.addServlet(sh, "*.jsp"); - - conf = new Configuration(); - conf.addResource("nutch-default.xml"); - conf.addResource("nutch-site-test.xml"); - - http = new Http(); - http.setConf(conf); - } + server = new Server(); + + // Context scontext = new Context(); + // scontext.setContextPath("/"); + // scontext.setResourceBase(RES_DIR); + // // servlet handler? + // scontext.addServlet("JSP", "*.jsp", + // "org.apache.jasper.servlet.JspServlet"); + // scontext.addHandler(new ResourceHandler()); + + Context root = new Context(server, "/", Context.SESSIONS); + root.setContextPath("/"); + root.setResourceBase(RES_DIR); + ServletHolder sh = new ServletHolder( + org.apache.jasper.servlet.JspServlet.class); + root.addServlet(sh, "*.jsp"); + + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + conf.addResource("nutch-site-test.xml"); + + http = new Http(); + http.setConf(conf); + } @After - public void tearDown() throws Exception { - server.stop(); - } - - /** - * Tests whether the client can remember cookies. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testCookies() throws Exception { - startServer(47500); - fetchPage("/cookies.jsp", 200); - fetchPage("/cookies.jsp?cookie=yes", 200); - tearDown(); - } - - /** - * Tests that no pre-emptive authorization headers are sent by the client. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testNoPreemptiveAuth() throws Exception { - startServer(47500); - fetchPage("/noauth.jsp", 200); - tearDown(); - } - - /** - * Tests default credentials. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testDefaultCredentials() throws Exception { - startServer(47502); - fetchPage("/basic.jsp", 200); - tearDown(); - } - - /** - * Tests basic authentication scheme for various realms. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testBasicAuth() throws Exception { - startServer(47500); - fetchPage("/basic.jsp", 200); - fetchPage("/basic.jsp?case=1", 200); - fetchPage("/basic.jsp?case=2", 200); - tearDown(); - } - - /** - * Tests that authentication happens for a defined realm and not for other - * realms for a host:port when an extra <code>authscope</code> tag is not - * defined to match all other realms. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testOtherRealmsNoAuth() throws Exception { - startServer(47501); - fetchPage("/basic.jsp", 200); - fetchPage("/basic.jsp?case=1", 401); - fetchPage("/basic.jsp?case=2", 401); - tearDown(); - } - - /** - * Tests Digest authentication scheme. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testDigestAuth() throws Exception { - startServer(47500); - fetchPage("/digest.jsp", 200); - tearDown(); - } - - /** - * Tests NTLM authentication scheme. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testNtlmAuth() throws Exception { - startServer(47501); - fetchPage("/ntlm.jsp", 200); - tearDown(); - } - - /** - * Starts the Jetty server at a specified port. - * - * @param portno - * Port number. - * @throws Exception - * When an error occurs. - */ - private void startServer(int portno) throws Exception { - port = portno; - - SelectChannelConnector connector1 = new SelectChannelConnector(); - connector1.setHost("127.0.0.1"); - connector1.setPort(port); - - server.addConnector(connector1); - server.start(); - } - - /** - * Fetches the specified <code>page</code> from the local Jetty server and - * checks whether the HTTP response status code matches with the expected - * code. - * - * @param page - * Page to be fetched. - * @param expectedCode - * HTTP response status code expected while fetching the page. - * @throws Exception - * When an error occurs or test case fails. - */ - private void fetchPage(String page, int expectedCode) throws Exception { - URL url = new URL("http", "127.0.0.1", port, page); - Response response = null; - response = http.getResponse(url, WebPage.newBuilder().build(), true); - - int code = response.getCode(); - assertEquals("HTTP Status Code for " + url, expectedCode, code); - } - - /** - * Returns an URL to the specified page. - * - * @param page - * Page available in the local Jetty server. - * @throws MalformedURLException - * If an URL can not be formed. - */ - private URL getURL(String page) throws MalformedURLException { - return new URL("http", "127.0.0.1", port, page); - } + public void tearDown() throws Exception { + server.stop(); + } + + /** + * Tests whether the client can remember cookies. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testCookies() throws Exception { + startServer(47500); + fetchPage("/cookies.jsp", 200); + fetchPage("/cookies.jsp?cookie=yes", 200); + tearDown(); + } + + /** + * Tests that no pre-emptive authorization headers are sent by the client. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testNoPreemptiveAuth() throws Exception { + startServer(47500); + fetchPage("/noauth.jsp", 200); + tearDown(); + } + + /** + * Tests default credentials. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testDefaultCredentials() throws Exception { + startServer(47502); + fetchPage("/basic.jsp", 200); + tearDown(); + } + + /** + * Tests basic authentication scheme for various realms. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testBasicAuth() throws Exception { + startServer(47500); + fetchPage("/basic.jsp", 200); + fetchPage("/basic.jsp?case=1", 200); + fetchPage("/basic.jsp?case=2", 200); + tearDown(); + } + + /** + * Tests that authentication happens for a defined realm and not for other + * realms for a host:port when an extra <code>authscope</code> tag is not + * defined to match all other realms. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testOtherRealmsNoAuth() throws Exception { + startServer(47501); + fetchPage("/basic.jsp", 200); + fetchPage("/basic.jsp?case=1", 401); + fetchPage("/basic.jsp?case=2", 401); + tearDown(); + } + + /** + * Tests Digest authentication scheme. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testDigestAuth() throws Exception { + startServer(47500); + fetchPage("/digest.jsp", 200); + tearDown(); + } + + /** + * Tests NTLM authentication scheme. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testNtlmAuth() throws Exception { + startServer(47501); + fetchPage("/ntlm.jsp", 200); + tearDown(); + } + + /** + * Starts the Jetty server at a specified port. + * + * @param portno + * Port number. + * @throws Exception + * When an error occurs. + */ + private void startServer(int portno) throws Exception { + port = portno; + + SelectChannelConnector connector1 = new SelectChannelConnector(); + connector1.setHost("127.0.0.1"); + connector1.setPort(port); + + server.addConnector(connector1); + server.start(); + } + + /** + * Fetches the specified <code>page</code> from the local Jetty server and + * checks whether the HTTP response status code matches with the expected + * code. + * + * @param page + * Page to be fetched. + * @param expectedCode + * HTTP response status code expected while fetching the page. + * @throws Exception + * When an error occurs or test case fails. + */ + private void fetchPage(String page, int expectedCode) throws Exception { + URL url = new URL("http", "127.0.0.1", port, page); + Response response = null; + response = http.getResponse(url, WebPage.newBuilder().build(), true); + + int code = response.getCode(); + assertEquals("HTTP Status Code for " + url, expectedCode, code); + } + + /** + * Returns an URL to the specified page. + * + * @param page + * Page available in the local Jetty server. + * @throws MalformedURLException + * If an URL can not be formed. + */ + private URL getURL(String page) throws MalformedURLException { + return new URL("http", "127.0.0.1", port, page); + } }
Modified: nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java (original) +++ nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java Fri Jan 9 06:34:33 2015 @@ -163,12 +163,14 @@ public class Sftp implements Protocol { bytes = new byte[size]; iStream.read(bytes); } catch (SftpException e) { - logger.error("SftpException in getFileProtocolOutput(), file: " - + url.getFile(), e); + logger + .error( + "SftpException in getFileProtocolOutput(), file: " + + url.getFile(), e); throw e; } catch (IOException e) { - logger.error("IOException in getFileProtocolOutput(), file: " - + url.getFile(), e); + logger.error( + "IOException in getFileProtocolOutput(), file: " + url.getFile(), e); throw e; } finally { if (iStream != null) { @@ -213,8 +215,8 @@ public class Sftp implements Protocol { Metadata metadata = new Metadata(); metadata.set(Response.CONTENT_TYPE, "text/html"); - metadata.set(Response.CONTENT_LENGTH, String.valueOf(directoryList - .length())); + metadata.set(Response.CONTENT_LENGTH, + String.valueOf(directoryList.length())); metadata.set(Response.LAST_MODIFIED, channelSftp.lstat(url.getFile()) .getMtimeString()); metadata.set(Response.LOCATION, url.toExternalForm()); @@ -250,7 +252,7 @@ public class Sftp implements Protocol { if (server == null) { return; } - + if (channelSftpByHostMap.containsKey(server)) { return; } Modified: nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java (original) +++ nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java Fri Jan 9 06:34:33 2015 @@ -30,65 +30,65 @@ import org.apache.nutch.storage.WebPage; public class LinkAnalysisScoringFilter implements ScoringFilter { - private Configuration conf; - private float normalizedScore = 1.00f; + private Configuration conf; + private float normalizedScore = 1.00f; - private final static Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); + private final static Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); - static { - FIELDS.add(WebPage.Field.METADATA); - FIELDS.add(WebPage.Field.SCORE); - } - - public LinkAnalysisScoringFilter() { - } - - public Configuration getConf() { - return conf; - } - - public void setConf(Configuration conf) { - this.conf = conf; - normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f); - } - - @Override - public Collection<WebPage.Field> getFields() { - return FIELDS; - } - - @Override - public void injectedScore(String url, WebPage page) - throws ScoringFilterException { - } - - @Override - public void initialScore(String url, WebPage page) - throws ScoringFilterException { - page.setScore(0.0f); - } - - @Override - public float generatorSortValue(String url, WebPage page, float initSort) - throws ScoringFilterException { - return page.getScore() * initSort; - } - - @Override - public void distributeScoreToOutlinks(String fromUrl, WebPage page, - Collection<ScoreDatum> scoreData, int allCount) - throws ScoringFilterException { - } - - @Override - public void updateScore(String url, WebPage page, - List<ScoreDatum> inlinkedScoreData) throws ScoringFilterException { - } - - @Override - public float indexerScore(String url, NutchDocument doc, WebPage page, - float initScore) throws ScoringFilterException { - return (normalizedScore * page.getScore()); - } + static { + FIELDS.add(WebPage.Field.METADATA); + FIELDS.add(WebPage.Field.SCORE); + } + + public LinkAnalysisScoringFilter() { + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f); + } + + @Override + public Collection<WebPage.Field> getFields() { + return FIELDS; + } + + @Override + public void injectedScore(String url, WebPage page) + throws ScoringFilterException { + } + + @Override + public void initialScore(String url, WebPage page) + throws ScoringFilterException { + page.setScore(0.0f); + } + + @Override + public float generatorSortValue(String url, WebPage page, float initSort) + throws ScoringFilterException { + return page.getScore() * initSort; + } + + @Override + public void distributeScoreToOutlinks(String fromUrl, WebPage page, + Collection<ScoreDatum> scoreData, int allCount) + throws ScoringFilterException { + } + + @Override + public void updateScore(String url, WebPage page, + List<ScoreDatum> inlinkedScoreData) throws ScoringFilterException { + } + + @Override + public float indexerScore(String url, NutchDocument doc, WebPage page, + float initScore) throws ScoringFilterException { + return (normalizedScore * page.getScore()); + } } Modified: nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java (original) +++ nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * {@link org.apache.nutch.scoring.webgraph.WebGraph}. */ package org.apache.nutch.scoring.link; + Modified: nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original) +++ nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Fri Jan 9 06:34:33 2015 @@ -38,17 +38,17 @@ import java.util.Set; /** * This plugin implements a variant of an Online Page Importance Computation - * (OPIC) score, described in this paper: - * <a href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/> - * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), - * Adaptive On-Line Page Importance Computation - * </a>. - * + * (OPIC) score, described in this paper: <a + * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/> + * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive + * On-Line Page Importance Computation </a>. + * * @author Andrzej Bialecki */ public class OPICScoringFilter implements ScoringFilter { - private final static Logger LOG = LoggerFactory.getLogger(OPICScoringFilter.class); + private final static Logger LOG = LoggerFactory + .getLogger(OPICScoringFilter.class); private final static Utf8 CASH_KEY = new Utf8("_csh_"); @@ -80,28 +80,33 @@ public class OPICScoringFilter implement @Override public void injectedScore(String url, WebPage row) - throws ScoringFilterException { + throws ScoringFilterException { float score = row.getScore(); row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(score))); } - /** Set to 0.0f (unknown value) - inlink contributions will bring it to - * a correct level. Newly discovered pages have at least one inlink. */ + /** + * Set to 0.0f (unknown value) - inlink contributions will bring it to a + * correct level. Newly discovered pages have at least one inlink. + */ @Override - public void initialScore(String url, WebPage row) throws ScoringFilterException { + public void initialScore(String url, WebPage row) + throws ScoringFilterException { row.setScore(0.0f); row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f))); } /** Use {@link WebPage#getScore()}. */ @Override - public float generatorSortValue(String url, WebPage row, float initSort) throws ScoringFilterException { + public float generatorSortValue(String url, WebPage row, float initSort) + throws ScoringFilterException { return row.getScore() * initSort; } /** Increase the score by a sum of inlinked scores. */ @Override - public void updateScore(String url, WebPage row, List<ScoreDatum> inlinkedScoreData) { + public void updateScore(String url, WebPage row, + List<ScoreDatum> inlinkedScoreData) { float adjust = 0.0f; for (ScoreDatum scoreDatum : inlinkedScoreData) { adjust += scoreDatum.getScore(); @@ -111,21 +116,23 @@ public class OPICScoringFilter implement ByteBuffer cashRaw = row.getMetadata().get(CASH_KEY); float cash = 0.0f; if (cashRaw != null) { - cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position()); + cash = Bytes.toFloat(cashRaw.array(), + cashRaw.arrayOffset() + cashRaw.position()); } - row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(cash + adjust))); + row.getMetadata().put(CASH_KEY, + ByteBuffer.wrap(Bytes.toBytes(cash + adjust))); } /** Get cash on hand, divide it by the number of outlinks and apply. */ @Override - public void distributeScoreToOutlinks(String fromUrl, - WebPage row, Collection<ScoreDatum> scoreData, - int allCount) { + public void distributeScoreToOutlinks(String fromUrl, WebPage row, + Collection<ScoreDatum> scoreData, int allCount) { ByteBuffer cashRaw = row.getMetadata().get(CASH_KEY); if (cashRaw == null) { return; } - float cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position()); + float cash = Bytes.toFloat(cashRaw.array(), + cashRaw.arrayOffset() + cashRaw.position()); if (cash == 0) { return; } @@ -138,7 +145,7 @@ public class OPICScoringFilter implement try { String toHost = new URL(scoreDatum.getUrl()).getHost(); String fromHost = new URL(fromUrl.toString()).getHost(); - if(toHost.equalsIgnoreCase(fromHost)){ + if (toHost.equalsIgnoreCase(fromHost)) { scoreDatum.setScore(internalScore); } else { scoreDatum.setScore(externalScore); @@ -152,9 +159,10 @@ public class OPICScoringFilter implement row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f))); } - /** Dampen the boost value by scorePower.*/ - public float indexerScore(String url, NutchDocument doc, WebPage row, float initScore) { - return (float)Math.pow(row.getScore(), scorePower) * initScore; + /** Dampen the boost value by scorePower. */ + public float indexerScore(String url, NutchDocument doc, WebPage row, + float initScore) { + return (float) Math.pow(row.getScore(), scorePower) * initScore; } @Override Modified: nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java (original) +++ nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * (OPIC) algorithm. */ package org.apache.nutch.scoring.opic; + Modified: nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original) +++ nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Fri Jan 9 06:34:33 2015 @@ -45,198 +45,197 @@ import org.w3c.dom.NodeList; public class CollectionManager extends Configured { - public static final String DEFAULT_FILE_NAME = "subcollections.xml"; + public static final String DEFAULT_FILE_NAME = "subcollections.xml"; - static final Logger LOG = LoggerFactory.getLogger(CollectionManager.class); + static final Logger LOG = LoggerFactory.getLogger(CollectionManager.class); - transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>(); + transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>(); - transient URL configfile; + transient URL configfile; - public CollectionManager(Configuration conf) { - super(conf); - init(); - } - - /** - * Used for testing - */ - protected CollectionManager() { - super(NutchConfiguration.create()); - } - - protected void init() { - try { - if (LOG.isInfoEnabled()) { - LOG.info("initializing CollectionManager"); - } - // initialize known subcollections - configfile = getConf().getResource( - getConf().get("subcollections.config", DEFAULT_FILE_NAME)); - - InputStream input = getConf().getConfResourceAsInputStream( - getConf().get("subcollections.config", DEFAULT_FILE_NAME)); - parse(input); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - LOG.warn("Error occured: " + e); - } - } - } - - protected void parse(InputStream input) { - Element collections = DomUtil.getDom(input); - - if (collections != null) { - NodeList nodeList = collections - .getElementsByTagName(Subcollection.TAG_COLLECTION); - - if (LOG.isInfoEnabled()) { - LOG.info("file has" + nodeList.getLength() + " elements"); - } - - for (int i = 0; i < nodeList.getLength(); i++) { - Element scElem = (Element) nodeList.item(i); - Subcollection subCol = new Subcollection(getConf()); - subCol.initialize(scElem); - collectionMap.put(subCol.name, subCol); - } - } else if (LOG.isInfoEnabled()) { - LOG.info("Cannot find collections"); - } - } - - public static CollectionManager getCollectionManager(Configuration conf) { - String key = "collectionmanager"; - ObjectCache objectCache = ObjectCache.get(conf); - CollectionManager impl = (CollectionManager) objectCache.getObject(key); - if (impl == null) { - try { - if (LOG.isInfoEnabled()) { - LOG.info("Instantiating CollectionManager"); - } - impl = new CollectionManager(conf); - objectCache.setObject(key, impl); - } catch (Exception e) { - throw new RuntimeException("Couldn't create CollectionManager", - e); - } - } - return impl; - } - - /** - * Returns named subcollection - * - * @param id - * @return Named SubCollection (or null if not existing) - */ - public Subcollection getSubColection(final String id) { - return (Subcollection) collectionMap.get(id); - } - - /** - * Delete named subcollection - * - * @param id - * Id of SubCollection to delete - */ - public void deleteSubCollection(final String id) throws IOException { - final Subcollection subCol = getSubColection(id); - if (subCol != null) { - collectionMap.remove(id); - } - } - - /** - * Create a new subcollection. - * - * @param name - * Name of SubCollection to create - * @return Created SubCollection or null if allready existed - */ - public Subcollection createSubCollection(final String id, final String name) { - Subcollection subCol = null; - - if (!collectionMap.containsKey(id)) { - subCol = new Subcollection(id, name, getConf()); - collectionMap.put(id, subCol); - } - - return subCol; - } - - /** - * Return names of collections url is part of - * - * @param url - * The url to test against Collections - * @return Space delimited string of collection names url is part of - */ - public List<String> getSubCollections(final String url) { - List<String> collections = new ArrayList<String>(); - final Iterator<Subcollection> iterator = collectionMap.values().iterator(); - - while (iterator.hasNext()) { - final Subcollection subCol = iterator.next(); - if (subCol.filter(url) != null) { - collections.add(subCol.name); - } - } - if (LOG.isTraceEnabled()) { - LOG.trace("subcollections:" - + Arrays.toString(collections.toArray())); - } - - return collections; - } - - /** - * Returns all collections - * - * @return All collections CollectionManager knows about - */ - public Collection<Subcollection> getAll() { - return collectionMap.values(); - } - - /** - * Save collections into file - * - * @throws Exception - */ - public void save() throws IOException { - try { - final FileOutputStream fos = new FileOutputStream(new File( - configfile.getFile())); - final Document doc = new DocumentImpl(); - final Element collections = doc - .createElement(Subcollection.TAG_COLLECTIONS); - final Iterator<Subcollection> iterator = collectionMap.values().iterator(); - - while (iterator.hasNext()) { - final Subcollection subCol = iterator.next(); - final Element collection = doc - .createElement(Subcollection.TAG_COLLECTION); - collections.appendChild(collection); - final Element name = doc.createElement(Subcollection.TAG_NAME); - name.setNodeValue(subCol.getName()); - collection.appendChild(name); - final Element whiteList = doc - .createElement(Subcollection.TAG_WHITELIST); - whiteList.setNodeValue(subCol.getWhiteListString()); - collection.appendChild(whiteList); - final Element blackList = doc - .createElement(Subcollection.TAG_BLACKLIST); - blackList.setNodeValue(subCol.getBlackListString()); - collection.appendChild(blackList); - } - - DomUtil.saveDom(fos, collections); - fos.flush(); - fos.close(); - } catch (FileNotFoundException e) { - throw new IOException(e.toString()); - } - } + public CollectionManager(Configuration conf) { + super(conf); + init(); + } + + /** + * Used for testing + */ + protected CollectionManager() { + super(NutchConfiguration.create()); + } + + protected void init() { + try { + if (LOG.isInfoEnabled()) { + LOG.info("initializing CollectionManager"); + } + // initialize known subcollections + configfile = getConf().getResource( + getConf().get("subcollections.config", DEFAULT_FILE_NAME)); + + InputStream input = getConf().getConfResourceAsInputStream( + getConf().get("subcollections.config", DEFAULT_FILE_NAME)); + parse(input); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error occured: " + e); + } + } + } + + protected void parse(InputStream input) { + Element collections = DomUtil.getDom(input); + + if (collections != null) { + NodeList nodeList = collections + .getElementsByTagName(Subcollection.TAG_COLLECTION); + + if (LOG.isInfoEnabled()) { + LOG.info("file has" + nodeList.getLength() + " elements"); + } + + for (int i = 0; i < nodeList.getLength(); i++) { + Element scElem = (Element) nodeList.item(i); + Subcollection subCol = new Subcollection(getConf()); + subCol.initialize(scElem); + collectionMap.put(subCol.name, subCol); + } + } else if (LOG.isInfoEnabled()) { + LOG.info("Cannot find collections"); + } + } + + public static CollectionManager getCollectionManager(Configuration conf) { + String key = "collectionmanager"; + ObjectCache objectCache = ObjectCache.get(conf); + CollectionManager impl = (CollectionManager) objectCache.getObject(key); + if (impl == null) { + try { + if (LOG.isInfoEnabled()) { + LOG.info("Instantiating CollectionManager"); + } + impl = new CollectionManager(conf); + objectCache.setObject(key, impl); + } catch (Exception e) { + throw new RuntimeException("Couldn't create CollectionManager", e); + } + } + return impl; + } + + /** + * Returns named subcollection + * + * @param id + * @return Named SubCollection (or null if not existing) + */ + public Subcollection getSubColection(final String id) { + return (Subcollection) collectionMap.get(id); + } + + /** + * Delete named subcollection + * + * @param id + * Id of SubCollection to delete + */ + public void deleteSubCollection(final String id) throws IOException { + final Subcollection subCol = getSubColection(id); + if (subCol != null) { + collectionMap.remove(id); + } + } + + /** + * Create a new subcollection. + * + * @param name + * Name of SubCollection to create + * @return Created SubCollection or null if allready existed + */ + public Subcollection createSubCollection(final String id, final String name) { + Subcollection subCol = null; + + if (!collectionMap.containsKey(id)) { + subCol = new Subcollection(id, name, getConf()); + collectionMap.put(id, subCol); + } + + return subCol; + } + + /** + * Return names of collections url is part of + * + * @param url + * The url to test against Collections + * @return Space delimited string of collection names url is part of + */ + public List<String> getSubCollections(final String url) { + List<String> collections = new ArrayList<String>(); + final Iterator<Subcollection> iterator = collectionMap.values().iterator(); + + while (iterator.hasNext()) { + final Subcollection subCol = iterator.next(); + if (subCol.filter(url) != null) { + collections.add(subCol.name); + } + } + if (LOG.isTraceEnabled()) { + LOG.trace("subcollections:" + Arrays.toString(collections.toArray())); + } + + return collections; + } + + /** + * Returns all collections + * + * @return All collections CollectionManager knows about + */ + public Collection<Subcollection> getAll() { + return collectionMap.values(); + } + + /** + * Save collections into file + * + * @throws Exception + */ + public void save() throws IOException { + try { + final FileOutputStream fos = new FileOutputStream(new File( + configfile.getFile())); + final Document doc = new DocumentImpl(); + final Element collections = doc + .createElement(Subcollection.TAG_COLLECTIONS); + final Iterator<Subcollection> iterator = collectionMap.values() + .iterator(); + + while (iterator.hasNext()) { + final Subcollection subCol = iterator.next(); + final Element collection = doc + .createElement(Subcollection.TAG_COLLECTION); + collections.appendChild(collection); + final Element name = doc.createElement(Subcollection.TAG_NAME); + name.setNodeValue(subCol.getName()); + collection.appendChild(name); + final Element whiteList = doc + .createElement(Subcollection.TAG_WHITELIST); + whiteList.setNodeValue(subCol.getWhiteListString()); + collection.appendChild(whiteList); + final Element blackList = doc + .createElement(Subcollection.TAG_BLACKLIST); + blackList.setNodeValue(subCol.getBlackListString()); + collection.appendChild(blackList); + } + + DomUtil.saveDom(fos, collections); + fos.flush(); + fos.close(); + } catch (FileNotFoundException e) { + throw new IOException(e.toString()); + } + } } Modified: nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original) +++ nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Fri Jan 9 06:34:33 2015 @@ -31,30 +31,30 @@ import org.w3c.dom.NodeList; * SubCollection represents a subset of index, you can define url patterns that * will indicate that particular page (url) is part of SubCollection. */ -public class Subcollection extends Configured implements URLFilter{ - - public static final String TAG_COLLECTIONS="subcollections"; - public static final String TAG_COLLECTION="subcollection"; - public static final String TAG_WHITELIST="whitelist"; - public static final String TAG_BLACKLIST="blacklist"; - public static final String TAG_NAME="name"; - public static final String TAG_ID="id"; +public class Subcollection extends Configured implements URLFilter { + + public static final String TAG_COLLECTIONS = "subcollections"; + public static final String TAG_COLLECTION = "subcollection"; + public static final String TAG_WHITELIST = "whitelist"; + public static final String TAG_BLACKLIST = "blacklist"; + public static final String TAG_NAME = "name"; + public static final String TAG_ID = "id"; ArrayList<String> blackList = new ArrayList<String>(); ArrayList<String> whiteList = new ArrayList<String>(); - /** + /** * SubCollection identifier */ String id; - /** + /** * SubCollection name */ String name; - /** + /** * SubCollection whitelist as String */ String wlString; @@ -64,21 +64,24 @@ public class Subcollection extends Confi */ String blString; - /** public Constructor + /** + * public Constructor * - * @param id id of SubCollection - * @param name name of SubCollection + * @param id + * id of SubCollection + * @param name + * name of SubCollection */ public Subcollection(String id, String name, Configuration conf) { this(conf); - this.id=id; + this.id = id; this.name = name; } - public Subcollection(Configuration conf){ + public Subcollection(Configuration conf) { super(conf); } - + /** * @return Returns the name */ @@ -203,7 +206,8 @@ public class Subcollection extends Confi /** * Set contents of blacklist from String * - * @param list the blacklist contents + * @param list + * the blacklist contents */ public void setBlackList(String list) { this.blString = list; @@ -213,7 +217,8 @@ public class Subcollection extends Confi /** * Set contents of whitelist from String * - * @param list the whitelist contents + * @param list + * the whitelist contents */ public void setWhiteList(String list) { this.wlString = list; Modified: nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -32,48 +32,49 @@ import org.apache.nutch.storage.WebPage. import org.apache.nutch.util.NutchConfiguration; public class SubcollectionIndexingFilter extends Configured implements - IndexingFilter { + IndexingFilter { - public SubcollectionIndexingFilter() { - super(NutchConfiguration.create()); - } - - public SubcollectionIndexingFilter(Configuration conf) { - super(conf); - } - - /** - * Doc field name - */ - public static final String FIELD_NAME = "subcollection"; - - /** - * Logger - */ - public static final Logger LOG = LoggerFactory - .getLogger(SubcollectionIndexingFilter.class); - - /** - * "Mark" document to be a part of subcollection - * - * @param doc - * @param url - */ - private void addSubCollectionField(NutchDocument doc, String url) { - for (String collname: CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) { - doc.add(FIELD_NAME, collname); - } - } - - @Override - public Collection<Field> getFields() { - return new ArrayList<Field>(); - } - - @Override - public NutchDocument filter(NutchDocument doc, String url, WebPage page) - throws IndexingException { - addSubCollectionField(doc, url); - return doc; - } + public SubcollectionIndexingFilter() { + super(NutchConfiguration.create()); + } + + public SubcollectionIndexingFilter(Configuration conf) { + super(conf); + } + + /** + * Doc field name + */ + public static final String FIELD_NAME = "subcollection"; + + /** + * Logger + */ + public static final Logger LOG = LoggerFactory + .getLogger(SubcollectionIndexingFilter.class); + + /** + * "Mark" document to be a part of subcollection + * + * @param doc + * @param url + */ + private void addSubCollectionField(NutchDocument doc, String url) { + for (String collname : CollectionManager.getCollectionManager(getConf()) + .getSubCollections(url)) { + doc.add(FIELD_NAME, collname); + } + } + + @Override + public Collection<Field> getFields() { + return new ArrayList<Field>(); + } + + @Override + public NutchDocument filter(NutchDocument doc, String url, WebPage page) + throws IndexingException { + addSubCollectionField(doc, url); + return doc; + } } Modified: nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java (original) +++ nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java Fri Jan 9 06:34:33 2015 @@ -22,3 +22,4 @@ * {@link org.apache.nutch.collection}. */ package org.apache.nutch.indexer.subcollection; + Modified: nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (original) +++ nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Fri Jan 9 06:34:33 2015 @@ -26,31 +26,33 @@ import org.junit.Test; import static org.junit.Assert.*; public class TestSubcollection { - - /**Test filtering logic + + /** + * Test filtering logic * * @throws Exception */ @Test public void testFilter() throws Exception { - Subcollection sc=new Subcollection(NutchConfiguration.create()); + Subcollection sc = new Subcollection(NutchConfiguration.create()); sc.setWhiteList("www.nutch.org\nwww.apache.org"); sc.setBlackList("jpg\nwww.apache.org/zecret/"); - - //matches whitelist - assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html")); - - //matches blacklist + + // matches whitelist + assertEquals("http://www.apache.org/index.html", + sc.filter("http://www.apache.org/index.html")); + + // matches blacklist assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html")); assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); - - //no match + + // no match assertEquals(null, sc.filter("http://www.google.com/")); } - + @Test - public void testInput(){ - StringBuffer xml=new StringBuffer(); + public void testInput() { + StringBuffer xml = new StringBuffer(); xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); xml.append("<!-- just a comment -->"); xml.append("<subcollections>"); @@ -66,44 +68,45 @@ public class TestSubcollection { xml.append("</blacklist>"); xml.append("</subcollection>"); xml.append("</subcollections>"); - - InputStream is=new ByteArrayInputStream(xml.toString().getBytes()); - - CollectionManager cm=new CollectionManager(); + + InputStream is = new ByteArrayInputStream(xml.toString().getBytes()); + + CollectionManager cm = new CollectionManager(); cm.parse(is); - - Collection c=cm.getAll(); - + + Collection c = cm.getAll(); + // test that size matches - assertEquals(1,c.size()); - - Subcollection collection=(Subcollection)c.toArray()[0]; - - //test collection id + assertEquals(1, c.size()); + + Subcollection collection = (Subcollection) c.toArray()[0]; + + // test collection id assertEquals("nutch", collection.getId()); - - //test collection name + + // test collection name assertEquals("nutch collection", collection.getName()); - //test whitelist - assertEquals(2,collection.whiteList.size()); - - String wlUrl=(String)collection.whiteList.get(0); + // test whitelist + assertEquals(2, collection.whiteList.size()); + + String wlUrl = (String) collection.whiteList.get(0); assertEquals("http://lucene.apache.org/nutch/", wlUrl); - wlUrl=(String)collection.whiteList.get(1); + wlUrl = (String) collection.whiteList.get(1); assertEquals("http://wiki.apache.org/nutch/", wlUrl); - - //matches whitelist - assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/")); - //test blacklist - assertEquals(1,collection.blackList.size()); + // matches whitelist + assertEquals("http://lucene.apache.org/nutch/", + collection.filter("http://lucene.apache.org/nutch/")); + + // test blacklist + assertEquals(1, collection.blackList.size()); - String blUrl=(String)collection.blackList.get(0); + String blUrl = (String) collection.blackList.get(0); assertEquals("http://www.xxx.yyy", blUrl); - //no match + // no match assertEquals(null, collection.filter("http://www.google.com/")); } } Modified: nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -38,12 +38,13 @@ import org.apache.nutch.util.domain.Doma * @author Enis Soztutar <enis.soz.nu...@gmail.com> */ public class TLDIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(TLDIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(TLDIndexingFilter.class); private Configuration conf; private static final Collection<Field> fields = new ArrayList<Field>(); - + @Override public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { @@ -52,7 +53,7 @@ public class TLDIndexingFilter implement DomainSuffix d = URLUtil.getDomainSuffix(_url); doc.add("tld", d.getDomain()); } catch (Exception ex) { - LOG.warn("Exception in TLDIndexingFilter",ex); + LOG.warn("Exception in TLDIndexingFilter", ex); } return doc; Modified: nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java (original) +++ nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java Fri Jan 9 06:34:33 2015 @@ -38,70 +38,70 @@ import org.apache.nutch.util.domain.Doma */ public class TLDScoringFilter implements ScoringFilter { - private Configuration conf; - private DomainSuffixes tldEntries; + private Configuration conf; + private DomainSuffixes tldEntries; - private final static Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); + private final static Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); - public TLDScoringFilter() { - tldEntries = DomainSuffixes.getInstance(); - } - - public Configuration getConf() { - return conf; - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public Collection<WebPage.Field> getFields() { - return FIELDS; - } - - @Override - public void injectedScore(String url, WebPage page) - throws ScoringFilterException { - } - - @Override - public void initialScore(String url, WebPage page) - throws ScoringFilterException { - - } - - @Override - public float generatorSortValue(String url, WebPage page, float initSort) - throws ScoringFilterException { - return initSort; - } - - @Override - public void distributeScoreToOutlinks(String fromUrl, WebPage page, - Collection<ScoreDatum> scoreData, int allCount) - throws ScoringFilterException { - } - - @Override - public void updateScore(String url, WebPage page, - List<ScoreDatum> inlinkedScoreData) throws ScoringFilterException { - } - - @Override - public float indexerScore(String url, NutchDocument doc, WebPage page, - float initScore) throws ScoringFilterException { - List<String> tlds = doc.getFieldValues("tld"); - float boost = 1.0f; - - if (tlds != null) { - for (String tld : tlds) { - DomainSuffix entry = tldEntries.get(tld); - if (entry != null) - boost *= entry.getBoost(); - } - } - return initScore * boost; - } + public TLDScoringFilter() { + tldEntries = DomainSuffixes.getInstance(); + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public Collection<WebPage.Field> getFields() { + return FIELDS; + } + + @Override + public void injectedScore(String url, WebPage page) + throws ScoringFilterException { + } + + @Override + public void initialScore(String url, WebPage page) + throws ScoringFilterException { + + } + + @Override + public float generatorSortValue(String url, WebPage page, float initSort) + throws ScoringFilterException { + return initSort; + } + + @Override + public void distributeScoreToOutlinks(String fromUrl, WebPage page, + Collection<ScoreDatum> scoreData, int allCount) + throws ScoringFilterException { + } + + @Override + public void updateScore(String url, WebPage page, + List<ScoreDatum> inlinkedScoreData) throws ScoringFilterException { + } + + @Override + public float indexerScore(String url, NutchDocument doc, WebPage page, + float initScore) throws ScoringFilterException { + List<String> tlds = doc.getFieldValues("tld"); + float boost = 1.0f; + + if (tlds != null) { + for (String tld : tlds) { + DomainSuffix entry = tldEntries.get(tld); + if (entry != null) + boost *= entry.getBoost(); + } + } + return initScore * boost; + } } Modified: nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -28,11 +28,10 @@ import org.apache.nutch.storage.WebPage; import org.junit.Test; /** - * JUnit test case which populates a HashMap - * with URL's and top level domain qualifiers - * as key's and value's respectively. - * We assert that each value entry in the HashMap equals - * the expect field value for the document after being filtered. + * JUnit test case which populates a HashMap with URL's and top level domain + * qualifiers as key's and value's respectively. We assert that each value entry + * in the HashMap equals the expect field value for the document after being + * filtered. * */ Modified: nutch/branches/2.x/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Fri Jan 9 06:34:33 2015 @@ -32,12 +32,11 @@ import org.apache.nutch.net.*; import org.apache.nutch.urlfilter.api.RegexRule; import org.apache.nutch.urlfilter.api.RegexURLFilterBase; - /** - * RegexURLFilterBase implementation based on the - * <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> - * Finite-State Automata for Java<sup>TM</sup>. - * + * RegexURLFilterBase implementation based on the <a + * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State + * Automata for Java<sup>TM</sup>. + * * @author Jérôme Charron * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> */ @@ -49,24 +48,24 @@ public class AutomatonURLFilter extends super(); } - public AutomatonURLFilter(String filename) - throws IOException, PatternSyntaxException { + public AutomatonURLFilter(String filename) throws IOException, + PatternSyntaxException { super(filename); } - AutomatonURLFilter(Reader reader) - throws IOException, IllegalArgumentException { + AutomatonURLFilter(Reader reader) throws IOException, + IllegalArgumentException { super(reader); } - - /* ----------------------------------- * - * <implementation:RegexURLFilterBase> * - * ----------------------------------- */ - + /* + * ----------------------------------- * <implementation:RegexURLFilterBase> * + * ----------------------------------- + */ + /** - * Rules specified as a config property will override rules specified - * as a config file. + * Rules specified as a config property will override rules specified as a + * config file. */ protected Reader getRulesReader(Configuration conf) throws IOException { String stringRules = conf.get(URLFILTER_AUTOMATON_RULES); @@ -81,21 +80,20 @@ public class AutomatonURLFilter extends protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } - - /* ------------------------------------ * - * </implementation:RegexURLFilterBase> * - * ------------------------------------ */ - + /* + * ------------------------------------ * </implementation:RegexURLFilterBase> + * * ------------------------------------ + */ + public static void main(String args[]) throws IOException { main(new AutomatonURLFilter(), args); } - private class Rule extends RegexRule { - + private RunAutomaton automaton; - + Rule(boolean sign, String regex) { super(sign, regex); automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); @@ -105,5 +103,5 @@ public class AutomatonURLFilter extends return automaton.run(url); } } - + } Modified: nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java Fri Jan 9 06:34:33 2015 @@ -27,14 +27,13 @@ import org.apache.nutch.urlfilter.api.Re import org.junit.Test; import static org.junit.Assert.*; - /** * JUnit based test of class <code>AutomatonURLFilter</code>. - * + * * @author Jérôme Charron */ public class TestAutomatonURLFilter extends RegexURLFilterBaseTest { - + protected URLFilter getURLFilter(Reader rules) { try { return new AutomatonURLFilter(rules); @@ -43,7 +42,7 @@ public class TestAutomatonURLFilter exte return null; } } - + @Test public void test() { test("WholeWebCrawling"); Modified: nutch/branches/2.x/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Fri Jan 9 06:34:33 2015 @@ -35,35 +35,48 @@ import org.apache.nutch.util.URLUtil; import org.apache.nutch.util.domain.DomainSuffix; /** - * <p>Filters URLs based on a file containing domain suffixes, domain names, and + * <p> + * Filters URLs based on a file containing domain suffixes, domain names, and * hostnames. Only a url that matches one of the suffixes, domains, or hosts - * present in the file is allowed.</p> + * present in the file is allowed. + * </p> * - * <p>Urls are checked in order of domain suffix, domain name, and hostname - * against entries in the domain file. The domain file would be setup as follows - * with one entry per line: - * - * <pre> com apache.org www.apache.org </pre> - * - * <p>The first line is an example of a filter that would allow all .com - * domains. The second line allows all urls from apache.org and all of its - * subdomains such as lucene.apache.org and hadoop.apache.org. The third line - * would allow only urls from www.apache.org. There is no specific ordering to - * entries. The entries are from more general to more specific with the more - * general overridding the more specific.</p> + * <p> + * Urls are checked in order of domain suffix, domain name, and hostname against + * entries in the domain file. The domain file would be setup as follows with + * one entry per line: + * + * <pre> + * com apache.org www.apache.org + * </pre> + * + * <p> + * The first line is an example of a filter that would allow all .com domains. + * The second line allows all urls from apache.org and all of its subdomains + * such as lucene.apache.org and hadoop.apache.org. The third line would allow + * only urls from www.apache.org. There is no specific ordering to entries. The + * entries are from more general to more specific with the more general + * overridding the more specific. + * </p> * * The domain file defaults to domain-urlfilter.txt in the classpath but can be * overridden using the: * - * <ul> <ol>property "urlfilter.domain.file" in ./conf/nutch-*.xml, and</ol> - * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul> + * <ul> + * <ol> + * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and + * </ol> + * <ol> + * attribute "file" in plugin.xml of this plugin + * </ol> + * </ul> * * the attribute "file" has higher precedence if defined. */ -public class DomainURLFilter - implements URLFilter { +public class DomainURLFilter implements URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(DomainURLFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(DomainURLFilter.class); // read in attribute "file" of this plugin. private static String attributeFile = null; @@ -71,8 +84,7 @@ public class DomainURLFilter private String domainFile = null; private Set<String> domainSet = new LinkedHashSet<String>(); - private void readConfiguration(Reader configReader) - throws IOException { + private void readConfiguration(Reader configReader) throws IOException { // read the configuration file, line by line BufferedReader reader = new BufferedReader(configReader); @@ -95,7 +107,8 @@ public class DomainURLFilter /** * Constructor that specifies the domain file to use. * - * @param domainFile The domain file, overrides domain-urlfilter.text default. + * @param domainFile + * The domain file, overrides domain-urlfilter.text default. * * @throws IOException */ @@ -111,8 +124,8 @@ public class DomainURLFilter // get the extensions for domain urlfilter String pluginName = "urlfilter-domain"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - URLFilter.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -120,32 +133,30 @@ public class DomainURLFilter break; } } - + // handle blank non empty input if (attributeFile != null && attributeFile.trim().equals("")) { attributeFile = null; } - + if (attributeFile != null) { if (LOG.isInfoEnabled()) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); + + " as " + attributeFile); } - } - else { + } else { if (LOG.isWarnEnabled()) { LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " - + pluginName); + + pluginName); } } // domain file and attribute "file" take precedence if defined - String file = conf.get("urlfilter.domain.file"); + String file = conf.get("urlfilter.domain.file"); String stringRules = conf.get("urlfilter.domain.rules"); if (domainFile != null) { file = domainFile; - } - else if (attributeFile != null) { + } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; @@ -159,8 +170,7 @@ public class DomainURLFilter reader = new FileReader(file); } readConfiguration(reader); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } } @@ -173,7 +183,7 @@ public class DomainURLFilter try { - // match for suffix, domain, and host in that order. more general will + // match for suffix, domain, and host in that order. more general will // override more specific String domain = URLUtil.getDomainName(url).toLowerCase().trim(); String host = URLUtil.getHost(url); @@ -182,20 +192,19 @@ public class DomainURLFilter if (domainSuffix != null) { suffix = domainSuffix.getDomain(); } - + if (domainSet.contains(suffix) || domainSet.contains(domain) - || domainSet.contains(host)) { + || domainSet.contains(host)) { return url; } // doesn't match, don't allow return null; - } - catch (Exception e) { - + } catch (Exception e) { + // if an error happens, allow the url to pass LOG.error("Could not apply filter on url: " + url + "\n" - + org.apache.hadoop.util.StringUtils.stringifyException(e)); + + org.apache.hadoop.util.StringUtils.stringifyException(e)); return null; } } Modified: nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Fri Jan 9 06:34:33 2015 @@ -26,14 +26,14 @@ import org.apache.nutch.util.NutchConfig public class TestDomainURLFilter { - protected static final Logger LOG = LoggerFactory.getLogger(TestDomainURLFilter.class); + protected static final Logger LOG = LoggerFactory + .getLogger(TestDomainURLFilter.class); private final static String SEPARATOR = System.getProperty("file.separator"); private final static String SAMPLES = System.getProperty("test.data", "."); @Test - public void testFilter() - throws Exception { + public void testFilter() throws Exception { String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; Configuration conf = NutchConfiguration.create(); Modified: nutch/branches/2.x/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Fri Jan 9 06:34:33 2015 @@ -41,16 +41,19 @@ import java.util.List; import java.util.ArrayList; /** - * Filters URLs based on a file of URL prefixes. The file is named by - * (1) property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and - * (2) attribute "file" in plugin.xml of this plugin - * Attribute "file" has higher precedence if defined. - * - * <p>The format of this file is one URL prefix per line.</p> + * Filters URLs based on a file of URL prefixes. The file is named by (1) + * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2) + * attribute "file" in plugin.xml of this plugin Attribute "file" has higher + * precedence if defined. + * + * <p> + * The format of this file is one URL prefix per line. + * </p> */ public class PrefixURLFilter implements URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(PrefixURLFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(PrefixURLFilter.class); // read in attribute "file" of this plugin. private static String attributeFile = null; @@ -60,7 +63,7 @@ public class PrefixURLFilter implements private Configuration conf; public PrefixURLFilter() throws IOException { - + } public PrefixURLFilter(String stringRules) throws IOException { @@ -74,43 +77,43 @@ public class PrefixURLFilter implements return url; } - private TrieStringMatcher readConfiguration(Reader reader) - throws IOException { - - BufferedReader in=new BufferedReader(reader); + private TrieStringMatcher readConfiguration(Reader reader) throws IOException { + + BufferedReader in = new BufferedReader(reader); List<String> urlprefixes = new ArrayList<String>(); String line; - while((line=in.readLine())!=null) { + while ((line = in.readLine()) != null) { if (line.length() == 0) continue; - char first=line.charAt(0); + char first = line.charAt(0); switch (first) { - case ' ' : case '\n' : case '#' : // skip blank & comment lines + case ' ': + case '\n': + case '#': // skip blank & comment lines continue; - default : - urlprefixes.add(line); + default: + urlprefixes.add(line); } } return new PrefixStringMatcher(urlprefixes); } - public static void main(String args[]) - throws IOException { - + public static void main(String args[]) throws IOException { + PrefixURLFilter filter; if (args.length >= 1) filter = new PrefixURLFilter(args[0]); else filter = new PrefixURLFilter(); - - BufferedReader in=new BufferedReader(new InputStreamReader(System.in)); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; - while((line=in.readLine())!=null) { - String out=filter.filter(line); - if(out!=null) { + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { System.out.println(out); } } @@ -120,8 +123,8 @@ public class PrefixURLFilter implements this.conf = conf; String pluginName = "urlfilter-prefix"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - URLFilter.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -138,8 +141,8 @@ public class PrefixURLFilter implements } } else { // if (LOG.isWarnEnabled()) { - // LOG.warn("Attribute \"file\" is not defined in plugin.xml for - // plugin "+pluginName); + // LOG.warn("Attribute \"file\" is not defined in plugin.xml for + // plugin "+pluginName); // } } @@ -161,7 +164,9 @@ public class PrefixURLFilter implements try { trie = readConfiguration(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } // TODO m...@media-style.com: throw Exception? Because broken api. throw new RuntimeException(e.getMessage(), e); } @@ -171,5 +176,5 @@ public class PrefixURLFilter implements public Configuration getConf() { return this.conf; } - + } Modified: nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java Fri Jan 9 06:34:33 2015 @@ -23,39 +23,23 @@ import junit.textui.TestRunner; import java.io.IOException; - /** * JUnit test for <code>PrefixURLFilter</code>. - * + * * @author Talat Uyarer * @author Cihad Guzel */ public class TestPrefixURLFilter extends TestCase { - private static final String prefixes = - "# this is a comment\n" + - "\n" + - "http://\n" + - "https://\n" + - "file://\n" + - "ftp://\n"; + private static final String prefixes = "# this is a comment\n" + "\n" + + "http://\n" + "https://\n" + "file://\n" + "ftp://\n"; private static final String[] urls = new String[] { - "http://www.example.com/", - "https://www.example.com/", - "ftp://www.example.com/", - "file://www.example.com/", - "abcd://www.example.com/", - "www.example.com/", - }; - - private static String[] urlsModeAccept = new String[] { - urls[0], - urls[1], - urls[2], - urls[3], - null, - null - }; + "http://www.example.com/", "https://www.example.com/", + "ftp://www.example.com/", "file://www.example.com/", + "abcd://www.example.com/", "www.example.com/", }; + + private static String[] urlsModeAccept = new String[] { urls[0], urls[1], + urls[2], urls[3], null, null }; private PrefixURLFilter filter = null; Modified: nutch/branches/2.x/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Fri Jan 9 06:34:33 2015 @@ -28,13 +28,12 @@ import org.apache.nutch.urlfilter.api.Re import org.apache.nutch.urlfilter.api.RegexURLFilterBase; import org.apache.nutch.util.NutchConfiguration; - /** * Filters URLs based on a file of regular expressions using the * {@link java.util.regex Java Regex implementation}. */ public class RegexURLFilter extends RegexURLFilterBase { - + public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file"; public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules"; @@ -42,24 +41,23 @@ public class RegexURLFilter extends Rege super(); } - public RegexURLFilter(String filename) - throws IOException, PatternSyntaxException { + public RegexURLFilter(String filename) throws IOException, + PatternSyntaxException { super(filename); } - RegexURLFilter(Reader reader) - throws IOException, IllegalArgumentException { + RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException { super(reader); } - - /* ----------------------------------- * - * <implementation:RegexURLFilterBase> * - * ----------------------------------- */ - + /* + * ----------------------------------- * <implementation:RegexURLFilterBase> * + * ----------------------------------- + */ + /** - * Rules specified as a config property will override rules specified - * as a config file. + * Rules specified as a config property will override rules specified as a + * config file. */ protected Reader getRulesReader(Configuration conf) throws IOException { String stringRules = conf.get(URLFILTER_REGEX_RULES); @@ -74,23 +72,22 @@ public class RegexURLFilter extends Rege protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } - - /* ------------------------------------ * - * </implementation:RegexURLFilterBase> * - * ------------------------------------ */ - + /* + * ------------------------------------ * </implementation:RegexURLFilterBase> + * * ------------------------------------ + */ + public static void main(String args[]) throws IOException { RegexURLFilter filter = new RegexURLFilter(); filter.setConf(NutchConfiguration.create()); main(filter, args); } - private class Rule extends RegexRule { - + private Pattern pattern; - + Rule(boolean sign, String regex) { super(sign, regex); pattern = Pattern.compile(regex); @@ -100,5 +97,5 @@ public class RegexURLFilter extends Rege return pattern.matcher(url).find(); } } - + } Modified: nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Fri Jan 9 06:34:33 2015 @@ -28,11 +28,11 @@ import static org.junit.Assert.*; /** * JUnit based test of class <code>RegexURLFilter</code>. - * + * * @author Jérôme Charron */ public class TestRegexURLFilter extends RegexURLFilterBaseTest { - + protected URLFilter getURLFilter(Reader rules) { try { return new RegexURLFilter(rules); @@ -41,7 +41,7 @@ public class TestRegexURLFilter extends return null; } } - + @Test public void test() { test("WholeWebCrawling"); Modified: nutch/branches/2.x/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Fri Jan 9 06:34:33 2015 @@ -51,14 +51,15 @@ import java.net.MalformedURLException; * Attribute "file" has higher precedence if defined. If the config file is * missing, all URLs will be rejected. * - * <p>This filter can be configured to work in one of two modes: + * <p> + * This filter can be configured to work in one of two modes: * <ul> - * <li><b>default to reject</b> ('-'): in this mode, only URLs that match suffixes - * specified in the config file will be accepted, all other URLs will be - * rejected.</li> - * <li><b>default to accept</b> ('+'): in this mode, only URLs that match suffixes - * specified in the config file will be rejected, all other URLs will be - * accepted.</li> + * <li><b>default to reject</b> ('-'): in this mode, only URLs that match + * suffixes specified in the config file will be accepted, all other URLs will + * be rejected.</li> + * <li><b>default to accept</b> ('+'): in this mode, only URLs that match + * suffixes specified in the config file will be rejected, all other URLs will + * be accepted.</li> * </ul> * <p> * The format of this config file is one URL suffix per line, with no preceding @@ -67,10 +68,10 @@ import java.net.MalformedURLException; * </p> * <p> * A single '+' or '-' sign not followed by any suffix must be used once, to - * signify the mode this plugin operates in. An optional single 'I' can be appended, - * to signify that suffix matches should be case-insensitive. The default, if - * not specified, is to use case-sensitive matches, i.e. suffix '.JPG' - * does not match '.jpg'. + * signify the mode this plugin operates in. An optional single 'I' can be + * appended, to signify that suffix matches should be case-insensitive. The + * default, if not specified, is to use case-sensitive matches, i.e. suffix + * '.JPG' does not match '.jpg'. * </p> * <p> * NOTE: the format of this file is different from urlfilter-prefix, because @@ -82,8 +83,8 @@ import java.net.MalformedURLException; * <h4>Example 1</h4> * <p> * The configuration shown below will accept all URLs with '.html' or '.htm' - * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), - * and prohibit all other suffixes. + * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit + * all other suffixes. * <p> * * <pre> @@ -91,7 +92,7 @@ import java.net.MalformedURLException; * * # prohibit all unknown, case-sensitive matching * - - * + * * # collect only HTML files. * .html * .htm @@ -119,11 +120,13 @@ import java.net.MalformedURLException; * </pre> * * </p> + * * @author Andrzej Bialecki */ public class SuffixURLFilter implements URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(SuffixURLFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(SuffixURLFilter.class); // read in attribute "file" of this plugin. private String attributeFile = null; @@ -144,11 +147,13 @@ public class SuffixURLFilter implements } public String filter(String url) { - if (url == null) return null; + if (url == null) + return null; String _url; if (ignoreCase) _url = url.toLowerCase(); - else _url = url; + else + _url = url; if (filterFromPath) { try { URL pUrl = new URL(_url); @@ -160,11 +165,15 @@ public class SuffixURLFilter implements String a = suffixes.shortestMatch(_url); if (a == null) { - if (modeAccept) return url; - else return null; + if (modeAccept) + return url; + else + return null; } else { - if (modeAccept) return null; - else return url; + if (modeAccept) + return null; + else + return url; } } @@ -187,30 +196,31 @@ public class SuffixURLFilter implements String line; while ((line = in.readLine()) != null) { - if (line.length() == 0) continue; + if (line.length() == 0) + continue; char first = line.charAt(0); switch (first) { - case ' ': - case '\n': - case '#': // skip blank & comment lines - break; - case '-': - allow = false; - if(line.contains("P")) - filterFromPath = true; - if(line.contains("I")) - ignore = true; - break; - case '+': - allow = true; - if(line.contains("P")) - filterFromPath = true; - if(line.contains("I")) - ignore = true; - break; - default: - aSuffixes.add(line); + case ' ': + case '\n': + case '#': // skip blank & comment lines + break; + case '-': + allow = false; + if (line.contains("P")) + filterFromPath = true; + if (line.contains("I")) + ignore = true; + break; + case '+': + allow = true; + if (line.contains("P")) + filterFromPath = true; + if (line.contains("I")) + ignore = true; + break; + default: + aSuffixes.add(line); } } if (ignore) { @@ -249,7 +259,8 @@ public class SuffixURLFilter implements this.conf = conf; String pluginName = "urlfilter-suffix"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(URLFilter.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -257,22 +268,25 @@ public class SuffixURLFilter implements break; } } - if (attributeFile != null && attributeFile.trim().equals("")) attributeFile = null; + if (attributeFile != null && attributeFile.trim().equals("")) + attributeFile = null; if (attributeFile != null) { if (LOG.isInfoEnabled()) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile); + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); } } else { // if (LOG.isWarnEnabled()) { - // LOG.warn("Attribute \"file\" is not defined in plugin.xml for - // plugin "+pluginName); + // LOG.warn("Attribute \"file\" is not defined in plugin.xml for + // plugin "+pluginName); // } } String file = conf.get("urlfilter.suffix.file"); String stringRules = conf.get("urlfilter.suffix.rules"); // attribute "file" takes precedence if defined - if (attributeFile != null) file = attributeFile; + if (attributeFile != null) + file = attributeFile; Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); @@ -283,7 +297,9 @@ public class SuffixURLFilter implements try { readConfiguration(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } throw new RuntimeException(e.getMessage(), e); } }