Author: kwright
Date: Tue May 26 10:41:38 2015
New Revision: 1681736
URL: http://svn.apache.org/r1681736
Log:
Pull up fix for CONNECTORS-1193 from trunk.
Added:
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
- copied unchanged from r1681735,
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
Modified:
manifoldcf/branches/dev_1x/ (props changed)
manifoldcf/branches/dev_1x/CHANGES.txt
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
manifoldcf/branches/dev_1x/framework/ (props changed)
manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 26 10:41:38 2015
@@ -123,4 +123,4 @@
/manifoldcf/branches/CONNECTORS-981:1605049-1605773
/manifoldcf/branches/CONNECTORS-989:1611600-1612101
/manifoldcf/branches/CONNECTORS-990:1610284-1610707
-/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094,1676882,1676910,1678300,1678329,1678471,1678551,1679730,1679826,1681390
+/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094,1676882,1676910,1678300,1678329,1678471,1678551,1679730,1679826,1681390,1681735
Modified: manifoldcf/branches/dev_1x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/CHANGES.txt?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/CHANGES.txt (original)
+++ manifoldcf/branches/dev_1x/CHANGES.txt Tue May 26 10:41:38 2015
@@ -3,6 +3,10 @@ $Id$
======================= 1.10-dev =====================
+CONNECTORS-1193: Add ability to discard web content based on a
+set of regular expressions.
+(Arcadius Ahouansou)
+
CONNECTORS-1199: SearchBlox connector formatting non-standard.
(Karl Wright)
Modified:
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
(original)
+++
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
Tue May 26 10:41:38 2015
@@ -18,15 +18,17 @@
*/
package org.apache.manifoldcf.crawler.connectors.webcrawler;
-import org.apache.manifoldcf.core.interfaces.*;
-import org.apache.manifoldcf.crawler.system.Logging;
-import java.util.regex.*;
-import java.util.*;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import static java.util.Arrays.asList;
+import static org.apache.manifoldcf.crawler.system.Logging.connectors;
/** This class is the handler for HTML content grepping during state
transitions */
public class FindContentHandler extends FindHandler implements IHTMLHandler
{
- protected final Pattern contentPattern;
+ protected final List<Pattern> contentPatterns;
protected final StringBuilder contentBuffer = new StringBuilder();
protected final static int MAX_LENGTH = 65536;
@@ -35,7 +37,13 @@ public class FindContentHandler extends
public FindContentHandler(String parentURI, Pattern contentPattern)
{
super(parentURI);
- this.contentPattern = contentPattern;
+ this.contentPatterns = asList(contentPattern);
+ }
+
+ public FindContentHandler(String parentURI, List<Pattern> contentPatterns)
+ {
+ super(parentURI);
+ this.contentPatterns = contentPatterns;
}
/** Apply overrides */
@@ -69,10 +77,14 @@ public class FindContentHandler extends
// continuity
String bufferContents = contentBuffer.toString();
contentBuffer.setLength(0);
- if (contentPattern.matcher(bufferContents).find())
- targetURI = "";
- else
- {
+ for (Pattern contentPattern : contentPatterns) {
+ if (contentPattern.matcher(bufferContents).find()) {
+ targetURI = "";
+ break;
+ }
+ }
+
+ if(targetURI == null) {
contentBuffer.append(bufferContents.substring(bufferContents.length() -
OVERLAP_AMOUNT));
}
}
@@ -153,8 +165,12 @@ public class FindContentHandler extends
{
String bufferContents = contentBuffer.toString();
contentBuffer.setLength(0);
- if (contentPattern.matcher(bufferContents).find())
- targetURI = "";
+ for(Pattern contentPattern: contentPatterns) {
+ if (contentPattern.matcher(bufferContents).find()) {
+ targetURI = "";
+ return;
+ }
+ }
}
}
Modified:
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
(original)
+++
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
Tue May 26 10:41:38 2015
@@ -137,6 +137,12 @@ public class WebcrawlerConfig
* if any one matches, causes the URL to be excluded from indexing. These
* regexps are newline separated, and # starts a comment. */
public static final String NODE_EXCLUDESINDEX = "excludesindex";
+
+ /**
+ * Exclude any page containing specified regex in their body from index
+ */
+ public static final String NODE_EXCLUDESCONTENTINDEX =
"excludescontentindex";
+
/** Limit to seeds. When value attribute is true, only seed domains will be
permitted. */
public static final String NODE_LIMITTOSEEDS = "limittoseeds";
/** Canonicalization rule. Attributes are regexp, description, reorder,
Modified:
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Tue May 26 10:41:38 2015
@@ -28,10 +28,6 @@ import org.apache.manifoldcf.ui.util.Enc
import org.apache.manifoldcf.core.fuzzyml.*;
import org.apache.http.conn.ConnectTimeoutException;
-import org.apache.http.client.RedirectException;
-import org.apache.http.client.CircularRedirectException;
-import org.apache.http.NoHttpResponseException;
-import org.apache.http.HttpException;
import java.io.*;
import java.nio.charset.StandardCharsets;
@@ -507,7 +503,7 @@ public class WebcrawlerConnector extends
*@param activities is the interface this method should use to perform
whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
*@param seedTime is the end of the time range of documents to consider,
exclusive.
- *@param lastSeedVersionString is the last seeding version string for this
job, or null if the job has no previous seeding version string.
+ *@param lastSeedVersion is the last seeding version string for this job, or
null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether
continuous or once-only.
*@return an updated seeding version string, to be stored with the job.
*/
@@ -1335,7 +1331,7 @@ public class WebcrawlerConnector extends
activities.noDocument(documentIdentifier,versionString);
return;
}
-
+
if (activities.checkURLIndexable(documentIdentifier) == false)
{
if (Logging.connectors.isDebugEnabled())
@@ -1387,7 +1383,15 @@ public class WebcrawlerConnector extends
activities.noDocument(documentIdentifier,versionString);
return;
}
-
+
+ if(!filter.isDocumentContentIndexable(documentIdentifier)){
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document
'"+documentIdentifier+"', not indexing because document content matched
document content exclusion rule");
+ errorCode = activities.EXCLUDED_CONTENT;
+ errorDesc = "Rejected due to content exclusion rule";
+ activities.noDocument(documentIdentifier,versionString);
+ return;
+ }
// Ingest the document
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Decided to ingest
'"+documentIdentifier+"'");
@@ -3895,6 +3899,8 @@ public class WebcrawlerConnector extends
String exclusions = "";
String inclusionsIndex = ".*\n";
String exclusionsIndex = "";
+ String exclusionsContentIndex = "";
+
boolean includeMatching = true;
Set<String> excludedHeaders = new HashSet<String>();
@@ -3945,6 +3951,12 @@ public class WebcrawlerConnector extends
if (exclusionsIndex == null)
exclusionsIndex = "";
}
+ else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ {
+ exclusionsContentIndex = sn.getValue();
+ if (exclusionsContentIndex == null)
+ exclusionsContentIndex = "";
+ }
else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
{
String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -4302,14 +4314,21 @@ public class WebcrawlerConnector extends
" <textarea rows=\"10\" cols=\"60\"
name=\""+seqPrefix+"exclusionsindex\">"+Encoder.bodyEscape(exclusionsIndex)+"</textarea>\n"+
" </td>\n"+
" </tr>\n"+
+" <tr>\n"+
+" <td class=\"description\" colspan=\"1\"><nobr>" +
Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex") +
"</nobr></td>\n"+
+" <td class=\"value\" colspan=\"1\">\n"+
+" <textarea rows=\"10\" cols=\"60\"
name=\""+seqPrefix+"exclusionscontentindex\">"+Encoder.bodyEscape(exclusionsContentIndex)+"</textarea>\n"+
+" </td>\n"+
+" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusions\"
value=\""+Encoder.attributeEscape(exclusions)+"\"/>\n"+
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusionsindex\"
value=\""+Encoder.attributeEscape(exclusionsIndex)+"\"/>\n"
+ "<input type=\"hidden\" name=\"" + seqPrefix + "exclusions\"
value=\"" + Encoder.attributeEscape(exclusions) + "\"/>\n" +
+ "<input type=\"hidden\" name=\"" + seqPrefix +
"exclusionsindex\" value=\"" + Encoder.attributeEscape(exclusionsIndex) +
"\"/>\n" +
+ "<input type=\"hidden\" name=\"" + seqPrefix +
"exclusionscontentindex\" value=\"" +
Encoder.attributeEscape(exclusionsContentIndex) + "\"/>\n"
);
}
@@ -4756,6 +4775,26 @@ public class WebcrawlerConnector extends
ds.addChild(ds.getChildCount(),cn);
}
+ // Get the content index exclusions
+ String exclusionsContentIndex =
variableContext.getParameter(seqPrefix+"exclusionscontentindex");
+ if (exclusionsContentIndex != null)
+ {
+ // Delete existing content exclusions record first
+ int i = 0;
+ while (i < ds.getChildCount())
+ {
+ SpecificationNode sn = ds.getChild(i);
+ if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ ds.removeChild(i);
+ else
+ i++;
+ }
+
+ SpecificationNode cn = new
SpecificationNode(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX);
+ cn.setValue(exclusionsContentIndex);
+ ds.addChild(ds.getChildCount(),cn);
+ }
+
// Read the url specs
String urlRegexpCount =
variableContext.getParameter(seqPrefix+"urlregexpcount");
if (urlRegexpCount != null && urlRegexpCount.length() > 0)
@@ -4962,6 +5001,8 @@ public class WebcrawlerConnector extends
String exclusions = "";
String inclusionsIndex = ".*\n";
String exclusionsIndex = "";
+ String exclusionsContentIndex = "";
+
boolean includeMatching = false;
Set<String> excludedHeaders = new HashSet<String>();
@@ -4999,6 +5040,12 @@ public class WebcrawlerConnector extends
if (exclusionsIndex == null)
exclusionsIndex = "";
}
+ else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ {
+ exclusionsContentIndex = sn.getValue();
+ if (exclusionsContentIndex == null)
+ exclusionsContentIndex = "";
+ }
else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
{
String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -5377,6 +5424,48 @@ public class WebcrawlerConnector extends
out.print(
" </td>\n"+
" </tr>\n"+
+" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
+" <tr>\n"+
+" <td class=\"description\"><nobr>" +
Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex") +
"</nobr></td>\n"+
+" <td class=\"value\">\n"
+ );
+ try
+ {
+ java.io.Reader str = new java.io.StringReader(exclusionsContentIndex);
+ try
+ {
+ java.io.BufferedReader is = new java.io.BufferedReader(str);
+ try
+ {
+ while (true)
+ {
+ String nextString = is.readLine();
+ if (nextString == null)
+ break;
+ if (nextString.length() == 0)
+ continue;
+ out.print(
+" <nobr>"+Encoder.bodyEscape(nextString)+"</nobr><br/>\n"
+ );
+ }
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+ finally
+ {
+ str.close();
+ }
+ }
+ catch (java.io.IOException e)
+ {
+ throw new ManifoldCFException("IO error: "+e.getMessage(),e);
+ }
+ out.print(
+" </td>\n"+
+" </tr>\n"+
" \n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
@@ -6197,7 +6286,7 @@ public class WebcrawlerConnector extends
handler.applyOverrides(lp);
return handler.getTargetURI();
}
-
+
/** Find HTML link URI, if present, making sure specified preference is
matched. */
protected String findHTMLLinkURI(String currentURI, LoginParameters lp)
throws ManifoldCFException
@@ -8013,7 +8102,7 @@ public class WebcrawlerConnector extends
/** This class describes the url filtering information (for crawling and
indexing) obtained from a digested DocumentSpecification.
*/
- protected static class DocumentURLFilter
+ protected class DocumentURLFilter
{
/** The version string */
protected String versionString;
@@ -8029,7 +8118,10 @@ public class WebcrawlerConnector extends
protected final List<Pattern> excludeIndexPatterns = new
ArrayList<Pattern>();
/** The hash map of seed hosts, to limit urls by, if non-null */
protected Set<String> seedHosts = null;
-
+
+ /**List of content exclusion pattern*/
+ protected final List<Pattern> excludeContentIndexPatterns = new
ArrayList<Pattern>();
+
/** Canonicalization policies */
protected final CanonicalizationPolicies canonicalizationPolicies = new
CanonicalizationPolicies();
@@ -8045,6 +8137,7 @@ public class WebcrawlerConnector extends
String excludes = "";
String includesIndex = ".*";
String excludesIndex = "";
+ String excludesContentIndex = "";
String seeds = "";
List<String> packList = new ArrayList<String>();
String[] packStuff = new String[2];
@@ -8176,12 +8269,19 @@ public class WebcrawlerConnector extends
throw new ManifoldCFException("Canonicalization regular expression
'"+urlRegexp+"' is illegal: "+e.getMessage(),e);
}
}
+ else if
(sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ {
+ excludesContentIndex = sn.getValue();
+ if (excludesContentIndex == null)
+ excludesContentIndex = "";
+ }
}
// Note: format change since MCF 1.7 release
StringBuilder versionBuffer = new StringBuilder();
pack(versionBuffer,includesIndex,'+');
pack(versionBuffer,excludesIndex,'+');
+ pack(versionBuffer,excludesContentIndex,'+');
packList(versionBuffer,packList,'+');
versionString = versionBuffer.toString();
@@ -8194,7 +8294,9 @@ public class WebcrawlerConnector extends
compileList(includeIndexPatterns,list);
list = stringToArray(excludesIndex);
compileList(excludeIndexPatterns,list);
-
+ list = stringToArray(excludesContentIndex);
+ compileList(excludeContentIndexPatterns,list);
+
if (limitToSeeds)
{
seedHosts = new HashSet<String>();
@@ -8365,6 +8467,30 @@ public class WebcrawlerConnector extends
return canonicalizationPolicies;
}
+ public boolean isDocumentContentIndexable(String documentIdentifier)
throws ManifoldCFException {
+ String content = findSpecifiedContent(documentIdentifier,
excludeContentIndexPatterns);
+ if (content != null) {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Url '" + documentIdentifier + "' is
not indexable because content exclusion pattern was matched");
+
+ return false;
+ }
+ return true;
+ }
+
+ protected String findSpecifiedContent(String currentURI, List<Pattern>
patterns) throws ManifoldCFException
+ {
+ if (excludeContentIndexPatterns.isEmpty()) {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: no content exclusion rule supplied...
returning");
+ return null;
+ }
+
+ FindContentHandler handler = new FindContentHandler(currentURI,
patterns);
+ handleHTML(currentURI, handler);
+ return handler.getTargetURI();
+ }
+
}
protected static class FetchStatus
Modified:
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
(original)
+++
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
Tue May 26 10:41:38 2015
@@ -76,6 +76,7 @@ WebcrawlerConnector.IncludeInIndex=Inclu
WebcrawlerConnector.IncludeOnlyHostsMatchingSeeds=Include only hosts matching
seeds?
WebcrawlerConnector.ExcludeFromCrawl=Exclude from crawl:
WebcrawlerConnector.ExcludeFromIndex=Exclude from index:
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
WebcrawlerConnector.DeleteToken=Delete token #
WebcrawlerConnector.NoAccessTokensPresent=No access tokens present
WebcrawlerConnector.AddAccessToken=Add access token
Modified:
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
(original)
+++
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
Tue May 26 10:41:38 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
WebcrawlerConnector.ExcludeFromCrawl=ã¯ãã¼ã«ããé¤å¤ï¼
WebcrawlerConnector.ExcludeFromIndex=ç´¢å¼ãé¤å¤ï¼
WebcrawlerConnector.DeleteToken=ãã¼ã¯ã³ãåé¤ #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
WebcrawlerConnector.NoAccessTokensPresent=ã¢ã¯ã»ã¹ãã¼ã¯ã³ãããã¾ãã
WebcrawlerConnector.AddAccessToken=ã¢ã¯ã»ã¹ãã¼ã¯ã³ã追å
WebcrawlerConnector.DeleteMetadata=ã¡ã¿ãã¼ã¿ãåé¤ #
Modified:
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
(original)
+++
manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
Tue May 26 10:41:38 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
WebcrawlerConnector.ExcludeFromCrawl=æé¤äºç¬è«å¤:
WebcrawlerConnector.ExcludeFromIndex=æé¤äºç´¢å¼å¤:
WebcrawlerConnector.DeleteToken=å é¤ä»¤ç #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
WebcrawlerConnector.NoAccessTokensPresent=访é®ä»¤çä¸åå¨
WebcrawlerConnector.AddAccessToken=æ·»å 访é®ä»¤ç
WebcrawlerConnector.DeleteMetadata=å é¤å
æ°æ® #
Propchange: manifoldcf/branches/dev_1x/framework/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 26 10:41:38 2015
@@ -113,4 +113,4 @@
/manifoldcf/branches/CONNECTORS-989/framework:1611600-1612101
/manifoldcf/branches/CONNECTORS-990/framework:1610284-1610707
/manifoldcf/trunk:1629122
-/manifoldcf/trunk/framework:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1632013,1632289,1632844,1633108,1633193,1633202,1633348,1633364,1634145,1634148,1634155,1634264,1634373,1634530,1635438,1635809,1636146,1636180,1636207,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1639593,1639600,1640018,1640101,1640199,1640314,1640319,1640749,1640772,1640925,1640941,1641222,1641557,1641559,1
641724,1641911,1642163,1642255,1642318,1644197,1644399,1646317,1646397,1646403,1646640,1647574,1647585,1647608,1649605,1650351,1650911,1651332,1651539,1651921,1655377,1655411,1657346,1658004,1658036,1660258,1660276,1669487,1670614,1672616,1672737,1676094,1681390
+/manifoldcf/trunk/framework:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1632013,1632289,1632844,1633108,1633193,1633202,1633348,1633364,1634145,1634148,1634155,1634264,1634373,1634530,1635438,1635809,1636146,1636180,1636207,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1639593,1639600,1640018,1640101,1640199,1640314,1640319,1640749,1640772,1640925,1640941,1641222,1641557,1641559,1
641724,1641911,1642163,1642255,1642318,1644197,1644399,1646317,1646397,1646403,1646640,1647574,1647585,1647608,1649605,1650351,1650911,1651332,1651539,1651921,1655377,1655411,1657346,1658004,1658036,1660258,1660276,1669487,1670614,1672616,1672737,1676094,1681390,1681735
Modified:
manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
(original)
+++
manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
Tue May 26 10:41:38 2015
@@ -35,6 +35,8 @@ public interface IOutputHistoryActivity
public static final String EXCLUDED_LENGTH = "EXCLUDEDLENGTH";
public static final String EXCLUDED_MIMETYPE = "EXCLUDEDMIMETYPE";
public static final String EXCLUDED_DATE = "EXCLUDEDDATE";
+ public static final String EXCLUDED_CONTENT = "EXCLUDEDCONTENT";
+
/**
* Use this result code when security info is not recognized.
*/
Modified:
manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
URL:
http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java?rev=1681736&r1=1681735&r2=1681736&view=diff
==============================================================================
---
manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
(original)
+++
manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
Tue May 26 10:41:38 2015
@@ -32,6 +32,8 @@ public interface IHistoryActivity
public static final String EXCLUDED_LENGTH =
IOutputHistoryActivity.EXCLUDED_LENGTH;
public static final String EXCLUDED_MIMETYPE =
IOutputHistoryActivity.EXCLUDED_MIMETYPE;
public static final String EXCLUDED_DATE =
IOutputHistoryActivity.EXCLUDED_DATE;
+ public static final String EXCLUDED_CONTENT =
IOutputHistoryActivity.EXCLUDED_CONTENT;
+
/**
* Use this result code when you get URL value from repository and it is not
valid.
*/