ekoje ekoje napisał(a): > Hello, I tried to modify Nutch in order to pass through a web proxy as > advice below but it still doesn'tr work. > > I've got the following error: > > 2007-02-15 17:04:58,285 INFO fetcher.Fetcher - fetching > http://lucene.apache.org/nutch/ > 2007-02-15 17:04:58,300 INFO http.Http - http.proxy.host = ncproxy1 > 2007-02-15 17:04:58,300 INFO http.Http - http.proxy.port = 8080 > 2007-02-15 17:04:58,300 INFO http.Http - http.timeout = 10000 > 2007-02-15 17:04:58,300 INFO http.Http - http.content.limit = 65536 > 2007-02-15 17:04:58,300 INFO http.Http - http.agent = > NutchCVS/Nutch-0.9-dev > (C:\pbapps\nutch-nightly\conf\nutch-default.xml) > 2007-02-15 17:04:58,300 INFO http.Http - protocol.plugin.check.blocking > = true > 2007-02-15 17:04:58,300 INFO http.Http - protocol.plugin.check.robots = > true > 2007-02-15 17:04:58,300 INFO http.Http - fetcher.server.delay = 1000 > 2007-02-15 17:04:58,300 INFO http.Http - http.max.delays = 1000 > 2007-02-15 17:04:58,316 ERROR http.Http - > org.apache.nutch.protocol.http.api.HttpException: > java.net.UnknownHostException: > lucene.apache.org: lucene.apache.org > 2007-02-15 17:04:58,316 ERROR http.Http - at > org.apache.nutch.protocol.http.api.HttpBase.blockAddr(HttpBase.java:340) > 2007-02-15 17:04:58,316 ERROR http.Http - at > org.apache.nutch.protocol.http.api.HttpBase.getProtocolOutput(HttpBase.java:212) > > > 2007-02-15 17:04:58,316 ERROR http.Http - at > org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:145) > 2007-02-15 17:04:58,316 ERROR http.Http - Caused by: > java.net.UnknownHostException: > lucene.apache.org: lucene.apache.org > 2007-02-15 17:04:58,316 ERROR http.Http - at > java.net.InetAddress.getAllByName0(InetAddress.java:1128) > 2007-02-15 17:04:58,316 ERROR http.Http - at > java.net.InetAddress.getAllByName0(InetAddress.java:1098) > 2007-02-15 17:04:58,316 ERROR http.Http - at > java.net.InetAddress.getAllByName(InetAddress.java:1061) > 2007-02-15 17:04:58,316 ERROR http.Http - at > java.net.InetAddress.getByName(InetAddress.java:958) > 2007-02-15 17:04:58,316 ERROR http.Http - at > org.apache.nutch.protocol.http.api.HttpBase.blockAddr(HttpBase.java:336) > 2007-02-15 17:04:58,316 ERROR http.Http - ... 2 more > 2007-02-15 17:04:58,316 INFO fetcher.Fetcher - fetch of > http://lucene.apache.org/nutch/ failed with: > org.apache.nutch.protocol.http.api.HttpException: > java.net.UnknownHostException: > lucene.apache.org > : lucene.apache.org > 2007-02-15 17:04:59,597 INFO plugin.PluginRepository - Plugins: looking > in: > C:\pbapps\nutch-nightly\plugins > 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Plugin > Auto-activation mode: > [true] > 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Registered Plugins: > 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - > CyberNeko HTML > Parser (lib-nekohtml) > 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Site > Query Filter > > > Could you please help me to go through this proxy with authentication ? > > Thanks, > > > [-] Hi > [-] > [-] I was having the same problem running nutch behind a web proxy. > [-] But with little changes in the plugin protocol-httpclient this works > for > [-] me. > [-] > [-] See source below for my changes. > [-] > [-] > [-] public class Http extends HttpBase { > [-] > [-] public static final Log LOG = LogFactory.getLog(Http.class); > [-] > [-] private static MultiThreadedHttpConnectionManager connectionManager = > [-] new MultiThreadedHttpConnectionManager(); > [-] > [-] // Since the Configuration has not yet been setted, > [-] // then an unconfigured client is returned. > [-] private static HttpClient client = new HttpClient(connectionManager); > [-] > [-] static synchronized HttpClient getClient() { > [-] return client; > [-] } > [-] > [-] boolean verbose = false; > [-] int maxThreadsTotal = 10; > [-] String ntlmUsername = ""; > [-] String ntlmPassword = ""; > [-] String ntlmDomain = ""; > [-] String ntlmHost = ""; > [-] > [-] String proxyuser = ""; > [-] String proxypass = ""; > [-] > [-] public Http() { > [-] super(LOG); > [-] } > [-] > [-] public void setConf(Configuration conf) { > [-] super.setConf(conf); > [-] this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10); > [-] this.ntlmUsername = conf.get("http.auth.ntlm.username", ""); > [-] this.ntlmPassword = conf.get("http.auth.ntlm.password", ""); > [-] this.ntlmDomain = conf.get("http.auth.ntlm.domain", ""); > [-] this.ntlmHost = conf.get("http.auth.ntlm.host", ""); > [-] > [-] > [-] // add config for auth proxy > [-] this.proxyuser = conf.get("http.auth.proxy.username", ""); > [-] this.proxypass = conf.get("http.auth.proxy.password", ""); > [-] > [-] > [-] //Level logLevel = Level.WARNING; > [-] //if (conf.getBoolean("http.verbose", false)) { > [-] // logLevel = Level.FINE; > [-] //} > [-] //LOG.setLevel(logLevel); > [-] > //Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector > ") > [-] // .setLevel(logLevel); > [-] configureClient(); > [-] } > [-] > [-] public static void main(String[] args) throws Exception { > [-] Http http = new Http(); > [-] http.setConf(NutchConfiguration.create()); > [-] main(http, args); > [-] } > [-] > [-] protected Response getResponse(URL url, CrawlDatum datum, boolean > [-] redirect) > [-] throws ProtocolException, IOException { > [-] return new HttpResponse(this, url, datum, redirect); > [-] } > [-] > [-] private void configureClient() { > [-] > [-] // Set up an HTTPS socket factory that accepts self-signed certs. > [-] //Protocol dummyhttps = new Protocol("https", new > [-] DummySSLProtocolSocketFactory(), 443); > [-] //Protocol.registerProtocol("https", dummyhttps); > [-] > [-] HttpConnectionManagerParams params = connectionManager.getParams(); > [-] params.setConnectionTimeout(timeout); > [-] params.setSoTimeout(timeout); > [-] params.setSendBufferSize(BUFFER_SIZE); > [-] params.setReceiveBufferSize(BUFFER_SIZE); > [-] params.setMaxTotalConnections(maxThreadsTotal); > [-] if (maxThreadsTotal > maxThreadsPerHost) { > [-] params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost); > [-] } else { > [-] params.setDefaultMaxConnectionsPerHost(maxThreadsTotal); > [-] } > [-] > [-] HostConfiguration hostConf = client.getHostConfiguration(); > [-] ArrayList headers = new ArrayList(); > [-] // prefer English > [-] headers.add(new Header("Accept-Language", > [-] "en-us,en-gb,en;q=0.7,*;q=0.3")); > [-] // prefer UTF-8 > [-] headers.add(new Header("Accept-Charset", > [-] "utf-8,ISO-8859-1;q=0.7,*;q=0.7")); > [-] // prefer understandable formats > [-] headers.add(new Header("Accept", > [-] > [-] "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9 > ,text/p > [-] lain;q=0.8,image/png,*/*;q=0.5")); > [-] // accept gzipped content > [-] headers.add(new Header("Accept-Encoding", "x-gzip, gzip")); > [-] hostConf.getParams().setParameter("http.default-headers", headers); > [-] if (useProxy) { > [-] hostConf.setProxy(proxyHost, proxyPort); > [-] // add support for proxy authentication > [-] if (proxyuser.length() > 0 ) { > [-] Credentials proxyCreds = new > [-] UsernamePasswordCredentials(proxyuser,proxypass); > [-] client.getState().setProxyCredentials(new > [-] AuthScope(proxyHost,AuthScope.ANY_PORT), proxyCreds); > [-] } > [-] } > [-] if (ntlmUsername.length() > 0) { > [-] Credentials ntCreds = new NTCredentials(ntlmUsername, > ntlmPassword, > [-] ntlmHost, ntlmDomain); > [-] client.getState().setCredentials(new AuthScope(ntlmHost, > [-] AuthScope.ANY_PORT), ntCreds); > [-] > [-] if (LOG.isInfoEnabled()) { > [-] LOG.info("Added NTLM credentials for " + ntlmUsername); > [-] } > [-] } > [-] if (LOG.isInfoEnabled()) { LOG.info("Configured Client"); } > [-] } > [-] } > [-] > [-] > [-] -----Ursprüngliche Nachricht----- > [-] Von: ekoje ekoje [mailto:[EMAIL PROTECTED] > [-] Gesendet: Donnerstag, 8. Februar 2007 15:36 > [-] An: [email protected] > [-] Betreff: Web Proxy > [-] > [-] Hi Guys, > [-] > [-] I would like to run nutch but I'm behind a web proxy with > authentication. > [-] > [-] I use nutch-0.8.1 under windows XP. Ive configured nutch-site.xml to > [-] specify > [-] my proxy host and port but how do i specify the username and password ? > [-] > [-] Could you please help me ? > [-] > [-] Thanks > [-] > This exception tells you that nutch cant resolv hostname, try to pass Ip rather then hostname of your proxy serwer (if it's possible) or make sure that nutch can resolv hostname.
-- Damian Florczyk aka thunder Gentoo Developer, Gentoo/NetBSD Development Lead ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-general mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/nutch-general
