Author: lewismc Date: Fri Feb 13 22:20:15 2015 New Revision: 1659701 URL: http://svn.apache.org/r1659701 Log: NUTCH-827 HTTP POST Authentication
Added: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java Added: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java?rev=1659701&view=auto ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java (added) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java Fri Feb 13 22:20:15 2015 @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.httpclient; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +public class HttpFormAuthConfigurer { + private String loginUrl; + private String loginFormId; + /** + * The data posted to login form, such as username(or email), password + */ + private Map<String, String> loginPostData; + /** + * In case we need add additional headers. + */ + private Map<String, String> additionalPostHeaders; + /** + * If http post login returns redirect code: 301 or 302, + * Http Client will automatically follow the redirect. + */ + private boolean loginRedirect; + /** + * Used when we need remove some form fields. + */ + private Set<String> removedFormFields; + + public HttpFormAuthConfigurer() { + } + + public String getLoginUrl() { + return loginUrl; + } + + public HttpFormAuthConfigurer setLoginUrl(String loginUrl) { + this.loginUrl = loginUrl; + return this; + } + + public String getLoginFormId() { + return loginFormId; + } + + public HttpFormAuthConfigurer setLoginFormId(String loginForm) { + this.loginFormId = loginForm; + return this; + } + + public Map<String, String> getLoginPostData() { + return loginPostData == null ? new HashMap<String, String>() + : loginPostData; + } + + public HttpFormAuthConfigurer setLoginPostData( + Map<String, String> loginPostData) { + this.loginPostData = loginPostData; + return this; + } + + public Map<String, String> getAdditionalPostHeaders() { + return additionalPostHeaders == null ? new HashMap<String, String>() + : additionalPostHeaders; + } + + public HttpFormAuthConfigurer setAdditionalPostHeaders( + Map<String, String> additionalPostHeaders) { + this.additionalPostHeaders = additionalPostHeaders; + return this; + } + + public boolean isLoginRedirect() { + return loginRedirect; + } + + public HttpFormAuthConfigurer setLoginRedirect(boolean redirect) { + this.loginRedirect = redirect; + return this; + } + + public Set<String> getRemovedFormFields() { + return removedFormFields == null ? new HashSet<String>() + : removedFormFields; + } + + public HttpFormAuthConfigurer setRemovedFormFields( + Set<String> removedFormFields) { + this.removedFormFields = removedFormFields; + return this; } +} Added: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java?rev=1659701&view=auto ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java (added) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java Fri Feb 13 22:20:15 2015 @@ -0,0 +1,223 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.httpclient; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.CookieHandler; +import java.net.CookieManager; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.methods.PostMethod; +import org.apache.commons.io.IOUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HttpFormAuthentication { + private static final Logger LOGGER = LoggerFactory + .getLogger(HttpFormAuthentication.class); + private static Map<String, String> defaultLoginHeaders = new HashMap<String, String>(); + + static { + defaultLoginHeaders.put("User-Agent", "Mozilla/5.0"); + defaultLoginHeaders + .put("Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5"); + defaultLoginHeaders.put("Connection", "keep-alive"); + defaultLoginHeaders.put("Content-Type", + "application/x-www-form-urlencoded"); + } + + private HttpClient client; + private HttpFormAuthConfigurer authConfigurer = new HttpFormAuthConfigurer(); + private String cookies; + + public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer, + HttpClient client, Http http) { + this.authConfigurer = authConfigurer; + this.client = client; + defaultLoginHeaders.put("Accept", http.getAccept()); + defaultLoginHeaders.put("Accept-Language", http.getAcceptLanguage()); + defaultLoginHeaders.put("User-Agent", http.getUserAgent()); + } + + public HttpFormAuthentication(String loginUrl, String loginForm, + Map<String, String> loginPostData, + Map<String, String> additionalPostHeaders, + Set<String> removedFormFields) { + this.authConfigurer.setLoginUrl(loginUrl); + this.authConfigurer.setLoginFormId(loginForm); + this.authConfigurer + .setLoginPostData(loginPostData == null ? new HashMap<String, String>() + : loginPostData); + this.authConfigurer + .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>() + : additionalPostHeaders); + this.authConfigurer + .setRemovedFormFields(removedFormFields == null ? new HashSet<String>() + : removedFormFields); + this.client = new HttpClient(); + } + + public void login() throws Exception { + // make sure cookies are turned on + CookieHandler.setDefault(new CookieManager()); + String pageContent = httpGetPageContent(authConfigurer.getLoginUrl()); + List<NameValuePair> params = getLoginFormParams(pageContent); + sendPost(authConfigurer.getLoginUrl(), params); + } + + private void sendPost(String url, List<NameValuePair> params) + throws Exception { + PostMethod post = null; + try { + if (authConfigurer.isLoginRedirect()) { + post = new PostMethod(url) { + @Override + public boolean getFollowRedirects() { + return true; + } + }; + } else { + post = new PostMethod(url); + } + // we can't use post.setFollowRedirects(true) as it will throw + // IllegalArgumentException: + // Entity enclosing requests cannot be redirected without user + // intervention + setLoginHeader(post); + post.addParameters(params.toArray(new NameValuePair[0])); + int rspCode = client.executeMethod(post); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("rspCode: " + rspCode); + LOGGER.debug("\nSending 'POST' request to URL : " + url); + + LOGGER.debug("Post parameters : " + params); + LOGGER.debug("Response Code : " + rspCode); + for (Header header : post.getRequestHeaders()) { + LOGGER.debug("Response headers : " + header); + } + } + String rst = IOUtils.toString(post.getResponseBodyAsStream()); + LOGGER.debug("login post result: " + rst); + } finally { + if (post != null) { + post.releaseConnection(); + } + } + } + + private void setLoginHeader(PostMethod post) { + Map<String, String> headers = new HashMap<String, String>(); + headers.putAll(defaultLoginHeaders); + // additionalPostHeaders can overwrite value in defaultLoginHeaders + headers.putAll(authConfigurer.getAdditionalPostHeaders()); + for (Entry<String, String> entry : headers.entrySet()) { + post.addRequestHeader(entry.getKey(), entry.getValue()); + } + post.addRequestHeader("Cookie", getCookies()); + } + + private String httpGetPageContent(String url) throws IOException { + + GetMethod get = new GetMethod(url); + try { + for (Entry<String, String> entry : authConfigurer + .getAdditionalPostHeaders().entrySet()) { + get.addRequestHeader(entry.getKey(), entry.getValue()); + } + client.executeMethod(get); + Header cookieHeader = get.getResponseHeader("Set-Cookie"); + if (cookieHeader != null) { + setCookies(cookieHeader.getValue()); + } + String rst = IOUtils.toString(get.getResponseBodyAsStream()); + return rst; + } finally { + get.releaseConnection(); + } + + } + + private List<NameValuePair> getLoginFormParams(String pageContent) + throws UnsupportedEncodingException { + List<NameValuePair> params = new ArrayList<NameValuePair>(); + Document doc = Jsoup.parse(pageContent); + Element loginform = doc.getElementById(authConfigurer.getLoginFormId()); + if (loginform == null) { + LOGGER.debug("No form element found with 'id' = {}, trying 'name'.", + authConfigurer.getLoginFormId()); + loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first(); + if (loginform == null) { + LOGGER.debug("No form element found with 'name' = {}", + authConfigurer.getLoginFormId()); + throw new IllegalArgumentException("No form exists: " + + authConfigurer.getLoginFormId()); + } + } + Elements inputElements = loginform.getElementsByTag("input"); + // skip fields in removedFormFields or loginPostData + for (Element inputElement : inputElements) { + String key = inputElement.attr("name"); + String value = inputElement.attr("value"); + if (authConfigurer.getLoginPostData().containsKey(key) + || authConfigurer.getRemovedFormFields().contains(key)) { + // value = loginPostData.get(key); + continue; + } + params.add(new NameValuePair(key, value)); + } + // add key and value in loginPostData + for (Entry<String, String> entry : authConfigurer.getLoginPostData() + .entrySet()) { + params.add(new NameValuePair(entry.getKey(), entry.getValue())); + } + return params; + } + + public String getCookies() { + return cookies; + } + + public void setCookies(String cookies) { + this.cookies = cookies; + } + + public boolean isRedirect() { + return authConfigurer.isLoginRedirect(); + } + + public void setRedirect(boolean redirect) { + this.authConfigurer.setLoginRedirect(redirect); + } + +}