Author: kwright
Date: Thu Feb 14 13:03:24 2013
New Revision: 1446153
URL: http://svn.apache.org/r1446153
Log:
Revisit the fix for CONNECTORS-623. Override the HttpSolrServer class instead,
and force multipart post, in order to guarantee delivery of content type, name,
and length to SolrCell.
Added:
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
(with props)
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1446153&r1=1446152&r2=1446153&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Feb 14 13:03:24 2013
@@ -3,6 +3,11 @@ $Id$
======================= 1.2-dev =====================
+CONNECTORS-623: A better fix, overriding the posting method for
+SolrJ and forcing multipart post. Although still a hack, this should cause
+the behavior to work like MCF 1.0.1 now.
+(Karl Wright)
+
CONNECTORS-645: Handle null data properly in worker thread.
(Maciej Li¿ewski, Karl Wright)
Modified:
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1446153&r1=1446152&r2=1446153&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
(original)
+++
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
Thu Feb 14 13:03:24 2013
@@ -243,9 +243,7 @@ public class HttpPoster
}
String httpSolrServerUrl = protocol + "://" + server + ":" + port +
location;
- HttpSolrServer httpSolrServer = new HttpSolrServer(httpSolrServerUrl,
localClient);
- // For portability with older versions of Solr
- httpSolrServer.setParser(new XMLResponseParser());
+ HttpSolrServer httpSolrServer = new
ModifiedHttpSolrServer(httpSolrServerUrl, localClient, new XMLResponseParser());
// Set the solrj instance we want to use
solrServer = httpSolrServer;
}
@@ -812,9 +810,10 @@ public class HttpPoster
writeField(out,LITERAL+newFieldName,values);
}
}
-
- writeField(out,LITERAL+"stream_size",String.valueOf(length));
- writeField(out,LITERAL+"stream_name",document.getFileName());
+
+ // These are unnecessary now in the case of non-solrcloud setups,
because we overrode the SolrJ posting method to use multipart.
+ //writeField(out,LITERAL+"stream_size",String.valueOf(length));
+ //writeField(out,LITERAL+"stream_name",document.getFileName());
// Write the commitWithin parameter
if (commitWithin != null)
Added:
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java?rev=1446153&view=auto
==============================================================================
---
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
(added)
+++
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
Thu Feb 14 13:03:24 2013
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.manifoldcf.agents.output.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.ConnectException;
+import java.net.SocketTimeoutException;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.http.Header;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.NameValuePair;
+import org.apache.http.NoHttpResponseException;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.entity.UrlEncodedFormEntity;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.client.methods.HttpRequestBase;
+import org.apache.http.client.params.ClientPNames;
+import org.apache.http.conn.ClientConnectionManager;
+import org.apache.http.entity.InputStreamEntity;
+import org.apache.http.entity.mime.FormBodyPart;
+import org.apache.http.entity.mime.HttpMultipartMode;
+import org.apache.http.entity.mime.MultipartEntity;
+import org.apache.http.entity.mime.content.InputStreamBody;
+import org.apache.http.entity.mime.content.StringBody;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.message.BasicHeader;
+import org.apache.http.message.BasicNameValuePair;
+import org.apache.http.util.EntityUtils;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.client.solrj.ResponseParser;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.request.RequestWriter;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.UpdateResponse;
+import org.apache.solr.client.solrj.util.ClientUtils;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.NamedList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/** This class overrides and somewhat changes the behavior of the
+* SolrJ HttpSolrServer class. The point of all this is simply to get
+* the right information to Tika. When SolrJ uses GET or POST but not
+* multipart-post, it does not include multipart headers that Tika uses -
+* specifically, the name of the document and the length of the document.
+* Patches have been submitted to the SOLR ticket queue to address this
+* problem in a method-insensitive way, but so far there has been no sign that
+* the Solr team is interested in committing them.
+*/
+public class ModifiedHttpSolrServer extends HttpSolrServer
+{
+ // Here we duplicate all the private fields we need
+
+ private static final String UTF_8 = "UTF-8";
+ private static final String DEFAULT_PATH = "/select";
+
+ private final HttpClient httpClient;
+ private boolean followRedirects = false;
+ private int maxRetries = 0;
+ private boolean useMultiPartPost = true;
+
+ public ModifiedHttpSolrServer(String baseURL, HttpClient client,
ResponseParser parser) {
+ super(baseURL, client, parser);
+ httpClient = client;
+ }
+
+ @Override
+ public NamedList<Object> request(final SolrRequest request,
+ final ResponseParser processor) throws SolrServerException, IOException {
+ HttpRequestBase method = null;
+ InputStream is = null;
+ SolrParams params = request.getParams();
+ Collection<ContentStream> streams =
requestWriter.getContentStreams(request);
+ String path = requestWriter.getPath(request);
+ if (path == null || !path.startsWith("/")) {
+ path = DEFAULT_PATH;
+ }
+
+ ResponseParser parser = request.getResponseParser();
+ if (parser == null) {
+ parser = this.parser;
+ }
+
+ // The parser 'wt=' and 'version=' params are used instead of the original
+ // params
+ ModifiableSolrParams wparams = new ModifiableSolrParams(params);
+ if (parser != null) {
+ wparams.set(CommonParams.WT, parser.getWriterType());
+ wparams.set(CommonParams.VERSION, parser.getVersion());
+ }
+ if (invariantParams != null) {
+ wparams.add(invariantParams);
+ }
+ params = wparams;
+
+ int tries = maxRetries + 1;
+ try {
+ while( tries-- > 0 ) {
+ // Note: since we aren't do intermittent time keeping
+ // ourselves, the potential non-timeout latency could be as
+ // much as tries-times (plus scheduling effects) the given
+ // timeAllowed.
+ try {
+ if( SolrRequest.METHOD.GET == request.getMethod() ) {
+ if( streams != null ) {
+ throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"GET can't send streams!" );
+ }
+ method = new HttpGet( baseUrl + path + ClientUtils.toQueryString(
params, false ) );
+ }
+ else if( SolrRequest.METHOD.POST == request.getMethod() ) {
+
+ String url = baseUrl + path;
+ boolean isMultipart = ( streams != null && streams.size() > 1 );
+
+ LinkedList<NameValuePair> postParams = new
LinkedList<NameValuePair>();
+ if (streams == null || isMultipart) {
+ HttpPost post = new HttpPost(url);
+ post.setHeader("Content-Charset", "UTF-8");
+ if (!this.useMultiPartPost && !isMultipart) {
+ post.addHeader("Content-Type",
+ "application/x-www-form-urlencoded; charset=UTF-8");
+ }
+
+ List<FormBodyPart> parts = new LinkedList<FormBodyPart>();
+ Iterator<String> iter = params.getParameterNamesIterator();
+ while (iter.hasNext()) {
+ String p = iter.next();
+ String[] vals = params.getParams(p);
+ if (vals != null) {
+ for (String v : vals) {
+ if (this.useMultiPartPost || isMultipart) {
+ parts.add(new FormBodyPart(p, new StringBody(v,
Charset.forName("UTF-8"))));
+ } else {
+ postParams.add(new BasicNameValuePair(p, v));
+ }
+ }
+ }
+ }
+
+ if (isMultipart) {
+ for (ContentStream content : streams) {
+ String contentType = content.getContentType();
+ if(contentType==null) {
+ contentType = "application/octet-stream"; // default
+ }
+ parts.add(new FormBodyPart(content.getName(),
+ new InputStreamBody(
+ content.getStream(),
+ contentType,
+ content.getName())));
+ }
+ }
+
+ if (parts.size() > 0) {
+ MultipartEntity entity = new
MultipartEntity(HttpMultipartMode.STRICT);
+ for(FormBodyPart p: parts) {
+ entity.addPart(p);
+ }
+ post.setEntity(entity);
+ } else {
+ //not using multipart
+ post.setEntity(new UrlEncodedFormEntity(postParams, "UTF-8"));
+ }
+
+ method = post;
+ }
+ // It is has one stream, it is the post body, put the params in
the URL
+ else {
+ String pstr = ClientUtils.toQueryString(params, false);
+ HttpPost post = new HttpPost(url + pstr);
+
+ // Single stream as body
+ // Using a loop just to get the first one
+ final ContentStream[] contentStream = new ContentStream[1];
+ for (ContentStream content : streams) {
+ contentStream[0] = content;
+ break;
+ }
+ if (contentStream[0] instanceof RequestWriter.LazyContentStream)
{
+ post.setEntity(new
InputStreamEntity(contentStream[0].getStream(), -1) {
+ @Override
+ public Header getContentType() {
+ return new BasicHeader("Content-Type",
contentStream[0].getContentType());
+ }
+
+ @Override
+ public boolean isRepeatable() {
+ return false;
+ }
+
+ });
+ } else {
+ post.setEntity(new
InputStreamEntity(contentStream[0].getStream(), -1) {
+ @Override
+ public Header getContentType() {
+ return new BasicHeader("Content-Type",
contentStream[0].getContentType());
+ }
+
+ @Override
+ public boolean isRepeatable() {
+ return false;
+ }
+ });
+ }
+ method = post;
+ }
+ }
+ else {
+ throw new SolrServerException("Unsupported method:
"+request.getMethod() );
+ }
+ }
+ catch( NoHttpResponseException r ) {
+ method = null;
+ if(is != null) {
+ is.close();
+ }
+ // If out of tries then just rethrow (as normal error).
+ if (tries < 1) {
+ throw r;
+ }
+ }
+ }
+ } catch (IOException ex) {
+ throw new SolrServerException("error reading streams", ex);
+ }
+
+ // XXX client already has this set, is this needed?
+ method.getParams().setParameter(ClientPNames.HANDLE_REDIRECTS,
+ followRedirects);
+ method.addHeader("User-Agent", AGENT);
+
+ InputStream respBody = null;
+ boolean shouldClose = true;
+
+ try {
+ // Execute the method.
+ final HttpResponse response = httpClient.execute(method);
+ int httpStatus = response.getStatusLine().getStatusCode();
+
+ // Read the contents
+ respBody = response.getEntity().getContent();
+
+ // handle some http level checks before trying to parse the response
+ switch (httpStatus) {
+ case HttpStatus.SC_OK:
+ case HttpStatus.SC_BAD_REQUEST:
+ case HttpStatus.SC_CONFLICT: // 409
+ break;
+ case HttpStatus.SC_MOVED_PERMANENTLY:
+ case HttpStatus.SC_MOVED_TEMPORARILY:
+ if (!followRedirects) {
+ throw new SolrServerException("Server at " + getBaseURL()
+ + " sent back a redirect (" + httpStatus + ").");
+ }
+ break;
+ default:
+ throw new
SolrException(SolrException.ErrorCode.getErrorCode(httpStatus), "Server at " +
getBaseURL()
+ + " returned non ok status:" + httpStatus + ", message:"
+ + response.getStatusLine().getReasonPhrase());
+
+ }
+ if (processor == null) {
+ // no processor specified, return raw stream
+ NamedList<Object> rsp = new NamedList<Object>();
+ rsp.add("stream", respBody);
+ // Only case where stream should not be closed
+ shouldClose = false;
+ return rsp;
+ }
+ String charset = EntityUtils.getContentCharSet(response.getEntity());
+ NamedList<Object> rsp = processor.processResponse(respBody, charset);
+ if (httpStatus != HttpStatus.SC_OK) {
+ String reason = null;
+ try {
+ NamedList err = (NamedList) rsp.get("error");
+ if (err != null) {
+ reason = (String) err.get("msg");
+ // TODO? get the trace?
+ }
+ } catch (Exception ex) {}
+ if (reason == null) {
+ StringBuilder msg = new StringBuilder();
+ msg.append(response.getStatusLine().getReasonPhrase());
+ msg.append("\n\n");
+ msg.append("request: " + method.getURI());
+ reason = java.net.URLDecoder.decode(msg.toString(), UTF_8);
+ }
+ throw new SolrException(
+ SolrException.ErrorCode.getErrorCode(httpStatus), reason);
+ }
+ return rsp;
+ } catch (ConnectException e) {
+ throw new SolrServerException("Server refused connection at: "
+ + getBaseURL(), e);
+ } catch (SocketTimeoutException e) {
+ throw new SolrServerException(
+ "Timeout occured while waiting response from server at: "
+ + getBaseURL(), e);
+ } catch (IOException e) {
+ throw new SolrServerException(
+ "IOException occured when talking to server at: " + getBaseURL(), e);
+ } finally {
+ if (respBody != null && shouldClose) {
+ try {
+ respBody.close();
+ } catch (Throwable t) {} // ignore
+ }
+ }
+ }
+
+ @Override
+ public void setFollowRedirects(boolean followRedirects) {
+ super.setFollowRedirects(followRedirects);
+ this.followRedirects = followRedirects;
+ }
+
+}
Propchange:
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ModifiedHttpSolrServer.java
------------------------------------------------------------------------------
svn:keywords = Id