Greetings!
I tried to get a complete page source like the one generated by a web
browser. However, HttpClient-generated file sizes change from time to
time, and I miss quite a lot of lines comparing to the browser
version. I am using an Amazon URL as an example since its page is
always super-sized.
I am really puzzled by this. What could be the cause? User agent,
buffer size, etc. Thanks.
Here is the source code:
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy;
import org.apache.http.impl.client.DefaultHttpClient;
public class RawHttpWebPageFetcher {
public static String getRaw(String url) throws
ClientProtocolException, IOException {
HttpClient httpclient = new DefaultHttpClient();
HttpGet httpget = new HttpGet(url);
HttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
StringBuilder sb = new StringBuilder();
if (entity != null) {
BufferedReader isr = new BufferedReader(new
InputStreamReader(entity.getContent(), "UTF-8"));
int byteR;
while((byteR = isr.read()) != -1) {
char ch = (char) byteR;
if(ch != '\n' && ch != '\r')
sb.append(ch);
}
isr.close();
}
return sb.toString();
}
static public void main(String [] args) throws
ClientProtocolException, IOException {
String url =
"http://www.amazon.com/Nikon-D3100-Digital-18-55mm-3-5-5-6/dp/B003ZYF3LO/ref=zg_bs_281052_3";
String oneLiner = RawHttpWebPageFetcher.getRaw(url);
String[] outputFileNames = url.split("://");
String outputFileName = outputFileNames[1].replaceAll("/",
"-").replaceAll("\\.","-");
PrintWriter out = new PrintWriter(new
FileWriter(outputFileName.concat(".html")));
System.out.println(outputFileName);
out.print(oneLiner);
out.close();
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]