java获取网页源码(自动识别编码,和网址跳转)

private final static int timeOut=Config.getInt("timeOut"); /**连接超时时间*/
private final static int retryCount=Config.getInt("retryTimes"); /**抓取失败重试次数*/
private final static boolean useProIp=Config.getBoolean("useproxIp"); /**是否使用代理ip*/

final static DefaultHttpClient client = new DefaultHttpClient();
private static final Pattern p_charset = https://www.360docs.net/doc/7c16441159.html,pile("charset\\s?=\\s?([a-zA-Z0-9\\-]+)");
private static final Pattern p_encoding = https://www.360docs.net/doc/7c16441159.html,pile("encoding=\"([a-zA-Z0-9\\-]+)\"");
private static final Pattern P_PINCODE = https://www.360docs.net/doc/7c16441159.html,pile("pincode");

public static synchronized String getContent(String url) throws Exception {

int trycount = 0;
URL target = new URL(url);

HttpGet httpGet = null;
String content = "";
while ((content.length() == 0) && (trycount try {
httpGet = new HttpGet(url);
httpGet.addHeader("Accept-Language", "zh-cn,zh;q=0.5");
httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpGet.addHeader("User-Agent", UA);
httpGet.addHeader("Host", target.getHost());
httpGet.addHeader("Referer", "http://" + target.getHost() + "/");

/**设置连接超时时间*/
HttpConnectionParams.setConnectionTimeout(httpGet.getParams(), timeOut);
HttpConnectionParams.setSoTimeout(httpGet.getParams(), timeOut);


HttpResponse httpResponse = client.execute(httpGet);

int statusCode = httpResponse.getStatusLine().getStatusCode(); //查看响应状态

if (statusCode== 200) {
HttpEntity entity = httpResponse.getEntity();
content = entityToString(entity);

/**url跳转*/
}else if ((statusCode == HttpStatus.SC_MOVED_TEMPORARILY) ||(statusCode == HttpStatus.SC_MOVED_PERMANENTLY) ||(statusCode == HttpStatus.SC_SEE_OTHER) ||(statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {
Header header = httpResponse.getLastHeader("location");
if(header!=null){
httpGet.abort();
return getContent(header.getValue());
}
}
}
catch (Exception e)
{
MyPrint.printError(url,e);
} finally {
if ((content.length() == 0)&&useProIp) {
MyIpPool.getSingle().setNextIp();
}
httpGet.abort();
trycount++;
}
}
return content;
}


private static String entityToString(HttpEntity entity){

String charset = EntityUtils.getContentCharSet(entity);
try
{
byte[] bytes = EntityUtils.toByteArray(entity);

if (charset == null){

Matcher m = p_charset.matcher(new String(bytes));
if (m.find()) {
charset = m.group(1).trim();
if ("GB2312".equalsIgnoreCase(charset)){
charset = "GBK";
}
}else {
m = p_encoding.matcher(new String(bytes));
if (m.find()) {
charset = m.group(1).trim();
if ("GB2312".equalsIgnoreCase(charset)) {
charset = "GBK";
}
}
}
}

charset = charset =

= null ? "GBK" : charset;

String content = new String(bytes, charset);

Matcher matcher = P_PINCODE.matcher(content);
if (matcher.find()) {
return "";
}
return content;
}
catch (Exception e){
e.printStackTrace();
} finally {
try {
entity.getContent().close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}

相关文档
最新文档