Java 第三方类 HttpClient,可以使用 http、https、socks5 代理爬取数据,其中代理包含无密码和有密码授权。

若想获得免费代理IP,或更多详细的代理使用示例,请见米扑代理示例代码:https://proxy.mimvp.com/demo2.php

 

HttpClient 下载

HttpClient 是Apache 的第三方库,需要下载引入jar包,才可使用。

HttpClient 源码和jar包下载:http://hc.apache.org/downloads.cgi

 

 

HttpClient 爬取网页

// 通过API实时获取米扑代理
public static void spider_proxy() {
	String proxy_url = "https://proxyapi.mimvp.com/api/fetchsecret.php?orderid=868435221231212345&http_type=3";

	try {
		@SuppressWarnings({ "resource", "deprecation" })		
		HttpClient client = new DefaultHttpClient();					// 舍弃的用法
		CloseableHttpClient client2 = HttpClients.createDefault();	// 推荐的用法
		
		HttpGet request = new HttpGet(proxy_url);
		HttpResponse response = client2.execute(request);
		
		if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			String result = EntityUtils.toString(response.getEntity());

			String[] proxy_list;
			proxy_list = result.split("\n");
			for (String proxy : proxy_list) {
				System.out.println(proxy);
			}
		}
		client2.close();
	} catch (Exception e) {
		System.out.println(e.toString());
	}
}

 

 

HttpClient 使用无密码代理爬取网页

// 方法1:无密代理, 支持 http、https(proxy)
public static void proxy_no_auth(String proxyType, String proxyIpPort, String webUrl) {
	System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl);
	String proxy_ip = proxyIpPort.split(":")[0];
	int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]);
	
	try {
		CloseableHttpClient client = HttpClients.createDefault();
		HttpHost proxy = new HttpHost(proxy_ip, proxy_port);
		
		HttpGet request = new HttpGet(webUrl);
		request.addHeader("Host","proxy.mimvp.com");
		request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
		
		HttpResponse response = client.execute(proxy, request);
		if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			String result = EntityUtils.toString(response.getEntity());
			System.out.println(result);
		}
		client.close();
	} catch (Exception e) {
		System.out.println(e.toString());
	}
}


// 方法2:无密代理, 支持 http(config)
public static void proxy_no_auth2(String proxyType, String proxyIpPort, String webUrl) {
	System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl);
	String proxy_ip = proxyIpPort.split(":")[0];
	int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]);
	
	try {
		HttpClientBuilder builder = HttpClientBuilder.create();
		CloseableHttpClient client = builder.build();
		
		URL url = new URL(webUrl);
		HttpHost target = new HttpHost(url.getHost(), url.getDefaultPort(), url.getProtocol());
		if(proxyType.equals("https")) {
			target = new HttpHost(url.getHost(), 443, "https");
		}
		HttpHost proxy = new HttpHost(proxy_ip, proxy_port);
		
		RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
		HttpGet request = new HttpGet(url.getPath());
		request.setConfig(config);
		request.addHeader("Host","proxy.mimvp.com");
		request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
		
		HttpResponse response = client.execute(target, request);
		if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			String result = EntityUtils.toString(response.getEntity());
			System.out.println(result);
		}
		client.close();
	} catch (Exception e) {
		System.out.println(e.toString());
	}
}


// 方法3:无密代理, 支持 http(routePlanner)
public static void proxy_no_auth3(String proxyType, String proxyIpPort, String webUrl) {
	System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl);
	String proxy_ip = proxyIpPort.split(":")[0];
	int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]);
	
	try {
		HttpHost proxy = new HttpHost(proxy_ip, proxy_port);
		DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);

		CloseableHttpClient client = HttpClients.custom().setRoutePlanner(routePlanner).build();
		HttpGet request = new HttpGet(webUrl);
		request.addHeader("Host","proxy.mimvp.com");
		request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
		
		HttpResponse response = client.execute(request);
		if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			String result = EntityUtils.toString(response.getEntity());
			System.out.println(result);
		}
		client.close();
	} catch (Exception e) {
		System.out.println(e.toString());
	}
}

 

 

HttpClient 使用密码授权代理爬取网页

// HttpClient 支持socks5代理的自定义类
static class MyConnectionSocketFactory extends PlainConnectionSocketFactory {
    @Override
    public Socket createSocket(final HttpContext context) throws IOException {
        InetSocketAddress socksaddr = (InetSocketAddress) context.getAttribute("socks.address");
        Proxy proxy = new Proxy(Proxy.Type.SOCKS, socksaddr);
        return new Socket(proxy);
    }

    @Override
    public Socket connectSocket(int connectTimeout, Socket socket, HttpHost host, InetSocketAddress remoteAddress,
            InetSocketAddress localAddress, HttpContext context) throws IOException {
        InetSocketAddress unresolvedRemote = InetSocketAddress.createUnresolved(host.getHostName(), remoteAddress.getPort());
        return super.connectSocket(connectTimeout, socket, host, unresolvedRemote, localAddress, context);
    }
}

static class MySSLConnectionSocketFactory extends SSLConnectionSocketFactory {
    public MySSLConnectionSocketFactory(final SSLContext sslContext) {
//	        super(sslContext, ALLOW_ALL_HOSTNAME_VERIFIER);
        super(sslContext);
    }

    @Override
    public Socket createSocket(final HttpContext context) throws IOException {
        InetSocketAddress socksaddr = (InetSocketAddress) context.getAttribute("socks.address");
        Proxy proxy = new Proxy(Proxy.Type.SOCKS, socksaddr);
        return new Socket(proxy);
    }

    @Override
    public Socket connectSocket(int connectTimeout, Socket socket, HttpHost host, InetSocketAddress remoteAddress,
            InetSocketAddress localAddress, HttpContext context) throws IOException {
        InetSocketAddress unresolvedRemote = InetSocketAddress.createUnresolved(host.getHostName(), remoteAddress.getPort());
        return super.connectSocket(connectTimeout, socket, host, unresolvedRemote, localAddress, context);
    }
}

static class FakeDnsResolver implements DnsResolver {
    @Override
    public InetAddress[] resolve(String host) throws UnknownHostException {
        // Return some fake DNS record for every request, we won't be using it
        return new InetAddress[] { InetAddress.getByAddress(new byte[] { 1, 1, 1, 1 }) };
    }
}

// 无密代理, 支持 socks5
public static void proxy_no_auth_socks(String proxyType, String proxyIpPort, String webUrl) {
	System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl);
	String proxy_ip = proxyIpPort.split(":")[0];
	int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]);
	
	try {
		Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory> create()
	            .register("http", new MyConnectionSocketFactory())
	            .register("https", new MySSLConnectionSocketFactory(SSLContexts.createSystemDefault()))
	            .build();
	    PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(reg, new FakeDnsResolver());
	    CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();
		
		InetSocketAddress addr = new InetSocketAddress(proxy_ip, proxy_port);
		HttpClientContext context = HttpClientContext.create();
		context.setAttribute("socks.address", addr);
		
		HttpGet request = new HttpGet(webUrl);
		request.addHeader("Host","proxy.mimvp.com");
		request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
		
		HttpResponse response = client.execute(request, context);
		if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			String result = EntityUtils.toString(response.getEntity());
			System.out.println(result);
		}
	} catch (Exception e) {
		System.out.println(e.toString());
	}
}



// 有密代理,需要用户名密码授权,请先取消授权的注释(代码里有注释说明)
public static void proxy_auth(String proxyType, String proxyIpPort, String webUrl) {
	System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl);
	String proxy_ip = proxyIpPort.split(":")[0];
	int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]);
	
	CredentialsProvider provider = new BasicCredentialsProvider();
	provider.setCredentials(	new AuthScope(proxy_ip, proxy_port), 
							new UsernamePasswordCredentials(MimvpProxyJava2.PROXY_USERNAME, MimvpProxyJava2.PROXY_PASSWORD));
	
	CloseableHttpClient client = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
	try {
		URL url = new URL(webUrl);
		HttpHost target = new HttpHost(url.getHost(),url.getDefaultPort(),url.getProtocol());
		HttpHost proxy = new HttpHost(proxy_ip, proxy_port);
		
		RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
		HttpGet request = new HttpGet(url.getPath());
		request.setConfig(config);
		request.addHeader("Host","proxy.mimvp.com");
		request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
		
		HttpResponse response = client.execute(target, request);
		if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			String result = EntityUtils.toString(response.getEntity());
			System.out.println(result);
		}
	} catch (Exception e) {
		e.printStackTrace();
	}
}


// socks5代理的用户名密码授权
static class MyAuthenticator extends Authenticator {
    private String username = "";
    private String password = "";
    public MyAuthenticator(String username, String password) {
        this.username = username;
        this.password = password;
    }
    protected PasswordAuthentication getPasswordAuthentication() {
        return new PasswordAuthentication(this.username, this.password.toCharArray());
    }
}

// 有密代理, 支持 socks5
public static void proxy_auth_socks(String proxyType, String proxyIpPort, String webUrl) {
	System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl);
	String proxy_ip = proxyIpPort.split(":")[0];
	int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]);

	try {
		Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory> create()
	            .register("http", new MyConnectionSocketFactory())
	            .register("https", new MySSLConnectionSocketFactory(SSLContexts.createSystemDefault()))
	            .build();
	    PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(reg, new FakeDnsResolver());
	    CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();

		// auth 代理需要用户名密码授权时开启,取消此注释,米扑代理验证通过
		Authenticator.setDefault(new MyAuthenticator(MimvpProxyJava2.PROXY_USERNAME, MimvpProxyJava2.PROXY_PASSWORD));	
		
		InetSocketAddress addr = new InetSocketAddress(proxy_ip, proxy_port);
		HttpClientContext context = HttpClientContext.create();
		context.setAttribute("socks.address", addr);
		
		HttpGet request = new HttpGet(webUrl);
		request.addHeader("Host","proxy.mimvp.com");
		request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
		
		HttpResponse response = client.execute(request, context);
		if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			String result = EntityUtils.toString(response.getEntity());
			System.out.println(result);
		}
	} catch (Exception e) {
		System.out.println(e.toString());
	}
}

 

完整代码示例,请见米扑代理示例

https://proxy.mimvp.com/demo2.php

 

 

参考推荐

Java http 和 httpclient 使用代理采集数据 (推荐