Java HttpClient 使用 http、https、socks5 代理爬取数据
1,682 views
0
Java 第三方类 HttpClient,可以使用 http、https、socks5 代理爬取数据,其中代理包含无密码和有密码授权。
若想获得免费代理IP,或更多详细的代理使用示例,请见米扑代理示例代码:https://proxy.mimvp.com/demo.php
HttpClient 下载
HttpClient 是Apache 的第三方库,需要下载引入jar包,才可使用。
HttpClient 源码和jar包下载:http://hc.apache.org/downloads.cgi
HttpClient 爬取网页
// 通过API实时获取米扑代理 public static void spider_proxy() { String proxy_url = "https://proxyapi.mimvp.com/api/fetchsecret.php?orderid=868435221231212345&http_type=3"; try { @SuppressWarnings({ "resource", "deprecation" }) HttpClient client = new DefaultHttpClient(); // 舍弃的用法 CloseableHttpClient client2 = HttpClients.createDefault(); // 推荐的用法 HttpGet request = new HttpGet(proxy_url); HttpResponse response = client2.execute(request); if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { String result = EntityUtils.toString(response.getEntity()); String[] proxy_list; proxy_list = result.split("\n"); for (String proxy : proxy_list) { System.out.println(proxy); } } client2.close(); } catch (Exception e) { System.out.println(e.toString()); } }
HttpClient 使用无密码代理爬取网页
// 方法1:无密代理, 支持 http、https(proxy) public static void proxy_no_auth(String proxyType, String proxyIpPort, String webUrl) { System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl); String proxy_ip = proxyIpPort.split(":")[0]; int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]); try { CloseableHttpClient client = HttpClients.createDefault(); HttpHost proxy = new HttpHost(proxy_ip, proxy_port); HttpGet request = new HttpGet(webUrl); request.addHeader("Host","proxy.mimvp.com"); request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); HttpResponse response = client.execute(proxy, request); if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { String result = EntityUtils.toString(response.getEntity()); System.out.println(result); } client.close(); } catch (Exception e) { System.out.println(e.toString()); } } // 方法2:无密代理, 支持 http(config) public static void proxy_no_auth2(String proxyType, String proxyIpPort, String webUrl) { System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl); String proxy_ip = proxyIpPort.split(":")[0]; int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]); try { HttpClientBuilder builder = HttpClientBuilder.create(); CloseableHttpClient client = builder.build(); URL url = new URL(webUrl); HttpHost target = new HttpHost(url.getHost(), url.getDefaultPort(), url.getProtocol()); if(proxyType.equals("https")) { target = new HttpHost(url.getHost(), 443, "https"); } HttpHost proxy = new HttpHost(proxy_ip, proxy_port); RequestConfig config = RequestConfig.custom().setProxy(proxy).build(); HttpGet request = new HttpGet(url.getPath()); request.setConfig(config); request.addHeader("Host","proxy.mimvp.com"); request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); HttpResponse response = client.execute(target, request); if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { String result = EntityUtils.toString(response.getEntity()); System.out.println(result); } client.close(); } catch (Exception e) { System.out.println(e.toString()); } } // 方法3:无密代理, 支持 http(routePlanner) public static void proxy_no_auth3(String proxyType, String proxyIpPort, String webUrl) { System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl); String proxy_ip = proxyIpPort.split(":")[0]; int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]); try { HttpHost proxy = new HttpHost(proxy_ip, proxy_port); DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy); CloseableHttpClient client = HttpClients.custom().setRoutePlanner(routePlanner).build(); HttpGet request = new HttpGet(webUrl); request.addHeader("Host","proxy.mimvp.com"); request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); HttpResponse response = client.execute(request); if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { String result = EntityUtils.toString(response.getEntity()); System.out.println(result); } client.close(); } catch (Exception e) { System.out.println(e.toString()); } }
HttpClient 使用密码授权代理爬取网页
// HttpClient 支持socks5代理的自定义类 static class MyConnectionSocketFactory extends PlainConnectionSocketFactory { @Override public Socket createSocket(final HttpContext context) throws IOException { InetSocketAddress socksaddr = (InetSocketAddress) context.getAttribute("socks.address"); Proxy proxy = new Proxy(Proxy.Type.SOCKS, socksaddr); return new Socket(proxy); } @Override public Socket connectSocket(int connectTimeout, Socket socket, HttpHost host, InetSocketAddress remoteAddress, InetSocketAddress localAddress, HttpContext context) throws IOException { InetSocketAddress unresolvedRemote = InetSocketAddress.createUnresolved(host.getHostName(), remoteAddress.getPort()); return super.connectSocket(connectTimeout, socket, host, unresolvedRemote, localAddress, context); } } static class MySSLConnectionSocketFactory extends SSLConnectionSocketFactory { public MySSLConnectionSocketFactory(final SSLContext sslContext) { // super(sslContext, ALLOW_ALL_HOSTNAME_VERIFIER); super(sslContext); } @Override public Socket createSocket(final HttpContext context) throws IOException { InetSocketAddress socksaddr = (InetSocketAddress) context.getAttribute("socks.address"); Proxy proxy = new Proxy(Proxy.Type.SOCKS, socksaddr); return new Socket(proxy); } @Override public Socket connectSocket(int connectTimeout, Socket socket, HttpHost host, InetSocketAddress remoteAddress, InetSocketAddress localAddress, HttpContext context) throws IOException { InetSocketAddress unresolvedRemote = InetSocketAddress.createUnresolved(host.getHostName(), remoteAddress.getPort()); return super.connectSocket(connectTimeout, socket, host, unresolvedRemote, localAddress, context); } } static class FakeDnsResolver implements DnsResolver { @Override public InetAddress[] resolve(String host) throws UnknownHostException { // Return some fake DNS record for every request, we won't be using it return new InetAddress[] { InetAddress.getByAddress(new byte[] { 1, 1, 1, 1 }) }; } } // 无密代理, 支持 socks5 public static void proxy_no_auth_socks(String proxyType, String proxyIpPort, String webUrl) { System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl); String proxy_ip = proxyIpPort.split(":")[0]; int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]); try { Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory> create() .register("http", new MyConnectionSocketFactory()) .register("https", new MySSLConnectionSocketFactory(SSLContexts.createSystemDefault())) .build(); PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(reg, new FakeDnsResolver()); CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build(); InetSocketAddress addr = new InetSocketAddress(proxy_ip, proxy_port); HttpClientContext context = HttpClientContext.create(); context.setAttribute("socks.address", addr); HttpGet request = new HttpGet(webUrl); request.addHeader("Host","proxy.mimvp.com"); request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); HttpResponse response = client.execute(request, context); if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { String result = EntityUtils.toString(response.getEntity()); System.out.println(result); } } catch (Exception e) { System.out.println(e.toString()); } } // 有密代理,需要用户名密码授权,请先取消授权的注释(代码里有注释说明) public static void proxy_auth(String proxyType, String proxyIpPort, String webUrl) { System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl); String proxy_ip = proxyIpPort.split(":")[0]; int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]); CredentialsProvider provider = new BasicCredentialsProvider(); provider.setCredentials( new AuthScope(proxy_ip, proxy_port), new UsernamePasswordCredentials(MimvpProxyJava2.PROXY_USERNAME, MimvpProxyJava2.PROXY_PASSWORD)); CloseableHttpClient client = HttpClients.custom().setDefaultCredentialsProvider(provider).build(); try { URL url = new URL(webUrl); HttpHost target = new HttpHost(url.getHost(),url.getDefaultPort(),url.getProtocol()); HttpHost proxy = new HttpHost(proxy_ip, proxy_port); RequestConfig config = RequestConfig.custom().setProxy(proxy).build(); HttpGet request = new HttpGet(url.getPath()); request.setConfig(config); request.addHeader("Host","proxy.mimvp.com"); request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); HttpResponse response = client.execute(target, request); if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { String result = EntityUtils.toString(response.getEntity()); System.out.println(result); } } catch (Exception e) { e.printStackTrace(); } } // socks5代理的用户名密码授权 static class MyAuthenticator extends Authenticator { private String username = ""; private String password = ""; public MyAuthenticator(String username, String password) { this.username = username; this.password = password; } protected PasswordAuthentication getPasswordAuthentication() { return new PasswordAuthentication(this.username, this.password.toCharArray()); } } // 有密代理, 支持 socks5 public static void proxy_auth_socks(String proxyType, String proxyIpPort, String webUrl) { System.out.println(proxyType + " , " + proxyIpPort + " , " + webUrl); String proxy_ip = proxyIpPort.split(":")[0]; int proxy_port = Integer.parseInt(proxyIpPort.split(":")[1]); try { Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory> create() .register("http", new MyConnectionSocketFactory()) .register("https", new MySSLConnectionSocketFactory(SSLContexts.createSystemDefault())) .build(); PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(reg, new FakeDnsResolver()); CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build(); // auth 代理需要用户名密码授权时开启,取消此注释,米扑代理验证通过 Authenticator.setDefault(new MyAuthenticator(MimvpProxyJava2.PROXY_USERNAME, MimvpProxyJava2.PROXY_PASSWORD)); InetSocketAddress addr = new InetSocketAddress(proxy_ip, proxy_port); HttpClientContext context = HttpClientContext.create(); context.setAttribute("socks.address", addr); HttpGet request = new HttpGet(webUrl); request.addHeader("Host","proxy.mimvp.com"); request.addHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); HttpResponse response = client.execute(request, context); if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { String result = EntityUtils.toString(response.getEntity()); System.out.println(result); } } catch (Exception e) { System.out.println(e.toString()); } }
完整代码示例,请见米扑代理示例:
https://proxy.mimvp.com/demo.php
参考推荐:
Java http 和 httpclient 使用代理采集数据 (推荐)
版权所有: 本文系米扑博客原创、转载、摘录,或修订后发表,最后更新于 2018-12-13 00:21:59
侵权处理: 本个人博客,不盈利,若侵犯了您的作品权,请联系博主删除,莫恶意,索钱财,感谢!