用Java写爬虫爬桃花族

用HttpClient和正则表达式爬桃花族图片保存到本地。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package com.sunnie.taohuazu;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public class Crawler {
private final static Logger logger = LogManager.getLogger(Crawler.class);

private static List<String> userAgents;

static {
// 相当于返回一个new ArrayList,链式编程
userAgents = Lists.newArrayList();
// 构造五个User-Agent
userAgents.add("spider");
userAgents.add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36");
userAgents.add("Mozilla/5.0 (Windows NT 6.1; rv:63.0) Gecko/20100101 Firefox/63.0");
userAgents.add("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
userAgents.add("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0");
}

public static Map<String, String> headers() {
// Maps.newHashMap是static方法,返回一个new HashMap,只是为了方便链式编程
Map<String, String> headers = Maps.newHashMap();
// 在userAgents里面随机选取一个构成<"User-Agent","xxx">键值对
headers.put("User-Agent", userAgents.get(Math.abs(new Random().nextInt()) % userAgents.size()));
return headers;
}

public static String httpGet(String url) {
// 得到一个用默认参数构造的CloseableHttpClient并将其传入HttpClient
// CloseableHttpClient实现了HttpClient,Closeable两个接口
org.apache.http.client.HttpClient httpClient = HttpClients.createDefault();
try {
// HttpGet是以实体的形式返回Request-URI标识的信息
HttpGet httpGet = new HttpGet(url);
// httpGet.setHeader(k,v)
headers().forEach(httpGet::setHeader);
// 模拟一次http请求返回httpResponse
HttpResponse httpResponse = httpClient.execute(httpGet);
// httpResponse转成UTF-8编码的字符串
return EntityUtils.toString(httpResponse.getEntity(), "UTF-8");
} catch (Exception e) {
logger.error(e);
}
return null;
}

public static InputStream getPicture(String url) {
org.apache.http.client.HttpClient httpClient = HttpClients.createDefault();
try {
HttpGet httpGet = new HttpGet(url);
headers().forEach(httpGet::setHeader);
HttpResponse httpResponse = httpClient.execute(httpGet);
return httpResponse.getEntity().getContent();
} catch (Exception e) {
logger.error(e);
}
return null;
}

public static void savePic(String url) {
System.out.println("picture url " + url);
String dir = "/Users/sunnie/sunniedoc/taohuazu/";
String filename = String.valueOf(System.currentTimeMillis());
InputStream inputStream = getPicture(url);
File file = new File(dir);
if (!file.exists()) {
if (!file.mkdirs()) {//若创建文件夹不成功
System.out.println("Unable to create external cache directory");
}
}

File targetfile = new File(dir + filename);
OutputStream os = null;
try {
os = new FileOutputStream(targetfile);
int ch = 0;
while ((ch = inputStream.read()) != -1) {
os.write(ch);
}
os.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
os.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}

public static List<String> getRegExContent(String content, String regex) {
List<String> list = new ArrayList<>();
// Pattern Matcher
Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
Matcher ma = pa.matcher(content);
while (ma.find()) {
list.add(ma.group(1));
}
return list;
}

public static void singlePageUrlList(String url) {
System.out.println("website url " + url);
String content = httpGet(url);
List<String> urlList = getRegExContent(content, "class=\"zoom\" src=\"(.*?)\"");
if (urlList.isEmpty()) {
urlList = getRegExContent(content, "class=\"zoom\" file=\"(.*?)\"");
}
// 把每个url都作为图片存下来
urlList.forEach(Crawler::savePic);
}

public static void main(String[] args) {
String urlPrefix = "http://thzbt.co/";
String urlPagePrefix = "http://thzbt.co/forum-181-";
IntStream.range(1,10).forEach(i -> {
String url = urlPagePrefix + i +".html";
String content = httpGet(url);
List<String> webUrlList = getRegExContent(content, "<a href=\"(thread-.*?.html)\"");
webUrlList = webUrlList.stream().distinct().collect(Collectors.toList());
webUrlList.forEach(webSite -> singlePageUrlList(urlPrefix + webSite));
});
}
}

pom.xml添加如下依赖:

1
2
3
4
5
6
7
8
9
10
11
12
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>r05</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>

接下来看本地文件夹发生的羞羞的事情,你懂得~

谢谢小天使请我吃糖果
0%