注意:伪造请求头不懂的话可以看一下小编第一篇文章教程 Java Jsoup爬虫入门(1)
pom.xml依赖
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
Demo
注意:伪造请求头不懂的话可以看一下小编第一篇文章教程 Java Jsoup爬虫入门(1)
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
public class Demo {
//获取文章分类
@Test
public void text03() throws Exception {
//爬取的目标网站
String targetURL = "http://dt2008.cn";
//获取connect
Connection connection = Jsoup.connect(targetURL);
//伪造请求头
connection.header("Accept", "");
connection.header("Accept-Encoding", "");
connection.header("Accept-Language", "");
connection.header("Cache-Control", "");
connection.header("Connection", "");
connection.header("Cookie", "");
connection.header("Host", "");
connection.header("User-Agent", "");
connection.ignoreHttpErrors(true);
//执行
Connection.Response execute = connection.method(Connection.Method.GET).execute();
//获取Document,这里的document就代表者爬取下来的整个页面
Document document = execute.parse();
//body()方法:获取爬取得到得html的body标签
//Element和Elements中的方法都是jQuery中的同名方法,功能也基本相同
Element bodyElement = document.body();
//获取视图
Elements select = bodyElement.select(".entry-wrapper");//css选择器
for(Element sele : select){
Elements title = sele.select(".entry-title a");
//text()方法:用于获取元素节点内的html标签
System.out.println("文章名称:" + title.text());
Elements selec = sele.select(".meta-category a");
String selecHtml = selec.text();
System.out.println("文章分类:" + selecHtml);
}
}
}
bodyElement.select(".entry-wrapper") 是父类div的class
运行结果
文章评论