[Java][华为云Java编程创造营][学习笔记][第三阶段][04_Java爬虫实战][二]

网友投稿 572 2022-05-30

1.4,综合实战1:Jsoup爬取博客

综合实战1:Jsoup爬取博客(1)

以CSDN认证的企业博客,华为开发者论坛为例,需要实现的需求:

获取博主的博客文章数量

获取博主的博客文章列表的页数

获取博主的每篇博客文章的信息(网址,创建时间,标题)

获取每篇博客文章中的图片链接

将所有获取的数据存储到数据库中

综合实战1:Jsoup爬取博客(2)

步骤:

1,博主博客网页分析

2,项目搭建及环境准备(JDK1.8,IntelliJ IDEA,MySQL5.7.32)

3,编写数据获取,数据处理,数据存储工具类

4,主函数完成业务逻辑

综合实战1:Jsoup爬取博客(3)

1,新建maven工程,新建包com.huawei,新建数据库配置文件db.properties

2,在pom.xml导入Jsoup和MySQL

4.0.0 org.example tempjava4 1.0-SNAPSHOT 8 8 org.jsoup jsoup 1.14.3 mysql mysql-connector-java 8.0.25

[Java][华为云Java编程创造营][学习笔记][第三阶段][04_Java爬虫实战][二]

3,在db.properties配置数据库信息

#mysql className=com.mysql.jdbc.Driver url=jdbc:mysql://localhost:3306/db_blog?characterEncoding=utf8 username=填写数据库登录名 password=填写数据库密码

4,用面向对象的角度来分析需求得出实体类有文章,博主,图片

博主对文章,一对多;文章对图片,一对多

5,数据库建表

如果建立数据库时没有选择UTF8格式,那么存储中文的时候就会报错

create table t_article ( id int auto_increment primary key, b_id varchar(50) null, url varchar(300) null, create_time varchar(50) null, title varchar(300) null ); create table t_blogger ( id varchar(50) not null primary key, article_count int null, page_count int null ); create table t_picture ( id int auto_increment primary key, a_id int null, url varchar(300) null );

6,实体类Article

package com.huawei.entity; /* * 文章实体类 * */ public class Article { //文章id private Integer id; //博主id private String bId; //文章的url private String url; //文章的创建时间 private String createTime; //文章的标题 private String title; public Article() { } @Override public String toString() { return "Article{" + "id=" + id + ", bId='" + bId + '\'' + ", url='" + url + '\'' + ", createTime='" + createTime + '\'' + ", title='" + title + '\'' + '}'; } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getbId() { return bId; } public void setbId(String bId) { this.bId = bId; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getCreateTime() { return createTime; } public void setCreateTime(String createTime) { this.createTime = createTime; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public Article(Integer id, String bId, String url, String createTime, String title) { this.id = id; this.bId = bId; this.url = url; this.createTime = createTime; this.title = title; } }

7,实体类Blogger

package com.huawei.entity; /* * 博主实体类 * */ public class Blogger { //博主id private String id; //博主博客文章的数量 private Integer articleCount; //博客文章的总页面数 private Integer pageCount; public Blogger() { } public Blogger(String id, Integer articleCount, Integer pageCount) { this.id = id; this.articleCount = articleCount; this.pageCount = pageCount; } @Override public String toString() { return "Blogger{" + "id='" + id + '\'' + ", articleCount=" + articleCount + ", pageCount=" + pageCount + '}'; } public String getId() { return id; } public void setId(String id) { this.id = id; } public Integer getArticleCount() { return articleCount; } public void setArticleCount(Integer articleCount) { this.articleCount = articleCount; } public Integer getPageCount() { return pageCount; } public void setPageCount(Integer pageCount) { this.pageCount = pageCount; } }

8,实体类Picture

package com.huawei.entity; /* * 图片实体类 * */ public class Picture { //图片id private Integer id; //文章id private Integer aId; //图片的url private String url; public Picture() { } @Override public String toString() { return "Picture{" + "id=" + id + ", aId=" + aId + ", url='" + url + '\'' + '}'; } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public Integer getaId() { return aId; } public void setaId(Integer aId) { this.aId = aId; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public Picture(Integer id, Integer aId, String url) { this.id = id; this.aId = aId; this.url = url; } }

9,工具类BlogUtil

package com.huawei.util; import com.huawei.entity.Article; import com.huawei.entity.Picture; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; /* * 博客数据爬取工具类 * */ public class BlogUtil { /* * 获取博客文章数量 * */ public static int getArticleCount(String blogHome) { //1,获取文档对象 Document doc = null; try { doc = Jsoup.connect(blogHome).get(); } catch (IOException e) { e.printStackTrace(); } //2,查找包含博客数量的元素 Element countElement = doc.select("span.count").first(); //3,取出元素包含的文本,这里为博客数量 String articleCount = countElement.text(); return Integer.parseInt(articleCount); } /* * 获得博客文章页数 * */ public static int getArticlePageCount(int articleCount) { //向上取整,获得页面数量 int pageCount = (int) Math.ceil(articleCount / Constants.PAGE_SIZE); return pageCount; } /* * 获取博客列表 * */ public static List

getArticleList(int pageBegin, int pageEnd, String blogHome, String bId) { //1,创建博客列表 ArrayList
articles = new ArrayList<>((pageEnd - pageBegin + 1) * ((int) Constants.PAGE_SIZE)); //2,定义变量:博客列表的网址 String articleListUrl = null; for (int i = pageBegin; i <= pageEnd; i++) { //拼接URL articleListUrl = blogHome + Constants.ARTICLE_LIST_URI + i; //1,获取document对象 Document doc = null; try { doc = Jsoup.connect(articleListUrl).get(); } catch (IOException e) { e.printStackTrace(); } //2,查找包含博客列表的元素 Element articleList = doc.select("div.article-list").first(); //3,查找每篇博客的元素 Elements articleElements = articleList.select("div.article-item-box.csdn-tracking-statistics"); for (Element element : articleElements) { //获取文章的URL String url = element.select("h4>a").first().attr("href"); //获取文章的标题 String title = element.select("h4>a").first().text().substring(3);//获取标题后去除前面的"原创 ",共3个字符 //获取文章的创建时间 String createTime = element.select("span.date").first().text(); Article article = new Article(); article.setUrl(url); article.setTitle(title); article.setCreateTime(createTime); article.setbId(bId); articles.add(article); } } return articles; } /* * 获取博客图片 * */ public static List getArticlePictures(String articleUrl) { ArrayList pictures = new ArrayList<>(); //1,获取文档对象 Document doc = null; try { doc = Jsoup.connect(articleUrl).get(); } catch (IOException e) { e.printStackTrace(); } //2,获取文章元素下面的所有图片元素,并进行遍历 Element article = doc.select("article.baidu_pl").first(); Elements images = article.select("img"); for (Element image : images) { String picUrl = image.attr("src"); Picture picture = new Picture(); picture.setUrl(picUrl); pictures.add(picture); } return pictures; } }

10,工具类Constants

package com.huawei.util; /* * 常量类 * */ public class Constants { //定义常量字符串 /* * BLOG_HOME:华为开发者论坛 https://hwdevelopers.blog.csdn.net/ * */ public static final String BLOG_HOME="https://hwdevelopers.blog.csdn.net"; /* * 博客文章列表拼接url,注意:如果BLOG_HOME的尾部加上了 / ,那么下方的头部就不要加上 / * */ public static final String ARTICLE_LIST_URI="/article/list/"; /* * 页面博客数量 * */ public static final double PAGE_SIZE=40.0; }

11,工具类JDBCUtils

package com.huawei.util; import java.io.IOException; import java.io.InputStream; import java.sql.*; import java.util.Properties; /* * JDBC工具类 * */ public class JDBCUtils { //创建全局属性 private static String className; private static String url; private static String username; private static String password; static { //加载配置文件,获取连接信息 Properties properties = new Properties(); try { InputStream in = JDBCUtils.class.getClassLoader().getResourceAsStream("db.properties"); //读取配置文件,获取信息 properties.load(in); //根据key获取value className = properties.getProperty("className"); url = properties.getProperty("url"); username = properties.getProperty("username"); password = properties.getProperty("password"); } catch (IOException e) { e.printStackTrace(); } } static { //加载驱动程序 try { Class.forName(className); } catch (ClassNotFoundException e) { e.printStackTrace(); } } //获取数据库连接对象 public static Connection getConn() { try { return DriverManager.getConnection(url, username, password); } catch (SQLException e) { e.printStackTrace(); throw new RuntimeException("数据库连接错误!"); } } //关闭连接,释放资源 public static void closeAll(Connection conn, Statement stat, ResultSet rs) { if (rs != null) { try { rs.close(); } catch (SQLException e) { e.printStackTrace(); } } if (stat != null) { try { stat.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } //方法重载 public static void closeAll(Connection conn, PreparedStatement pstmt) { if (pstmt != null) { try { pstmt.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } public static ResultSet executeQuery(String preparedSql, Object... param) { Connection conn = null; PreparedStatement pstmt = null; ResultSet res = null; /*处理SQL,执行SQL*/ try { conn = getConn();//得到数据库连接 pstmt = conn.prepareStatement(preparedSql);//得到PreparedStatement对象 if (param != null) { for (int i = 0; i < param.length; i++) { pstmt.setObject(i + 1, param[i]);//为预编译sql设置参数 } } res = pstmt.executeQuery();//执行SQL语句 } catch (SQLException e) { e.printStackTrace(); } finally { JDBCUtils.closeAll(conn, pstmt, res); } return res; } public static int executeUpdate(String preparedSql, Object... param) { Connection conn = null; PreparedStatement pstmt = null; int key = 0;//主键 /*处理SQL,执行SQL*/ try { conn = getConn();//得到数据库连接 //注意:MySQL5.1.7需要显式添加一个参数Statement.RETURN_GENERATED_KEYS pstmt = conn.prepareStatement(preparedSql,Statement.RETURN_GENERATED_KEYS);//得到PreparedStatement对象 if (param != null) { for (int i = 0; i < param.length; i++) { pstmt.setObject(i + 1, param[i]);//为预编译SQL设置参数 } } pstmt.executeUpdate();//执行SQL语句 ResultSet resultSet = pstmt.getGeneratedKeys(); if (resultSet.next()) { key = resultSet.getInt(1);//获取主键返回 } } catch (SQLException e) { e.printStackTrace();//处理异常 } finally { JDBCUtils.closeAll(conn, pstmt); } return key; } }

12,工具类StringUtil

package com.huawei.util; /* * 字符串工具类 * */ public class StringUtil { /* * 博主id处理 * */ public static String subId(String url)//url为博客网址 { if (url != null && url.trim().length() != 0)//非空判断 { /*int index = url.lastIndexOf("/");//切割博主id String id = url.substring(index + 1); return id;*/ int index1=url.indexOf("https://"); int index2 = url.indexOf("."); String id = url.substring(index1, index2); id = id.replace("https://", ""); return id; } else { return ""; } } }

13,启动类Main

package com; import com.huawei.entity.Article; import com.huawei.entity.Blogger; import com.huawei.entity.Picture; import com.huawei.util.BlogUtil; import com.huawei.util.Constants; import com.huawei.util.JDBCUtils; import com.huawei.util.StringUtil; import java.util.List; public class Main { public static String insertBlogger = "insert into t_blogger(id,article_count,page_count) value(?,?,?)"; public static String insertArticle = "insert into t_article(b_id,url,create_time,title) value(?,?,?,?)"; public static String insertPicture = "insert into t_picture(a_id,url) value(?,?)"; public static void main(String[] args) { int articleCount = BlogUtil.getArticleCount(Constants.BLOG_HOME); int pageCount = BlogUtil.getArticlePageCount(articleCount); //博主对象 Blogger blogger = new Blogger(StringUtil.subId(Constants.BLOG_HOME), articleCount, pageCount); System.out.println(blogger); JDBCUtils.executeUpdate(insertBlogger, blogger.getId(), blogger.getArticleCount(), blogger.getPageCount()); //文章列表 List

articleList = BlogUtil.getArticleList( 1, 2, Constants.BLOG_HOME, StringUtil.subId(Constants.BLOG_HOME)); for (Article article : articleList) { System.out.println(article); final int aid = JDBCUtils.executeUpdate(insertArticle, article.getbId(), article.getUrl(), article.getCreateTime(), article.getTitle()); //图片列表 List pictures = BlogUtil.getArticlePictures(article.getUrl()); for (Picture picture : pictures) { System.out.println(picture); JDBCUtils.executeUpdate(insertPicture, aid, picture.getUrl()); } } } }

Java

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:【愚公系列】2022年01月 Django商城项目13-登录界面-QQ登录功能实现
下一篇:DDD领域驱动设计实战(三)-深入理解实体
相关文章