[Java][华为云Java编程创造营][学习笔记][第三阶段][04_Java爬虫实战][二]
1.4,综合实战1:Jsoup爬取博客
综合实战1:Jsoup爬取博客(1)
以CSDN认证的企业博客,华为开发者论坛为例,需要实现的需求:
获取博主的博客文章数量
获取博主的博客文章列表的页数
获取博主的每篇博客文章的信息(网址,创建时间,标题)
获取每篇博客文章中的图片链接
将所有获取的数据存储到数据库中
综合实战1:Jsoup爬取博客(2)
步骤:
1,博主博客网页分析
2,项目搭建及环境准备(JDK1.8,IntelliJ IDEA,MySQL5.7.32)
3,编写数据获取,数据处理,数据存储工具类
4,主函数完成业务逻辑
综合实战1:Jsoup爬取博客(3)
1,新建maven工程,新建包com.huawei,新建数据库配置文件db.properties
2,在pom.xml导入Jsoup和MySQL
3,在db.properties配置数据库信息
#mysql className=com.mysql.jdbc.Driver url=jdbc:mysql://localhost:3306/db_blog?characterEncoding=utf8 username=填写数据库登录名 password=填写数据库密码
4,用面向对象的角度来分析需求得出实体类有文章,博主,图片
博主对文章,一对多;文章对图片,一对多
5,数据库建表
如果建立数据库时没有选择UTF8格式,那么存储中文的时候就会报错
create table t_article ( id int auto_increment primary key, b_id varchar(50) null, url varchar(300) null, create_time varchar(50) null, title varchar(300) null ); create table t_blogger ( id varchar(50) not null primary key, article_count int null, page_count int null ); create table t_picture ( id int auto_increment primary key, a_id int null, url varchar(300) null );
6,实体类Article
package com.huawei.entity; /* * 文章实体类 * */ public class Article { //文章id private Integer id; //博主id private String bId; //文章的url private String url; //文章的创建时间 private String createTime; //文章的标题 private String title; public Article() { } @Override public String toString() { return "Article{" + "id=" + id + ", bId='" + bId + '\'' + ", url='" + url + '\'' + ", createTime='" + createTime + '\'' + ", title='" + title + '\'' + '}'; } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getbId() { return bId; } public void setbId(String bId) { this.bId = bId; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getCreateTime() { return createTime; } public void setCreateTime(String createTime) { this.createTime = createTime; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public Article(Integer id, String bId, String url, String createTime, String title) { this.id = id; this.bId = bId; this.url = url; this.createTime = createTime; this.title = title; } }
7,实体类Blogger
package com.huawei.entity; /* * 博主实体类 * */ public class Blogger { //博主id private String id; //博主博客文章的数量 private Integer articleCount; //博客文章的总页面数 private Integer pageCount; public Blogger() { } public Blogger(String id, Integer articleCount, Integer pageCount) { this.id = id; this.articleCount = articleCount; this.pageCount = pageCount; } @Override public String toString() { return "Blogger{" + "id='" + id + '\'' + ", articleCount=" + articleCount + ", pageCount=" + pageCount + '}'; } public String getId() { return id; } public void setId(String id) { this.id = id; } public Integer getArticleCount() { return articleCount; } public void setArticleCount(Integer articleCount) { this.articleCount = articleCount; } public Integer getPageCount() { return pageCount; } public void setPageCount(Integer pageCount) { this.pageCount = pageCount; } }
8,实体类Picture
package com.huawei.entity; /* * 图片实体类 * */ public class Picture { //图片id private Integer id; //文章id private Integer aId; //图片的url private String url; public Picture() { } @Override public String toString() { return "Picture{" + "id=" + id + ", aId=" + aId + ", url='" + url + '\'' + '}'; } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public Integer getaId() { return aId; } public void setaId(Integer aId) { this.aId = aId; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public Picture(Integer id, Integer aId, String url) { this.id = id; this.aId = aId; this.url = url; } }
9,工具类BlogUtil
package com.huawei.util; import com.huawei.entity.Article; import com.huawei.entity.Picture; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; /* * 博客数据爬取工具类 * */ public class BlogUtil { /* * 获取博客文章数量 * */ public static int getArticleCount(String blogHome) { //1,获取文档对象 Document doc = null; try { doc = Jsoup.connect(blogHome).get(); } catch (IOException e) { e.printStackTrace(); } //2,查找包含博客数量的元素 Element countElement = doc.select("span.count").first(); //3,取出元素包含的文本,这里为博客数量 String articleCount = countElement.text(); return Integer.parseInt(articleCount); } /* * 获得博客文章页数 * */ public static int getArticlePageCount(int articleCount) { //向上取整,获得页面数量 int pageCount = (int) Math.ceil(articleCount / Constants.PAGE_SIZE); return pageCount; } /* * 获取博客列表 * */ public static List
10,工具类Constants
package com.huawei.util; /* * 常量类 * */ public class Constants { //定义常量字符串 /* * BLOG_HOME:华为开发者论坛 https://hwdevelopers.blog.csdn.net/ * */ public static final String BLOG_HOME="https://hwdevelopers.blog.csdn.net"; /* * 博客文章列表拼接url,注意:如果BLOG_HOME的尾部加上了 / ,那么下方的头部就不要加上 / * */ public static final String ARTICLE_LIST_URI="/article/list/"; /* * 页面博客数量 * */ public static final double PAGE_SIZE=40.0; }
11,工具类JDBCUtils
package com.huawei.util; import java.io.IOException; import java.io.InputStream; import java.sql.*; import java.util.Properties; /* * JDBC工具类 * */ public class JDBCUtils { //创建全局属性 private static String className; private static String url; private static String username; private static String password; static { //加载配置文件,获取连接信息 Properties properties = new Properties(); try { InputStream in = JDBCUtils.class.getClassLoader().getResourceAsStream("db.properties"); //读取配置文件,获取信息 properties.load(in); //根据key获取value className = properties.getProperty("className"); url = properties.getProperty("url"); username = properties.getProperty("username"); password = properties.getProperty("password"); } catch (IOException e) { e.printStackTrace(); } } static { //加载驱动程序 try { Class.forName(className); } catch (ClassNotFoundException e) { e.printStackTrace(); } } //获取数据库连接对象 public static Connection getConn() { try { return DriverManager.getConnection(url, username, password); } catch (SQLException e) { e.printStackTrace(); throw new RuntimeException("数据库连接错误!"); } } //关闭连接,释放资源 public static void closeAll(Connection conn, Statement stat, ResultSet rs) { if (rs != null) { try { rs.close(); } catch (SQLException e) { e.printStackTrace(); } } if (stat != null) { try { stat.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } //方法重载 public static void closeAll(Connection conn, PreparedStatement pstmt) { if (pstmt != null) { try { pstmt.close(); } catch (SQLException e) { e.printStackTrace(); } } if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } public static ResultSet executeQuery(String preparedSql, Object... param) { Connection conn = null; PreparedStatement pstmt = null; ResultSet res = null; /*处理SQL,执行SQL*/ try { conn = getConn();//得到数据库连接 pstmt = conn.prepareStatement(preparedSql);//得到PreparedStatement对象 if (param != null) { for (int i = 0; i < param.length; i++) { pstmt.setObject(i + 1, param[i]);//为预编译sql设置参数 } } res = pstmt.executeQuery();//执行SQL语句 } catch (SQLException e) { e.printStackTrace(); } finally { JDBCUtils.closeAll(conn, pstmt, res); } return res; } public static int executeUpdate(String preparedSql, Object... param) { Connection conn = null; PreparedStatement pstmt = null; int key = 0;//主键 /*处理SQL,执行SQL*/ try { conn = getConn();//得到数据库连接 //注意:MySQL5.1.7需要显式添加一个参数Statement.RETURN_GENERATED_KEYS pstmt = conn.prepareStatement(preparedSql,Statement.RETURN_GENERATED_KEYS);//得到PreparedStatement对象 if (param != null) { for (int i = 0; i < param.length; i++) { pstmt.setObject(i + 1, param[i]);//为预编译SQL设置参数 } } pstmt.executeUpdate();//执行SQL语句 ResultSet resultSet = pstmt.getGeneratedKeys(); if (resultSet.next()) { key = resultSet.getInt(1);//获取主键返回 } } catch (SQLException e) { e.printStackTrace();//处理异常 } finally { JDBCUtils.closeAll(conn, pstmt); } return key; } }
12,工具类StringUtil
package com.huawei.util; /* * 字符串工具类 * */ public class StringUtil { /* * 博主id处理 * */ public static String subId(String url)//url为博客网址 { if (url != null && url.trim().length() != 0)//非空判断 { /*int index = url.lastIndexOf("/");//切割博主id String id = url.substring(index + 1); return id;*/ int index1=url.indexOf("https://"); int index2 = url.indexOf("."); String id = url.substring(index1, index2); id = id.replace("https://", ""); return id; } else { return ""; } } }
13,启动类Main
package com; import com.huawei.entity.Article; import com.huawei.entity.Blogger; import com.huawei.entity.Picture; import com.huawei.util.BlogUtil; import com.huawei.util.Constants; import com.huawei.util.JDBCUtils; import com.huawei.util.StringUtil; import java.util.List; public class Main { public static String insertBlogger = "insert into t_blogger(id,article_count,page_count) value(?,?,?)"; public static String insertArticle = "insert into t_article(b_id,url,create_time,title) value(?,?,?,?)"; public static String insertPicture = "insert into t_picture(a_id,url) value(?,?)"; public static void main(String[] args) { int articleCount = BlogUtil.getArticleCount(Constants.BLOG_HOME); int pageCount = BlogUtil.getArticlePageCount(articleCount); //博主对象 Blogger blogger = new Blogger(StringUtil.subId(Constants.BLOG_HOME), articleCount, pageCount); System.out.println(blogger); JDBCUtils.executeUpdate(insertBlogger, blogger.getId(), blogger.getArticleCount(), blogger.getPageCount()); //文章列表 List
Java
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。