Java实现爬虫遍历访问个人博客

wylc123 1年前 ⋅ 2331 阅读

1.获取页面流

package com.cnki.base.utils;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 * @author SongBin on 2018/11/1.
 */
public class HttpUtil {
    /**
     * 访问URL并拿到响应代码
     * @param urlstr
     * @return
     * @throws IOException
     */
    public static InputStream doGet(String urlstr) throws IOException {
        URL url= new URL(urlstr);
        HttpURLConnection conn= (HttpURLConnection) url.openConnection();
        conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
        InputStream inputStream= conn.getInputStream();
        return inputStream;
    }
}

2.将页面流转为字符串

package com.cnki.base.utils;

import java.io.IOException;
import java.io.InputStream;

/**
 * @author SongBin on 2018/11/1.
 *
 * 将响应的InputStream转成String的代码
 */
public class StreamUtil {
    public static String inputStreamToString(InputStream is, String charset) throws IOException {
        byte[] bytes = new byte[1024];
        int byteLength = 0;
        StringBuffer sb = new StringBuffer();
        while((byteLength = is.read(bytes)) != -1) {
            sb.append(new String(bytes, 0, byteLength, charset));
        }
        return sb.toString();
    }
}

3.刷流量方法

package com.cnki.base.utils;

import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author SongBin on 2018/11/1.
 *
 * 刷csdn博客访问量
 */
public class AddCsdnBlogPV {
    private Set<String> blogListPageUrls = new TreeSet<>();
    private Set<String> blogUrls = new TreeSet<>();

    public void visitBlog() throws IOException {
        addBlogUrl();
        for(String blogUrl : blogUrls) {
            String artlUrl = myBlogUrl + blogUrl;
            InputStream is = HttpUtil.doGet(artlUrl);
            if (is != null) {
                System.out.println(artlUrl + "访问成功");
            }
            is.close();
        }
    }

    /**
     * @throws IOException
     * 加载所有的bolg地址
     */
    public void addBlogUrl() throws IOException {
        blogListPageUrls.add(myBlogUrl);
        addBlogListPageUrl(myBlogUrl, blogListPageUrls);
        for (String bolgListUrl : blogListPageUrls) {
            addBlogUrl(bolgListUrl, blogUrls);
        }
    }

    /**
     * 通过下一页,遍历所有博客目录页面链接
     * @param pageUrl
     * @param pagelistUrls
     * @throws IOException
     */
    private String myBlogUrl = "http://www.songbin.top/";
    /*<a href="index?ord=newest&pn=2" pageNo="2">*/
    private String nextPagePanner = "<a href=\"index\\?ord=newest&pn=[0-9]{1,10}\" pageNo=\"[0-9]{1,10}\">&nbsp;<i class=\"fa fa-angle-right\"></i>&nbsp;</a>";	//下一页的正则表达式
    private String nextPageUrlPanner = "index\\?ord=newest&pn=[0-9]{1,10}";

    public void addBlogListPageUrl(String pageUrl, Set<String> pagelistUrls) throws IOException {
        InputStream is = HttpUtil.doGet(pageUrl);
        String pageStr = StreamUtil.inputStreamToString(is, "UTF-8");
        System.out.print(pageStr);
        is.close();
        Pattern nextPagePattern = Pattern.compile(nextPagePanner);
        Matcher nextPagematcher = nextPagePattern.matcher(pageStr);
        if (nextPagematcher.find()) {
            nextPagePattern = Pattern.compile(nextPageUrlPanner);
            nextPagematcher = nextPagePattern.matcher(nextPagematcher.group(0));
            if (nextPagematcher.find()) {
                pagelistUrls.add(myBlogUrl + nextPagematcher.group(0));
                System.out.println("成功添加博客列表页面地址:" + myBlogUrl + nextPagematcher.group(0));
                addBlogListPageUrl(myBlogUrl + nextPagematcher.group(0), pagelistUrls);
            }
        }
    }

    /**
     * 添加搜索博客目录的博客链接
     * @param blogListURL 博客目录地址
     * @param artlUrls    存放博客访问地址的集合
     * @throws IOException
     */
    private String artlUrl = "/view/[0-9]{1,10}";//博客url的正则表达式
    public void addBlogUrl(String blogListURL, Set<String> artlUrls) throws IOException {
        InputStream is = HttpUtil.doGet(blogListURL);
        String pageStr = StreamUtil.inputStreamToString(is, "UTF-8");
        is.close();
        Pattern pattern = Pattern.compile(artlUrl);
        Matcher matcher = pattern.matcher(pageStr);
        while (matcher.find()) {
            String e = matcher.group(0);
            System.out.println("成功添加博客地址:" + e);
            artlUrls.add(e);
        }
    }
}

4.测试类

package com.cnki.utilstest;
import com.cnki.base.utils.AddCsdnBlogPV;
import java.io.IOException;

/**
 * @author SongBin on 2018/11/1.
 */
public class TestUtils {

    public static void main(String[] args){
        AddCsdnBlogPV addBlogPv = new AddCsdnBlogPV();
        try {
            addBlogPv.visitBlog();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
更多内容请访问:IT源点

相关文章推荐

全部评论: 0

    我有话说: