您当前的位置: 首页 >  梁云亮

【精品】爬取 国家统计局 2020年 省市县乡村 数据

梁云亮 发布时间:2021-05-02 03:55:09 ,浏览量:3

对应本博客还有一个简化版的,请参看:省市区级联SQL文件)

说明

费了好大的劲把数据从官网上爬下来并导入到MySQL中 在这里插入图片描述 国家统计局官网地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/54/5402.html

爬虫代码
package com.hc;

import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.hc.domain.*;
import com.hc.mapper.*;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;

import javax.annotation.Resource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 全国省市县镇村数据爬取
 *
 * @author 梁云亮
 */
@Slf4j
@SpringBootTest
public class InitAdd5Tables {

    /**
     * 建立连接
     */
    private Document connect(String url) {
        if (url == null || url.isEmpty()) {
            throw new IllegalArgumentException("无效的url");
        }
        try {
            return Jsoup.connect(url).timeout(100 * 1000).get();
        } catch (IOException e) {
            System.out.println(url+"地址不存在");
            return null;
        }
    }

    /**
     * 获取所有的省份
     *
     * @return
     */
    public List getProvinces() {
        List res = new ArrayList();
        Document connect = connect("http://localhost:8080/2020/default.htm");
        Elements rowProvince = connect.select("tr.provincetr");
        for (Element provinceElement : rowProvince) {// 遍历每一行的省份城市
            Elements select = provinceElement.select("a");
            for (Element province : select) {// 每一个省份(四川省)
                String name = province.text();
                String code = province.select("a").attr("href");
                res.add(code.substring(0, code.lastIndexOf(".")) + "*" + name);
            }
        }
        return res;
    }

    @Test
    public void testGetProvince() {
        getProvinces().forEach(System.out::println);
    }

    @Resource
    private ProvinceMapper provinceMapper;

    @Test
    void insertProvinces() {
        List list = new ArrayList();
        for (String p : getProvinces()) {
            String[] split = p.split("\\*");
            Province province = Province.builder().code(split[0]).name(split[1]).build();
            list.add(province);
        }
        //list.forEach(System.out::println);
        int res = provinceMapper.batchInsert(list);
        System.out.println(res);
    }

    /**
     * 根据省份编号获取该省份下所有的市
     *
     * @param provinceCode 省份编号
     * @return
     */
    public List getCitiesByProvince(String provinceCode) {
        List res = new ArrayList();
        Document connect = connect("http://localhost:8080/2020/" + provinceCode + ".html");
        Elements rowCity = connect.select("tr.citytr");
        for (Element cityElement : rowCity) {// 遍历每一行的省份城市
            String name = cityElement.select("td").text();
            String[] split = name.split(" ");
            res.add(split[0].substring(0, 4) + "*" + split[1]);
        }
        return res;
    }

    @Test
    public void testGetCitiesByProvince() {
        getCitiesByProvince("41").forEach(System.out::println);
    }

    @Resource
    private CityMapper cityMapper;

    @Test
    void insertCities() {
        List pList = getProvinces();
        for (String p : pList) {
            List list = new ArrayList();
            String[] split = p.split("\\*");
            List cList = getCitiesByProvince(split[0]);
            Province pp = provinceMapper.selectOne(new QueryWrapper().eq("code", split[0]));
            for (String c : cList) {
                String[] tmp = c.split("\\*");
                City city = City.builder().name(tmp[1]).code(tmp[0]).provinceId(pp.getId()).build();
                //System.out.println(city);
                list.add(city);
            }
            //一个省一个省的添加
            int res = cityMapper.batchInsert(list);
            System.out.println(res);
        }
    }

    /**
     * 根据省市编号获取该省份下所有的县
     *
     * @param cityCode 市编号
     * @return
     */
    public List getCountriesByCity(String cityCode) {
        List res = new ArrayList();
        Document connect = connect("http://localhost:8080/2020/" + cityCode + ".html");
        Elements rowCountry = connect.select("tr.countytr");
        if (rowCountry.size() == 0) {
            Elements townCountry = connect.select("tr.towntr");
            for (Element townElement : townCountry) {
                String txt = townElement.select("td").text();
                String[] split = txt.split(" ");
                res.add(split[0].substring(0, 9) + "*" + split[1]);
                //比如海南省下的儋州市,只有4级目录,没有country
            }
        } else {
            for (Element countryElement : rowCountry) {// 遍历每一行的省份城市
                String txt = countryElement.select("td").text();
                String[] split = txt.split(" ");
                res.add(split[0].substring(0, 6) + "*" + split[1]);
            }
        }
        return res;
    }

    @Test
    void testGetCountiesByProvince() {
        getCountriesByCity("46/4604").forEach(System.out::println);
    }

    @Resource
    private CountryMapper countryMapper;

    @Test
    void insertCountry() {
        List pList = getProvinces();
        for (int i = 0; i             
关注
打赏
1688896170
查看更多评论
0.1236s