2021年中国省份及城市及街道数据抓取(未包括我国台湾省、香港特别行政区和澳门特别行政区)
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
@ Author :Lan
@ Blog :www.lanol.cn
@ Date : 2022/1/12
@ Description:I'm in charge of my Code
-------------------------------------------------
"""
import time
import requests
import parsel
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'
provinceText = requests.get(base_url + 'index.html')
provinceText.encoding = 'utf-8'
provinceHtml = parsel.Selector(provinceText.text)
china = {}
# 获取所有省份,并进行遍历
for province in provinceHtml.xpath('//td/a'):
provinceName = province.xpath('text()').extract_first()
print(provinceName)
china[provinceName] = {}
provinceUrl = base_url + province.xpath('@href').extract_first().replace('.html', '/')
cityText = requests.get(base_url + province.xpath('@href').extract_first())
cityText.encoding = 'utf-8'
cityHtml = parsel.Selector(cityText.text)
# 获取省份下的市
for city in cityHtml.xpath("//tr[@class='citytr']/td[2]"):
cityName = city.xpath('a/text()').extract_first()
print(provinceName, cityName)
china[provinceName][cityName] = {}
countyText = requests.get(base_url + city.xpath('a/@href').extract_first())
countyText.encoding = 'utf-8'
countyHtml = parsel.Selector(countyText.text)
# 获取市区下的区
for county in countyHtml.xpath("//tr[@class='countytr']/td[2]//*"):
countyName = county.xpath('text()').extract_first()
print(provinceName, cityName, countyName)
china[provinceName][cityName][countyName] = {}
townUrl = provinceUrl + county.xpath('@href').extract_first() if county.xpath('@href') else None
if townUrl:
try:
townText = requests.get(townUrl, timeout=2)
except:
townText = requests.get(townUrl, timeout=2)
townText.encoding = 'utf-8'
townHtml = parsel.Selector(townText.text)
# 获取区下面的镇
towns = townHtml.xpath("//tr[@class='towntr']/td[2]//text()").extract()
print(provinceName, cityName, countyName, towns)
china[provinceName][cityName][countyName] = towns
print(china)
评论 (0)