代码拉取完成,页面将自动刷新
# 通过城市名称构造第一类访问 url
# 访问第一类 url
# 解析该城市的公交分类,获取到想要的分类类型,构造第二类访问 url
# 访问第二类 url
# 解析分类类型的每一个公交线路,构造第三类访问 url
# 访问第三类 url
# 解析该公交路线的上、下两部分数据
# 循环第二类 -> 第三类的过程,直到爬完所有数据
import csv
from bs4 import BeautifulSoup
import requests
url = 'https://nanchang.8684.cn/'
response = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.1.4031 SLBChan/30'}, timeout=10)
"""获取数据并解析"""
soup = BeautifulSoup(response.text, 'lxml')
soup_buslayer = soup.find('div', class_='bus-layer depth w120')
# 解析分类数据
dic_result = {}
soup_buslist = soup_buslayer.find_all('div', class_='pl10')
for soup_bus in soup_buslist:
name = soup_bus.find('span', class_='kt').get_text()
if '以数字开头' in name:
soup_a_list = soup_bus.find('div', class_='list')
for soup_a in soup_a_list.find_all('a'):
text = soup_a.get_text()
href = soup_a.get('href')
dic_result[text] = "https://nanchang.8684.cn" + href
print(dic_result)
bus_arr = []
index = 0
for key,value in dic_result.items():
print ('key: ',key,'value: ',value)
response = requests.get(url=value, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.1.4031 SLBChan/30'}, timeout=10)
"""获取数据并解析"""
soup = BeautifulSoup(response.content, 'lxml')
# 详细线路
soup_buslist = soup.find('div', class_='list clearfix')
for soup_a in soup_buslist.find_all('a'):
text = soup_a.get_text()
href = soup_a.get('href')
title = soup_a.get('title')
bus_arr.append([title, text, "https://nanchang.8684.cn" + href])
print(bus_arr)
bus_des = []
index = 0
for value in bus_arr:
bus_name = value[0]
# print ('路线名称: ',value[0],'value: ',value[2])
response = requests.get(url=value[2], headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.1.4031 SLBChan/30'}, timeout=10)
"""获取数据并解析"""
soup = BeautifulSoup(response.text, 'lxml')
# 详细线路
soup_buslist = soup.find('div', class_='info')
for soup_ul in soup_buslist.find_all('ul'):
date = soup_ul.select('li')[0].get_text()
money = soup_ul.select('li')[1].get_text()
company = soup_ul.select('li')[2].select('a')[0].get_text()
bus_des.append({"线路名称" : bus_name, "运行时间" : date,"票价" : money,"公司" : company})
print(bus_des)
#创建csv
fo = open("news.csv", "w", newline='', encoding='utf-8')
#表头
header = ["线路名称", "运行时间","票价","公司"]
writer = csv.DictWriter(fo,header)
#写入表头
writer.writeheader()
#将上一步的字典写入csv文件中
writer.writerows(bus_des)
fo.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。