master

分支 (1)

管理

管理

master

csdn-resource-library
/
大学地域排名.py

import requests
from lxml import etree
import xlwt


# 爬虫函数
def spider(url, datas):
    re = requests.get(url)
    re.encoding = 'utf-8'
    html = re.text
    selector = etree.HTML(html)
    contents = selector.xpath("/html/body/main/div/div[2]/div/table/tbody/tr")  # 数据格式为列表存储
    # print("contents:", contents)
    for content in contents:
        data = []  # 循环创建一个空列表放入不同大学的所有数据
        ranking = content.xpath("./td[1]/text()")[0]  # 大学在地域的排名
        # print(ranking,type(ranking))

        name = content.xpath("./td[2]/a/text()")[0]  # 大学名称
        # print(name)

        score = content.xpath("./td[3]/text()")[0]  # 大学评分
        # print(score)

        area = content.xpath("./td[4]/text()")[0]  # 大学所在省份或直辖市
        # print(area)

        data.append(str(ranking))
        data.append(str(name))
        data.append(str(score))
        data.append(str(area))
        datas.append(data) # datas每一个元素为一个列表，即data


# 存储原始数据到xls文件中
def save_excel(datas):
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet("mysheet", cell_overwrite_ok = True)
    sheet.write(0, 0, "ranking")
    sheet.write(0, 1, "name")
    sheet.write(0, 2, "score")
    sheet.write(0, 3, "area")
    i = 1  # 数据开始位置为第二行
    for data in datas:
        print(data)
        for info in range(len(data)):
            print(info)
            sheet.write(i, info, data[info])
        i += 1
    print("大学地域排名写入完成\n")
    book.save('大学地域排名.xls')


# 存储数据到本地
def save_txt(datas):
    file1 = open('研究院校经纬度.txt','r',encoding='utf-8')
    file2 = open('finally.txt', 'w', encoding='utf-8')
    lines = file1.readlines()  # 读取所有行

    """
    file1为所有的院校名称和经纬度，这里我们筛选出需要展示在地图上的不同省份的排名靠前的大学，
    将他们的经纬度和名称放入finally.txt中
    """

    for line in lines:
        line = line.strip('\n').split(',')  # strip去除字符串后换行符，split分割数据为列表
        for data in datas:
            if data[1] in line:  # data[1]为大学名称
                line.append(data[0])  # ranking 排名
                line.append(data[2])  # score 评分
                line.append(data[3])  # area 地区
                print(line)
                file2.write(','.join(line)+"\n")  # join将列表元素拼接，拼接符为可以自定义，这里为逗号，记得加换行符，不要数据都在一行
                break  # 如果是我们要的大学，写入数据后跳出循环，继续验证下一个大学
            else:
                continue  # 如果在当前行没有找到大学名称，结束这次循环，继续下一行寻找
    file1.close()
    file2.close()

    #  将finally.txt文件写入excel中方便操作
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet("mysheet", cell_overwrite_ok = True)
    sheet.write(0, 0, "name")
    sheet.write(0, 1, "jingdu")
    sheet.write(0, 2, "weidu")
    sheet.write(0, 3, "ranking")
    sheet.write(0, 4, "score")
    sheet.write(0, 5, "area")

    f = open('finally.txt', 'r', encoding='utf-8')
    infos = f.readlines()
    length = len(infos)
    for i in range(1, length+1):
        l = infos[i-1].strip('\n').split(',')
        # print(l)
        words = len(l)
        for j in range(words):
            sheet.write(i, j, l[j])
    print("finally.xls入完成\n")
    book.save('finally.xls')
    f.close()


def main():
    datas = []  # 初始化的datas，可以在函数修改保存
    base_url = "http://www.nseac.com/eva/GEEA.php?DDLyear=2020&DDLThird="
    for i in range(1,32):
        url = base_url + str(i)
        spider(url, datas)
        print("当前进度为{}/31\n".format(i))
    print("完成所有爬取任务\n")
    # save_excel(datas)
    save_txt(datas)


if __name__ == "__main__":
    main()