1 Star 2 Fork 1

Sakura1609/b站视频爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
bilibili.py 3.55 KB
一键复制 编辑 原始数据 按行查看 历史
"""
爬取bilibili
"""
import pickle
import time
import random
import requests
import re
import json
custom_headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Connection': 'close'
}
queue = set()
passList = set()
count = 0
def get_dict(bvid: str) -> dict:
"""
从单个网页中爬取需要的字典, 如果发生错误会直接扔出错误而不是返回None
:param bvid: BV号
:return: 需要的数据字典
"""
# 请求原始html数据
url = f"https://www.bilibili.com/video/{bvid}"
response = requests.get(url=url, headers=custom_headers)
if response.status_code != 200:
ValueError(f"get html failed, status_code: {response.status_code}")
# 使用正则表达式匹配出javaScript代码中的一个字典变量
match = re.search(r"<script>window.__INITIAL_STATE__=(?P<dict>{.*?});", response.text, re.MULTILINE | re.DOTALL)
if match is None:
ValueError("dict not found")
# 将匹配到的字符串转化为python字典
video_dict = match.group('dict')
dic = json.loads(video_dict)
# 随机睡眠
time.sleep(random.random()/2)
return dic
def produce_json(dic: dict) -> dict:
try:
data = {
'bvid': dic['bvid'],
'title': dic['videoData']['title'],
'state': dic['videoData']['stat'],
'tags': [],
}
for tag in dic['tags']:
data['tags'].append(tag['tag_name'])
except KeyError as e:
print("produce_json failed")
data = {
'bvid': '',
'title': '',
'state': '',
'tags': [],
}
# for item in dic['related']:
# data['related'].append({
# 'bvid': item['bvid'],
# 'title': item['title']
# })
# bvList = []
# for item in data['related']:
# bvList.append(item['bvid'])
return data
def get_related(dic: dict) -> set:
bvList = []
try:
for item in dic['related']:
bvList.append(item['bvid'])
except KeyError as e:
print("get_related failed")
pass
return bvList
# 广度优先遍历进行爬取
def width(bvid: str, passList: set, queue: set, count: int):
if len(queue) == 0:
queue.add(bvid)
passList.add(bvid)
with open('result_2.txt', 'ab') as f:
while queue and count < 300000:
bvid = queue.pop()
passList.add(bvid)
dic = get_dict(bvid)
f.write(json.dumps(produce_json(dic), ensure_ascii=False).encode('utf-8'))
f.write(',\n'.encode('utf-8'))
count = count + 1
child = set(get_related(dic)).difference(passList)
if len(child) != 0:
queue.update(child)
def main():
with open('passList.dat', 'rb') as f1:
passList.update(pickle.load(f1))
with open('queue.dat', 'rb') as f2:
queue.update(pickle.load(f2))
with open('count.dat', 'rb') as f3:
count = pickle.load(f3)
source = "BV1JB4y1s7Dk"
width(source, passList, queue, count)
if __name__ == '__main__':
try:
main()
except:
print("interupted")
with open('passList.dat', 'wb') as f1:
pickle.dump(passList, f1)
with open('queue.dat', 'wb') as f2:
pickle.dump(queue, f2)
with open('count.dat', 'wb') as f3:
pickle.dump(count, f3)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/xianyun1609/b-station-video-crawler.git
[email protected]:xianyun1609/b-station-video-crawler.git
xianyun1609
b-station-video-crawler
b站视频爬虫
master

搜索帮助