1 Star 0 Fork 0

mr_nobody/py_crawler

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
crawler2_urllib_coroutine.py 3.95 KB
一键复制 编辑 原始数据 按行查看 历史
mr_nobody 提交于 2017-04-16 21:27 . coroutine crawler2
#!/usr/bin/python3
# coding: utf-8
"""
__title__ = "my crawler"
__author__ = "Hans"
__mtime__ = "2017/3/30 0030"
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗┓┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
from bs4 import BeautifulSoup
import socket
import urllib.parse
import time
from sqlalchemy import Column, String, create_engine, Integer
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from urllib import request
import asyncio
# 创建数据库对象基类
Base = declarative_base()
# 定义Pic 对象
class Pic(Base):
__tablename__ = 'pic'
id = Column(Integer(), primary_key=True)
site = Column(String(100))
pic = Column(String(1000))
# 初始化数据库连接
engine = create_engine('mysql+pymysql://root:root@localhost:3306/crawler')
# 创建DBsession类型
DBsession = sessionmaker(bind=engine)
session = DBsession()
url_seen = set(['/']) # 已经扒过的页面
url_todo = set(['/']) # 准备扒的页面
img_set = set([]) # 扒到的图片集合
site = 'www.yangqq.com'
async def get_response(query):
return request.urlopen(query)
async def fetch(url, flag, site=site):
print(url, flag)
query = ('http', site, url, '', '', '')
query = urllib.parse.urlunparse(query) # 组合成一个可以访问的url
try:
response = await get_response(query) # 和socket不一样的是这里得到的响应内容可以使用response 的方法单独读取
except:
url_seen.add(url)
url_todo.remove(url)
return
body = response.read()
# print('body:', body)
# 根据 body 得到 一个 soup 对象
soup = BeautifulSoup(body, 'html.parser')
# 找到当前页面所有的本站的跳转链接
links = soup.find_all('a')
# print(len(links))
for link in links:
# print(link['href'])
try:
f_url = link['href']
except:
url_seen.add(url)
url_todo.remove(url)
return
f_url = urllib.parse.urljoin(url, f_url)
# 将url解析
p_url = urllib.parse.urlparse(f_url)
# 排除不是http的请求
if p_url.scheme not in ('', 'http', 'https'):
continue
# 排除不是本站的请求
if p_url.netloc not in ('', site):
continue
# 获取当前连接
c_url = p_url.path
# 判断这个url 是否已经扒过了
if c_url not in url_seen: # 如果不再扒过的集合中,就添加到url_todo
url_todo.add(c_url)
# 找到本页面所有的图片连接
images = soup.find_all('img')
for img in images:
img_url = img['src']
# print(img_url)
# 直接将连接加入到 img_set
img_set.add(img_url)
# 将图片添加到数据库
try:
result = session.query(Pic).filter(Pic.pic==img_url).first()
if not result:
session.add(Pic(site=site, pic=img_url))
session.commit()
except:
pass
# 在最后将当前url, 添加到url_seen, 从url_todo中去掉
url_seen.add(url)
url_todo.remove(url)
if __name__ == '__main__':
async def main(flag):
while url_todo:
await fetch(list(url_todo)[0], flag)
start = time.time()
# fetch('/', 'init')
loop = asyncio.get_event_loop() #获取EventLoop
loop.run_until_complete(fetch('/', 'init'))
tasks = [main('one'), main('two'), main('three'), main('four')]
loop.run_until_complete(asyncio.gather(*tasks)) #执行协程
loop.close()
session.close()
print('抓取了 {} 页面, 共耗时 {:.1f} s'.format(len(url_seen), time.time() - start))
print('-'*30)
print('抓取到的图片共 {} 张'.format(len(img_set)))
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/mr_nobody/py_crawler.git
[email protected]:mr_nobody/py_crawler.git
mr_nobody
py_crawler
py_crawler
master

搜索帮助