代码拉取完成,页面将自动刷新
#!/usr/bin/python3
# coding: utf-8
"""
__title__ = "my crawler"
__author__ = "Hans"
__mtime__ = "2017/3/30 0030"
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗┓┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
from bs4 import BeautifulSoup
import socket
import urllib.parse
import time
from sqlalchemy import Column, String, create_engine, Integer
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
import threading
# 创建数据库对象基类
Base = declarative_base()
# 定义Pic 对象
class Pic(Base):
__tablename__ = 'pic'
id = Column(Integer(), primary_key=True)
site = Column(String(100))
pic = Column(String(1000))
# 初始化数据库连接
engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/crawler')
# 创建DBsession类型
DBsession = sessionmaker(bind=engine)
session = DBsession()
url_seen = set(['/']) # 已经扒过的页面
url_todo = set(['/']) # 准备扒的页面
img_set = set([]) # 扒到的图片集合
site = 'www.yangqq.com'
def fetch(url, site=site):
print(url)
sock = socket.socket()
sock.connect((site, 80))
get = 'GET {} HTTP/1.0\r\nHost: {}\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36\r\n\r\n'.format(url, site)
sock.send(get.encode('utf-8'))
response = b''
while True:
chunk = sock.recv(4096)
# print(type(chunk))
if not chunk:
break
response += chunk
# print(response)
# response 得到了服务器返回的信息,包括响应头, 将响应头和body 分离
try:
header, body = response.split(b'\r\n\r\n', 1)
except:
url_seen.add(url)
url_todo.remove(url)
return
# 根据 body 得到 一个 soup 对象
soup = BeautifulSoup(body, 'lxml')
# 找到当前页面所有的本站的跳转链接
links = soup.find_all('a')
# print(len(links))
for link in links:
# print(link['href'])
try:
f_url = link['href']
except:
url_seen.add(url)
url_todo.remove(url)
return
f_url = urllib.parse.urljoin(url, f_url)
# 将url解析
p_url = urllib.parse.urlparse(f_url)
# 排除不是http的请求
if p_url.scheme not in ('', 'http', 'https'):
continue
# 排除不是本站的请求
if p_url.netloc not in ('', site):
continue
# 获取当前连接
c_url = p_url.path
# 判断这个url 是否已经扒过了
if c_url not in url_seen: # 如果不再扒过的集合中,就添加到url_todo
url_todo.add(c_url)
# 找到本页面所有的图片连接
images = soup.find_all('img')
for img in images:
img_url = img['src']
# print(img_url)
# 直接将连接加入到 img_set
img_set.add(img_url)
# 将图片添加到数据库
try:
result = session.query(Pic).filter(Pic.pic==img_url).first()
if not result:
session.add(Pic(site=site, pic=img_url))
session.commit()
except:
pass
# 在最后将当前url, 添加到url_seen, 从url_todo中去掉
url_seen.add(url)
url_todo.remove(url)
if __name__ == '__main__':
start = time.time()
fetch('/')
def run_fetch(url_todo=url_todo):
while url_todo:
# print(url_todo)
fetch(list(url_todo)[0])
t1 = threading.Thread(target=run_fetch)
t2 = threading.Thread(target=run_fetch)
t3 = threading.Thread(target=run_fetch)
t4 = threading.Thread(target=run_fetch)
t1.start()
t2.start()
t3.start()
t4.start()
t1.join()
t2.join()
t3.join()
t4.join()
session.close()
print('抓取了 {} 页面, 共耗时 {:.1f} s'.format(len(url_seen), time.time() - start))
print('-'*30)
print('抓取到的图片共 {} 张'.format(len(img_set)))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。