代码拉取完成,页面将自动刷新
#!/usr/bin/python3
# coding: utf-8
"""
__title__ = "my crawler"
__author__ = "Hans"
__mtime__ = "2017/3/30 0030"
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗┓┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
from bs4 import BeautifulSoup
import socket
import urllib.parse
site = 'www.quanjing.com'
sock = socket.socket()
# sock.setblocking(False)
sock.connect((site, 80))
get = 'GET / HTTP/1.0\r\nHost: {}\r\n\r\n'.format(site)
sock.send(get.encode('utf-8'))
response = b''
while True:
chunk = sock.recv(4096)
print(type(chunk))
if not chunk:
break
response += chunk
# 将response内容进行拆分
header, body = response.split(b'\r\n\r\n', 1)
# print(body.decode())
# with open('index.html', 'wb') as f:
# f.write(response)
soup = BeautifulSoup(body, 'lxml')
# print(soup.prettify())
# print(''.join(soup.find_all('img')))
# print(str(soup.find_all('img')[0]))
imgs = soup.find_all('img')
print(len(imgs))
# with open('image.html', 'w') as f:
# for img in imgs:
# print(type(img))
# f.write(str(img.encoding('utf-8'), encoding="utf-8"))
# f.write('<br><hr>')
# f.write('\r\n')
for img in imgs:
print(urllib.parse.unquote(img['src']))
links = soup.find_all('a')
print('-'*100)
for link in links:
try:
print(urllib.parse.unquote(link['href']))
except:
pass
# url = urllib.parse.unquote(link['href'])
# p_url = urllib.parse.urlparse(url)
# print(p_url)
# with open('image.html', 'w') as f:
# for link in links:
# try:
# f.write(str(link))
# f.write('\r\n')
# except:
# pass
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。