代码拉取完成,页面将自动刷新
__author__ = 'Wayne'
import urllib.request
import os
import re
def url_open(url):
url = re.sub(r'^//*', "http://", url)
req = urllib.request.Request(url)
req.add_header(
'User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
response = urllib.request.urlopen(req)
return response.read()
def get_page(url):
html = url_open(url).decode('utf-8')
print(html)
# 正则表达式寻找页面地址
pattern = r'<span class="current-comment-page">\[(\d{4})\]</span>'
page = int(re.findall(pattern, html)[0])
return page
def find_imgs(page_url):
page_url = "http://" + page_url
pattern = r'<img src="(.*?\.jpg)"'
html = url_open(page_url).decode('utf-8')
img_addrs = re.findall(pattern, html)
return img_addrs
def save_imgs(img_addrs, page_num, folder):
os.mkdir(str(page_num))
os.chdir(str(page_num))
for i in img_addrs:
pattern = r'sinaimg.cn/mw600/(.*?).jpg'
filename = i.split('/')[-1]
image = url_open(i)
with open(filename, 'wb') as f:
f.write(image)
f.close()
def download_mm(folder='ooxx', pages=10):
os.mkdir(folder) # 新建文件夹
os.chdir(folder) # 跳转到文件夹
folder_top = os.getcwd() # 获取当前工作目录
url = 'http://jandan.net/ooxx/'
page_num = get_page(url) # 获取网页最新的地址
for i in range(pages):
page_num -= i # 递减下载几个网页
page_url = url + 'page-' + str(page_num) + '#comments' # 组合网页地址
img_addrs = find_imgs(page_url) # 获取图片地址
save_imgs(img_addrs, page_num, folder) # 保存图片
os.chdir(folder_top)
if __name__ == '__main__':
folder = input("Please enter a folder(default is 'ooxx'): ")
pages = input("How many pages do you wan to download(default is 10): ")
download_mm(str(folder), int(pages))
os.system("pause")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。