1 Star 0 Fork 0

liarchoc/learngit

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
getImgs.py 1.93 KB
一键复制 编辑 原始数据 按行查看 历史
liarchoc 提交于 2017-08-11 21:55 . 8-11
__author__ = 'Wayne'
import urllib.request
import os
import re
def url_open(url):
url = re.sub(r'^//*', "http://", url)
req = urllib.request.Request(url)
req.add_header(
'User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
response = urllib.request.urlopen(req)
return response.read()
def get_page(url):
html = url_open(url).decode('utf-8')
print(html)
# 正则表达式寻找页面地址
pattern = r'<span class="current-comment-page">\[(\d{4})\]</span>'
page = int(re.findall(pattern, html)[0])
return page
def find_imgs(page_url):
page_url = "http://" + page_url
pattern = r'<img src="(.*?\.jpg)"'
html = url_open(page_url).decode('utf-8')
img_addrs = re.findall(pattern, html)
return img_addrs
def save_imgs(img_addrs, page_num, folder):
os.mkdir(str(page_num))
os.chdir(str(page_num))
for i in img_addrs:
pattern = r'sinaimg.cn/mw600/(.*?).jpg'
filename = i.split('/')[-1]
image = url_open(i)
with open(filename, 'wb') as f:
f.write(image)
f.close()
def download_mm(folder='ooxx', pages=10):
os.mkdir(folder) # 新建文件夹
os.chdir(folder) # 跳转到文件夹
folder_top = os.getcwd() # 获取当前工作目录
url = 'http://jandan.net/ooxx/'
page_num = get_page(url) # 获取网页最新的地址
for i in range(pages):
page_num -= i # 递减下载几个网页
page_url = url + 'page-' + str(page_num) + '#comments' # 组合网页地址
img_addrs = find_imgs(page_url) # 获取图片地址
save_imgs(img_addrs, page_num, folder) # 保存图片
os.chdir(folder_top)
if __name__ == '__main__':
folder = input("Please enter a folder(default is 'ooxx'): ")
pages = input("How many pages do you wan to download(default is 10): ")
download_mm(str(folder), int(pages))
os.system("pause")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/casterfir/learngit.git
[email protected]:casterfir/learngit.git
casterfir
learngit
learngit
master

搜索帮助