代码拉取完成,页面将自动刷新
同步操作将从 邱海/Python爬虫小说下载阅读 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
import sys
import re
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5 import QtGui, QtCore, QtWidgets
import threading
import time
import math
import pandas as pd
import numpy as np
from lxml import etree
import urllib
from fake_useragent import UserAgent
import webbrowser
import copy
import os
import shutil
import mainUI as mu
import readUI as ru
import settingsUI
class myToolButton(QPushButton):
'''
重载实现信号按钮,点击发送对应链接url
'''
buttonClicked = pyqtSignal(str)
def __init__(self, buttonText, txtUrl):
super(myToolButton, self).__init__()
self.setText(buttonText)
self.txtUrl = txtUrl
self.clicked.connect(lambda: self.buttonClicked.emit(self.txtUrl))
class webBrowserButton(QPushButton):
'''
重载实现点击链接跳转到浏览器按钮
'''
def __init__(self, urlText):
super(webBrowserButton, self).__init__()
self.urlText = urlText
self.clicked.connect(lambda: webbrowser.open(self.urlText))
self.setText(urlText)
class downloadProgressBar(QProgressBar):
'''
自定义下载进度条
'''
def __init__(self):
super(downloadProgressBar, self).__init__()
self.setStyleSheet("QProgressBar{"
"border: 2px solid grey;"
"border-radius: 5px;"
"text-align: right;"
"border-radius: 5px;"
"background-color: #FFFFFF;"
"}"
"QProgressBar::chunk{"
"border-radius: 5px;"
"background-color: #007FFF;"
"width: 15px;"
"margin: 0.5px"
"}"
)
class returnThread(threading.Thread):
'''
继承线程类,实现取函数结果
'''
def __init__(self, func, args=()):
super(returnThread, self).__init__()
self.func = func
self.args = args
def run(self):
self.result = self.func(*self.args)
def getResult(self):
try:
return self.result
except Exception:
return None
class downloader(QObject):
"""
类说明:小说下载器
"""
flushAllActiveUrlSig = pyqtSignal(str) # 刷新页面链接信号
flushDownloadSig = pyqtSignal() # 刷新下载列表信号
showSearchingGIFSig = pyqtSignal() # 展示加载gif
hideSearchingGIFSig = pyqtSignal() # 隐藏加载gif
txtDownloadExitSig = pyqtSignal() # 小说下载完成信号
def __init__(self, mainUI, setUI, readUI):
super(downloader, self).__init__()
self.mainThreadID = threading.currentThread().ident
self.isRunning = False # 运行状态
self.mainUI = mainUI # 主窗口ui控制
self.readUI = readUI # 阅读窗口ui控制
self.setUI = setUI # 设置窗口ui控制
self.curSavePath = os.getcwd() # 当前下载保存目录默认值,实际值以配置文件为准
self.maxDownloadNum = 5 # 允许同时下载最大小说数量默认值,实际值以配置文件为准
self.url2ThreadID = dict() # 以小说链接为键,索引正在下载该小说的线程id
self.waitForDownload = 0 # 线程状态,等待下载
self.willDownload = 1 # 线程状态,继续运行
self.stopDownload = 3 # 线程状态,停止下载
self.willStop = 4 # 线程状态,下载完成
self.changeDownloadSpeed = 5 # 线程状态,下载速度(线程数)改变
self.thread2Status = dict() # 以线程ID号为键,索引子线程运行状态,取值范围为以上状态码
self.threadNum = 1 # 每本小说下载线程同时开启的下载线程数,0、1代表下载线程不再使用子线程
self.waitBatch = 10 # 每本小说不同网页请求间隔(s)
self.retryBatch = 30 # 每次失败重试网页请求间隔(s)
self.reGetTimes = 999 # 失败重试次数
self.reqTimeout = 20 # 请求超时时长(s)
self.mainUrl = '' # 网站主页地址
self.curUrl = '' # 当前显示页面地址
self.url2Name = dict() # 以链接存储链接的名字
self.url2MainUrlIndex = dict() # 以小说链接为键,索引小说的主站索引
self.allActiveTxtUrl = dict() # 以主页链接为键,索引所有可用小说链接列表
self.allActiveMenuUrl = dict() # 以主页链接为键,索引所有可用菜单链接列表
self.pageMenuUrl = dict() # 以主页链接为键,索引页内菜单(翻页等)名为键
self.chapterUrl = dict() # 以小说链接为键,索引目标小说章节链接列表
self.downloadUrlList = [] # 所有需要下载的小说链接列表
self.downloadingUrlList = [] # 所有下载中的小说链接列表,根据maxDownloadNum决定最大数量
self.waitForDownloadUrlList = [] # 等待下载列表
self.downloadedUrlList = [] # 记录已下载完成的小说链接列表
self.downloadedChapterUrl = dict() # 以小说链接为键,索引小说已下载好的章节链接列表
self.url2downloadProcess = dict()# 以小说链接为键,索引正在使用的进度显示子控件
self.ua = UserAgent() # 随机用户代理
self.savePathChanging = False # 下载目录转移状态
# mainUI.mainBox内设定的主站索引
self.url2MainUrlIndex['http://wap.zzs5.com/'] = 0
self.url2MainUrlIndex['https://www.bqktxt.com/'] = 1
self.url2MainUrlIndex['https://www.biqukan.com/'] = 1
# 忽略列表,与列表内匹配的链接名均忽略掉
self.ignoreNameList = ['更多', '电脑版', '注册', '登录', '书架', '繁体',
'资讯', '联系我们', '加入我们', '帮助', '举报',
'建议', '漏洞', '沪', '文学', 'txt', 'TXT', '人物',
'最新', '电子书']
self.flushAllActiveUrlSig.connect(self.showAllActiveUrl)
self.flushDownloadSig.connect(self.showDownloadTxt)
self.showSearchingGIFSig.connect(self.showSearchingGIF)
self.hideSearchingGIFSig.connect(self.hideSearchingGIF)
self.txtDownloadExitSig.connect(self.wakeOneWaitedTxt)
self.downloadListNum = 0 # 记录当前下载列表中的小说数量
# 读取已有的下载配置
self.readSettings()
def readSettings(self):
'''
读取已有的配置
'''
if os.path.exists(os.path.join(os.getcwd(), 'settings')):
# 读取配置文件
if os.path.exists(os.path.join(os.getcwd(), 'settings', 'settings.csv')):
try:
settingsData = pd.read_csv(os.path.join(os.getcwd(), 'settings', 'settings.csv'))
self.curSavePath = settingsData['saveTxtPath'][0]
self.maxDownloadNum = settingsData['maxDownloadNum'][0]
self.threadNum = settingsData['downloadSpeed'][0]
except pd.errors.EmptyDataError as pde:
pass
# 读取下载完成列表
if os.path.exists(os.path.join(os.getcwd(), 'settings', 'downloaded.csv')):
try:
downloadedData = pd.read_csv(os.path.join(os.getcwd(), 'settings', 'downloaded.csv'))
downloadedUrlList = np.array(downloadedData['txtUrl']).tolist()
self.downloadedUrlList = copy.copy(downloadedUrlList)
for i in range(len(downloadedUrlList)):
self.url2Name[downloadedUrlList[i]] = downloadedData['txtName'][i]
except pd.errors.EmptyDataError as pde:
pass
# 读取下载列表
if os.path.exists(os.path.join(os.getcwd(), 'settings', 'download.csv')):
try:
pdDownloadData = pd.read_csv(os.path.join(os.getcwd(), 'settings', 'download.csv'))
downloadUrlList = np.array(pdDownloadData['txtUrl']).tolist()
self.downloadListNum = len(self.downloadUrlList)
self.downloadUrlList = copy.copy(downloadUrlList)
for i in range(len(downloadUrlList)):
txtUrl = downloadUrlList[i]
txtName = pdDownloadData['txtName'][i]
mainUrlIndex = pdDownloadData['mainUrlIndex'][i]
self.url2Name[txtUrl] = txtName
self.url2MainUrlIndex[txtUrl] = mainUrlIndex
if os.path.exists(os.path.join(os.getcwd(), 'settings', txtName + '.csv')):
if os.path.exists(os.path.join(self.curSavePath, 'DownloadData', txtName + '.txt')):
try:
downloadedChapterUrlData = pd.read_csv(os.path.join(os.getcwd(), 'settings', txtName + '.csv'))
self.downloadedChapterUrl[txtUrl] = copy.copy(np.array(downloadedChapterUrlData[txtUrl]).tolist())
print('读取到下载记录', txtName, txtUrl)
except pd.errors.EmptyDataError as pde:
# 未读取到下载记录,清理错误csv文件
os.remove(os.path.join(os.getcwd(), 'settings', txtName + '.csv'))
pass
else:
# txt文件已被移除,删除相关下载记录
os.remove(os.path.join(os.getcwd(), 'settings', txtName + '.csv'))
except pd.errors.EmptyDataError as pde:
pass
# 不论结果如何,下载器读取完毕,开始运行
self.isRunning = True
def recoverDownloadStatus(self):
'''
从下载记录恢复下载所有小说
'''
downloadUrlList = copy.copy(self.downloadUrlList)
for i in range(len(downloadUrlList)):
txtUrl = downloadUrlList[i]
if 'wap.zzs5.com' in txtUrl:
self.url2MainUrlIndex[txtUrl] = 0
elif 'www.biqukan.com' in txtUrl or 'www.bqktxt.com' in txtUrl:
self.url2MainUrlIndex[txtUrl] = 1
else:
continue
self.downloadAllTxt()
def getOneUserAgent(self):
'''
随机生成代理UserAgent
'''
User_Agent = {'User-Agent': str(self.ua.random), 'Connection': 'close'}
return User_Agent
def getAllActiveUrl(self, mainUrl):
"""
获取curUrl所有可用链接并展示之
"""
if mainUrl not in self.allActiveMenuUrl:
self.showSearchingGIFSig.emit()
self.allActiveTxtUrl[mainUrl] = []
self.allActiveMenuUrl[mainUrl] = []
self.pageMenuUrl[mainUrl] = []
if mainUrl != self.curUrl:
# 判断是否仍在等待该请求响应
self.allActiveTxtUrl.pop(mainUrl)
self.allActiveMenuUrl.pop(mainUrl)
self.pageMenuUrl.pop(mainUrl)
return
try:
req = requests.get(url=mainUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
# 连接被关闭,激活刷新主页显示函数显示一个返回主页按钮,退出
self.flushAllActiveUrlSig.emit(mainUrl)
return
req.encoding = req.apparent_encoding
a = BeautifulSoup(req.text, features="lxml").find_all('a')
if not a:
# 失败重试
for i in range(self.reGetTimes):
if not a:
time.sleep(self.retryBatch)
if mainUrl != self.curUrl:
# 判断是否仍在等待该请求响应
self.allActiveTxtUrl.pop(mainUrl)
self.allActiveMenuUrl.pop(mainUrl)
self.pageMenuUrl.pop(mainUrl)
return
print('请求解析页面', self.url2MainUrlIndex[mainUrl], mainUrl, self.url2Name[mainUrl], '失败,正在重试')
try:
req = requests.get(url=mainUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
# 连接被关闭,激活刷新主页显示函数显示一个返回主页按钮,退出
self.flushAllActiveUrlSig.emit(mainUrl)
return
req.encoding = req.apparent_encoding
a = BeautifulSoup(req.text, 'lxml').find_all('a')
else:
print('重新请求页面', self.url2Name[mainUrl], '成功')
break
for each in a:
if None != each.string and None != each.get('href'):
eachUrl = each.get('href')
eachName = each.string
if '/' == eachUrl[0]:
# 对主站内页面跳转需要使用主站链接+eachUrl
usefulUrl = self.mainUrl + eachUrl
else:
usefulUrl = eachUrl
# 决定该链接是否忽略
willContinue = False
if '.php' in eachUrl or 'javascript:' in eachUrl:
willContinue = True
elif '第' == eachName[0]:
for i in range(len(eachName)):
if '章' == eachName[i] or ' ' == eachName[i]:
willContinue = True
break
else:
for i in range(len(self.ignoreNameList)):
if self.ignoreNameList[i] in eachName:
willContinue = True
break
if willContinue:
continue
# 添加到索引字典
self.url2MainUrlIndex[usefulUrl] = self.url2MainUrlIndex[mainUrl]
# 判断该链接是菜单还是小说
if eachName.isdigit() or '>' == eachName or '<' == eachName or '上一页' == eachName or '下一页' == eachName:
# 翻页菜单
self.pageMenuUrl[mainUrl].append(usefulUrl)
self.url2Name[usefulUrl] = eachName
continue
urlParse = urllib.parse.urlparse(usefulUrl)
if urlParse.path and '/' != urlParse.path and '//' != urlParse.path:
urlSplit = eachUrl.split('/')
if urlSplit[-1]:
if not urlSplit[-1].split('.')[-1]:
for i in range(len(urlSplit[-1])):
if not urlSplit[-1][i].isdigit() and '_' != urlSplit[-1][i]:
# 菜单
self.allActiveMenuUrl[mainUrl].append(usefulUrl)
self.url2Name[usefulUrl] = eachName
break
elif urlSplit[-2]:
for i in range(len(urlSplit[-2])):
if not urlSplit[-2][i].isdigit() and '_' != urlSplit[-2][i]:
# 菜单
self.allActiveMenuUrl[mainUrl].append(usefulUrl)
self.url2Name[usefulUrl] = eachName
break
else:
# 菜单
self.allActiveMenuUrl[mainUrl].append(usefulUrl)
self.url2Name[usefulUrl] = eachName
continue
if usefulUrl not in self.allActiveMenuUrl[mainUrl]:
# 小说
self.allActiveTxtUrl[mainUrl].append(usefulUrl)
self.url2Name[usefulUrl] = eachName
# 激活刷新主页显示函数
self.flushAllActiveUrlSig.emit(mainUrl)
def addShowMenu(self, mainUrl):
'''
添加菜单显示
'''
# 记录是否有返回首页菜单按钮
for i in range(int(len(self.allActiveMenuUrl[mainUrl]) / 5)):
for j in range(5):
curIndex = 5 * i + j
menuUrl = self.allActiveMenuUrl[mainUrl][curIndex]
itemBtn = myToolButton(self.url2Name[menuUrl], menuUrl)
itemBtn.setIcon(QIcon('images/webBrowser.png'))
itemBtn.buttonClicked.connect(self.enterMenu)
itemIndex = i
self.mainUI.showScrollLayout.addWidget(itemBtn, itemIndex, j + 1)
for i in range(len(self.allActiveMenuUrl[mainUrl]) % 5):
curIndex = int(len(self.allActiveMenuUrl[mainUrl]) / 5) * 5 + i
menuUrl = self.allActiveMenuUrl[mainUrl][curIndex]
itemBtn = myToolButton(self.url2Name[menuUrl], menuUrl)
itemBtn.setIcon(QIcon('images/webBrowser.png'))
itemBtn.buttonClicked.connect(self.enterMenu)
itemIndex = int(len(self.allActiveMenuUrl[mainUrl]) / 5)
self.mainUI.showScrollLayout.addWidget(itemBtn, itemIndex, i + 1)
curIndexMainUrl = re.findall(re.compile(r'[(](.*?)[)]'), self.mainUI.mainBox.currentText())[0]
if not len(self.allActiveMenuUrl[mainUrl]):
# 至少有一个返回首页按钮
itemBtn = myToolButton('首页', curIndexMainUrl)
itemBtn.setIcon(QIcon('images/webBrowser.png'))
itemBtn.buttonClicked.connect(self.enterMenu)
self.mainUI.showScrollLayout.addWidget(itemBtn, 0, 1)
itemLabel = QLabel('序号')
itemName = QLabel('小说名')
itemDownBtn = QLabel('下载')
itemWebbrowser = QLabel('转到浏览器查看详细信息')
self.mainUI.showScrollLayout.addWidget(itemLabel, int(len(self.allActiveMenuUrl[mainUrl]) / 5) + 1, 1)
self.mainUI.showScrollLayout.addWidget(itemName, int(len(self.allActiveMenuUrl[mainUrl]) / 5) + 1, 2)
self.mainUI.showScrollLayout.addWidget(itemDownBtn, int(len(self.allActiveMenuUrl[mainUrl]) / 5) + 1, 3)
self.mainUI.showScrollLayout.addWidget(itemWebbrowser, int(len(self.allActiveMenuUrl[mainUrl]) / 5) + 1, 4, 1, 2)
def addShowTxt(self, mainUrl):
'''
添加小说链接一条小说显示
'''
for i in range(len(self.allActiveTxtUrl[mainUrl])):
txtUrl = self.allActiveTxtUrl[mainUrl][i]
txtName = self.url2Name[txtUrl]
itemLabel = QLabel(str(i + 1))
itemLabel.setMaximumWidth(100)
itemLabel.setWordWrap(True)
itemName = QLabel(txtName)
itemName.setMinimumWidth(200)
itemName.setMaximumWidth(400)
itemName.setWordWrap(True)
itemDownBtn = myToolButton('下载', txtUrl)
itemDownBtn.setIcon(QIcon('images/download.png'))
itemDownBtn.buttonClicked.connect(self.addDownloadTxtThread)
itemUrlBtn = webBrowserButton(self.allActiveTxtUrl[mainUrl][i])
itemUrlBtn.setIcon(QIcon('images/webBrowser.png'))
itemIndex = int(len(self.allActiveMenuUrl[mainUrl]) / 5) + 2 + i
self.mainUI.showScrollLayout.addWidget(itemLabel, itemIndex, 1)
self.mainUI.showScrollLayout.addWidget(itemName, itemIndex, 2)
self.mainUI.showScrollLayout.addWidget(itemDownBtn, itemIndex, 3)
self.mainUI.showScrollLayout.addWidget(itemUrlBtn, itemIndex, 4, 1, 2)
def addShowPageMenu(self, mainUrl):
'''
添加页内(翻页)菜单显示
'''
for i in range(int(len(self.pageMenuUrl[mainUrl]) / 5)):
for j in range(5):
curIndex = 5 * i + j
menuUrl = self.pageMenuUrl[mainUrl][curIndex]
menuName = self.url2Name[menuUrl]
itemBtn = myToolButton(menuName, menuUrl)
itemBtn.setIcon(QIcon('images/webBrowser.png'))
itemBtn.buttonClicked.connect(self.enterMenu)
itemIndex = int(len(self.allActiveMenuUrl[mainUrl]) / 5) + len(self.allActiveTxtUrl[mainUrl]) + 5 + i
self.mainUI.showScrollLayout.addWidget(itemBtn, itemIndex, j + 1)
for i in range(len(self.pageMenuUrl[mainUrl]) % 5):
curIndex = int(len(self.pageMenuUrl[mainUrl]) / 5) * 5 + i
menuUrl = self.pageMenuUrl[mainUrl][curIndex]
menuName = self.url2Name[mainUrl]
itemBtn = myToolButton(menuName, menuUrl)
itemBtn.setIcon(QIcon('images/webBrowser.png'))
itemBtn.buttonClicked.connect(self.enterMenu)
itemIndex = int(len(self.allActiveMenuUrl[mainUrl]) / 5) + len(self.allActiveTxtUrl[mainUrl]) + 5 + int(len(self.pageMenuUrl[mainUrl]) / 5) * 5
self.mainUI.showScrollLayout.addWidget(itemBtn, itemIndex, i + 1)
def showSearchingGIF(self):
'''
展示加载gif
'''
if threading.currentThread().ident == self.mainThreadID:
if self.mainUI.searchingGIF.isHidden():
self.mainUI.searchingGIF.show()
def hideSearchingGIF(self):
'''
隐藏加载gif
'''
if threading.currentThread().ident == self.mainThreadID:
if not self.mainUI.searchingGIF.isHidden():
self.mainUI.searchingGIF.hide()
def showAllActiveUrl(self, mainUrl):
'''
展示主页上所有链接
'''
if threading.currentThread().ident == self.mainThreadID:
# print('展示页面', mainUrl)
# 重新定义滑动layout
self.clearLayout(self.mainUI.showScrollLayout)
self.mainUI.showScrollLayout = QGridLayout()
self.mainUI.showScrollLayout.setHorizontalSpacing(30)
self.mainUI.showScrollLayout.setVerticalSpacing(20)
# 展示所有菜单项
self.addShowMenu(mainUrl)
# 展示所有获取到小说链接
self.addShowTxt(mainUrl)
# 展示所有页内菜单项(翻页等)
self.addShowPageMenu(mainUrl)
# 刷新窗口布局
self.mainUI.showScrollWidget.destroy()
self.mainUI.showScrollWidget = QWidget()
self.mainUI.showScrollWidget.setLayout(self.mainUI.showScrollLayout)
self.mainUI.showScroll.setWidget(self.mainUI.showScrollWidget)
else:
print('小说列表刷新信号接收线程不是主线程,当前线程id', threading.currentThread().ident, ',主线程id', self.mainThreadID)
self.hideSearchingGIFSig.emit()
def addShowTask(self, index, txtUrl):
'''
添加下载列表一条显示
'''
itemLabel = QLabel(str(index + 1))
itemStatus = QLabel('下载状态')
itemStartBtn = myToolButton('开始/暂停', txtUrl)
itemCancelBtn = myToolButton('取消下载', txtUrl)
itemName = QLabel(self.url2Name[txtUrl])
itemProgressBar = downloadProgressBar()
if txtUrl in self.downloadingUrlList:
itemStatus.setPixmap(QPixmap('images/downloading.png'))
itemStartBtn.setIcon(QIcon('images/stop.png'))
elif txtUrl in self.waitForDownloadUrlList:
itemStatus.setPixmap(QPixmap('images/waiting.png'))
itemStartBtn.setIcon(QIcon('images/start.png'))
else:
itemStatus.setPixmap(QPixmap('images/stoped.png'))
itemStartBtn.setIcon(QIcon('images/start.png'))
itemCancelBtn.setIcon(QIcon('images/cancel.png'))
itemStatus.setMaximumSize(30, 30)
itemStatus.setMinimumSize(30, 30)
itemStatus.setScaledContents(True)
itemProgressBar.setMinimumWidth(300)
if txtUrl in self.downloadedChapterUrl and txtUrl in self.chapterUrl:
itemProgressBar.setValue(int(len(self.downloadedChapterUrl[txtUrl]) / len(self.chapterUrl[txtUrl]) * 100))
else:
itemProgressBar.setValue(0)
itemStartBtn.buttonClicked.connect(self.startOrStopTxt)
itemCancelBtn.buttonClicked.connect(self.cancelDownloadTxt)
self.url2downloadProcess[txtUrl] = itemProgressBar
self.mainUI.downloadScrollLayout.addWidget(itemLabel, index + 1, 0)
self.mainUI.downloadScrollLayout.addWidget(itemStatus, index + 1, 1)
self.mainUI.downloadScrollLayout.addWidget(itemName, index + 1, 2)
self.mainUI.downloadScrollLayout.addWidget(itemProgressBar, index + 1, 3)
self.mainUI.downloadScrollLayout.addWidget(itemStartBtn, index + 1, 4)
self.mainUI.downloadScrollLayout.addWidget(itemCancelBtn, index + 1, 5)
def showDownloadTxt(self):
'''
刷新下载列表
'''
if threading.currentThread().ident == self.mainThreadID:
downloadUrlList = copy.copy(self.downloadUrlList)
downloadingUrlList = copy.copy(self.downloadingUrlList)
waitForDownloadUrlList = copy.copy(self.waitForDownloadUrlList)
self.clearLayout(self.mainUI.downloadScrollLayout)
self.mainUI.downloadScrollLayout = QGridLayout()
self.mainUI.downloadScrollLayout.setHorizontalSpacing(30)
self.mainUI.downloadScrollLayout.setVerticalSpacing(20)
itemLabel = QLabel('序号')
itemName = QLabel('小说名')
itemDownBtn = QLabel('下载进度')
self.mainUI.downloadScrollLayout.addWidget(itemLabel, 0, 1)
self.mainUI.downloadScrollLayout.addWidget(itemName, 0, 2)
self.mainUI.downloadScrollLayout.addWidget(itemDownBtn, 0, 3)
for i in range(len(downloadingUrlList)):
# 优先显示下载中的小说实现排序效果
txtUrl = downloadingUrlList[i]
self.addShowTask(i, txtUrl)
for i in range(len(waitForDownloadUrlList)):
# 随后现实等待下载槽位的小说
txtUrl = waitForDownloadUrlList[i]
self.addShowTask(len(downloadingUrlList) + i, txtUrl)
for i in range(len(downloadUrlList)):
# 显示剩余不在下载中的小说
txtUrl = downloadUrlList[i]
if txtUrl not in downloadingUrlList and txtUrl not in waitForDownloadUrlList:
self.addShowTask(len(downloadingUrlList) + len(waitForDownloadUrlList) + i, txtUrl)
# 原downloadScrollWidget高宽不可变,重新定义新downloadScrollWidget替换之以适应新高宽
self.mainUI.downloadScrollWidget.destroy()
self.mainUI.downloadScrollWidget = QWidget()
self.mainUI.downloadScrollWidget.setLayout(self.mainUI.downloadScrollLayout)
self.mainUI.downloadScroll.setWidget(self.mainUI.downloadScrollWidget)
else:
print('下载列表刷新信号接收线程不是主线程,当前线程id', threading.currentThread().ident, ',主线程id', self.mainThreadID)
def writer(self, chapterName, fileName, chapterText):
"""
写入新章节到文件
"""
if not os.path.exists(os.path.join(self.curSavePath, 'DownloadData')):
os.mkdir(os.path.join(self.curSavePath, 'DownloadData'))
with open(os.path.join(self.curSavePath, 'DownloadData', fileName), 'a', encoding='utf-8') as f:
f.write('\n' + chapterName + '\n')
if chapterText:
f.writelines(chapterText)
f.write('\n\n\n')
def getAllChapterUrl(self, txtUrl):
"""
获取目标小说所有章节名字、下载链接、章节数量
添加下载列表时调用,调用结束时发送信号激活下载线程
"""
# 不同网站格式不一,需单独处理
mainUrlIndex = self.url2MainUrlIndex[txtUrl]
mainUrl = re.findall(re.compile(r'[(](.*?)[)]'), self.mainUI.mainBox.itemText(mainUrlIndex))[0]
txtName = self.url2Name[txtUrl]
if 1 == mainUrlIndex:
# 当前主站是 www.biqukan.com
try:
req = requests.get(url=txtUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
# 连接被关闭,激活刷新主页显示函数显示一个返回主页按钮
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
return
req.encoding = req.apparent_encoding
div = BeautifulSoup(req.text, features="lxml").find('div', class_='listmain')
if not div:
# 失败重试
for i in range(self.reGetTimes):
if not div:
time.sleep(self.retryBatch)
print('请求解析小说', mainUrlIndex, txtUrl, txtName, '所有章节失败,正在重试')
try:
req = requests.get(url=txtUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
return
req.encoding = req.apparent_encoding
div = BeautifulSoup(req.text, features="lxml").find('div', class_='listmain')
else:
break
if div:
a = BeautifulSoup(str(div), features="lxml").find_all('a')
if len(a) > 13:
print('小说', self.url2Name[txtUrl], '解析所有章节成功')
self.chapterUrl[txtUrl] = []
for each in a[13:]:
oneChapterUrl = mainUrl + each.get('href')
self.chapterUrl[txtUrl].append(oneChapterUrl)
self.url2Name[oneChapterUrl] = each.string
self.url2MainUrlIndex[oneChapterUrl] = mainUrlIndex
if len(self.chapterUrl[txtUrl]):
# 调用下载控制函数
self.downloadTxtControl(txtUrl)
else:
if txtUrl in self.downloadUrlList:
# 从下载列表删除
print('请求失败,删除任务', self.url2Name[txtUrl])
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
#TODO 显示错误
else:
if txtUrl in self.downloadUrlList:
# 从下载列表删除
print('请求失败,删除任务', self.url2Name[txtUrl])
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
#TODO 显示错误
elif 0 == mainUrlIndex:
# 当前主站是 www.zzs5.com,该网站点击小说后不是立刻进入目录界面,而是进入详细信息页面
try:
req1 = requests.get(url=txtUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
return
req1.encoding = req1.apparent_encoding
htmlElement1 = etree.HTML(req1.text)
txtInfoUrls = htmlElement1.xpath('/html/body/div[3]/div[7]/span/li/a/@href')
if not txtInfoUrls:
# 失败重试
for i in range(self.reGetTimes):
if not txtInfoUrls:
time.sleep(self.retryBatch)
print('请求解析获取', mainUrlIndex, txtUrl, txtName, '详情页面失败,正在重试')
try:
req1 = requests.get(url=txtUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
return
req1.encoding = req1.apparent_encoding
htmlElement1 = etree.HTML(req1.text)
txtInfoUrls = htmlElement1.xpath('/html/body/div[3]/div[7]/span/li/a/@href')
else:
break
if txtInfoUrls:
# 上几行处理了该网站的阅读中转页面,以下开始爬取章节链接
print('小说', self.url2Name[txtUrl], '进入详细信息页面')
txtInfoUrl = mainUrl + txtInfoUrls[0]
try:
req2 = requests.get(url=txtInfoUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
return
req2.encoding = req2.apparent_encoding
htmlElement2 = etree.HTML(req2.text)
chapterName = htmlElement2.xpath('/html/body/div[2]/div[6]/p/a/text()')
chapterUrl = htmlElement2.xpath('/html/body/div[2]/div[6]/p/a/@href')
if not len(chapterUrl):
# 失败重试
for i in range(self.reGetTimes):
if not len(chapterUrl):
time.sleep(self.retryBatch)
print('请求解析小说', mainUrlIndex, txtUrl, txtName, '所有章节失败,正在重试')
try:
req2 = requests.get(url=txtInfoUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
return
req2.encoding = req2.apparent_encoding
htmlElement2 = etree.HTML(req2.text)
chapterName = htmlElement2.xpath('/html/body/div[2]/div[6]/p/a/text()')
chapterUrl = htmlElement2.xpath('/html/body/div[2]/div[6]/p/a/@href')
else:
break
if len(chapterUrl) and len(chapterName):
print('小说', self.url2Name[txtUrl], '解析所有章节成功')
self.chapterUrl[txtUrl] = []
for i in range(len(chapterUrl)):
oneChapterUrl = mainUrl + chapterUrl[i]
self.chapterUrl[txtUrl].append(oneChapterUrl)
self.url2Name[oneChapterUrl] = chapterName[i]
self.url2MainUrlIndex[oneChapterUrl] = mainUrlIndex
# 调用下载控制函数
self.downloadTxtControl(txtUrl)
else:
if txtUrl in self.downloadUrlList:
# 从下载列表删除
print('请求失败,删除任务', self.url2Name[txtUrl])
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
#TODO 显示错误
else:
if txtUrl in self.downloadUrlList:
# 从下载列表删除
print('请求失败,删除任务', self.url2Name[txtUrl])
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
#TODO 显示错误
else:
# 无法处理未兼容格式的网站
if txtUrl in self.downloadUrlList:
# 从下载列表删除
print('网站', txtUrl, '格式不支持,请求失败,删除任务', self.url2Name[txtUrl])
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
#TODO 显示错误
#TODO 显示错误
pass
def getOneContent(self, contentUrl, mainUrlIndex):
"""
获取一章节内容
"""
# 不同网站格式不一,需单独处理
if 1 == mainUrlIndex:
# 当前主站是 www.biqukan.com
try:
req = requests.get(url=contentUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return ''
req.encoding = req.apparent_encoding
div = BeautifulSoup(req.text, 'lxml').find_all('div', class_='showtxt')
# '\xa0'就是' ',将其换成两个换行符使文本美观
if not div:
for i in range(self.reGetTimes):
# 失败重试
if not div:
time.sleep(self.retryBatch)
print('请求获取章节', mainUrlIndex, contentUrl, self.url2Name[contentUrl], '内容失败,正在重试')
try:
req = requests.get(url=contentUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return ''
req.encoding = req.apparent_encoding
div = BeautifulSoup(req.text, 'lxml').find_all('div', class_='showtxt')
else:
print('重新获取章节', self.url2Name[contentUrl], '内容成功')
break
if len(div):
return div[0].text.replace('\xa0'*8, '\n\n') + '\n'
else:
return '\n'
elif 0 == mainUrlIndex:
# 当前主站是 www.zzs5.com,每一章划分为三页
urlPath = urllib.parse.urlparse(contentUrl).path.split('/')[-1].split('.')[-2]
secondUrl = contentUrl.replace(urlPath, urlPath + '_2')
thirdUrl = contentUrl.replace(urlPath, urlPath + '_3')
# 爬取第一页
try:
req = requests.get(url=contentUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return ''
req.encoding = req.apparent_encoding
article = BeautifulSoup(req.text, 'lxml').find_all(id='nr')
if not article:
# 失败重试
for i in range(self.reGetTimes):
if not article:
time.sleep(self.retryBatch)
print('请求获取章节', mainUrlIndex, contentUrl, self.url2Name[contentUrl], '第一页内容失败,正在重试')
try:
req = requests.get(url=contentUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return ''
req.encoding = req.apparent_encoding
article = BeautifulSoup(req.text, 'lxml').select('#nr')
else:
print('重新获取章节', mainUrlIndex, contentUrl, self.url2Name[contentUrl], '第一页内容成功')
break
if article:
contentStr = article[0].text.replace('\xa0'*8, '\n\n') + '\n'
else:
return ''
time.sleep(self.waitBatch + 1)
# 爬取第二页
try:
req = requests.get(url=secondUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return contentStr
req.encoding = req.apparent_encoding
article = BeautifulSoup(req.text, 'lxml').select('#nr')
if not article:
# 失败重试
for i in range(self.reGetTimes):
if not article:
time.sleep(self.retryBatch)
print('请求获取章节', mainUrlIndex, secondUrl, self.url2Name[contentUrl], '第二页内容失败,正在重试')
try:
req = requests.get(url=secondUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return contentStr
req.encoding = req.apparent_encoding
article = BeautifulSoup(req.text, 'lxml').find_all(id='nr')
else:
print('重新获取章节', mainUrlIndex, secondUrl, self.url2Name[contentUrl], '第二页内容成功')
break
if article:
contentStr += article[0].text.replace('\xa0'*8, '\n\n') + '\n'
time.sleep(self.waitBatch + 1)
# 爬取第三页
try:
req = requests.get(url=thirdUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return contentStr
req.encoding = req.apparent_encoding
article = BeautifulSoup(req.text, 'lxml').find_all(id='nr')
if not article:
# 失败重试
for i in range(self.reGetTimes):
if not article:
time.sleep(self.retryBatch)
print('请求获取章节', mainUrlIndex, thirdUrl, self.url2Name[contentUrl], '第三页内容失败,正在重试')
try:
req = requests.get(url=thirdUrl, headers=self.getOneUserAgent())
except requests.exceptions.ConnectionError as rec:
return contentStr
req.encoding = req.apparent_encoding
article = BeautifulSoup(req.text, 'lxml').select('#nr')
else:
print('重新获取章节', mainUrlIndex, thirdUrl, self.url2Name[contentUrl], '第三页内容成功')
break
if article:
contentStr += article[0].text.replace('\xa0'*8, '\n\n') + '\n'
return contentStr
else:
return ''
def downloadTxtControl(self, txtUrl):
'''
下载控制函数,一般在子线程内调用
负责调用downloadTxt下载函数、处理下载状态改变
'''
curThreadID = threading.currentThread().ident
txtName = self.url2Name[txtUrl]
mainUrlIndex = self.url2MainUrlIndex[txtUrl]
if txtUrl not in self.url2ThreadID:
self.url2ThreadID[txtUrl] = curThreadID
else:
print('出错了,正在下载中小说', txtName, txtUrl, '旧线程尚未退出,新线程已退出')
return
self.thread2Status[curThreadID] = self.willDownload
while(self.isRunning):
curStatus = self.thread2Status[curThreadID]
if curStatus == self.willDownload:
# 处于下载状态,启动下载
print('子线程启动下载小说函数', txtName, txtUrl)
self.downloadTxt(txtUrl)
elif curStatus == self.changeDownloadSpeed:
# 改变下载速度状态,重新启动下载并恢复下载状态
print('子线程重新启动下载小说函数', txtName, txtUrl)
self.thread2Status[curThreadID] = self.willDownload
self.downloadTxt(txtUrl)
elif curStatus == self.waitForDownload:
# 下载槽位被挤占,失去下载优先级,暂停下载、转为等待下载状态,添加进等待列表、结束线程
if txtUrl in self.url2ThreadID:
self.url2ThreadID.pop(txtUrl)
if curThreadID in self.thread2Status:
self.thread2Status.pop(curThreadID)
if txtUrl in self.downloadingUrlList:
self.downloadingUrlList.remove(txtUrl)
if txtUrl not in self.waitForDownloadUrlList:
self.waitForDownloadUrlList.append(txtUrl)
self.flushDownloadSig.emit()
self.txtDownloadExitSig.emit()
print('暂停下载小说', txtName, txtUrl)
return
elif curStatus == self.stopDownload:
# 取消下载,结束线程
if txtUrl in self.url2ThreadID:
self.url2ThreadID.pop(txtUrl)
if curThreadID in self.thread2Status:
self.thread2Status.pop(curThreadID)
if txtUrl in self.downloadingUrlList:
self.downloadingUrlList.remove(txtUrl)
self.flushDownloadSig.emit()
self.txtDownloadExitSig.emit()
print('取消下载小说', txtName, txtUrl)
return
elif curStatus == self.willStop:
# 下载完成,结束线程
if txtUrl in self.url2ThreadID:
self.url2ThreadID.pop(txtUrl)
if curThreadID in self.thread2Status:
self.thread2Status.pop(curThreadID)
if txtUrl in self.downloadingUrlList:
self.downloadingUrlList.remove(txtUrl)
if txtUrl not in self.downloadedUrlList:
self.downloadedUrlList.append(txtUrl)
if txtUrl in self.downloadUrlList:
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
self.txtDownloadExitSig.emit()
print('完成下载小说', txtName, txtUrl)
return
else:
# 状态码错误,结束线程
if txtUrl in self.url2ThreadID:
self.url2ThreadID.pop(txtUrl)
if curThreadID in self.thread2Status:
self.thread2Status.pop(curThreadID)
if txtUrl in self.downloadingUrlList:
self.downloadingUrlList.remove(txtUrl)
self.flushDownloadSig.emit()
self.txtDownloadExitSig.emit()
print('状态码错误,退出下载小说', txtName, txtUrl)
return
def downloadTxt(self, txtUrl):
'''
下载小说函数,一般在子线程内调用
'''
lastStatus = self.thread2Status[threading.currentThread().ident]
txtName = self.url2Name[txtUrl]
mainUrlIndex = self.url2MainUrlIndex[txtUrl]
if txtUrl not in self.downloadedChapterUrl:
self.downloadedChapterUrl[txtUrl] = []
if os.path.exists(os.path.join(self.curSavePath, 'DownloadData', txtName + '.txt')) and os.path.exists(os.path.join(os.getcwd(), 'settings', txtName + '.csv')):
# 继续下载以前下载过的小说,可能是已下载完成、曾取消下载(包括闪退丢失下载信息)
# 或是小说在连载中,下载最新章节
try:
pdDownloadedChapterData = pd.read_csv(os.path.join(os.getcwd(), 'settings', txtName + '.csv'))
self.downloadedChapterUrl[txtUrl] = copy.copy(np.array(pdDownloadedChapterData[txtUrl]).tolist())
except pd.errors.EmptyDataError as pde:
# 未读取到下载记录,清理txt文件
if os.path.exists(os.path.join(self.curSavePath, 'DownloadData', txtName + '.txt')):
os.remove(os.path.join(self.curSavePath, 'DownloadData', txtName + '.txt'))
pass
if len(self.downloadedChapterUrl[txtUrl]):
# 恢复下载
startIndex = len(self.downloadedChapterUrl[txtUrl])
if len(self.chapterUrl[txtUrl]) <= startIndex + 1:
# 已下载完所有章节
self.url2downloadProcess[txtUrl].setValue(100)
print('小说', txtName, txtUrl, '所有章节已下载完成')
self.thread2Status[threading.currentThread().ident] = self.willStop
return
if txtUrl in self.downloadedChapterUrl and txtUrl in self.chapterUrl:
print('子线程开始恢复下载小说', txtName, ',已下载章节数量:', len(self.downloadedChapterUrl[txtUrl]) , ',总章节数量:', len(self.chapterUrl[txtUrl]))
self.url2downloadProcess[txtUrl].setValue(int(len(self.downloadedChapterUrl[txtUrl]) / len(self.chapterUrl[txtUrl]) * 100))
else:
print('开始开始下载小说', txtName, txtUrl)
else:
# 新增下载
startIndex = 0
print('子线程开始下载小说', txtName, txtUrl)
if os.path.exists(os.path.join(self.curSavePath, 'DownloadData', txtName + '.txt')):
os.remove(os.path.join(self.curSavePath, 'DownloadData', txtName + '.txt'))
if 1 < self.threadNum:
batchNum = math.floor((len(self.chapterUrl[txtUrl]) - startIndex) / self.threadNum)
lastBatchNum = (len(self.chapterUrl[txtUrl]) - startIndex) % self.threadNum
else:
batchNum = 0
lastBatchNum = len(self.chapterUrl[txtUrl]) - startIndex
for i in range(batchNum):
if txtUrl not in self.downloadingUrlList:
# 已不占有下载槽位,该线程应当即刻退出
self.thread2Status[threading.currentThread().ident] = self.stopDownload
return
elif lastStatus != self.thread2Status[threading.currentThread().ident]:
#下载状态改变,即刻返回处理
return
# 分批次执行下载线程
url2Thread = dict()
for j in range(self.threadNum):
# 一批次线程间隔一定时间执行
curIndex = startIndex + i*self.threadNum + j
oneChapterUrl = self.chapterUrl[txtUrl][curIndex]
oneThread = returnThread(self.getOneContent, args=(oneChapterUrl, mainUrlIndex))
url2Thread[oneChapterUrl] = oneThread
oneThread.start()
time.sleep(self.waitBatch)
for key,value in url2Thread.items():
# 等待该批次线程结束
#TODO 不用join,而采用列表记录状态
value.join()
for j in range(self.threadNum):
# 取出该批次爬取的章节内容
curIndex = startIndex + i*self.threadNum + j
oneChapterUrl = self.chapterUrl[txtUrl][curIndex]
oneChapterName = self.url2Name[oneChapterUrl]
if oneChapterUrl not in self.downloadedChapterUrl[txtUrl]:
oneChapterContent = url2Thread[oneChapterUrl].getResult()
while(self.savePathChanging):
# 等待下载目录转移完毕
time.sleep(self.waitBatch)
# 先写入txt文件再添加记录,在恢复下载时宁可重复写入章节也不丢失章节
self.writer(oneChapterName, f'{txtName}.txt', oneChapterContent)
self.downloadedChapterUrl[txtUrl].append(oneChapterUrl)
self.url2downloadProcess[txtUrl].setValue(int(len(self.downloadedChapterUrl[txtUrl]) / len(self.chapterUrl[txtUrl]) * 100))
print(txtName + "下载进度:%.3f%%" % float(((i + 1)*self.threadNum - 1) / len(self.chapterUrl[txtUrl])))
for i in range(lastBatchNum):
if txtUrl not in self.downloadingUrlList:
# 已不占有下载槽位,该线程应当即刻退出
self.thread2Status[threading.currentThread().ident] = self.stopDownload
return
elif lastStatus != self.thread2Status[threading.currentThread().ident]:
#下载状态改变,即刻返回处理
return
# 不分批次或最后不足一批次章节间隔一定时间爬取
time.sleep(self.waitBatch)
curIndex = startIndex + batchNum*self.threadNum + i
oneChapterUrl = self.chapterUrl[txtUrl][curIndex]
oneChapterName = self.url2Name[oneChapterUrl]
if oneChapterName not in self.downloadedChapterUrl[txtUrl]:
oneChapterContent = self.getOneContent(self.chapterUrl[txtUrl][curIndex], mainUrlIndex)
while(self.savePathChanging):
# 等待下载目录转移完毕
time.sleep(self.waitBatch)
# 先写入txt文件再添加记录,在恢复下载时宁可重复写入章节也不丢失章节
self.writer(oneChapterName, f'{txtName}.txt', oneChapterContent)
self.downloadedChapterUrl[txtUrl].append(oneChapterUrl)
self.url2downloadProcess[txtUrl].setValue(int(len(self.downloadedChapterUrl[txtUrl]) / len(self.chapterUrl[txtUrl]) * 100))
print(txtName + "下载进度:%.3f%%" % float(curIndex / len(self.chapterUrl[txtUrl])))
self.url2downloadProcess[txtUrl].setValue(100)
print('小说', txtName, txtUrl, '所有章节下载完成')
self.thread2Status[threading.currentThread().ident] = self.willStop
def clearLayout(self, layout):
'''
清空layout内的所有控件,包括widget子窗口
这种方式不会改变layout所在窗口的大小
'''
itemList = list(range(layout.count()))
itemList.reverse() # 倒序删除不受改变索引
for i in itemList:
item = layout.itemAt(i)
layout.removeItem(item)
if item.widget():
# 清理包含widget的子控件
item.widget().deleteLater()
layout.activate = False
def saveDownloadStatus(self):
'''
循环保存下载状态,已下载完成的小说、未下载完成的小说及其已下载的章节数量
'''
time.sleep(3)
for i in range(20):
# 保存线程启动前最多30s,需要判断下载器是否开始运行
if self.isRunning:
break
print('下载器未完成记录读取,不保存记录')
time.sleep(1)
while self.isRunning:
time.sleep(self.waitBatch)
# 保存下载配置
if not os.path.exists(os.path.join(os.getcwd(), 'settings')):
os.mkdir(os.path.join(self.curSavePath, 'settings'))
settingsDataDict = {'saveTxtPath': [self.curSavePath], 'maxDownloadNum': [self.maxDownloadNum], 'downloadSpeed': [self.threadNum]}
if len(settingsDataDict.keys()):
pdSettingsData = pd.DataFrame(settingsDataDict, columns=settingsDataDict.keys())
if len(pdSettingsData[list(settingsDataDict.keys())[0]]):
# 数据第一列非空,即写入数据一定正常,空数据不需写入
pdSettingsData.to_csv(os.path.join(os.getcwd(), 'settings', 'settings.csv'), encoding='utf-8', index=None)
# 保存下载中的小说链接列表
downloadUrlList = copy.copy(self.downloadUrlList)
if len(downloadUrlList):
downloadUrl = []
downloadName = []
downloadMainUrlIndex = []
for i in range(len(downloadUrlList)):
txtUrl = downloadUrlList[i]
# 小说url
downloadUrl.append(txtUrl)
# 下载中小说名字列表
downloadName.append(self.url2Name[txtUrl])
# 下载中小说主站索引列表
downloadMainUrlIndex.append(self.url2MainUrlIndex[txtUrl])
downloadDataDict = {'txtName': downloadName, 'txtUrl': downloadUrl, 'mainUrlIndex': downloadMainUrlIndex}
if len(downloadDataDict.keys()):
pdDownloadData = pd.DataFrame(downloadDataDict, columns=downloadDataDict.keys())
if len(self.downloadUrlList) and len(pdDownloadData[list(downloadDataDict.keys())[0]]):
# 下载列表非空,数据第一列非空,即写入数据一定正常,空数据不需写入
pdDownloadData.to_csv(os.path.join(os.getcwd(), 'settings', 'download.csv'), encoding='utf-8', index=None)
# 下载中小说已下载完成章节链接列表
downloadedChapterUrlDict = copy.copy(self.downloadedChapterUrl)
downloadingUrlList = copy.copy(self.downloadingUrlList)
if len(downloadedChapterUrlDict.keys()):
chapterDictKeys = list(downloadedChapterUrlDict.keys())
for i in range(len(chapterDictKeys)):
# 每本小说已下载章节保存为一个单独文件
txtUrl = chapterDictKeys[i]
if txtUrl in downloadingUrlList:
# 只需更新下载中小说已下载章节
downloadedChapterUrlList = downloadedChapterUrlDict[txtUrl]
downloadedChapterDataDict = {txtUrl: downloadedChapterUrlList}
pdDownloadedChapterData = pd.DataFrame(downloadedChapterDataDict, columns=downloadedChapterDataDict.keys())
if len(self.downloadingUrlList) and len(self.downloadUrlList) == self.downloadListNum and txtUrl in self.downloadingUrlList and len(pdDownloadedChapterData[list(downloadedChapterDataDict.keys())[0]]):
# 正在下载中,数据第一列非空,即写入数据一定正常,空数据不需写入
pdDownloadedChapterData.to_csv(os.path.join(os.getcwd(), 'settings', self.url2Name[txtUrl] + '.csv'), encoding='utf-8', index=None)
elif len(self.downloadUrlList) != self.downloadListNum:
print('缓存下载进度失败,downloadListNum值记录错误,应为', self.downloadListNum, '实际为', len(self.downloadUrlList),',已纠正')
self.downloadListNum = len(self.downloadUrlList)
# 保存下载完成的小说链接列表
downloadedUrlList = copy.copy(self.downloadedUrlList)
if len(downloadedUrlList):
downloadedNameList = []
for i in range(len(downloadedUrlList)):
downloadedNameList.append(self.url2Name[downloadedUrlList[i]])
downloadedDataDict = {'txtName': downloadedNameList, 'txtUrl': downloadedUrlList}
if len(downloadedDataDict.keys()):
pdDownloadedData = pd.DataFrame(downloadedDataDict, columns=downloadedDataDict.keys())
if len(pdDownloadedData[list(downloadedDataDict.keys())[0]]):
# 数据第一列非空,即写入数据一定正常,空数据不需写入
pdDownloadedData.to_csv(os.path.join(os.getcwd(), 'settings', 'downloaded.csv'), encoding='utf-8', index=None)
def mainUrlChanged(self):
'''
更改小说主站
'''
self.mainUrl = re.findall(re.compile(r'[(](.*?)[)]'), self.mainUI.mainBox.currentText())[0]
if self.mainUrl not in self.url2Name:
self.url2Name[self.mainUrl] = self.mainUI.mainBox.currentText()
self.enterMenu(self.mainUrl)
def enterMenu(self, menuUrl):
'''
启动线程获取menuUrl页内所有页链接并在线程内发送信号刷新页面
'''
self.curUrl = menuUrl
newThread = threading.Thread(target=self.getAllActiveUrl, args=(menuUrl, ))
newThread.setDaemon(True)
newThread.start()
def addDownloadTxtThread(self, txtUrl, isUrgency=False):
'''
添加下载任务到右侧列表、启动子线程解析章节并下载小说或添加到等待下载列表
isUrgency为True表明该下载需要立刻开始,若没有下载空位则停止最早的一个下载线程为之腾位置
'''
if threading.currentThread().ident == self.mainThreadID:
if txtUrl not in self.downloadUrlList:
self.downloadUrlList.append(txtUrl)
self.downloadListNum += 1
if txtUrl not in self.downloadingUrlList:
if len(self.downloadingUrlList) < self.maxDownloadNum:
# 启动下载子线程,在子线程内获取章节链接并遍历下载所有小说
self.downloadingUrlList.append(txtUrl)
if txtUrl in self.waitForDownloadUrlList:
self.waitForDownloadUrlList.remove(txtUrl)
self.showDownloadTxt()
downloadTxtThread = threading.Thread(target=self.getAllChapterUrl, args=(txtUrl, ))
downloadTxtThread.setDaemon(True)
downloadTxtThread.start()
elif isUrgency:
# 比较紧急,发命令使最早的一个下载线程及时自行停止为之腾位置
willStopedTxtUrl = self.downloadingUrlList.pop(0)
if willStopedTxtUrl in self.url2ThreadID:
self.thread2Status[self.url2ThreadID[willStopedTxtUrl]] = self.waitForDownload
if willStopedTxtUrl not in self.waitForDownloadUrlList:
self.waitForDownloadUrlList.append(willStopedTxtUrl)
# 直接启动下载子线程,在子线程内获取章节链接并遍历下载所有小说
self.downloadingUrlList.append(txtUrl)
if txtUrl in self.waitForDownloadUrlList:
self.waitForDownloadUrlList.remove(txtUrl)
self.showDownloadTxt()
downloadTxtThread = threading.Thread(target=self.getAllChapterUrl, args=(txtUrl, ))
downloadTxtThread.setDaemon(True)
downloadTxtThread.start()
elif txtUrl not in self.waitForDownloadUrlList:
# 加入等待下载队列
self.waitForDownloadUrlList.append(txtUrl)
self.showDownloadTxt()
def searchTxt(self):
'''
搜索按钮按下
'''
if 1 == self.mainUI.mainBox.currentIndex():
searchUrl = 'https://so.biqusoso.com/s.php?ie=utf-8&siteid=qu-la.com&q=' + self.mainUI.searchText.text()
self.url2MainUrlIndex[searchUrl] = 1
elif 0 == self.mainUI.mainBox.currentIndex():
searchUrl = 'http://wap.zzs5.com/index.php?m=search&c=index&a=init&typeid=2&siteid=1&q=' + self.mainUI.searchText.text()
self.url2MainUrlIndex[searchUrl] = 0
else:
return
self.enterMenu(searchUrl)
def chooseSavePath(self):
'''
选择目录按钮按下,手动选择一个目录作为存储目录
'''
choosedPath = QtWidgets.QFileDialog.getExistingDirectory(None, '选择保存目录', os.getcwd())
if choosedPath and choosedPath != self.curSavePath:
self.setUI.pathText.setText(choosedPath)
def changeSetting(self):
'''
更改下载保存目录、最大下载数量、下载速度
'''
newPath = self.setUI.pathText.text()
if newPath and newPath != self.curSavePath:
self.copyDownloadPath(newPath)
else:
self.setUI.pathText.setText(self.curSavePath)
if self.setUI.speedSpinBox.value() != self.threadNum:
# 改变下载速度(下载线程数)
self.threadNum = self.setUI.speedSpinBox.value()
for i in range(len(self.downloadingUrlList)):
txtUrl = self.downloadingUrlList[i]
if txtUrl in self.url2ThreadID:
self.thread2Status[self.url2ThreadID[txtUrl]] = self.changeDownloadSpeed
if self.setUI.numSpinBox.value() != self.maxDownloadNum:
# 改变最大下载数量
self.maxDownloadNum = self.setUI.numSpinBox.value()
self.wakeOneWaitedTxt()
def copyDownloadPath(self, newSavePath):
'''
当下载目录改变,将以前下载目录所有txt文件copy到新目录,且在此期间write函数等待其执行完毕
'''
self.savePathChanging = True
if not os.path.exists(os.path.join(newSavePath, 'DownloadData')):
os.mkdir(os.path.join(newSavePath, 'DownloadData'))
if os.path.exists(os.path.join(self.curSavePath, 'DownloadData')):
dirFiles = os.listdir(os.path.join(self.curSavePath, 'DownloadData'))
for dirFile in dirFiles:
if os.path.isfile(os.path.join(self.curSavePath, 'DownloadData', dirFile)):
if '.txt' in dirFile:
shutil.copyfile(os.path.join(self.curSavePath, 'DownloadData', dirFile), os.path.join(newSavePath, 'DownloadData', dirFile))
self.curSavePath = newSavePath
self.setUI.pathText.setText(self.curSavePath)
self.savePathChanging = False
def recoverSetting(self):
'''
取消保存设置的修改
'''
self.setUI.pathText.setText(self.curSavePath)
self.setUI.numSpinBox.setValue(self.maxDownloadNum)
self.setUI.speedSpinBox.setValue(self.threadNum)
def quitRunning(self):
'''
退出运行状态,仅考虑非守护线程,设置IsRunning为False使之停止运行
'''
self.isRunning = False
def refreshActiveUrl(self):
'''
刷新当前页面元素
'''
self.allActiveMenuUrl.pop(self.curUrl)
self.enterMenu(self.curUrl)
def downloadAllTxt(self):
'''
遍历下载列表所有小说,由addDownloadTxtThread处理下载或添加等待
'''
if threading.currentThread().ident == self.mainThreadID:
for i in range(len(self.downloadUrlList)):
txtUrl = self.downloadUrlList[i]
self.addDownloadTxtThread(txtUrl)
def stopAllTxt(self):
'''
停止下载所有小说
'''
for i in range(len(self.downloadUrlList)):
txtUrl = self.downloadUrlList[i]
if txtUrl in self.downloadingUrlList:
if txtUrl in self.url2ThreadID:
self.thread2Status[self.url2ThreadID[txtUrl]] = self.stopDownload
self.downloadingUrlList.remove(txtUrl)
self.waitForDownloadUrlList.clear()
self.flushDownloadSig.emit()
def cancelDownloadTxt(self, txtUrl):
'''
取消下载小说并从下载列表移除
'''
if txtUrl in self.downloadUrlList:
if txtUrl in self.downloadingUrlList:
self.downloadingUrlList.remove(txtUrl)
if txtUrl in self.url2ThreadID:
self.thread2Status[self.url2ThreadID[txtUrl]] = self.stopDownload
if txtUrl in self.waitForDownloadUrlList:
self.waitForDownloadUrlList.remove(txtUrl)
self.downloadUrlList.remove(txtUrl)
self.downloadListNum -= 1
self.flushDownloadSig.emit()
def stopOneDownloadingTxt(self, txtUrl):
'''
暂停一个下载中的小说线程
'''
if txtUrl in self.downloadingUrlList:
if txtUrl in self.thread2Status:
self.thread2Status[self.url2ThreadID[txtUrl]] = self.stopDownload
self.downloadingUrlList.remove(txtUrl)
self.flushDownloadSig.emit()
def startOrStopTxt(self, txtUrl):
'''
开始或暂停下载列表中的小说
'''
if txtUrl in self.downloadingUrlList:
# 执行暂停
if txtUrl in self.url2ThreadID:
self.thread2Status[self.url2ThreadID[txtUrl]] = self.stopDownload
self.downloadingUrlList.remove(txtUrl)
self.wakeOneWaitedTxt()
self.flushDownloadSig.emit()
elif txtUrl in self.waitForDownloadUrlList:
# 赋予优先级,立刻添加下载
self.addDownloadTxtThread(txtUrl, True)
else:
# 添加下载,没有优先级,可能进入等待队列
self.addDownloadTxtThread(txtUrl)
def wakeOneWaitedTxt(self):
'''
槽位减少,暂停最早的一个或多个下载任务
空出一个槽位,查看是否有等待中的下载任务,有则唤醒之
'''
if len(self.downloadingUrlList) < self.maxDownloadNum and len(self.waitForDownloadUrlList):
# 有空闲槽位
deltaNum = self.maxDownloadNum - len(self.downloadingUrlList)
for i in range(deltaNum):
if len(self.waitForDownloadUrlList):
txtUrl = self.waitForDownloadUrlList.pop(0)
self.addDownloadTxtThread(txtUrl)
else:
break
elif len(self.downloadingUrlList) > self.maxDownloadNum:
# 槽位减少,停止最早的减少量个下载线程
deltaNum = len(self.downloadingUrlList) - self.maxDownloadNum
for i in range(deltaNum):
self.stopOneDownloadingTxt(self.downloadingUrlList.pop(0))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。