代码拉取完成,页面将自动刷新
同步操作将从 郭鼎/北网-2分院人工智能-1804C-资管 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
from spider import Spider
'''
url = 'https://www.x23us.com/html/66/66656/'
book_author_regex = '<meta name="og:novel:author" content="(.*?)"/> '
book_name_regex = '<meta name="og:novel:book_name" content="(.*?)"/>'
book_chapter_regex = '<td class="L"><a href="(.*?).html">(.*?)</a></td>'
x = Spider(url).get_info(book_author = book_author_regex,
book_name = book_name_regex,
book_chapter=book_chapter_regex,
)
'''
class BookInfoApi(Spider):
def book_info(self):
self.book_author_regex = '<meta name="og:novel:author" content="(.*?)"/> '
self.book_name_regex = '<meta name="og:novel:book_name" content="(.*?)"/>'
self.book_chapter_regex = '<td class="L"><a href="(.*?).html">(.*?)</a></td>'
self.book_info = self.get_info(
book_author = self.book_author_regex,
book_name = self.book_name_regex,
book_chapter= self.book_chapter_regex,
)
return self.book_info
class ChapterInfo(Spider):
def content_info(self):
self.chapter_name_regex = '<h1>(.*?)</h1>'
self.chapter_content_regex = '<dd id="contents">(.*?)</dd>'
self.chapter_next_regex = '<dd><h3><a href="/html/66/66656/">上一页</a> <a href="/html/66/66656/" title="圣墟最新章节更新列表">返回最新章节列表</a> <a href="/html/66/66656/27429412.html">下一页</a></h3></dd>'
self.chapter_info = self.get_info(
chapter_name = self.chapter_name_regex,
chapter_content = self.chapter_content_regex,
chapter_next = self.chapter_next_regex,
)
return self.chapter_info
def save(book_url = 'https://www.x23us.com/html/4/4779/'):
book_info = BookInfoApi(book_url).book_info()
with open('书名:{}-作者:{}.txt'.format(book_info['book_name'][0],book_info['book_author'][0]),'w') as f:
for chapter_url,chapter_name in book_info['book_chapter'][:10]:
chapter_url = book_url+ chapter_url + '.html'
chapter_info = ChapterInfo(chapter_url).content_info()
print(chapter_info['chapter_name'][0])
#print(chapter_info)
f.write(chapter_info['chapter_name'][0])
f.write('\n\n')
f.write(chapter_info['chapter_content'][0].replace(' ',' ').replace('<br />','\n'))
f.write('\n\n')
f.write('*'*20)
f.write('\n\n')
if __name__ == '__main__':
'''
book_info:
{'book_author': ['塞林格'],
'book_name': ['麦田里的守望者'],
'book_chapter': [('1524372', '·内容提要·'), ('1524375', '·作品赏析·'), ('1524378', '第01节'), ('1524381', '第02节'), ('1524384', '第03节'), ('1524387', '第04节'), ('1524390', '第05节'), ('1524394', '第06节'), ('1524397', '第07节'), ('1524400', '第08节'), ('1524403', '第09节'), ('1524406', '第10节'), ('1524410', '第11节'), ('1524413', '第12节'), ('1524416', '第13节'), ('1524419', '第14节'), ('1524422', '第15节'), ('1524425', '第16节'), ('1524428', '第17节'), ('1524431', '第18节'), ('1524434', '第19节'), ('1524437', '第20节'), ('1524441', '第21节'), ('1524444', '第22节'), ('1524447', '第23节'), ('1524450', '第24节'), ('1524453', '第25节'), ('1524456', '第26节')]}
chapter_info:
{'chapter_name': ['正文 ·内容提要·'],
'chapter_content': ['&n...],
'chapter_next': []
}
'''
save()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。