代码拉取完成,页面将自动刷新
import configparser
import os,sys
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils import project
from ting89.ting89_spider import Ting89Spider
from Dmzj.dmzj_spider import DmzjSpider
from ting89search.ting89s_spider import Ting89sSpider
import re
import time
def GetSpiderList(cfgname):
conf = configparser.ConfigParser()
conf.read(os.path.join(os.path.dirname(os.path.realpath(__file__)), cfgname))
spider_path = conf.items('settings')
spider_module = [x[0] for x in spider_path]
return spider_module
# 本想从spider中获取name,结果遍历失败
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
spider_name = []
for module in spider_module:
for module1 in walk_modules(module):
for spcls in iter_spider_classes(module1):
spider_name.append(spcls.name)
return spider_name
def GetSpiderSettings(spider_name, conf):
os.environ['SCRAPY_PROJECT'] = spider_name
spider_setting = project.get_project_settings()
ini_setting = conf.items('settings')
ini_setting = {k.upper():v for k, v in ini_setting}
for k, v in ini_setting.items():
if v.lower() == 'false': v = False
elif v.lower() == 'true': v = True
spider_setting.set(k, v)
if k == 'LOG_PATH':
# 创建日志目录
if not os.path.exists(v):
os.mkdir(v)
# 设置日志文件名
LOG_FILE = v + '/scrapy_{}.log'.format(time.strftime("%Y-%m-%d", time.localtime()) )
spider_setting.set('LOG_FILE', LOG_FILE)
return spider_setting
def main():
spider_factory = GetSpiderList('scrapy.cfg')
## init
conf = configparser.ConfigParser()
conf.read(os.path.join(os.path.dirname(os.path.realpath(__file__)),'./config.ini'), encoding='utf-8')
if len(sys.argv) > 1:
url = sys.argv[1]
else:
url = conf.get('Network', 'url')
# 通过url,解析爬虫名字 ( 取 //和/之间的域名,用'.'分割,之后取倒数第二个作为爬虫名字)
spider_name = re.search('\/\/.*?\/', url).group().split('.')[-2]
if spider_name not in spider_factory:
print("url not support!!!", url)
exit(0)
setting = GetSpiderSettings(spider_name, conf)
process = CrawlerProcess(setting)
r1 = process.crawl(spider_name, url)
r2 = process.start()
if __name__ == "__main__":
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。