2 Star 0 Fork 0

LiangQ/scrapy_demo

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
start.py 2.46 KB
一键复制 编辑 原始数据 按行查看 历史
import configparser
import os,sys
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils import project
from ting89.ting89_spider import Ting89Spider
from Dmzj.dmzj_spider import DmzjSpider
from ting89search.ting89s_spider import Ting89sSpider
import re
import time
def GetSpiderList(cfgname):
conf = configparser.ConfigParser()
conf.read(os.path.join(os.path.dirname(os.path.realpath(__file__)), cfgname))
spider_path = conf.items('settings')
spider_module = [x[0] for x in spider_path]
return spider_module
# 本想从spider中获取name,结果遍历失败
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
spider_name = []
for module in spider_module:
for module1 in walk_modules(module):
for spcls in iter_spider_classes(module1):
spider_name.append(spcls.name)
return spider_name
def GetSpiderSettings(spider_name, conf):
os.environ['SCRAPY_PROJECT'] = spider_name
spider_setting = project.get_project_settings()
ini_setting = conf.items('settings')
ini_setting = {k.upper():v for k, v in ini_setting}
for k, v in ini_setting.items():
if v.lower() == 'false': v = False
elif v.lower() == 'true': v = True
spider_setting.set(k, v)
if k == 'LOG_PATH':
# 创建日志目录
if not os.path.exists(v):
os.mkdir(v)
# 设置日志文件名
LOG_FILE = v + '/scrapy_{}.log'.format(time.strftime("%Y-%m-%d", time.localtime()) )
spider_setting.set('LOG_FILE', LOG_FILE)
return spider_setting
def main():
spider_factory = GetSpiderList('scrapy.cfg')
## init
conf = configparser.ConfigParser()
conf.read(os.path.join(os.path.dirname(os.path.realpath(__file__)),'./config.ini'), encoding='utf-8')
if len(sys.argv) > 1:
url = sys.argv[1]
else:
url = conf.get('Network', 'url')
# 通过url,解析爬虫名字 ( 取 //和/之间的域名,用'.'分割,之后取倒数第二个作为爬虫名字)
spider_name = re.search('\/\/.*?\/', url).group().split('.')[-2]
if spider_name not in spider_factory:
print("url not support!!!", url)
exit(0)
setting = GetSpiderSettings(spider_name, conf)
process = CrawlerProcess(setting)
r1 = process.crawl(spider_name, url)
r2 = process.start()
if __name__ == "__main__":
main()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/LiangQuan-git/scrapy_demo.git
[email protected]:LiangQuan-git/scrapy_demo.git
LiangQuan-git
scrapy_demo
scrapy_demo
master

搜索帮助