master

分支 (1)

管理

管理

master

spider
/
ddc-test.py

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from retrying import retry
import requests
from configparser import ConfigParser
import time
import random

# def _result(result):
#     return result is None

class downloader(object):

    # def reqget(self,target):
    #     # target = 'http://www.qqddc.com/jxs.do?method=list&pn='+str(pagenum)+'&pp='+str(city)
    #     print('请求数据:',target)
    #     res = requests.get(url = target)
    #     if res.status_code != 200:
    #         print('请求失败:',target,' 重新尝试……')
    #         raise requests.RequestException('my_request_get error!!!!')
    #     html = res.text

    # @retry(stop_max_attempt_number=10, wait_random_min=1000, wait_random_max=2000)
    def doit(self,city,pagenum):
        ret = ''
        target = 'http://www.qqddc.com/jxs.do?method=list&pn='+str(pagenum)+'&pp='+str(city)
        #target = 'http://www.qqddc.com/jxs.do?method=list&ct=355' #直接用城市id，不需要省份ID
        f=open('ddc-cookie.txt','r')
        cookies={}
        for line in f.read().split(';'):
            name,value=line.strip().split('=',1)
            cookies[name]=value
        headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
        print('请求数据:',target)
        res = requests.get(url = target,cookies=cookies,headers=headers)
        # print('返回代码:',res.status_code)
        # if res.status_code != 200:
        #     print('请求失败:',target,' 重新尝试……')
        #     raise requests.RequestException('my_request_get error!!!!')
        html = res.text
        bf = BeautifulSoup(html)
        print('解析成功:',target)
        divs = bf.find_all('div', class_ = 'item-txt')
        for i in divs:
            spans = i.find_all('span')
            onetext =  i.h1.a.text+'\t'+'品牌：'+ spans[0].a.text + '\t' + spans[1].text + '\t' + '地址：' + spans[2].text.split("：")[1]
            ret = ret + onetext + '\n'
        return ret
        #  print(texts.h1[0])

    def writer(self, path, text,city,pagenum):
        write_flag = True
        #写内容
        with open(path, 'a', encoding='utf-8') as f:
            f.write('\n')
            f.writelines(text)
            # f.write('\n\n')
        #更新配置
        cfg = ConfigParser()
        cfg.read('ddc-config.conf')
        cfg.set('curr','city',str(city))
        cfg.set('curr','pagenum',str(pagenum))
        with open('ddc-config.conf', 'w') as configfile:
            cfg.write(configfile)

if __name__ == "__main__":
    ret = ''
    # 超过页码通过302返回错误页面
    # 城市ID超过，返回内容是空
    # 从ct=1 pn=1开始循环
    # target = 'http://www.qqddc.com/jxs.do?method=list&pn='+str(pagenum)+'&pp='+str(city)
    target = 'http://www.qqddc.com/jxs.do?method=list&ct=455' #直接用城市id，不需要省份ID
    f=open('ddc-cookie.txt','r')
    cookies={}
    for line in f.read().split(';'):
        name,value=line.strip().split('=',1)
        cookies[name]=value
    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
    print('请求数据:',target)
    res = requests.get(url = target,cookies=cookies,headers=headers)
    html = res.text
    bf = BeautifulSoup(html)
    divs = bf.find_all('div', class_ = 'item-txt')
    if divs:
        print('false')
    else:
        print('true')