代码拉取完成,页面将自动刷新
# encoding=utf8
import sys
import csv
import time
import urllib
import random
import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
class WhHouseMarket:
dataUrl = "https://wh.lianjia.com/chengjiao"
outPutFile = "/Users/didi/work/tmp/whHouse1.csv"
retData = []
def __init__(self):
print "init WhHouseMarket"
def run(self):
print "run WhHouseMarket"
dataRealUrl = self.dataUrl
count = 1
retry = 0
while True:
count += 1
dataObj = self.getHtml(dataRealUrl)
datalen = self.parseData(dataObj)
dataRealUrl = "%s/pg%d" % (self.dataUrl, count)
print "page %d crewl done with len of %d!" % (count-1, datalen)
if datalen < 10:
if retry < 3:
count -= 1
continue
else:
break
if count > 20000:
break
retry = 0
time.sleep(random.randint(5,10))
self.dumpRet(self.outPutFile)
def getHtml(self, urlStr):
data = urllib.urlopen(urlStr)
# data = open('/Users/didi/work/tmp/index.html')
return BeautifulSoup.BeautifulSoup(data)
def parseData(self, dataObj):
itemList = dataObj.findAll("ul", {"class":"listContent"})
itemList = itemList[0].findAll("li")
for item in itemList:
title = item.div.findAll("div", {"class":"title"})[0].a.string
dealDate = item.div.findAll("div", {"class":"dealDate"})[0].string
dealCycleTxt = item.div.findAll("span", {"class":"dealCycleTxt"})[0]
unitPrice = item.div.findAll("div", {"class":"unitPrice"})[0].span.string
totalPrice = item.div.findAll("div", {"class":"totalPrice"})[0].span.string
houseExtInfo = item.div.findAll("div", {"class":"houseInfo"})[0].contents[1]
newItem = {}
newItem['title'] = title
newItem["dealDate"] = dealDate
newItem["unitPrice"] = unitPrice
newItem["totalPrice"] = totalPrice
newItem["houseExtInfo"] = houseExtInfo
spans = dealCycleTxt.findAll('span')
if len(spans) >= 2 :
newItem["expectPrice"] = spans[0].string
newItem["period"] = spans[1].string
self.retData.append(newItem)
return len(itemList)
def dumpRet(self, filename):
with open(filename, 'wb') as fp:
self.retData.sort(key=self.takeDealDate)
for item in self.retData:
# oneline = ','.join(item.values())
# oneline = "%s,%s,%s,%s,%s,%s,%s\n" % (item['title'], item['dealDate'], item['unitPrice'], item['totalPrice'], item['expectPrice'], item['period'], item['houseExtInfo'])
# fp.write(oneline)
myWriter = csv.writer(fp)
myWriter.writerow(item.values())
def takeDealDate(self, data):
return data['dealDate']
if __name__ == '__main__':
newAna = WhHouseMarket()
newAna.run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。