1 Star 0 Fork 0

BugIt/二手房交易数据爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
wh_house.py 3.03 KB
一键复制 编辑 原始数据 按行查看 历史
# encoding=utf8
import sys
import csv
import time
import urllib
import random
import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
class WhHouseMarket:
dataUrl = "https://wh.lianjia.com/chengjiao"
outPutFile = "/Users/didi/work/tmp/whHouse1.csv"
retData = []
def __init__(self):
print "init WhHouseMarket"
def run(self):
print "run WhHouseMarket"
dataRealUrl = self.dataUrl
count = 1
retry = 0
while True:
count += 1
dataObj = self.getHtml(dataRealUrl)
datalen = self.parseData(dataObj)
dataRealUrl = "%s/pg%d" % (self.dataUrl, count)
print "page %d crewl done with len of %d!" % (count-1, datalen)
if datalen < 10:
if retry < 3:
count -= 1
continue
else:
break
if count > 20000:
break
retry = 0
time.sleep(random.randint(5,10))
self.dumpRet(self.outPutFile)
def getHtml(self, urlStr):
data = urllib.urlopen(urlStr)
# data = open('/Users/didi/work/tmp/index.html')
return BeautifulSoup.BeautifulSoup(data)
def parseData(self, dataObj):
itemList = dataObj.findAll("ul", {"class":"listContent"})
itemList = itemList[0].findAll("li")
for item in itemList:
title = item.div.findAll("div", {"class":"title"})[0].a.string
dealDate = item.div.findAll("div", {"class":"dealDate"})[0].string
dealCycleTxt = item.div.findAll("span", {"class":"dealCycleTxt"})[0]
unitPrice = item.div.findAll("div", {"class":"unitPrice"})[0].span.string
totalPrice = item.div.findAll("div", {"class":"totalPrice"})[0].span.string
houseExtInfo = item.div.findAll("div", {"class":"houseInfo"})[0].contents[1]
newItem = {}
newItem['title'] = title
newItem["dealDate"] = dealDate
newItem["unitPrice"] = unitPrice
newItem["totalPrice"] = totalPrice
newItem["houseExtInfo"] = houseExtInfo
spans = dealCycleTxt.findAll('span')
if len(spans) >= 2 :
newItem["expectPrice"] = spans[0].string
newItem["period"] = spans[1].string
self.retData.append(newItem)
return len(itemList)
def dumpRet(self, filename):
with open(filename, 'wb') as fp:
self.retData.sort(key=self.takeDealDate)
for item in self.retData:
# oneline = ','.join(item.values())
# oneline = "%s,%s,%s,%s,%s,%s,%s\n" % (item['title'], item['dealDate'], item['unitPrice'], item['totalPrice'], item['expectPrice'], item['period'], item['houseExtInfo'])
# fp.write(oneline)
myWriter = csv.writer(fp)
myWriter.writerow(item.values())
def takeDealDate(self, data):
return data['dealDate']
if __name__ == '__main__':
newAna = WhHouseMarket()
newAna.run()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/hengbo12345/house_deal_crawler.git
[email protected]:hengbo12345/house_deal_crawler.git
hengbo12345
house_deal_crawler
二手房交易数据爬虫
master

搜索帮助