1 Star 3 Fork 2

Jackin/Python3_t.qq.com_client

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
channelWeiboGather.py 3.70 KB
一键复制 编辑 原始数据 按行查看 历史
Jackin CNLove 提交于 2013-11-09 16:15 . Init
# -*- coding: UTF-8 -*-
import client.tWeibo
import urllib.parse
import datetime
import sqlite3
import random
import time
import math
import json
import sys
import re
# 从广场采集微博内容
# 保存到SQLite
def stripLink(html):
return re.sub("<a [^>]+>(?P<content>.+?)</a>", "\g<content>", html, re.I | re.S)
def replaceQFace(html):
html = re.sub("<img .+?title='(?P<title>[^']+)'>", '/\g<title> ', html, re.I)
return re.sub("<img .+?title='(?P<title>[^']+)' k='(?P<k>[^']+)'>", '/\g<k>\g<title> ', html, re.I)
connect = sqlite3.connect('weibo.db')
cursor = connect.cursor()
# cursor.execute("CREATE TABLE IF NOT EXISTS weibo_content (id INTEGER PRIMARY KEY, channel TEXT, weibo_id TEXT UNIQUE, content TEXT, name TEXT, nick TEXT, timestamp INTEGER, pic TEXT, gathertime TEXT DEFAULT CURRENT_TIMESTAMP )")
# connect.commit()
uin = 'QQ号'
passwd = '密码'
wb = client.tWeibo.tWeibo(uin, passwd)
wb.login()
# 不采集的频道 ('1834', '《大家》'),
channelIds = [
('1254', '热门'),
('1248', '热门'),
('40', '搞笑'),
('838', '动漫'),
('19', '星座'),
('48', '兴趣'),
('34', '读书'),
('883', '科技'),
('1', '时事'),
('47', '资讯'),
('1331', '蔡奇'),
('1688', '路边社'),
('882', '杂谈'),
('1732', '媒体'),
('889', '人生语录'),
('1646', '情感'),
('1719', '语录'),
('1829', '全家福'),
('38', '旅行'),
('31', '摄影'),
('1718', '美图'),
('925', '红酒'),
('1141', '美食家'),
('35', '美食'),
('37', '美容'),
('36', '服饰搭配'),
('49', '时尚'),
('844', '贴图'),
('843', '真人秀'),
('1645', '女神'),
('43', '美女'),
]
for (channelId, channelName) in channelIds:
header = {
'Referer':'http://c.t.qq.com/i/' + channelId,
'User-Agent' : 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17',
}
page = 1
sqlInsert = "INSERT OR IGNORE INTO weibo_content(channel, weibo_id, content, name, nick, timestamp, pic) VALUES(?, ?, ?, ?, ?, ?, ?)"
apiURL = 'http://c.t.qq.com/asyn/selectedAutoUpdate.php?'
furl = apiURL + urllib.parse.urlencode({
'cid' : channelId,
'n' : '30',
'pgv_ref' : 'web.c.page.nav.tree.level1',
'turn' : '1',
'version' : '4',
'personalOrder' : '0',
'apiType' : '7',
'apiHost' : 'http://api.t.qq.com',
'_r' : math.floor(datetime.datetime.now().timestamp() * 1000),
})
rs = wb.get(furl, header).data.decode('utf-8')
fp = open('data\\channel\\%s.json' % channelId, 'w', encoding='utf-8')
while True:
fp.write(rs.strip() + '\n')
print("广场 > %s -- %d" % (channelName, page))
try:
result = json.loads(rs)
except ValueError as e:
print(rs)
print(e)
sys.exit()
if len(result['info']['talk']) == 0:
break
for talk in result['info']['talk']:
try:
tid = talk['id']
timestamp = talk['timestamp']
if talk['type'] != 1 or talk['content'] is None:
continue
cursor.execute(sqlInsert, (channelName, talk['id'], stripLink(replaceQFace(talk['content'])), talk['name'], talk['nick'], talk['timestamp'], ','.join(talk['image']),))
except KeyError as e:
print(talk)
print(e)
connect.commit()
page += 1
furl = apiURL + urllib.parse.urlencode({
'cid' : channelId,
'n' : '30',
'pgv_ref' : 'web.c.page.nav.tree.level1',
'turn' : '1',
'version' : '2',
'personalOrder' : '0',
'r' : math.floor(datetime.datetime.now().timestamp() * 1000),
'p' : page,
'id' : tid,
'time' : timestamp,
'apiType' : '7',
'apiHost' : 'http//api.t.qq.com',
'_r' : math.floor(datetime.datetime.now().timestamp() * 1000),
})
rs = wb.get(furl, header).data.decode('utf-8')
fp.close()
connect.commit()
cursor.close()
connect.close()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/Jackin/Python3_t.qq.com_client.git
[email protected]:Jackin/Python3_t.qq.com_client.git
Jackin
Python3_t.qq.com_client
Python3_t.qq.com_client
master

搜索帮助