代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
"""
# @Author : FengLiang
# @Time : 2020/4/11 19:50
# @File : ItemCF.py
"""
from operator import itemgetter
import numpy as np
import pandas as pd
# data_file = 'ratings.csv'
data_file = 'text3.csv'
# df = pd.read_csv(data_file, usecols=range(3))
# df.columns = ['user', 'item', 'rating']
# df = df[(df.user <= 300) & (df.item <= 4000)]
class ItemCF:
def __init__(self):
self.frame = pd.read_csv(data_file, usecols=range(3))
# self.frame = df
self.frame.columns = ['user', 'item', 'rating']
self.data = {}
self.train = {}
self.test = {}
self.similarity = {}
@staticmethod
def _process_data(input_data):
"""
自定义数据处理函数
:param input_data: DataFrame
:return: dict{user_id: {item_id: rating}}
"""
output_data = {}
for _, items in input_data.iterrows():
user = int(items['user'])
item = int(items['item'])
rating = float(items['rating'])
if user in output_data.keys():
currentRatings = output_data[user]
else:
currentRatings = {}
currentRatings[item] = rating
output_data[user] = currentRatings
return output_data
def load_data(self, train_size, normalize=True):
"""
加载数据,划分训练集和测试集
:param normalize:
:param train_size:
:return:
"""
print('loading data...')
if normalize is True: # 利用pandas对整列进行归一化,评分在(0,1)之间
rating = self.frame['rating']
self.frame['rating'] = (rating - rating.min()) / (rating.max() - rating.min())
train_data = self.frame.sample(frac=train_size, random_state=10, axis=0)
test_data = self.frame[~self.frame.index.isin(train_data.index)]
self.data = self._process_data(self.frame)
self.train = self._process_data(train_data)
self.test = self._process_data(test_data)
print('loaded data finish...')
def items_similarity(self, normal=False):
"""
计算物品和物品之间的相似度矩阵
:return:
"""
print('computing all deviations...')
items_matrix = {}
user_items_count = {}
for user, items in self.train.items():
for i in items.keys():
user_items_count.setdefault(i, 0) # 初始化i行j列的值
user_items_count[i] += 1
items_matrix.setdefault(i, {})
for j in items.keys():
if i == j:
continue
items_matrix[i].setdefault(j, 0)
# items_matrix[i][j] += 1
items_matrix[i][j] += 1 / np.math.log(1 + len(items) * 1.0)
if not normal:
for i, related_items in items_matrix.items(): # 计算相似度矩阵
self.similarity.setdefault(i, {})
for j, rating in related_items.items():
self.similarity[i][j] = rating / np.sqrt(
user_items_count[i] * user_items_count[j] * 1.0) # 余弦相似度
else:
self.similarity_max = {}
for i, related_items in items_matrix.items():
self.similarity.setdefault(i, {})
for j, rating in related_items.items():
self.similarity_max.setdefault(j, 0)
self.similarity[i][j] = rating / np.sqrt(
user_items_count[i] * user_items_count[j] * 1.0)
if self.similarity[i][j] > self.similarity_max[j]:
self.similarity_max[j] = self.similarity[i][j] # 记录第j列的最大值
for i, related_items in items_matrix.items():
for j, rating in related_items.items():
self.similarity[i][j] = self.similarity[i][j] / self.similarity_max[j]
print('computed all deviations finish...')
def predict(self, user, K, N=None):
"""
对输入的一个user进行推荐
:param K:
:param N:
:param user:
:return:
"""
recommendations = {}
if user not in list(self.train.keys()): # 训练集中不存在用户返回空结果
return list()
for sim_item, similarity_factor1 in self.train[user].items():
for related_item, similarity_factor2 in sorted(self.similarity[sim_item].items(), key=itemgetter(1),
reverse=True)[:K]:
if related_item in self.train[user].keys():
continue
recommendations.setdefault(related_item, 0)
recommendations[related_item] += similarity_factor1 * similarity_factor2
result_list = [(item, rating) for item, rating in recommendations.items()]
result_list.sort(key=lambda x: x[1], reverse=True)
if N: # topN
return result_list[:N]
else:
return result_list
def validate(self, K=20):
"""
计算MAE、RMSE评估指标
:return:
"""
print('calculating MAE and RMSE...')
error_sum = 0.0
sqrError_sum = 0.0
setSum = 0
for user in self.test:
recommendation = self.predict(user, K)
userRatings = self.test[user]
for each in recommendation:
item = each[0]
rating = each[1]
if item in userRatings:
error_sum += abs(userRatings[item] - rating)
sqrError_sum += (userRatings[item] - rating) ** 2
setSum += 1
mae = error_sum / setSum
rmse = np.sqrt(sqrError_sum / setSum)
return mae, rmse
def evaluate(self, K, N=10):
"""
推荐topN结果评估,计算precision和recall
:return:
"""
print('calculating precision and recall...')
hit = 0
recall_sum = 0
precision_sum = 0
# i = 0
for user in self.test.keys():
# i += 1
# if i % 100 == 0:
# print('calculating %d users' % i)
real_items = self.test.get(user) # 测试集真实结果
recommendation = self.predict(user, K, N)
rec_result = [(item, rating) for item, rating in recommendation]
pred_items = [p[0] for p in rec_result] # 预测结果
hit += len([h for h in pred_items if h in real_items])
recall_sum += len(real_items)
# precision_sum += len(pred_items)
precision_sum += N
# print(precision_sum, recall_sum)
precision = hit / (precision_sum * 1.0)
recall = hit / (recall_sum * 1.0)
return precision, recall
if __name__ == '__main__':
# evaluate_test()
itemcf = ItemCF()
itemcf.load_data(train_size=0.8, normalize=True)
itemcf.items_similarity(normal=False)
mae, rmse = itemcf.validate()
print(mae, rmse)
# train_range = [0.9, 0.8, 0.7, 0.6, 0.5]
# for i in train_range:
# print('数据集:', i)
# itemcf.load_data(train_size=i, normalize=True)
# itemcf.items_similarity(normal=False)
#
# mae, rmse = itemcf.validate()
# print(mae, rmse)
#
# pre, rec = itemcf.evaluate(N=10)
# print(pre, rec)
# res = itemcf.predict(1, top_n=10)
# print(res)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。