from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold
# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')
# define a cross-validation iterator
kf = KFold(n_splits=3)
algo = SVD()
for trainset, testset in kf.split(data):
# train and test algorithm.
algo.fit(trainset)
predictions = algo.test(testset)
# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)
import pandas as pd
from surprise import Dataset
from surprise import Reader
# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
'userID': [9, 32, 2, 45, 'user_foo'],
'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
from surprise import Dataset
from surprise import Reader
# path to dataset file
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold
# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
# This time, we'll use the built-in reader.
reader = Reader('ml-100k')
# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]
data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()
algo = SVD()
for trainset, testset in pkf.split(data):
# train and test algorithm.
algo.fit(trainset)
predictions = algo.test(testset)
# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)
随机预测:random_pred.NormalPredictor 假设训练集评分服从正态分布,根据训练集数据进行最大似然估计得到评分的均值
μ
^
\hat\mu
μ^和标准差
σ
^
\hat\sigma
σ^,构建正态分布
N
(
μ
,
σ
)
N(\mu,\sigma)
N(μ,σ),由正态分布产生预测结果。
μ
^
=
∑
r
u
i
∈
R
t
r
a
i
n
r
u
i
∣
R
t
r
a
i
n
∣
\hat{\mu}=\frac{\sum_{r_{ui}\in{R_{train}}}r_{ui}}{|R_{train}|}
μ^=∣Rtrain∣∑rui∈Rtrainrui
σ
^
=
∑
(
r
u
i
−
μ
^
)
2
∣
R
t
r
a
i
n
∣
\hat{\sigma}=\sqrt{\frac{\sum(r_{ui}-\hat{\mu})^2}{|R_{train}|}}
σ^=∣Rtrain∣∑(rui−μ^)2
baseline:baseline_only.BaselineOnly(bsl_options={}, verbose=True) 假设函数为
r
u
i
=
b
u
i
=
μ
+
b
u
+
b
i
r_{ui}=b_{ui}=\mu+b_u+b_i
rui=bui=μ+bu+bi,则需要最小化
∑
r
u
i
∈
R
t
r
a
i
n
(
r
u
i
−
(
μ
+
b
u
+
b
i
)
)
2
+
λ
(
∑
u
b
u
2
+
∑
i
b
i
2
)
\sum_{r_{ui}\in R_{train}}(r_{ui}-(\mu+b_u+b_i))^2+\lambda (\sum_u b_u^2+\sum_i b_i^2)
∑rui∈Rtrain(rui−(μ+bu+bi))2+λ(∑ubu2+∑ibi2),使用Stochastic Gradient Descent (SGD)或Alternating Least Squares(ALS)进行求解。 使用SGD时,损失函数为
∑
r
u
i
∈
R
t
r
a
i
n
(
r
u
i
−
(
μ
+
b
u
+
b
i
)
)
2
+
λ
(
∑
u
b
u
2
+
∑
i
b
i
2
)
\sum_{r_{ui}\in R_{train}}(r_{ui}-(\mu+b_u+b_i))^2+\lambda (\sum_u b_u^2+\sum_i b_i^2)
rui∈Rtrain∑(rui−(μ+bu+bi))2+λ(u∑bu2+i∑bi2)参数有:
使用ALS时,损失函数为
∑
r
u
i
∈
R
t
r
a
i
n
(
r
u
i
−
(
μ
+
b
u
+
b
i
)
)
2
+
λ
1
∑
u
b
u
2
+
λ
2
∑
i
b
i
2
\sum_{r_{ui}\in R_{train}}(r_{ui}-(\mu+b_u+b_i))^2+\lambda_1\sum_u b_u^2+\lambda_2\sum_i b_i^2
rui∈Rtrain∑(rui−(μ+bu+bi))2+λ1u∑bu2+λ2i∑bi2参数有:
Cosine similarity
c
o
s
i
n
e
_
s
i
m
(
u
,
v
)
=
∑
i
∈
I
u
v
r
u
i
r
v
i
∑
i
∈
I
u
v
r
u
i
2
∑
i
∈
I
u
v
r
v
i
2
cosine\_sim(u,v)=\frac{\sum_{i\in{I_{uv}}}r_{ui}r_{vi}}{\sqrt{\sum_{i\in{I_{uv}}}r_{ui}^2}\sqrt{\sum_{i\in{I_{uv}}}r_{vi}^2}}
cosine_sim(u,v)=∑i∈Iuvrui2∑i∈Iuvrvi2∑i∈Iuvruirvi
Mean Squared Difference similarity
m
s
d
(
u
,
v
)
=
1
∣
I
u
v
∣
∑
i
∈
I
u
v
(
r
u
i
−
r
v
i
)
2
msd(u,v)=\frac{1}{|I_{uv}|}\sum_{i\in{I_{uv}}}(r_{ui}-r_{vi})^2
msd(u,v)=∣Iuv∣1i∈Iuv∑(rui−rvi)2
m
s
d
_
s
i
m
(
u
,
v
)
=
1
1
+
m
s
d
(
u
,
v
)
msd\_sim(u,v)=\frac{1}{1+msd(u,v)}
msd_sim(u,v)=1+msd(u,v)1
Pearson similarity
p
e
a
r
s
o
n
_
s
i
m
(
u
,
v
)
=
∑
i
∈
I
u
v
(
r
u
i
−
μ
u
)
(
r
v
i
−
μ
v
)
∑
i
∈
I
u
v
(
r
u
i
−
μ
u
)
2
∑
i
∈
I
u
v
(
r
v
i
−
μ
v
)
2
pearson\_sim(u,v)=\frac{\sum_{i\in{I_{uv}}}(r_{ui}-\mu_u)(r_{vi}-\mu_v)}{\sqrt{\sum_{i\in{I_{uv}}}(r_{ui}-\mu_u)^2}\sqrt{\sum_{i\in{I_{uv}}}(r_{vi}-\mu_v)^2}}
pearson_sim(u,v)=∑i∈Iuv(rui−μu)2∑i∈Iuv(rvi−μv)2∑i∈Iuv(rui−μu)(rvi−μv)
Pearson baseline similarity:使用baseline
b
u
i
b_{ui}
bui取代mean
p
e
a
r
s
o
n
_
b
a
s
e
l
i
n
e
_
s
i
m
(
u
,
v
)
=
∑
i
∈
I
u
v
(
r
u
i
−
b
u
i
)
(
r
v
i
−
b
v
i
)
∑
i
∈
I
u
v
(
r
u
i
−
b
u
i
)
2
∑
i
∈
I
u
v
(
r
v
i
−
b
v
i
)
2
pearson\_baseline\_sim(u,v)=\frac{\sum_{i\in{I_{uv}}}(r_{ui}-b_{ui})(r_{vi}-b_{vi})}{\sqrt{\sum_{i\in{I_{uv}}}(r_{ui}-b_{ui})^2}\sqrt{\sum_{i\in{I_{uv}}}(r_{vi}-b_{vi})^2}}
pearson_baseline_sim(u,v)=∑i∈Iuv(rui−bui)2∑i∈Iuv(rvi−bvi)2∑i∈Iuv(rui−bui)(rvi−bvi) 当评分矩阵十分稀疏时,使用下式计算以减少过拟合
p
e
a
r
s
o
n
_
b
a
s
e
l
i
n
e
_
s
h
r
i
n
k
_
s
i
m
(
u
,
v
)
=
∣
I
u
v
∣
−
1
∣
I
u
v
∣
−
1
+
s
h
r
i
n
k
a
g
e
p
e
a
r
s
o
n
_
b
a
s
e
l
i
n
e
_
s
i
m
(
u
,
v
)
pearson\_baseline\_shrink\_sim(u,v)=\frac{|I_{uv}|-1}{|I_{uv}|-1+shrinkage}pearson\_baseline\_sim(u,v)
pearson_baseline_shrink_sim(u,v)=∣Iuv∣−1+shrinkage∣Iuv∣−1pearson_baseline_sim(u,v)
knns.KNNBasic(k=40, min_k=1, sim_options={}, verbose=True):基本的基于物品or用户的协同过滤
r
^
u
i
=
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
×
r
v
i
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
\hat r_{ui}=\frac{\sum_{v\in{N_{i(u)}^k}}sim(u,v)\times r_{vi}}{\sum_{v\in{N_{i(u)}^k}}sim(u,v)}
r^ui=∑v∈Ni(u)ksim(u,v)∑v∈Ni(u)ksim(u,v)×rvi knns.KNNWithMeans(k=40, min_k=1, sim_options={}, verbose=True):考虑用户/物品偏好的协同过滤
r
^
u
i
=
μ
u
+
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
×
(
r
v
i
−
μ
v
)
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
\hat r_{ui}=\mu_u+\frac{\sum_{v\in{N_{i(u)}^k}}sim(u,v)\times (r_{vi}-\mu_v)}{\sum_{v\in{N_{i(u)}^k}}sim(u,v)}
r^ui=μu+∑v∈Ni(u)ksim(u,v)∑v∈Ni(u)ksim(u,v)×(rvi−μv) knns.KNNWithZScore(k=40, min_k=1, sim_options={}, verbose=True)考虑用户/物品偏好,并对用户/物品进行z-score归一化
r
^
u
i
=
μ
u
+
σ
u
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
×
(
r
v
i
−
μ
v
)
÷
σ
v
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
\hat r_{ui}=\mu_u+\sigma_u \frac{\sum_{v\in{N_{i(u)}^k}}sim(u,v)\times (r_{vi}-\mu_v)\div\sigma_v}{\sum_{v\in{N_{i(u)}^k}}sim(u,v)}
r^ui=μu+σu∑v∈Ni(u)ksim(u,v)∑v∈Ni(u)ksim(u,v)×(rvi−μv)÷σv knns.KNNBaseline(k=40, min_k=1, sim_options={}, bsl_options={}, verbose=True)基于baseline的协同过滤
r
^
u
i
=
b
u
i
+
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
×
(
r
v
i
−
b
v
i
)
∑
v
∈
N
i
(
u
)
k
s
i
m
(
u
,
v
)
\hat r_{ui}=b_{ui}+\frac{\sum_{v\in{N_{i(u)}^k}}sim(u,v)\times (r_{vi}-b_{vi})}{\sum_{v\in{N_{i(u)}^k}}sim(u,v)}
r^ui=bui+∑v∈Ni(u)ksim(u,v)∑v∈Ni(u)ksim(u,v)×(rvi−bvi)
matrix_factorization.SVD,考虑偏置项的LFM,也称为bias-SVD 假设函数为
r
^
u
i
=
μ
+
b
u
+
b
i
+
q
i
T
p
u
\hat r_{ui}=\mu+b_u+b_i+q_i^Tp_u
r^ui=μ+bu+bi+qiTpu 构造损失函数为
∑
r
u
i
∈
R
t
r
a
i
n
(
r
u
i
−
r
^
u
i
)
2
+
λ
(
b
u
2
+
b
i
2
+
∣
∣
q
i
∣
∣
2
+
∣
∣
p
u
∣
∣
2
)
\sum_{r_{ui}\in {R_{train}}}(r_{ui}-\hat r_{ui})^2+\lambda(b_u^2+b_i^2+||q_i||^2+||p_u||^2)
rui∈Rtrain∑(rui−r^ui)2+λ(bu2+bi2+∣∣qi∣∣2+∣∣pu∣∣2) 带入假设函数得
∑
r
u
i
∈
R
t
r
a
i
n
(
r
u
i
−
(
μ
+
b
u
+
b
i
+
q
i
T
p
u
)
)
2
+
λ
(
b
u
2
+
b
i
2
+
∣
∣
q
i
∣
∣
2
+
∣
∣
p
u
∣
∣
2
)
\sum_{r_{ui}\in {R_{train}}}(r_{ui}-(\mu+b_u+b_i+q_i^Tp_u))^2+\lambda(b_u^2+b_i^2+||q_i||^2+||p_u||^2)
rui∈Rtrain∑(rui−(μ+bu+bi+qiTpu))2+λ(bu2+bi2+∣∣qi∣∣2+∣∣pu∣∣2) 使用SGD的迭代公式如下:
b
u
=
b
u
+
γ
(
r
u
i
−
r
^
u
i
−
λ
b
u
)
b_u = b_u + \gamma(r_{ui}-\hat r_{ui}-\lambda b_u)
bu=bu+γ(rui−r^ui−λbu)
b
i
=
b
i
+
γ
(
r
u
i
−
r
^
u
i
−
λ
b
i
)
b_i = b_i + \gamma(r_{ui}-\hat r_{ui}-\lambda b_i)
bi=bi+γ(rui−r^ui−λbi)
q
i
=
q
i
+
γ
(
(
r
u
i
−
r
^
u
i
)
p
u
−
λ
q
i
)
q_i = q_i + \gamma((r_{ui}-\hat r_{ui})p_u-\lambda q_i)
qi=qi+γ((rui−r^ui)pu−λqi)
p
u
=
p
u
+
γ
(
(
r
u
i
−
r
^
u
i
)
q
i
−
λ
p
u
)
p_u = p_u + \gamma((r_{ui}-\hat r_{ui})q_i-\lambda p_u)
pu=pu+γ((rui−r^ui)qi−λpu) 其中
γ
\gamma
γ是学习速率,
λ
\lambda
λ是正则化系数。 算法参数:
参数名
说明
默认值
n_factor
设置的factor数目
100
n_epochs
SGD算法的迭代次数
20
biased
是否使用bias-SVD,True使用bias-SVD,False使用SVD
True
lr_all
所有参数的学习速率,也可以针对不同的参数设置不同的学习速率
0.005
reg_all
所有参数的正则化系数,也可以针对不同的参数设置不同的正则化系数
0.02
返回
p
u
p_u
pu,
q
i
q_i
qi,
b
u
b_u
bu,
b
i
b_i
bi。
matrix_factorization.SVDpp 前面的LFM模型并没有显式地考虑用户的历史行为对用户评分预测的影响。Koren在Netflix Prize比赛中提出了一个模型,将用户历史评分的物品加入到了LFM模型中,结合了基于item的邻域方法(item-CF)和LFM,称为SVD++。(论文:Factor in the Neigbhorhood: Scalable and Accurate Collaborative Filtering) 假设函数为:
r
^
u
i
=
μ
+
b
u
+
b
i
+
q
i
T
p
u
+
1
∣
I
u
∣
∑
j
∈
I
u
w
i
j
\hat r_{ui}=\mu+b_u+b_i+q_i^Tp_u+\frac{1}{|I_u|}\sum_{j\in I_u}w_{ij}
r^ui=μ+bu+bi+qiTpu+∣Iu∣1j∈Iu∑wij 将物品相似度矩阵
w
i
j
w_{ij}
wij同样进行分解,得:
r
^
u
i
=
μ
+
b
u
+
b
i
+
q
i
T
p
u
+
1
∣
I
u
∣
x
i
T
∑
j
∈
I
u
y
j
\hat r_{ui}=\mu+b_u+b_i+q_i^Tp_u+\frac{1}{|I_u|}x_i^T\sum_{j\in I_u}y_j
r^ui=μ+bu+bi+qiTpu+∣Iu∣1xiTj∈Iu∑yj 为了缓解过拟合,令
x
=
q
x=q
x=q,得
r
^
u
i
=
μ
+
b
u
+
b
i
+
q
i
T
(
p
u
+
1
∣
I
u
∣
∑
j
∈
I
u
y
j
)
\hat r_{ui}=\mu+b_u+b_i+q_i^T(p_u+\frac{1}{|I_u|}\sum_{j\in I_u}y_j)
r^ui=μ+bu+bi+qiT(pu+∣Iu∣1j∈Iu∑yj) 后面构造损失函数,以及算法的超参数与bias-SVD相似。
matrix_factorization.NMF Non-negative Matrix Factorization (NMF),一种基于非负矩阵分解的协同过滤算法。假设函数为
r
^
u
i
=
μ
+
b
u
+
b
i
+
q
i
T
p
u
\hat r_{ui}=\mu+b_u+b_i+q_i^Tp_u
r^ui=μ+bu+bi+qiTpu,与SVD的区别在于
q
i
q_i
qi和
p
u
p_u
pu的值在任何时候都是非负的。相关论文有:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')
# We'll use the famous SVD algorithm.
algo = SVD()
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV
# Use movielens-100K
data = Dataset.load_builtin('ml-100k')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])