花了点时间重写基于内存的矩阵分解拟合原始矩阵的推荐算法,下一篇将会给出biasSVD算法,整个算法的难点在于梯度的计算以及梯度的更新,对于批量梯度下降算法可以参照相应的数学公式实现
from numpy import * import ray import socket import pandas as pd import os from sklearn.utils import shuffle from sklearn import preprocessing from collections import Counter import time import progressbar import matplotlib.pyplot as plt np.seterr(divide='ignore', invalid='ignore') ''' this is a function to come true svd model named svd++ algorithm. and using ray freamwork. name: kenny adelaide email: kenny13141314@163.com time: 2021/11/17 ''' # ray.init(address='192.168.0.219:6379', _redis_password="5241590000000000") ''' =================================================common function area============================================================== ''' def onloaddata(): ''' onload data to memory. Returns:matirx, userno and videono ''' dictionary = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(dictionary, 'data/00000005.csv') o_data = pd.read_csv(path) userno = o_data['userid'].max() + 1 videono = o_data['videoid'].max() + 1 return [o_data, userno, videono] def build_score_matrix_R(data, userno, videono): ''' this is common function for all algorithm-model. via original data to build the true score matrix. Args: data: userno: the max number of user code. videono:the max number of item code. Returns: score matrix ''' matrix = [[None] * videono] * userno matrix = np.array(matrix) # matrix = np.zeros((videono, userno)) for index, row in data.iterrows(): matrix[int(row['userid']), int(row['videoid'])] = float(row['score']) return matrix def L2Norm(a, vector): result = list(np.dot(vector, vector) * a)[0][0] return result ''' =================================================funck svd============================================================== ''' def init_P_Q_matrix(user_disms=[3, 3], item_disms=[3, 3], init_method='quadrature'): ''' this is a function to create two matrix for sgd training. we via quadrature distribution function. Args: user_disms: user matrix shape. item_disms: item matrix shape init_method: generating matrix approach. Returns: ''' if str(init_method) == str('quadrature'): P = random.randn(user_disms[0], user_disms[1]) Q = random.randn(item_disms[1], item_disms[0]) return [P, Q] return def calculate_error(P_matrix, Q_matrix, y_matrix): ''' calculating error rator from two matrix. Returns: ''' rows, cols = np.nonzero(y_matrix != None) errors = y_matrix[rows, cols] - np.sum(P_matrix[rows] * Q_matrix.T[cols], axis=1) return errors def gradient(P_matrix, Q_matrix, rows, cols, a, index, error): or_row, or_col = rows[index], cols[index] P_gradient = -2 * error * Q_matrix[:, or_col] + 2 * a * P_matrix[or_row, :] Q_gradient = -2 * error * P_matrix[or_row, :] + 2 * a * Q_matrix[:, or_col] return [Q_gradient, P_gradient] def updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols): or_row, or_col = rows[index], cols[index] P[or_row, :] -= learning_rate * P_gradient Q[:, or_col] -= learning_rate * Q_gradient return [P, Q] def funck_svd(): ''' train function is ford training svd++ algorithm. defined two matrix to fit the orginal rating-matrix. Returns: cost and iters count. ''' [data, userno, videono] = onloaddata() learning_rate = 0.001 iters = 50000 a = 0.005 [P, Q] = init_P_Q_matrix(user_disms=[userno, 2], item_disms=[videono, 2], init_method='quadrature') y_matirx = build_score_matrix_R(data, userno, videono) if not isinstance(P, np.ndarray): P = np.array(P).around(decimals=4) if not isinstance(Q, np.ndarray): Q = np.array(Q).around(decimals=4) if not isinstance(y_matirx, np.ndarray): y_matirx = np.array(y_matirx).around(decimals=4) rows, cols = np.nonzero(y_matirx != None) cost_arr = [] count = 0 bar = progressbar for i in bar.progressbar(range(iters)): errors_matrix = calculate_error(P, Q, y_matirx) cost = np.sum(np.square(errors_matrix)) if cost <= 0.00001: break for index in range(len(rows)): [Q_gradient, P_gradient] = gradient(P, Q, rows, cols, a, index, errors_matrix[index]) [P, Q] = updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols) cost_arr.append(cost) count += 1 return cost_arr, count