from numpy import * import ray import socket import pandas as pd import os from sklearn.utils import shuffle from sklearn import preprocessing from collections import Counter import time import progressbar import matplotlib.pyplot as plt np.seterr(divide='ignore', invalid='ignore') ''' this is a function to come true svd model named svd++ algorithm. and using ray freamwork. name: kenny adelaide email: kenny13141314@163.com time: 2021/11/17 ''' # ray.init(address='', _redis_password="5241590000000000") ''' =================================================common function area============================================================== ''' def onloaddata(): ''' onload data to memory. Returns:matirx, userno and videono ''' dictionary = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(dictionary, 'data/00000005.csv') o_data = pd.read_csv(path) userno = o_data['userid'].max() + 1 videono = o_data['videoid'].max() + 1 return [o_data, userno, videono] def build_score_matrix_R(data, userno, videono): ''' this is common function for all algorithm-model. via original data to build the true score matrix. Args: data: userno: the max number of user code. videono:the max number of item code. Returns: score matrix ''' matrix = [[None] * videono] * userno matrix = np.array(matrix) # matrix = np.zeros((videono, userno)) for index, row in data.iterrows(): matrix[int(row['userid']), int(row['videoid'])] = float(row['score']) return matrix def L2Norm(a, vector): result = list(np.dot(vector, vector) * a)[0][0] return result ''' =================================================funck svd============================================================== ''' def init_P_Q_matrix(user_disms=[3, 3], item_disms=[3, 3], init_method='quadrature'): ''' this is a function to create two matrix for sgd training. we via quadrature distribution function. Args: user_disms: user matrix shape. item_disms: item matrix shape init_method: generating matrix approach. Returns: ''' if str(init_method) == str('quadrature'): P = random.randn(user_disms[0], user_disms[1]) Q = random.randn(item_disms[1], item_disms[0]) return [P, Q] return def calculate_error(P_matrix, Q_matrix, y_matrix): ''' calculating error rator from two matrix. Returns: ''' rows, cols = np.nonzero(y_matrix != None) errors = y_matrix[rows, cols] - np.sum(P_matrix[rows] * Q_matrix.T[cols], axis=1) return errors def gradient(P_matrix, Q_matrix, rows, cols, a, index, error): or_row, or_col = rows[index], cols[index] P_gradient = -2 * error * Q_matrix[:, or_col] + 2 * a * P_matrix[or_row, :] Q_gradient = -2 * error * P_matrix[or_row, :] + 2 * a * Q_matrix[:, or_col] return [Q_gradient, P_gradient] def updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols): or_row, or_col = rows[index], cols[index] P[or_row, :] -= learning_rate * P_gradient Q[:, or_col] -= learning_rate * Q_gradient return [P, Q] def funck_svd(): ''' train function is ford training svd++ algorithm. defined two matrix to fit the orginal rating-matrix. Returns: cost and iters count. ''' [data, userno, videono] = onloaddata() learning_rate = 0.001 iters = 50000 a = 0.005 [P, Q] = init_P_Q_matrix(user_disms=[userno, 2], item_disms=[videono, 2], init_method='quadrature') y_matirx = build_score_matrix_R(data, userno, videono) if not isinstance(P, np.ndarray): P = np.array(P).around(decimals=4) if not isinstance(Q, np.ndarray): Q = np.array(Q).around(decimals=4) if not isinstance(y_matirx, np.ndarray): y_matirx = np.array(y_matirx).around(decimals=4) rows, cols = np.nonzero(y_matirx != None) cost_arr = [] count = 0 bar = progressbar for i in bar.progressbar(range(iters)): errors_matrix = calculate_error(P, Q, y_matirx) cost = np.sum(np.square(errors_matrix)) if cost <= 0.00001: break for index in range(len(rows)): [Q_gradient, P_gradient] = gradient(P, Q, rows, cols, a, index, errors_matrix[index]) [P, Q] = updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols) cost_arr.append(cost) count += 1 return cost_arr, count