基于MF 的推荐系统 funckSVD算法以及实现

博客小编 (112) 2024-06-03 14:01:01

花了点时间重写基于内存的矩阵分解拟合原始矩阵的推荐算法，下一篇将会给出biasSVD算法，整个算法的难点在于梯度的计算以及梯度的更新，对于批量梯度下降算法可以参照相应的数学公式实现

基于MF 的推荐系统 funckSVD算法以及实现 (https://mushiming.com/) 第1张

基于MF 的推荐系统 funckSVD算法以及实现 (https://mushiming.com/) 第2张

from numpy import *
import ray
import socket
import pandas as pd
import os
from sklearn.utils import shuffle
from sklearn import preprocessing
from collections import Counter
import time
import progressbar
import matplotlib.pyplot as plt

np.seterr(divide='ignore', invalid='ignore')
'''
this is a function to come true svd model named svd++ algorithm. and using ray freamwork.
name: kenny adelaide
email: kenny13141314@163.com
time: 2021/11/17
'''

# ray.init(address='192.168.0.219:6379', _redis_password="5241590000000000")


'''
=================================================common function area==============================================================
'''


def onloaddata():
    '''
    onload data to memory.
    Returns:matirx, userno and videono
    '''
    dictionary = os.path.dirname(os.path.abspath(__file__))
    path = os.path.join(dictionary, 'data/00000005.csv')
    o_data = pd.read_csv(path)
    userno = o_data['userid'].max() + 1
    videono = o_data['videoid'].max() + 1
    return [o_data, userno, videono]


def build_score_matrix_R(data, userno, videono):
    '''
    this is common function for all algorithm-model.
    via original data to build the true score matrix.
    Args:
        data:
        userno: the max number of user code.
        videono:the max number of item code.
    Returns: score matrix
    '''
    matrix = [[None] * videono] * userno
    matrix = np.array(matrix)

    # matrix = np.zeros((videono, userno))
    for index, row in data.iterrows():
        matrix[int(row['userid']), int(row['videoid'])] = float(row['score'])
    return matrix


def L2Norm(a, vector):
    result = list(np.dot(vector, vector) * a)[0][0]
    return result


'''
=================================================funck svd==============================================================
'''


def init_P_Q_matrix(user_disms=[3, 3], item_disms=[3, 3], init_method='quadrature'):
    '''
     this is a function to create two matrix for sgd training.
    we via quadrature  distribution function.
    Args:
        user_disms: user matrix shape.
        item_disms: item matrix shape
        init_method: generating matrix approach.
    Returns:
    '''

    if str(init_method) == str('quadrature'):
        P = random.randn(user_disms[0], user_disms[1])
        Q = random.randn(item_disms[1], item_disms[0])
        return [P, Q]
    return


def calculate_error(P_matrix, Q_matrix, y_matrix):
    '''
    calculating error rator from two matrix.
    Returns:
    '''
    rows, cols = np.nonzero(y_matrix != None)
    errors = y_matrix[rows, cols] - np.sum(P_matrix[rows] * Q_matrix.T[cols], axis=1)
    return errors


def gradient(P_matrix, Q_matrix, rows, cols, a, index, error):
    or_row, or_col = rows[index], cols[index]
    P_gradient = -2 * error * Q_matrix[:, or_col] + 2 * a * P_matrix[or_row, :]
    Q_gradient = -2 * error * P_matrix[or_row, :] + 2 * a * Q_matrix[:, or_col]
    return [Q_gradient, P_gradient]


def updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols):
    or_row, or_col = rows[index], cols[index]
    P[or_row, :] -= learning_rate * P_gradient
    Q[:, or_col] -= learning_rate * Q_gradient

    return [P, Q]


def funck_svd():
    '''
    train function is ford training svd++ algorithm.
    defined two matrix to fit the orginal rating-matrix.
    Returns: cost and iters count.
    '''
    [data, userno, videono] = onloaddata()
    learning_rate = 0.001
    iters = 50000
    a = 0.005

    [P, Q] = init_P_Q_matrix(user_disms=[userno, 2], item_disms=[videono, 2], init_method='quadrature')
    y_matirx = build_score_matrix_R(data, userno, videono)

    if not isinstance(P, np.ndarray):
        P = np.array(P).around(decimals=4)
    if not isinstance(Q, np.ndarray):
        Q = np.array(Q).around(decimals=4)
    if not isinstance(y_matirx, np.ndarray):
        y_matirx = np.array(y_matirx).around(decimals=4)

    rows, cols = np.nonzero(y_matirx != None)
    cost_arr = []
    count = 0
    bar = progressbar
    for i in bar.progressbar(range(iters)):
        errors_matrix = calculate_error(P, Q, y_matirx)
        cost = np.sum(np.square(errors_matrix))
        if cost <= 0.00001:
            break

        for index in range(len(rows)):
            [Q_gradient, P_gradient] = gradient(P, Q, rows, cols, a, index, errors_matrix[index])
            [P, Q] = updateParameters(Q_gradient, P_gradient, P, Q, learning_rate, index, rows, cols)

        cost_arr.append(cost)
        count += 1

    return cost_arr, count

THE END