推荐系统-FM模型实现

Posted by mudux on February 7, 2020

 之前博客讲解了FM模型理论知识,这里用tensorflow实现FM模型。
这个库实现了任意FM模型,可作为参考。

注:这里只有二阶交叉项。

1. 数据集

 来自MovieLens数据集,为用户对电影的评分数据。

2. tensorflow实现

util.py文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from itertools import count
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy.sparse import csr


def sigmoid(input):
    return 1 / (1 + np.exp(-input))

def vectorize_dic(dic, ix=None, p=None):
    """
    Creates a scipy csr matrix from a list of lists (each inner list
    is a set of values corresponding to a feature)
    :param dic: dictionay of feature lists. Keys are the name of features
    :param ix:  index generator(default None)
    :param p: dimension of feature space (number of columns in the sparse matrix)
    :return:
    """

    if (ix == None):
        d = count(0)
        ix = defaultdict(lambda: next(d))
    n = len(list(dic.values())[0])  #number of samples
    g = len(list(dic.keys()))   # number of features
    nz = n * g  # number of non_zeros

    col_ix = np.empty(nz, dtype=int)

    i = 0
    for k, lis in dic.items():
        col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]
        i += 1

    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)

    if (p == None):
        p = len(ix)
    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix


def loadDataSet(trainPath, testPath, mode):
    """
    :param trainPath: train file path
    :param testPath:  test file path
    :return:
    """
    cols = ['user', 'item', 'rating', 'timestamp']
    train = pd.read_csv(trainPath, delimiter='\t', names=cols)
    test = pd.read_csv(testPath, delimiter='\t', names=cols)
    X_train, ix = vectorize_dic({'users': train.user.values, 'items': train.item.values})
    X_test, ix = vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])
    y1 = train.rating.values
    y_train = np.zeros((len(y1), 1))
    y2 = test.rating.values
    y_test = np.zeros((len(y2), 1))
    if mode == 'regression':
        y_train = y1.copy()
        y_test = y2.copy()
    elif mode == 'classification':
        y_train[np.where(y1 == 5)] = 1
        y_train[np.where(y1 < 5)] = -1
        y_test[np.where(y2 == 5)] = 1
        y_test[np.where(y2 < 5)] = -1
    return X_train, y_train, X_test, y_test



def batcher(X_, y_=None, batch_size=None):
    n_samples = X_.shape[0]

    if batch_size is None:
        batch_size = n_samples

    for i in range(0, n_samples, batch_size):
        upper_bound = min(i + batch_size, n_samples)
        ret_x = X_[i:upper_bound]
        ret_y = None
        if y_ is not None:
            ret_y = y_[i:i + batch_size]
            yield (ret_x, ret_y)

FM.py文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from util import *
import tensorflow as tf

class FM():
    def __init__(self, trainPath, testPath, k, mode='regression'):
        self.trainPath = trainPath
        self.testPath = testPath
        self.k = k
        self.mode = mode

    def load_data(self):
        X_train, y_train, X_test, y_test = loadDataSet(self.trainPath, self.testPath, self.mode)
        X_train = X_train.todense()
        X_test = X_test.todense()
        print("Train data shape: ", X_train.shape)
        print("Test data shape: ", X_test.shape)
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test


    def build_model(self):

        self.n, self.p = self.X_train.shape
        # design matrix
        self.X = tf.placeholder('float', shape=[None, self.p])
        # target vector
        self.y = tf.placeholder('float', shape=[None, 1])

        # bias and weights
        w0 = tf.Variable(tf.zeros([1]))
        W = tf.Variable(tf.zeros([self.p]))

        # interaction factors, randomly initialized
        V = tf.Variable(tf.random.normal([self.k, self.p], stddev=0.01))

        # estimator of y, initialized to 0
        y_hat = tf.Variable(tf.zeros([self.n, 1]))

        linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, self.X), 1, keepdims=True))
        pair_interaction = (tf.multiply(0.5,
                                         tf.reduce_sum(
                                             tf.subtract(
                                                 tf.pow(tf.matmul(self.X, tf.transpose(V)), 2),
                                                 tf.matmul(tf.pow(self.X, 2), tf.transpose(tf.pow(V, 2)))),
                                             1, keepdims=True)))
        self.y_hat = tf.add(linear_terms, pair_interaction)

        # lambda_w = tf.constant(0.001, name='lambda_w')
        # lambda_v = tf.constant(0.001, name='lambda_v')
        lambda_w = tf.constant(0.00, name='lambda_w')
        lambda_v = tf.constant(0.00, name='lambda_v')
        l2_norm = tf.add(
            tf.reduce_sum(tf.multiply(lambda_w, tf.pow(W, 2))),
            tf.reduce_sum(tf.multiply(lambda_v, tf.pow(V, 2)))
        )

        if self.mode == 'regression':
            self.error = tf.reduce_mean(tf.square(tf.subtract(self.y, self.y_hat)))
            self.loss = tf.add(self.error, l2_norm)
        elif self.mode == 'classification':
            print(self.y.get_shape().as_list())
            print(self.y_hat.get_shape().as_list())
            self.error = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=self.y_hat))
            self.loss = tf.add(self.error, l2_norm)
            print(self.loss.get_shape().as_list())
            print(l2_norm.get_shape().as_list())

        # self.optimizer = tf.train.AdamOptimizer(beta1=0.9, beta2=0.5).minimize(self.loss)
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(self.loss)

    def train(self):
        epochs = 30
        batch_size = 256
        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)
        for epoch in range(epochs):
            perm = np.random.permutation(self.X_train.shape[0])
            cnt = 0
            for batchX, batchY in batcher(self.X_train[perm], self.y_train[perm], batch_size):
                _, loss = self.sess.run((self.optimizer, self.loss), feed_dict={self.X: batchX.reshape(-1, self.p), self.y: batchY.reshape(-1, 1)})
                if cnt == 1:
                    print("Epoch: %d, Loss: %.3f" % (epoch + 1, loss))
                cnt += 1

    def evaluate(self):
        if self.mode == 'regression':
            errors = []
            for batchX, batchY in batcher(self.X_test, self.y_test):
                errors.append(self.sess.run(self.error, feed_dict={self.X: batchX.reshape(-1, self.p), self.y: batchY.reshape(-1, 1)}))
            RMSE = np.sqrt(np.array(errors).mean())
            print("RMSE: ", RMSE)
        elif self.mode == 'classification':
            pred = np.zeros((len(self.X_test), 1))
            for batchX, batchY in batcher(self.X_test, self.y_test):
                logits = self.sess.run(self.y_hat, feed_dict={self.X: batchX.reshape(-1, self.p), self.y: batchY.reshape(-1, 1)})
                y_hat = sigmoid(logits)
                pred[np.where(y_hat > 0.5)] = 1
                pred[np.where(y_hat < 0.5)] = -1
            print("Accuracy: ", np.mean(self.y_test == pred))
        self.sess.close()



if __name__ == '__main__':
    fm = FM('./ml-100k/ua.base', './ml-100k/ua.test', 20, 'classification')
    fm.load_data()
    fm.build_model()
    fm.train()
    fm.evaluate()

 这里用来做分类,把评分为5的作为1类,小于5的为-1类,输出为:
FM-test
 可以看出准确率为77.1%,表现还过得去。

3. Reference

1. babakx/FM-github
2. movieLen数据集
3. tffm-package
4. Factorization Machines with libFM
5. Factorization Machines with tensorflow