我在基于内容的推荐系统预测不是 TDIDF方面遇到问题

如何解决我在基于内容的推荐系统预测不是 TDIDF方面遇到问题

我不断收到以下错误 --> 异常：昏暗。不匹配：测试数据包含 3 个项目，而内容包含 1526 个项目。请确保测试和内容的列匹配。

有人可以帮我吗？我已经研究这段代码好几天了。我的整个代码体如下。

import numpy as np # linear algebra
import pandas as pd # data processing,CSV file I/O (e.g. pd.read_csv)
from IESEGRecSys.Functions import *
from sklearn.model_selection import train_test_split
from surprise import KNNBasic
from surprise import Dataset,Reader

user_artists = pd.read_table("user_artists.dat")


user_artists['ratings'] = 0
user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1),'ratings'] = 5
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8),'ratings'] = 4
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6),'ratings'] = 3
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4),'ratings'] = 2
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2),'ratings'] = 1

data = user_artists[['userID','artistID','ratings']]

data.head()
data.shape

# train-test split
train,test = train_test_split(data,test_size=0.3,random_state=42)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(data.shape)
print(train.shape)
print(test.shape)

tags = pd.read_table("tags.dat",encoding = 'unicode_escape')
user_taggedartists = pd.read_table("user_taggedartists.dat")

user_tag_merged = pd.merge(user_taggedartists,tags,on="tagID",how="inner")
user_tag_merged_updated = pd.merge(user_tag_merged,data,on=(["userID","artistID"]),how="inner")

movie=user_tag_merged_updated
movie

data2 = data[['userID','ratings']]

# train-test split
train,test2 = train_test_split(data2,random_state=42)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(data2.shape)
print(train.shape)
print(test.shape)

data_pivot2 = data2.pivot_table(index='artistID',values='ratings',columns='userID').fillna(0)
data_pivot2.head()

movie2 = [['tagID','year']]

movie2 = user_tag_merged_updated.pivot_table(index='tagID',values='year',columns='userID').fillna(0)

movie2.head()

# Content based as a function

from numpy.linalg import norm

def simil_cosine(a,b):
    return np.dot(a,b)/(norm(a)*norm(b))


def ContentBased(content_data,test_data,NN):

    cdata = content_data.reset_index(drop=True).copy()

    # store user and item dimensions
    dim = cdata.shape[0]
    nr_user = cdata.shape[0]

    if test_data.shape[1] != dim:
        raise Exception('Dim. mismatch: Test data contains {} items,while Content contains {} items. Please make sure the columns of test and content match.'\
            .format(test_data.shape[1],dim))

    # similarity matrices
    matrix = np.zeros(shape=(dim,dim),dtype=np.float)
    matrixNN = np.zeros(shape=(dim,dtype=np.float)

    # compute similarity
    for i,row in cdata.iterrows():
        for j,col in cdata.iterrows():
            if i <= j: continue
            else: matrix[i][j] = simil_cosine(np.array(row),np.array(col))

    # copy values to other diagonal
    matrix = matrix + matrix.T - np.diag(np.diag(matrix))

    print('Similarity calculation done...')

    # mask all values that are not nearest neighbors
    cutoff = lambda x,cv: x if x >= cv else 0.0
    v_cutoff = np.vectorize(cutoff)

    for i in range(dim):
        crit_val = -np.sort(-matrix[i])[NN-1]
        matrixNN[i] = v_cutoff(matrix[i],crit_val)

    print('Nearest neighbor selection done...')

    # predict user-item ratings in test_data
    prediction = np.zeros(shape=(nr_user,dtype=np.float)

    for i in range(nr_user):
        num = np.matmul(np.array(test_data.iloc[i,:]),matrixNN)
        denom = matrixNN.sum(axis=0) # column sums
        prediction[i] = num/denom

    print('Prediction done...')

    # return DataFrame
    return pd.DataFrame(prediction,index=test_data.index,columns=test_data.columns)

cb_pred = ContentBased(movie2,data_pivot2,10)

# Content Based as a Class

from numpy.linalg import norm

class ContentBased:

    def simil_cosine(self,a,b):
        return np.dot(a,b)/(norm(a)*norm(b))

    def __init__(self,NN):
        self.NN = NN
        
    
    def fit(self,content_data):
        cdata = content_data.reset_index(drop=True).copy()
        self.item_dim = cdata.shape[0]
        self.matrix   = np.zeros(shape=(self.item_dim,self.item_dim),dtype=np.float)
        self.matrixNN = np.zeros(shape=(self.item_dim,dtype=np.float)

        # compute similarity
        for i,row in cdata.iterrows():
            for j,col in cdata.iterrows():
                if i <= j: continue
                else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col))

        # copy values to other diagonal
        self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix))

        cutoff = lambda x,cv: x if x >= cv else 0.0
        v_cutoff = np.vectorize(cutoff)

        for i in range(self.item_dim):
            crit_val = -np.sort(-self.matrix[i])[self.NN-1]
            self.matrixNN[i] = v_cutoff(self.matrix[i],crit_val)

    
    def predict(self,test_data):

        if test_data.shape[1] != self.item_dim:
            raise Exception('Dim. mismatch: Test data contains {} items,while Content contains {} items. Please make sure the columns of test and content match.'\
                .format(test_data.shape[1],self.item_dim))

我不断收到以下错误 --> 异常：昏暗。不匹配：测试数据包含 3 个项目，而内容包含 1526 个项目。请确保测试和内容的列匹配。

我在基于内容的推荐系统预测不是 TDIDF方面遇到问题

如何解决我在基于内容的推荐系统预测不是 TDIDF方面遇到问题

相关推荐