我在基于内容的推荐系统预测不是 TDIDF方面遇到问题

如何解决我在基于内容的推荐系统预测不是 TDIDF方面遇到问题

我不断收到以下错误 --> 异常:昏暗。不匹配:测试数据包含 3 个项目,而内容包含 1526 个项目。请确保测试和内容的列匹配。

有人可以帮我吗?我已经研究这段代码好几天了。我的整个代码体如下。

import numpy as np # linear algebra
import pandas as pd # data processing,CSV file I/O (e.g. pd.read_csv)
from IESEGRecSys.Functions import *
from sklearn.model_selection import train_test_split
from surprise import KNNBasic
from surprise import Dataset,Reader

user_artists = pd.read_table("user_artists.dat")


user_artists['ratings'] = 0
user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1),'ratings'] = 5
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8),'ratings'] = 4
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6),'ratings'] = 3
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4),'ratings'] = 2
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2),'ratings'] = 1

data = user_artists[['userID','artistID','ratings']]

data.head()
data.shape

# train-test split
train,test = train_test_split(data,test_size=0.3,random_state=42)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(data.shape)
print(train.shape)
print(test.shape)

tags = pd.read_table("tags.dat",encoding = 'unicode_escape')
user_taggedartists = pd.read_table("user_taggedartists.dat")

user_tag_merged = pd.merge(user_taggedartists,tags,on="tagID",how="inner")
user_tag_merged_updated = pd.merge(user_tag_merged,data,on=(["userID","artistID"]),how="inner")

movie=user_tag_merged_updated
movie

data2 = data[['userID','ratings']]

# train-test split
train,test2 = train_test_split(data2,random_state=42)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(data2.shape)
print(train.shape)
print(test.shape)

data_pivot2 = data2.pivot_table(index='artistID',values='ratings',columns='userID').fillna(0)
data_pivot2.head()

movie2 = [['tagID','year']]

movie2 = user_tag_merged_updated.pivot_table(index='tagID',values='year',columns='userID').fillna(0)

movie2.head()

# Content based as a function

from numpy.linalg import norm

def simil_cosine(a,b):
    return np.dot(a,b)/(norm(a)*norm(b))


def ContentBased(content_data,test_data,NN):

    cdata = content_data.reset_index(drop=True).copy()

    # store user and item dimensions
    dim = cdata.shape[0]
    nr_user = cdata.shape[0]

    if test_data.shape[1] != dim:
        raise Exception('Dim. mismatch: Test data contains {} items,while Content contains {} items. Please make sure the columns of test and content match.'\
            .format(test_data.shape[1],dim))

    # similarity matrices
    matrix = np.zeros(shape=(dim,dim),dtype=np.float)
    matrixNN = np.zeros(shape=(dim,dtype=np.float)

    # compute similarity
    for i,row in cdata.iterrows():
        for j,col in cdata.iterrows():
            if i <= j: continue
            else: matrix[i][j] = simil_cosine(np.array(row),np.array(col))

    # copy values to other diagonal
    matrix = matrix + matrix.T - np.diag(np.diag(matrix))

    print('Similarity calculation done...')

    # mask all values that are not nearest neighbors
    cutoff = lambda x,cv: x if x >= cv else 0.0
    v_cutoff = np.vectorize(cutoff)

    for i in range(dim):
        crit_val = -np.sort(-matrix[i])[NN-1]
        matrixNN[i] = v_cutoff(matrix[i],crit_val)

    print('Nearest neighbor selection done...')

    # predict user-item ratings in test_data
    prediction = np.zeros(shape=(nr_user,dtype=np.float)

    for i in range(nr_user):
        num = np.matmul(np.array(test_data.iloc[i,:]),matrixNN)
        denom = matrixNN.sum(axis=0) # column sums
        prediction[i] = num/denom

    print('Prediction done...')

    # return DataFrame
    return pd.DataFrame(prediction,index=test_data.index,columns=test_data.columns)

cb_pred = ContentBased(movie2,data_pivot2,10)

# Content Based as a Class

from numpy.linalg import norm

class ContentBased:

    def simil_cosine(self,a,b):
        return np.dot(a,b)/(norm(a)*norm(b))

    def __init__(self,NN):
        self.NN = NN
        
    
    def fit(self,content_data):
        cdata = content_data.reset_index(drop=True).copy()
        self.item_dim = cdata.shape[0]
        self.matrix   = np.zeros(shape=(self.item_dim,self.item_dim),dtype=np.float)
        self.matrixNN = np.zeros(shape=(self.item_dim,dtype=np.float)

        # compute similarity
        for i,row in cdata.iterrows():
            for j,col in cdata.iterrows():
                if i <= j: continue
                else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col))

        # copy values to other diagonal
        self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix))

        cutoff = lambda x,cv: x if x >= cv else 0.0
        v_cutoff = np.vectorize(cutoff)

        for i in range(self.item_dim):
            crit_val = -np.sort(-self.matrix[i])[self.NN-1]
            self.matrixNN[i] = v_cutoff(self.matrix[i],crit_val)

    
    def predict(self,test_data):

        if test_data.shape[1] != self.item_dim:
            raise Exception('Dim. mismatch: Test data contains {} items,while Content contains {} items. Please make sure the columns of test and content match.'\
                .format(test_data.shape[1],self.item_dim))

我不断收到以下错误 --> 异常:昏暗。不匹配:测试数据包含 3 个项目,而内容包含 1526 个项目。请确保测试和内容的列匹配。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


Selenium Web驱动程序和Java。元素在(x,y)点处不可单击。其他元素将获得点击?
Python-如何使用点“。” 访问字典成员?
Java 字符串是不可变的。到底是什么意思?
Java中的“ final”关键字如何工作?(我仍然可以修改对象。)
“loop:”在Java代码中。这是什么,为什么要编译?
java.lang.ClassNotFoundException:sun.jdbc.odbc.JdbcOdbcDriver发生异常。为什么?
这是用Java进行XML解析的最佳库。
Java的PriorityQueue的内置迭代器不会以任何特定顺序遍历数据结构。为什么?
如何在Java中聆听按键时移动图像。
Java“Program to an interface”。这是什么意思?
Java在半透明框架/面板/组件上重新绘画。
Java“ Class.forName()”和“ Class.forName()。newInstance()”之间有什么区别?
在此环境中不提供编译器。也许是在JRE而不是JDK上运行?
Java用相同的方法在一个类中实现两个接口。哪种接口方法被覆盖?
Java 什么是Runtime.getRuntime()。totalMemory()和freeMemory()?
java.library.path中的java.lang.UnsatisfiedLinkError否*****。dll
JavaFX“位置是必需的。” 即使在同一包装中
Java 导入两个具有相同名称的类。怎么处理?
Java 是否应该在HttpServletResponse.getOutputStream()/。getWriter()上调用.close()?
Java RegEx元字符(。)和普通点?