如何解决有没有办法提高模型性能?
# Import the required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df1 = pd.read_csv('/content/drive/MyDrive/Regression/train.csv')
df1.shape
a = [x for x in df1.columns if df1[x].dtype == 'O'] # Categorical Columns
len(a)
b = [x for x in df1.columns if df1[x].dtype != 'O'] # Numerical Columns
len(b)
df1[a]
df1[b]
# Filling the Categorical columns
def fill_in(dataset):
for i in dataset.columns:
if dataset[i].isna and dataset[i].dtype == 'O':
dataset[i].fillna('missing',inplace = True)
return dataset
fill_in(df1)
# Filling the Numerical columns
def filling_integer(dataset):
for i in dataset.columns:
if dataset[i].isna and dataset[i].dtype != 'O':
dataset[i].fillna(dataset[i].median(),inplace = True)
return dataset
filling_integer(df1)
sns.heatmap(df1.isna())
"""Check for Outliers"""
for i in b:
plt.title(i)
sns.Boxplot(x=df1[i])
plt.show()
"""Handling the outliers"""
!pip install feature-engine
from feature_engine.outliers import Winsorizer
# for Q-Q plots
import scipy.stats as stats
# create the capper
windsoriser = Winsorizer(capping_method='quantiles',# choose from iqr,gaussian or quantiles
tail='both',# cap left,right or both tails
fold=0.05,variables= list(df1[b]))
windsoriser.fit(df1)
df1_t = windsoriser.transform(df1)
# function to create Boxplot.
def diagnostic_plots(df,variable):
# function takes a dataframe (df) and
# the variable of interest as arguments
# define figure size
plt.figure(figsize=(16,4))
# Boxplot
plt.subplot(1,3,3)
sns.Boxplot(y=df[variable])
plt.title('Boxplot')
plt.show()
diagnostic_plots(df1,'SalePrice'),diagnostic_plots(df1_t,'SalePrice')
diagnostic_plots(df1,'WoodDeckSF'),'WoodDeckSF')
df1.shape,df1_t.shape
df1_t.head().T
"""Converting Categorical into Numerical"""
for feature in a:
labels_ordered=df1_t.groupby([feature])['SalePrice'].mean().sort_values().index
labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
df1_t[feature]=df1_t[feature].map(labels_ordered)
df1_t
"""Scale the Features"""
scale= [feature for feature in df1_t.columns if feature not in ['Id','SalePrice']]
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(df1_t[scale])
scaler.transform(df1_t[scale])
data = pd.concat([df1_t[['Id','SalePrice']].reset_index(drop=True),pd.DataFrame(scaler.transform(df1_t[scale]),columns=scale)],axis=1)
data
X = data.drop(['Id','SalePrice'],axis=1)
y = data[['SalePrice']]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)
import tensorflow as tf
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=128,activation='relu'))
ann.add(tf.keras.layers.Dense(units=128,activation='relu'))
ann.add(tf.keras.layers.Dense(units=1))
ann.compile(optimizer = 'adam',loss = 'mean_squared_error')
ann.fit(X_train,batch_size = 32,epochs = 100)
我正在使用 ANN 来解决房价回归问题,该模型的表现太糟糕了。即使我尝试了 100 个时期和 2 个隐藏层,每个隐藏层都有 128 个节点,但损失函数还是相当高的。我还是输了
Epoch 100/100
35/35 [==============================] - 0s 2ms/step - loss: 633115520.0000
我哪里做错了。有人可以帮我理解吗?提前致谢:)
解决方法
我认为您应该在此数据集的特定Kaggle Discussion Forum 或一般discussion forum 中发布此问题。无论如何,还有很大的改进空间。
分类列
例如,以 LotConfig
列为例,您已将所有类标记为一些数字,然后模型将理解 Inside 比 Corner 更可取,因为在数据集中,Inside 被赋予 0,Corner 被赋予 0.25。
模型会更偏向于 Corner
,因为它具有最高的数值,会偏向于此,但实际上不应该
数值列
对于数字列中的每个缺失值,您都用 median
填充它,这是错误的,这会扭曲列的性质。假设它有 60% 的缺失值,您用中值填充所有这些值将与您之前观察者完全不同。必须以不同方式评估和填充每一列的缺失值。
注意:由于它有很多分类列,而每列中的类并不多,因此基于树的算法可能会更好
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。