如何解决如何对 R 数据帧进行采样以使其在多个变量中具有代表性?
更新 6/16:
添加了调用验证函数的 train-test split 函数。添加了一个参数,以便为您的目标变量单独设置 alpha,以防您需要更高的标准。添加了库调用。
这是一个关于它的 Github:https://github.com/KalebCoberly/train_test_split_R
结束更新
我想做一些类似于 this post about Pandas dataframes 的事情,但在 R 中,理想情况下不考虑数据类型(例如,具有因子和数字列的数据框)。
我想获得一个 R 数据框的随机样本,其中每个变量都相对代表总体。
我已经看到了基于单个变量创建分层样本的方法。但是,我想确保在多个列上的表示,而不仅仅是因子。
我编写了一个简单的算法来处理数值变量,对每个变量使用 Wilcoxon 检验。因此,如果样本(测试集)中的所有数字列似乎与剩余集(训练集)中的数字列来自同一集,那么您就有了一个相当有代表性的样本。您随机抽取一个样本并使用以下函数对其进行验证,然后重新采样和验证,直到您得到一个样本,该样本满足所有变量的最低代表性(以 alpha 衡量)。
在这种情况下,因为 alpha 代表错误拒绝原假设的风险(H0 = 样本没有来自显着不同的群体,即它们代表同一群体。),并且因为我们想拒绝拒绝在零假设中,我们希望 p 值大于 alpha 而不是小于 alpha,并且我们想要尽可能高的 alpha。
library(tidyverse)
train_test_split = function(df,y_cols,id_cols,feats_lst,test_size = .3,alpha = .5,target_alpha = .9,validate = TRUE) {
# Splits df into train/test sets and input/target (X/y) sets.
# (Must have id_col,but can be "dummy" since it's discarded for index.)
# Parameters:
# df: (data.frame) Full data set,including target variable(s).
# y_cols: (c(character)) Target column(s).
# id_cols: (c(character)) Id column(s) to drop,because df maintains index.
# test_size: (numeric) Proportion of rows to use for test set.
# (Does not validate.)
# alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
# H0 = feature n of train and of test do not represent different sets.
# (i.e. representative split)
# H1 = feature n of train and of test represent different supersets.
# target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
# if feature is in y_cols).
# validate: (bool) Should set split be validated?
# Return:
# split_lst: (list(data.frame)) (train_X,train_y,test_X,test_y)
# train_X (data.frame) Input features in training subset.
# train_y (data.frame) Target variable in training subset.
# test_X (data.frame) Input features in testing subset.
# test_y (data.frame) Target variable in testing subset.
split_lst = list(
'train_X' = data.frame(),'train_y' = data.frame(),'test_X' = data.frame(),'test_y' = data.frame()
)
full_set_len = nrow(df)
test_set_len = as.integer(test_size * full_set_len)
###
### TO DO: Add a parameter and logic to choose whether to track this. ###
###
# To track average p-values of features:
feats_p_av_lst = vector(mode = 'list',length = length(feats_lst))
names(feats_p_av_lst) = feats_lst
# Split and validate until valid.
valid_split = FALSE
while (!valid_split) {
# Split randomly.
test_idx = sample(x = full_set_len,size = test_set_len)
split_lst$train_X = select(df[-test_idx,],-all_of(y_cols))
split_lst$train_y = select(df[-test_idx,all_of(y_cols))
split_lst$train_y[id_cols] = split_lst$train_X[id_cols]
split_lst$test_X = select(df[test_idx,-all_of(y_cols))
split_lst$test_y = select(df[test_idx,all_of(y_cols))
split_lst$test_y[id_cols] = split_lst$test_X[id_cols]
# Validate the split.
if (validate) {
# Randomize test order to "cost-average" compute.
feats_lst = sample(feats_lst)
# Test X and y separately to avoid the join compute and data copies.
X_validation_results = validate_split(
train = split_lst$train_X,test = split_lst$test_X,feats_lst = feats_lst,y_cols = y_cols,feats_p_val_lst = feats_p_av_lst,alpha = alpha,target_alpha = target_alpha
)
feats_p_av_lst = X_validation_results$p_vals
if (X_validation_results$valid){
y_validation_results = validate_split(
train = split_lst$train_y,test = split_lst$test_y,target_alpha = target_alpha
)
feats_p_av_lst = y_validation_results$p_vals
if (y_validation_results$valid) {
valid_split = TRUE
} # else { print("Invalid y split. resampling.") }
} # else { print("Invalid X split. resampling.") }
} else {valid_split = TRUE}
}
if (validate) {
for(feat in names(feats_p_av_lst)) {
feats_p_av_lst[[feat]] = mean(feats_p_av_lst[[feat]])
}
print('Average p-values:')
print(feats_p_av_lst)
}
return(split_lst)
}
validate_split = function(train,test,feats_p_val_lst,target_alpha = .9) {
# Conducts Wilcoxon ranks sum test column by column to test if train and test
# represent a similar superset. (i.e.,is the split stratified on every
# feature?) Both train and test should have the same features. There should
# be at least one numeric (i.e. continuous) feature,as the test will only
# be performed on these columns -- this does limit the test.
# Parameters:
# train: (data.frame) A subset of original set to compare to the other
# subset,test.
# test: (data.frame) A subset of original set to compare to the other
# subset,train.
# feats_lst: (list(character)) List of features to test.
# y_cols: (c(character)) Vector of target features.
# feats_p_val_lst: (list(character:list(double)) Dictionary of p-values to
# to track which features are hardest to stratify.
# alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
# H0 = feature n of train and test does not represent different sets.
# (i.e. representative split)
# H1 = feature n of train and test represents a different superset.
# target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
# if feature is in y_cols).
# Return:
# list(valid: (bool),p_vals: (list(character:list(double)))
# valid: (bool) Are the sets representative of the same superset?
# p_vals: (list(character:list(double)) feats_p_val_lst updated
valid = TRUE
for (feat in feats_lst) {
if (valid & feat %in% colnames(train) & feat %in% colnames(test)) {
this_alpha = alpha
if (feat %in% y_cols) {
this_alpha = target_alpha
}
results = wilcox.test(
x = as.double(train[[feat]]),y = as.double(test[[feat]])
)
if (!(results$p.value > this_alpha)) {
# print("Reject null hypothesis that split is not unrepresentative:")
valid = FALSE
}
# print(feat)
# print(results$p.value)
feats_p_val_lst[[feat]] = c(feats_p_val_lst[[feat]],results$p.value)
}
}
return(list('valid' = valid,'p_vals' = feats_p_val_lst))
}
在虚拟数据上测试:
sample_df = data.frame(
list(
'Id' = c(1:1000),'y' = as.double(sample(1:1000,size = 1000)),'a' = as.double(sample(1:2000,'b' = as.double(sample(1:3000,size = 1000))
)
)
y_cols = c('y'),id_cols = c('Id'),feats_lst = colnames(select(sample_df,where(is.double)))
split_lst = train_test_split(
df = sample_df,id_cols = id_cols,feats_lst = feats_lst
)
# > names(split_lst)
# [1] "train_X" "train_y" "test_X" "test_y"
# You can call validate_split again on your found split to
# get your final p-values for each feature.
feats_p_val_lst = vector(mode = 'list',length = length(feats_lst))
names(feats_p_val_lst) = feats_lst
validate_split_lst = validate_split =(
train = split_lst$train_X,feats_p_val_lst = feats_p_val_lst
)
validate_split_lst = validate_split =(
train = split_lst$train_y,feats_p_val_lst = validate_split_lst$p_vals
)
> validate_split_lst$p_vals
# A list of all your feature names with their p-values.
> validate_split_lst$valid
TRUE
同样,这完全忽略了因数和整数,除非您将它们转换为双精度数,但这会违反 Wilcoxon 假设,即数据是连续的。
鉴于我当前的数据集包含大约 80 个变量,其中几乎一半是双精度变量,这就足够了,因为如果所有双精度变量都具有代表性。
但是,它需要永远运行并获得 p > .5(即无法拒绝这些数据集不是来自不同人群的零假设(即并非没有代表性))。而且,如果数据集的所有或大部分变量都是因子或整数,那又如何?
从数学/统计角度和/或 R/编程角度来看,有没有更好的方法?另外,这对机器学习有什么问题吗?我想认为它会提高训练/调整模型的泛化能力,减少过拟合的机会,但它是否会以某种方式造成泄漏或其他问题?
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。