如何对 R 数据帧进行采样以使其在多个变量中具有代表性？

如何解决如何对 R 数据帧进行采样以使其在多个变量中具有代表性？

更新 6/16：

添加了调用验证函数的 train-test split 函数。添加了一个参数，以便为您的目标变量单独设置 alpha，以防您需要更高的标准。添加了库调用。

这是一个关于它的 Github：https://github.com/KalebCoberly/train_test_split_R

结束更新

我想做一些类似于 this post about Pandas dataframes 的事情，但在 R 中，理想情况下不考虑数据类型（例如，具有因子和数字列的数据框）。

我想获得一个 R 数据框的随机样本，其中每个变量都相对代表总体。

我已经看到了基于单个变量创建分层样本的方法。但是，我想确保在多个列上的表示，而不仅仅是因子。

我编写了一个简单的算法来处理数值变量，对每个变量使用 Wilcoxon 检验。因此，如果样本（测试集）中的所有数字列似乎与剩余集（训练集）中的数字列来自同一集，那么您就有了一个相当有代表性的样本。您随机抽取一个样本并使用以下函数对其进行验证，然后重新采样和验证，直到您得到一个样本，该样本满足所有变量的最低代表性（以 alpha 衡量）。

在这种情况下，因为 alpha 代表错误拒绝原假设的风险（H0 = 样本没有来自显着不同的群体，即它们代表同一群体。），并且因为我们想拒绝拒绝在零假设中，我们希望 p 值大于 alpha 而不是小于 alpha，并且我们想要尽可能高的 alpha。

library(tidyverse)

train_test_split = function(df,y_cols,id_cols,feats_lst,test_size = .3,alpha = .5,target_alpha = .9,validate = TRUE) {
  # Splits df into train/test sets and input/target (X/y) sets.
    # (Must have id_col,but can be "dummy" since it's discarded for index.)
  # Parameters:
    # df: (data.frame) Full data set,including target variable(s).
    # y_cols: (c(character)) Target column(s).
    # id_cols: (c(character)) Id column(s) to drop,because df maintains index.
    # test_size: (numeric) Proportion of rows to use for test set.
      # (Does not validate.)
  # alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
    # H0 = feature n of train and of test do not represent different sets.
      # (i.e. representative split)
    # H1 = feature n of train and of test represent different supersets.
  # target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
    # if feature is in y_cols).
  # validate: (bool) Should set split be validated?
  # Return:
    # split_lst: (list(data.frame)) (train_X,train_y,test_X,test_y)
      # train_X (data.frame) Input features in training subset.
      # train_y (data.frame) Target variable in training subset.
      # test_X (data.frame) Input features in testing subset.
      # test_y (data.frame) Target variable in testing subset.
  
  split_lst = list(
    'train_X' = data.frame(),'train_y' = data.frame(),'test_X' = data.frame(),'test_y' = data.frame()
  )
  
  full_set_len = nrow(df)
  test_set_len = as.integer(test_size * full_set_len)
  
###
### TO DO: Add a parameter and logic to choose whether to track this. ###
###
  # To track average p-values of features:
  feats_p_av_lst = vector(mode = 'list',length = length(feats_lst))
  names(feats_p_av_lst) = feats_lst
  
  
  # Split and validate until valid.
  valid_split = FALSE
  while (!valid_split) {
    # Split randomly.
    test_idx = sample(x = full_set_len,size = test_set_len)
    split_lst$train_X = select(df[-test_idx,],-all_of(y_cols))
    split_lst$train_y = select(df[-test_idx,all_of(y_cols))
    split_lst$train_y[id_cols] = split_lst$train_X[id_cols]
    split_lst$test_X = select(df[test_idx,-all_of(y_cols))
    split_lst$test_y = select(df[test_idx,all_of(y_cols))
    split_lst$test_y[id_cols] = split_lst$test_X[id_cols]
    
    # Validate the split.
    if (validate) {
      # Randomize test order to "cost-average" compute.
      feats_lst = sample(feats_lst)
      
      # Test X and y separately to avoid the join compute and data copies.
      X_validation_results = validate_split(
        train = split_lst$train_X,test = split_lst$test_X,feats_lst = feats_lst,y_cols = y_cols,feats_p_val_lst = feats_p_av_lst,alpha = alpha,target_alpha = target_alpha
      )
      feats_p_av_lst = X_validation_results$p_vals
      
      if (X_validation_results$valid){
        
        y_validation_results = validate_split(
          train = split_lst$train_y,test = split_lst$test_y,target_alpha = target_alpha
        )
        feats_p_av_lst = y_validation_results$p_vals
        
        if (y_validation_results$valid) {
          valid_split = TRUE
        } # else { print("Invalid y split. resampling.") }
      } # else { print("Invalid X split. resampling.") }
    } else {valid_split = TRUE}
  }
  
  if (validate) {
    for(feat in names(feats_p_av_lst)) {
      feats_p_av_lst[[feat]] = mean(feats_p_av_lst[[feat]])
    }
    print('Average p-values:')
    print(feats_p_av_lst)
  }
  
  return(split_lst)
}


validate_split = function(train,test,feats_p_val_lst,target_alpha = .9) {
  # Conducts Wilcoxon ranks sum test column by column to test if train and test
    # represent a similar superset. (i.e.,is the split stratified on every
    # feature?) Both train and test should have the same features. There should
    # be at least one numeric (i.e. continuous) feature,as the test will only
    # be performed on these columns -- this does limit the test.
  # Parameters:
    # train: (data.frame) A subset of original set to compare to the other
      # subset,test.
    # test: (data.frame) A subset of original set to compare to the other
      # subset,train.
    # feats_lst: (list(character)) List of features to test.
    # y_cols: (c(character)) Vector of target features.
    # feats_p_val_lst: (list(character:list(double)) Dictionary of p-values to
      # to track which features are hardest to stratify.
    # alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
      # H0 = feature n of train and test does not represent different sets.
        # (i.e. representative split)
      # H1 = feature n of train and test represents a different superset.
    # target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
      # if feature is in y_cols).
  # Return:
    # list(valid: (bool),p_vals: (list(character:list(double)))
      # valid: (bool) Are the sets representative of the same superset?
    # p_vals: (list(character:list(double)) feats_p_val_lst updated
  
  valid = TRUE
  
  for (feat in feats_lst) {
    if (valid & feat %in% colnames(train) & feat %in% colnames(test)) {
      this_alpha = alpha
      if (feat %in% y_cols) {
        this_alpha = target_alpha
      }
      results = wilcox.test(
        x = as.double(train[[feat]]),y = as.double(test[[feat]])
      )
      if (!(results$p.value > this_alpha)) {
        # print("Reject null hypothesis that split is not unrepresentative:")
        valid = FALSE
      }
      # print(feat)
      # print(results$p.value)
      feats_p_val_lst[[feat]] = c(feats_p_val_lst[[feat]],results$p.value)
    }
  }
  
  return(list('valid' = valid,'p_vals' = feats_p_val_lst))
}

在虚拟数据上测试：

sample_df = data.frame(
  list(
    'Id' = c(1:1000),'y' = as.double(sample(1:1000,size = 1000)),'a' = as.double(sample(1:2000,'b' = as.double(sample(1:3000,size = 1000))
  )
)

y_cols = c('y'),id_cols = c('Id'),feats_lst = colnames(select(sample_df,where(is.double)))

split_lst = train_test_split(
  df = sample_df,id_cols = id_cols,feats_lst = feats_lst
)

# > names(split_lst)
# [1] "train_X" "train_y" "test_X"  "test_y"

# You can call validate_split again on your found split to
# get your final p-values for each feature.
feats_p_val_lst = vector(mode = 'list',length = length(feats_lst))
names(feats_p_val_lst) = feats_lst

validate_split_lst = validate_split =(
  train = split_lst$train_X,feats_p_val_lst = feats_p_val_lst
)
validate_split_lst = validate_split =(
  train = split_lst$train_y,feats_p_val_lst = validate_split_lst$p_vals
)

> validate_split_lst$p_vals
# A list of all your feature names with their p-values.
> validate_split_lst$valid
TRUE

同样，这完全忽略了因数和整数，除非您将它们转换为双精度数，但这会违反 Wilcoxon 假设，即数据是连续的。

鉴于我当前的数据集包含大约 80 个变量，其中几乎一半是双精度变量，这就足够了，因为如果所有双精度变量都具有代表性。

但是，它需要永远运行并获得 p > .5（即无法拒绝这些数据集不是来自不同人群的零假设（即并非没有代表性））。而且，如果数据集的所有或大部分变量都是因子或整数，那又如何？

从数学/统计角度和/或 R/编程角度来看，有没有更好的方法？另外，这对机器学习有什么问题吗？我想认为它会提高训练/调整模型的泛化能力，减少过拟合的机会，但它是否会以某种方式造成泄漏或其他问题？