微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

RPART模型在拟合模型时忽略变量

如何解决RPART模型在拟合模型时忽略变量

当我尝试使用 Survival~Sex+Pclass 拟合分类树模型时,它不考虑 Pclass 而只考虑性别(当 Survival、Sex 和 Pclass 被分解为代码中所示时),无论如何指定控制参数。

代码

library(titanic)
library(rpart)
library(rpart.plot)

train = titanic_train
titanic_train$Survived = factor(titanic_train$Survived)
titanic_train$Sex = factor(titanic_train$Sex)
titanic_train$Pclass = factor(titanic_train$Pclass)
ctrl=rpart.control(minsplit = 6,cp=0.001)
fit = rpart(Survived ~  Pclass + Sex,data = titanic_train,control=ctrl)
rpart.plot(fit)

https://i.stack.imgur.com/V50YE.png

解决方法

它真的不想再分裂了。即使设置 cp = 0 也不能解决问题(使用 minsplit = 1)。但是 cp = -1 确实如此,将树分支到每个类的叶子。 (这是否可取是另一回事......)

enter image description here

,

这确实是一个有趣的观察,因为

  • 我们知道 Pclass 是一个信息量很大的变量,
  • 大多数其他分类树软件将在 Pclass 上进一步拆分(例如 #include <array> #include <stdio.h> #include <thread> #include "ringbuffer.hpp" #include "portaudio.h" /* #define SAMPLE_RATE (17932) // Test failure to open with this value. */ #define SAMPLE_RATE (44100) #define FRAMES_PER_BUFFER (512) #define NUM_SECONDS (1) #define NUM_CHANNELS (2) #define NUM_WRITES_PER_BUFFER (4) #define DITHER_FLAG (paDitherOff) //#define DITHER_FLAG (0) /* Select sample format. */ #if 1 #define PA_SAMPLE_TYPE paFloat32 typedef float SAMPLE; #define SAMPLE_SILENCE (0.0f) #define PRINTF_S_FORMAT "%.8f" #elif 1 #define PA_SAMPLE_TYPE paInt16 typedef short SAMPLE; #define SAMPLE_SILENCE (0) #define PRINTF_S_FORMAT "%d" #elif 0 #define PA_SAMPLE_TYPE paInt8 typedef char SAMPLE; #define SAMPLE_SILENCE (0) #define PRINTF_S_FORMAT "%d" #else #define PA_SAMPLE_TYPE paUInt8 typedef unsigned char SAMPLE; #define SAMPLE_SILENCE (128) #define PRINTF_S_FORMAT "%d" #endif static unsigned long int rbs_min(unsigned long int a,unsigned long int b) { return (a < b) ? a : b; } typedef struct { unsigned long int frameIndex; unsigned long int dataIndex; unsigned long int maxFrameIndex; int threadSyncFlag; SAMPLE *sampleData; const SAMPLE* buff; Ringbuffer<const SAMPLE*,65536> ringBuffer; void *threadHandle; } paTestData; static int cons(void* ptr) { unsigned long int ra_i; paTestData* pData = (paTestData*)ptr; /* Mark thread started */ pData->threadSyncFlag = 0; while(1) { ra_i = pData->ringBuffer.readAvailable(); printf("+Read available %lu\n",ra_i); if ( (pData->dataIndex <= 65536) ) { if (! pData->ringBuffer.isEmpty()) { pData->dataIndex += pData->ringBuffer.readBuff(&pData->buff,ra_i); printf(">>dataIndex: %lu\n",pData->dataIndex); } } else { break; } Pa_Sleep(100); } pData->threadSyncFlag = 0; return 0; } static unsigned NextPowerOf2(unsigned val) { val--; val = (val >> 1) | val; val = (val >> 2) | val; val = (val >> 4) | val; val = (val >> 8) | val; val = (val >> 16) | val; return ++val; } /* This routine will be called by the PortAudio engine when audio is needed. ** It may be called at interrupt level on some machines so don't do anything ** that could mess up the system like calling malloc() or free(). */ static int recordCallback( const void *inputBuffer,void *outputBuffer,unsigned long framesPerBuffer,const PaStreamCallbackTimeInfo* timeInfo,PaStreamCallbackFlags statusFlags,void *userData ) { paTestData *data = (paTestData*)userData; unsigned long int elementsWriteable = data->ringBuffer.writeAvailable(); unsigned long int elementsToWrite = rbs_min( elementsWriteable,(unsigned long int)(framesPerBuffer * NUM_CHANNELS) ); SAMPLE *rptr = (SAMPLE*)inputBuffer; (void) outputBuffer; /* Prevent unused variable warnings. */ (void) timeInfo; (void) statusFlags; (void) userData; data->frameIndex += data->ringBuffer.writeBuff( &rptr,elementsToWrite); return paContinue; } PaError pa_term(PaError err) { Pa_Terminate(); if( err != paNoError ) { fprintf( stderr,"An error occured while using the portaudio stream\n" ); fprintf( stderr,"Error number: %d\n",err ); fprintf( stderr,"Error message: %s\n",Pa_GetErrorText( err ) ); err = 1; // Always return 0 or 1,but no other return codes. } return err; } int main(void); int main(void) { PaStreamParameters inputParameters,outputParameters; PaStream* stream; PaError err = paNoError; paTestData data = {0}; unsigned delayCntr; unsigned long int totalFrames; unsigned numSamples; unsigned numBytes; data.dataIndex = 0; err = Pa_Initialize(); if( err != paNoError ) { pa_term(err); } inputParameters.device = Pa_GetDefaultInputDevice(); /* default input device */ if (inputParameters.device == paNoDevice) { printf("Error: No default input device.\n"); pa_term(paDeviceUnavailable); } inputParameters.channelCount = 2; /* stereo input */ inputParameters.sampleFormat = PA_SAMPLE_TYPE; inputParameters.suggestedLatency = Pa_GetDeviceInfo( inputParameters.device )->defaultLowInputLatency; inputParameters.hostApiSpecificStreamInfo = NULL; /* Record some audio. -------------------------------------------- */ err = Pa_OpenStream( &stream,&inputParameters,NULL,/* &outputParameters,*/ SAMPLE_RATE,FRAMES_PER_BUFFER,paClipOff,/* we won't output out of range samples so don't bother clipping them */ recordCallback,&data ); if( err != paNoError ) { pa_term(err); } // Start stream logging thread std::thread first (cons,&data); err = Pa_StartStream( stream ); if( err != paNoError ) { pa_term(err); } printf("Stream started\n"); while(!paNoError) { Pa_Sleep(1); } err = Pa_CloseStream( stream ); if( err != paNoError ) { pa_term(err); } } tree::treepartykit::ctree、...),
  • 完全相同代码的回归树版本(即不将 Survived 转换为一个因子,而是将其保留为数字。)导致 4 个叶子,即使 Gini 杂质与方差损失函数相同对于 0/1 数据。

同样难以解释为什么对于 sklearn.tree.DecisionTreeClassifiercp = 0 生成的树不会是最深的。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


Selenium Web驱动程序和Java。元素在(x,y)点处不可单击。其他元素将获得点击?
Python-如何使用点“。” 访问字典成员?
Java 字符串是不可变的。到底是什么意思?
Java中的“ final”关键字如何工作?(我仍然可以修改对象。)
“loop:”在Java代码中。这是什么,为什么要编译?
java.lang.ClassNotFoundException:sun.jdbc.odbc.JdbcOdbcDriver发生异常。为什么?
这是用Java进行XML解析的最佳库。
Java的PriorityQueue的内置迭代器不会以任何特定顺序遍历数据结构。为什么?
如何在Java中聆听按键时移动图像。
Java“Program to an interface”。这是什么意思?