Python sklearn.tree 模块,DecisionTreeRegressor() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.tree.DecisionTreeRegressor()。
def cross_validate_best_kNown():
'''
import and clean the tractor data,then do a coRSS validation on each of the three models we are
training here. A RandomForest,a GradientBoost,and an AdaBoost backed by a DecisionTree. Print
the scores.
The parameters we're using here are the "best" that we've found so far using a grid search.
'''
tractor_data = pd.read_csv('data/train.csv')
tractor_data = cln.clean_all(tractor_data)
X = tractor_data
y = tractor_data.pop('SalePrice')
rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2)
gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, max_features='log2', min_samples_split=2, max_depth=1)
ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', max_depth=3)
ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000)
validate.cross_v_scores([rf, gb, ab], X, y)
# RandomForestRegressor -- rmlSE: -0.596797712098,R2: 0.0272065373946
# GradientBoostingRegressor -- rmlSE: -0.996134592541,R2: -2.37202164829
# AdaBoostRegressor -- rmlSE: -0.706385708459,R2: -0.103966980393
def test_regression():
# Check regression for varIoUs parameter settings.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"max_features": [0.5,
"bootstrap": [True, False],
"bootstrap_features": [True, False]})
for base_estimator in [None,
DummyRegressor(),
DecisionTreeRegressor(),
KNeighborsRegressor(),
SVR()]:
for params in grid:
BaggingRegressor(base_estimator=base_estimator,
random_state=rng,
**params).fit(X_train, y_train).predict(X_test)
def load(file_path):
with open(file_path + '.params', 'r') as params_file:
params = json.load(params_file)
weak_learners = list()
for wl_id in range(params['n_round']):
# wl = DecisionTreeRegressor(max_depth=params['max_depth'],
# max_features=params['max_features'],
# min_samples_leaf=params['min_samples_leaf'])
wl = joblib.load(file_path + '.wl%d' % wl_id)
weak_learners.append(wl)
rankgbm = RankGBM(params['Vote_k'],
n_round=params['n_round'],
max_depth=params['max_depth'],
max_features=params['max_features'],
min_samples_leaf=params['min_samples_leaf'],
learn_rate=params['learn_rate'])
rankgbm.weak_learners = weak_learners
return rankgbm
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'max_features': ['sqrt', 'log2', None],
'max_depth': range(2,1000),
}
]
reg = gridsearchcv(DecisionTreeRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
reg.fit(self.X_train, self.y_train)
print "Best parameters set found on development set:\n"
print reg.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in reg.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "MSE for test data set:\n"
y_true, y_pred = self.y_test, reg.predict(self.X_test)
print mean_squared_error(y_true, y_pred)
def convert(model, feature_names, target):
"""Convert a decision tree model to protobuf format.
Parameters
----------
decision_tree : DecisionTreeRegressor
A trained scikit-learn tree model.
feature_names: [str]
Name of the input columns.
target: str
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, _tree.DecisionTreeRegressor)
_sklearn_util.check_fitted(model, lambda m: hasattr(m, 'tree_') and model.tree_ is not None)
return _MLModel(_convert_tree_ensemble(model, target))
def test_tree_regressor(self):
for dtype in self.number_data_type.keys():
scikit_model = DecisionTreeRegressor(random_state=1)
data = self.scikit_data['data'].astype(dtype)
target = self.scikit_data['target'].astype(dtype)
scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target)
test_data = data[0].reshape(1, -1)
self._check_tree_model(spec, 'multiArrayType', 'doubleType', 1)
coreml_model = create_model(spec)
try:
self.assertEqual(scikit_model.predict(test_data)[0].dtype,
type(coreml_model.predict({'data': test_data})['target']))
self.assertEqual(scikit_model.predict(test_data)[0],
coreml_model.predict({'data': test_data})['target'],
msg="{} != {} for Dtype: {}".format(
scikit_model.predict(test_data)[0],
coreml_model.predict({'data': test_data})['target'],
dtype
)
)
except RuntimeError:
print("{} not supported. ".format(dtype))
def decision_tree(X, y, regression, max_depth=3):
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.core.pylabtools import figsize
from IPython.display import Image
figsize(12.5, 6)
import pydot
if regression:
clf = DecisionTreeRegressor(max_depth=max_depth)
else:
clf = DecisionTreeClassifier(max_depth=max_depth)
clf.fit(X, y)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, feature_names=list(X.columns),
filled=True, rounded=True,)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
return Image(graph.create_png())
def test_DecisionTreeRegressor(*data):
'''
test DT regression
:param data: train_data,test_data,train_value,test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
regr = DecisionTreeRegressor()
regr.fit(X_train, y_train)
print("Training score:{0}".format(regr.score(X_train,y_train)))
print("Testing score:{0}".format(regr.score(X_test,y_test)))
##graph
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
X = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
Y = regr.predict(X)
ax.scatter(X_train, label="train sample",c='g')
ax.scatter(X_test, y_test, label="test sample",c='r')
ax.plot(X, label="predict_value", linewidth=2,alpha=0.5)
ax.set_xlabel("data")
ax.set_ylabel("target")
ax.set_title("Decision Tree Regression")
ax.legend(framealpha=0.5)
plt.show()
def bench_scikit_tree_regressor(X, Y):
"""Benchmark with scikit-learn decision tree regressor"""
from sklearn.tree import DecisionTreeRegressor
gc.collect()
# start time
tstart = datetime.Now()
clf = DecisionTreeRegressor()
clf.fit(X, Y).predict(X)
delta = (datetime.Now() - tstart)
# stop time
scikit_regressor_results.append(
delta.seconds + delta.microseconds / mu_second)
def test_importances_gini_equal_mse():
# Check that gini is equivalent to mse for binary output variable
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
# The gini index and the mean square error (variance) might differ due
# to numerical instability. Since those instabilities mainly occurs at
# high tree depth,we restrict this maximal depth.
clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
random_state=0).fit(X, y)
reg = DecisionTreeRegressor(criterion="mse",
random_state=0).fit(X, y)
assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
assert_array_equal(clf.tree_.feature, reg.tree_.feature)
assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_friedman_mse_in_graphviz():
clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
clf.fit(X, y)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data)
clf = GradientBoostingClassifier(n_estimators=2, y)
for estimator in clf.estimators_:
export_graphviz(estimator[0], out_file=dot_data)
for finding in finditer("\[.*?samples.*?\]", dot_data.getvalue()):
assert_in("friedman_mse", finding.group())
def test_bootstrap_features():
# Test that bootstrapping features may generate duplicate features.
rng = check_random_state(0)
X_train, y_test = train_test_split(boston.data,
boston.target,
random_state=rng)
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
max_features=1.0,
bootstrap_features=False,
random_state=rng).fit(X_train, y_train)
for features in ensemble.estimators_features_:
assert_equal(boston.data.shape[1], np.unique(features).shape[0])
ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
bootstrap_features=True, y_train)
for features in ensemble.estimators_features_:
assert_greater(boston.data.shape[1], np.unique(features).shape[0])
def test_parallel_regression():
# Check parallel regression.
rng = check_random_state(0)
X_train,
random_state=rng)
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=3,
random_state=0).fit(X_train, y_train)
ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)
ensemble = BaggingRegressor(DecisionTreeRegressor(),
n_jobs=1, y_train)
y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)
def test_gridsearch():
# Check that base trees can be grid-searched.
# AdaBoost classification
boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'n_estimators': (1, 2),
'base_estimator__max_depth': (1,
'algorithm': ('SAMME', 'SAMME.R')}
clf = gridsearchcv(boost, parameters)
clf.fit(iris.data, iris.target)
# AdaBoost regression
boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
random_state=0)
parameters = {'n_estimators': (1, 2)}
clf = gridsearchcv(boost, parameters)
clf.fit(boston.data, boston.target)
def _get_shape_for_attribute(attribute_data, labels, class_weights, feature_name, criterion, splitter,
max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
max_features, random_state, max_leaf_nodes, presort):
dtr = DecisionTreeRegressor(criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=max_features,
random_state=random_state,
max_leaf_nodes=max_leaf_nodes,
presort=presort)
dtr.fit(attribute_data.reshape(-1, 1), labels)
return feature_name, _get_sum_of_gamma_correction(dtr.tree_, attribute_data, feature_name)
def create_model(list_of_features):
n_estimators=10000
n_jobs=4
x_train=data_frame[list_of_features]
y_train=data_frame.iloc[:,-1]
x_test=data_frame_test[list_of_features]
random_state=0
forest=BaggingRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=n_estimators,random_state=random_state, n_jobs=n_jobs)
forest.fit(x_train[list_of_features],y_train)
Y_pred=forest.predict(data_frame_test[list_of_features].as_matrix())
i=0
file=open('submission.csv','w')
header="Id,SalePrice"
header=header+'\n'
file.write(header)
for id in (data_frame_test['Id']):
str="{},{}".format(id,Y_pred[i])
str=str+'\n'
file.write(str)
i+=1
def RunTestor():
# Create Volume test
vXs, vYs = generateBaseVector(volume_filename, "volume")
vXs_more = createVolumeVector(vXs, weather_filename)
vXs_fin = generateProcessedVolumeVector(vXs_more)
vX_train, vX_test, vy_train, vy_test = train_test_split(vXs_fin, vYs, test_size=0.1)
volume_reg = DecisionTreeRegressor()
volume_reg.fit(vX_train, vy_train)
vResult = volume_reg.predict(vX_test)
v_mape = VolumeMAPE(vX_test, vResult, vy_test)
# Create Travel Time test
tXs, tYs = generateBaseVector(travel_filename, "travel_time")
tXs_more = createTravelTimeVector(tXs, weather_filename)
tXs_fin = generateProcessedTravelTimeVector(tXs_more)
tX_train, tX_test, ty_train, ty_test = train_test_split(tXs_fin, tYs, test_size=0.1)
travelTime_reg = DecisionTreeRegressor()
travelTime_reg.fit(tX_train, ty_train)
tResult = travelTime_reg.predict(tX_test)
t_mape = TravelTimeMAPE(tX_test, tResult, ty_test)
print("MAPE of Volume Prediction: " + str(v_mape) + "\n")
print("MAPE of Travel Prediction: " + str(t_mape) + "\n")
return True
def setClf(self):
min_samples_split = 10
self.clf = DecisionTreeRegressor(random_state=0, min_samples_split= min_samples_split)
return
def test_logitboost_musk_fitting():
c = LogitBoostClassifier(
base_estimator=DecisionTreeRegressor(max_depth=1),
n_estimators=30,
learning_rate=1.0
)
data = MUSK1()
c.fit(data.data, np.sign(data.labels))
assert_array_less(c.estimator_errors_, 0.6)
assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.05
def test_logitboost_hastie_fitting():
c = LogitBoostClassifier(
base_estimator=DecisionTreeRegressor(max_depth=1),
learning_rate=1.0
)
data = Hastie_10_2()
c.fit(data.data, 0.5)
assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.2
def test_gentleboost_musk_fitting():
c = GentleBoostClassifier(
base_estimator=DecisionTreeRegressor(max_depth=1),
n_estimators=30,
learning_rate=1.0
)
data = MUSK1()
c.fit(data.data, c.predict(data.data)) < 0.1
def test_gentleboost_hastie_fitting():
c = GentleBoostClassifier(
base_estimator=DecisionTreeRegressor(max_depth=1),
learning_rate=1.0
)
data = Hastie_10_2()
c.fit(data.data, c.predict(data.data)) < 0.2
def fit(self, sample_weight=None):
from sklearn.tree import DecisionTreeRegressor
self.max_features = float(self.max_features)
if self.max_depth == "None":
self.max_depth = None
else:
num_features = X.shape[1]
max_depth = max(1, int(np.round(self.max_depth * num_features, 0)))
self.min_samples_split = int(self.min_samples_split)
self.min_samples_leaf = int(self.min_samples_leaf)
if self.max_leaf_nodes == "None":
self.max_leaf_nodes = None
else:
self.max_leaf_nodes = int(self.max_leaf_nodes)
self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf)
self.estimator = DecisionTreeRegressor(
criterion=self.criterion,
max_depth=max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
max_leaf_nodes=self.max_leaf_nodes,
random_state=self.random_state)
self.estimator.fit(X, sample_weight=sample_weight)
return self
def test_cart_d1_agrees_with_scikit():
d_cart = GaussCART(X, 1)
d_pred = d_cart.predict(X)
sk_cart = tree.DecisionTreeRegressor(max_depth=1)
sk_cart = sk_cart.fit(X, y)
sk_pred = sk_cart.predict(X)
d_error = np.round(sose(y, d_pred), 6)
sk_error = np.round(sose(y, sk_pred), 6)
assert d_error == sk_error
def test_cart_d3_agrees_with_scikit():
d_cart = GaussCART(X, 3)
d_pred = d_cart.predict(X)
sk_cart = tree.DecisionTreeRegressor(max_depth=3)
sk_cart = sk_cart.fit(X, 6)
assert d_error == sk_error
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
def bulid_model(model_name):
model = model_name()
return model
#for model_name in [LinearRegression,GradientBoostingRegressor]:
for model_name in [LinearRegression, ElasticNet]:
model = bulid_model(model_name)
model.fit(TrainX,TrainY)
print(model_name)
resid = model.predict(TestX) - TestY
#print resid
print("Residual sum of squares: %f"% np.mean(resid ** 2))
#print model.predict(TestX)
#print TestY
# Explained variance score: 1 is perfect prediction
plt.scatter(model.predict(TestX), resid);
plt.axhline(0, color='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
#plt.xlim([1,50])
plt.show()
print('Variance score: %.2f' % model.score(TestX, TestY))
from statsmodels.stats.stattools import jarque_bera
_, pvalue, _, _ = jarque_bera(resid)
print ("Test Residuals normal", pvalue)
from statsmodels import regression, stats
import statsmodels.api as sms
import statsmodels.stats.diagnostic as smd
# xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
xs_with_constant = sms.add_constant(TestX)
_, pvalue1, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
print ("Test Heteroskedasticity", pvalue1)
ljung_Box = smd.acorr_ljungBox(resid, lags=10)
#print "Lagrange Multiplier Statistics:",ljung_Box[0]
print "Test Autocorrelation P-values:", ljung_Box[1]
if any(ljung_Box[1] < 0.05):
print "The residuals are autocorrelated."
else:
print "The residuals are not autocorrelated."
def __init__(self, isTrain):
super(RegressionAdaBoost, self).__init__(isTrain)
# data preprocessing
#self.dataPreprocessing()
# Create AdaBoost regression object
decisionReg = DecisionTreeRegressor(max_depth=10)
rng = np.random.RandomState(1)
self.adaReg = AdaBoostRegressor(decisionReg,
n_estimators=400,
random_state=rng)
def drawValidationCurve(self):
"""
To draw the validation curve
:return:NA
"""
X, y = self.X_train, self.y_train.ravel()
indices = np.arange(y.shape[0])
#np.random.shuffle(indices)
X, y = X[indices], y[indices]
train_sizes = range(2,60)
train_scores, valid_scores = validation_curve(DecisionTreeRegressor(max_features=None), "max_depth",
train_sizes, scoring='mean_squared_error')
train_scores = -1.0/5 *train_scores
valid_scores = -1.0/5 *valid_scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training MSE")
plt.plot(train_sizes, valid_scores_mean, '*-', color="g",
label="cross-validation MSE")
plt.legend(loc="best")
plt.xlabel('Max Depth')
plt.ylabel('MSE')
plt.title('Validation Curve with Decision \nTree Regression on the parameter of Max Depth')
plt.grid(True)
plt.show()
def test_search_cv_results_none_param():
X, y = [[1], [2], [3], [4], [5]], [0, 0, 1]
estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
est_parameters = {"random_state": [0, None]}
cv = KFold(random_state=0)
for est in estimators:
grid_search = dcv.gridsearchcv(est, est_parameters, cv=cv).fit(X, y)
assert_array_equal(grid_search.cv_results_['param_random_state'],
[0, None])
def regress(y, x, test_x=[]):
if len(test_x) == 0:
test_x = x
clf = DecisionTreeRegressor()
clf.fit(x, y)
y_p = clf.predict(test_x)
plt.scatter(y, y_p)
def ada_boost_tree_grid_search():
ada_boost_tree_grid = {
'base_estimator__max_features': ['sqrt'],
'base_estimator__splitter': ['best', 'random'],
'base_estimator__min_samples_split': [2, 4],
'base_estimator__max_depth': [1, 3],
'n_estimators': [50, 100, 1000],
'learning_rate': [.001, .01, .1],
'loss': ['linear', 'square', 'exponential']
}
abr = AdaBoostRegressor(DecisionTreeRegressor())
return ada_boost_tree_grid, abr
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
scikit_data = load_boston()
scikit_model = DecisionTreeRegressor(random_state = 1)
scikit_model.fit(scikit_data['data'], scikit_data['target'])
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = DecisionTreeRegressor()
spec = skl_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
from sklearn.preprocessing import OneHotEncoder
with self.assertRaises(Exception):
model = OneHotEncoder()
spec = skl_converter.convert(model, 'out')
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
# Load data and train model
scikit_data = load_boston()
self.scikit_data = scikit_data
self.X = scikit_data['data']
self.target = scikit_data['target']
self.feature_names = scikit_data.feature_names
self.output_name = 'target'
def spot_check(X, y):
if type == 'regression':
models = [
(LinearRegression(), 'Ordinary Least Squares'),
(Ridge(alpha=0.1), 'Ridge (alpha 0.1)'),
(Ridge(), 'Ridge (alpha 1.0)'),
(Lasso(alpha=0.1), 'Lasso (alpha 0.1)'),
(Lasso(), 'Lasso (alpha 1.0)'),
(ElasticNet(alpha=0.1), 'ElasticNet (alpha 0.1)'),
(ElasticNet(), 'ElasticNet (alpha 1.0)'),
(DecisionTreeRegressor(), 'Decision Tree'),
(KNeighborsRegressor(), 'K-Nearest Neighbors'),
# (RandomForestRegressor(),'Random Forest Regressor'),
# (BaggingRegressor(),'Bagging Regressor'),
# (GradientBoostingRegressor(),'Gradient Bosted Regression'),
# (SVR(),'Support Vector Regression')
]
splits = 5
scores = []
for model, model_name in models:
score = check_model(model, splits, y)
# get average score
scores.append(score)
model_names = map(lambda x: x[1], models)
for name, score in zip(model_names, scores):
print('%s: %f' % (name, score))
def test_boston(self):
from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn
model = DecisionTreeRegressor(max_n_splits=3)
model_sklearn = DecisionTreeRegressorSklearn()
dataset = load_boston()
mse = []
mse_sklearn = []
for fold in range(5):
X_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.33)
model.fit(X_train, y_train)
y = model.predict(X_test)
mse.append(mean_squared_error(y, y_test))
model_sklearn.fit(X_train, y_train)
y = model_sklearn.predict(X_test)
mse_sklearn.append(mean_squared_error(y, y_test))
mean_mse = np.mean(mse)
mean_mse_sklearn = np.mean(mse_sklearn)
print(mean_mse, mean_mse_sklearn)
# Check that our model differs in MSE no worse than 20%
self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.2)
def test_boston(self):
from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn
model = DecisionTreeRegressor(tree_type='oblivIoUs', max_n_splits=3)
model_sklearn = DecisionTreeRegressorSklearn()
dataset = load_boston()
mse = []
mse_sklearn = []
for fold in range(5):
X_train, mean_mse_sklearn)
# Check that our model differs in MSE no worse than 50%
self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.5)
# def test_check_estimators(self):
# """
# Tests that models adhere to scikit-learn Estimator interface.
# """
# check_estimator(DecisionTreeClassifier)
def __init__(self, problem_type):
self.problem_type = problem_type
if self._is_classification():
self.model = DecisionTreeClassifier(random_state=RANDOM_STATE+1)
elif self._is_regression():
self.model = DecisionTreeRegressor(random_state=RANDOM_STATE+2)
else:
raise NotImplementedError
def __init__(self, base_estimator=None, max_features=1.0,
max_depth=6, learning_rate=1.0, loss='linear', random_state=None):
if base_estimator and base_estimator == 'etr':
base_estimator = ExtraTreeRegressor(max_depth=max_depth,
max_features=max_features)
else:
base_estimator = DecisionTreeRegressor(max_depth=max_depth,
max_features=max_features)
self.model = sklearn.ensemble.AdaBoostRegressor(
base_estimator=base_estimator,
n_estimators=n_estimators,
learning_rate=learning_rate,
random_state=random_state,
loss=loss)
def test_DecisionTreeRegressor_splitter(*data):
'''
test the performance with different splitters
:param data: train_data,test_value
:return: None
'''
X_train,y_test=data
splitters=['best','random']
for splitter in splitters:
regr = DecisionTreeRegressor(splitter=splitter)
regr.fit(X_train, y_train)
print("Splitter {0}".format(splitter))
print("Training score:{0}".format(regr.score(X_train,y_train)))
print("Testing score:{0}".format(regr.score(X_test,y_test)))
def test_DecisionTreeRegressor_depth(*data,maxdepth):
'''
test the score with different max_depth
:param data: train_data,test_value
:param maxdepth: an integer
:return: None
'''
X_train,y_test=data
depths=np.arange(1,maxdepth)
training_scores=[]
testing_scores=[]
for depth in depths:
regr = DecisionTreeRegressor(max_depth=depth)
regr.fit(X_train, y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
## graph
fig=plt.figure()
ax=fig.add_subplot(1,1)
ax.plot(depths,training_scores,label="traing score")
ax.plot(depths,testing_scores,label="testing score")
ax.set_xlabel("maxdepth")
ax.set_ylabel("score")
ax.set_title("Decision Tree Regression")
ax.legend(framealpha=0.5)
plt.show()
def test_presort_sparse():
ests = (DecisionTreeClassifier(presort=True),
DecisionTreeRegressor(presort=True))
sparse_matrices = (csr_matrix, csc_matrix, coo_matrix)
y, X = datasets.make_multilabel_classification(random_state=0,
n_samples=50,
n_features=1,
n_classes=20)
y = y[:, 0]
for est, sparse_matrix in product(ests, sparse_matrices):
yield check_presort_sparse, est, sparse_matrix(X), y
def test_oob_score_regression():
# Check that oob prediction is a good estimation of the generalization
# error.
rng = check_random_state(0)
X_train,
random_state=rng)
clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
n_estimators=50,
bootstrap=True,
oob_score=True,
random_state=rng).fit(X_train, y_train)
test_score = clf.score(X_test, y_test)
assert_less(abs(test_score - clf.oob_score_), 0.1)
# Test with few estimators
assert_warns(UserWarning,
BaggingRegressor(base_estimator=DecisionTreeRegressor(),
n_estimators=1,
bootstrap=True,
oob_score=True,
random_state=rng).fit,
X_train,
y_train)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。