如何解决使用sklearn进行三次样条回归?
我想非常准确地回归一个与单个变量 x 非线性相关的目标。在我的 sklearn 管道中,我使用:
pipe = Pipeline([('poly',polynomialFeatures(3,include_bias=False)),\
('regr',ElasticNet(random_state=0))])
这似乎在准确性方面给出了与 np.polyfit(x,y,3)
相似的结果。但是,我基本上可以通过使用三次样条来达到机器精度。见下图,我展示了数据和各种拟合,以及残差。 [注意:下面的例子有 50 个样本。我实际有 2000 个样本]
我有两个问题:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
%matplotlib inline
data = pd.read_csv('example.txt') # added to this post below
p = np.polyfit(data['x'],data['y'],3)
data['polyfit'] = np.poly1d(p)(data['x'])
f = interp1d(data['x'],kind='cubic')
data['spline'] = f(data['x'])
fig,axes = plt.subplots(nrows=3,sharex=True)
axes[0].plot(data['x'],data['polyfit'],'.',label='polyfit')
axes[0].plot(data['x'],data['spline'],label='spline')
axes[0].plot(data['x'],label='true')
axes[0].legend()
axes[1].plot(data['x'],data['polyfit']-data['y'],label='error polyfit')
axes[1].legend()
axes[2].plot(data['x'],data['spline']-data['y'],label='error spline')
axes[2].legend()
plt.show()
数据如下:
example.txt:
,x,y
257,6.26028462060192,-1233.748349982897
944,4.557099191827032,928.1430280794456
1560,6.765081341690966,-1807.9090703632864
504,4.0015671921214775,1683.311523022658
1499,3.0496689401255783,3055.291788377236
1247,5.608726443314061,-441.9226126757499
1856,4.6124942196224845,845.129184983355
1495,1.273838638033053,5479.078773760113
1052,5.353775782315625,-115.14032709875217
247,2.6495185259267076,3656.7467318648232
1841,9.73337795053495,-4884.806993807511
1574,1.1772247845133335,5544.080005636716
1116,5.698561786140379,-555.3435567718
1489,4.184371293153768,1427.6922357286753
603,1.568868565047676,5179.156099377357
358,4.534081088923849,960.3983442022857
774,9.304809492028289,-4468.215701489676
1525,9.17423541311121,-4340.565494266174
1159,6.705834877066449,-1750.189447626367
1959,3.0431599461645207,3065.358649171256
1086,1.3861557136230234,5378.274828554064
81,4.728366950632029,682.7245723055514
1791,6.954198834068505,-2027.0414501796324
234,2.8672306789699844,3330.7282514295102
1850,2.0086469278742363,4603.0931759401155
1531,9.843164998128215,-4973.735518791005
903,1.534448692052103,5220.331847067942
1258,7.243723209152924,-2354.629822080041
645,2.3302780902754514,4128.077572586273
1425,3.295574067849755,2694.766296765896
311,2.3225198086033756,4152.206609684557
219,8.479436097125713,-3665.2515034579396
1917,7.1524135031820135,-2253.3455629418195
1412,6.79800860136838,-1861.3756670478142
705,1.9001265482939966,4756.283634364785
663,3.441268690856777,2489.7632239249424
1871,6.473544271091015,-1480.6593600880415
1897,8.217615163361007,-3386.5427698021977
558,6.609652057181176,-1634.1672307700298
553,5.679571371137544,-524.352981663938
1847,6.487178186324092,-1500.1891501936236
752,9.377368455681758,-4548.188126821915
1469,8.586759667609758,-3771.691600599668
1794,6.649801445466815,-1674.4870918398076
968,1.6226439291315056,5117.8804886837
108,3.0077346937655647,3118.0786841570025
96,6.278616413290749,-1245.4758811316083
994,7.631678455127069,-2767.3224262153176
871,2.6696610777085863,3630.02481913033
1405,9.209358577104299,-4368.622350004463
解决方法
将单个三次方程用于数据的前两种方法,但是(顾名思义)interp1d
用三次样条插值数据:即存在三次曲线对于每个连续的点对,保证完美匹配(达到计算精度)。
我想对我之前的问题进行跟进。可以使用 scikit-learn 拟合基于 B-spline 的模型,该模型具有有限的复杂性(预定义的样条数 - 不像 interp1d 那样随着点数增长)。以下代码取自 robust_splines_sklearn.py。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import splrep,splev
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.metrics import mean_squared_error
def get_bspline_basis(knots,degree=3,periodic=False):
"""Get spline coefficients for each basis spline."""
nknots = len(knots)
y_dummy = np.zeros(nknots)
knots,coeffs,degree = splrep(knots,y_dummy,k=degree,per=periodic)
ncoeffs = len(coeffs)
bsplines = []
for ispline in range(nknots):
coeffs = [1.0 if ispl == ispline else 0.0 for ispl in range(ncoeffs)]
bsplines.append((knots,degree))
return bsplines
class BSplineFeatures(TransformerMixin):
def __init__(self,knots,periodic=False):
self.bsplines = get_bspline_basis(knots,degree,periodic=periodic)
self.nsplines = len(self.bsplines)
def fit(self,X,y=None):
return self
def transform(self,X):
nsamples,nfeatures = X.shape
features = np.zeros((nsamples,nfeatures * self.nsplines))
for ispline,spline in enumerate(self.bsplines):
istart = ispline * nfeatures
iend = (ispline + 1) * nfeatures
features[:,istart:iend] = splev(X,spline)
return features
data = pd.read_csv('example.txt')
knots = np.array([1,1.5,2,3,4,5,7.5,10])
bspline_features = BSplineFeatures(knots,periodic=False)
base_regressor = LinearRegression()
pipe1 = Pipeline([('poly',PolynomialFeatures(3,include_bias=False)),\
('regr',ElasticNet(random_state=0))])
pipe2 = Pipeline([('feature',bspline_features),\
('regression',base_regressor)])
models = {"polyfit": pipe1,"spline": pipe2}
X = data['x'].to_numpy().reshape((data.shape[0],1))
y = data['y']
for label,model in models.items():
model.fit(X,y)
data[label] = model.predict(X)
fig,axes = plt.subplots(nrows=3,sharex=True)
axes[0].plot(data['x'],data['polyfit'],'.',label='polyfit')
axes[0].plot(data['x'],data['spline'],label='spline')
axes[0].plot(data['x'],data['y'],label='true')
axes[0].legend()
axes[1].plot(data['x'],data['polyfit']-data['y'],label='error polyfit')
axes[1].legend()
axes[2].plot(data['x'],data['spline']-data['y'],label='error spline')
axes[2].legend()
plt.show()
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。