如何解决给定直方图来计算PDF
我有一个严重偏斜的直方图,想计算一系列Lifetimevalues(曲线下的面积,PDF)的概率。例如,生命周期值处于(0-0.01)
由LTV组成的数据帧,该数据是通过累计收入/累计安装次数来计算的:
df['LTV']
是
(0,0.208125,0.0558879,0.608348,0.212553,0.0865896,0.728542,0.609512,0.0801339,0.140657,0.0194118,0.0634682,0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0.342055,0.42781,0.192115,1.6473,0.232329,2.21329,0.748,0.0424286,0.455439,0.210282,5.56453,0.427959,0.352059,0.567059,0.384462,1.29476,0.0103125,0.0126923,1.03356,0.289785,0)
我曾尝试利用SKlearn的KernelDensity,但是,在将其拟合到直方图之后,它并没有捕获到过度代表的0。
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch,field,x_start,x_end):
plt.figure(figsize = (10,7))
unit = 0
x = np.linspace(df_lunch.min() - unit,df_lunch.max() + unit,1000)[:,np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch,bins=200,density=True,label='LTV',color='blue',alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian',bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x,kd_vals_lunch,color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,linestyle='dashed')
# Show the plots
plt.xlabel(field,fontsize=15)
plt.ylabel('Probability Density',fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1),'LTV',x_start=0,x_end=0.01)
然后找到这样的概率:
def get_probability(start_value,end_value,eval_points,kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value,N)[:,np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,end_value = 0.01,eval_points = 100,kd = kd_lunch)))
但是,这种方法不能产生我们想要的适当的PDF值。 对于其他方法的任何建议将不胜感激。
地点:
解决方法
我在工作中使用了或多或少相似的脚本,这里是我的脚本,可能对你有帮助。
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1,x_start,x_end):
plt.figure(figsize = (4,3.5))
unit = 1.5
x = np.linspace(-20,20,1000)[:,np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1,bins=np.linspace(-20,40),density=True,color='r',alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian',bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x,kd_vals_data1,label='$N_a$',linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed',linewidth = 2.0,label='$β_o$')
plt.axvline(x=1.9,color='black',label='$β_b$')
plt.axvline(x=x_end,color='red',linewidth = 2,label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta',fontsize=10)
plt.ylabel('Probability Density',fontsize=10)
plt.title('02 hours window',fontsize=12)
plt.xlim(-20,20)
plt.ylim(0,0.3)
plt.yticks([0,0.1,0.2,0.3])
plt.legend(fontsize=12,loc='upper left',frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value,end_value,eval_points,kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value,N)[:,np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1,1)
kd_data1 = plot_prob_density(data1,x_start=3.0,x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,end_value = 13,eval_points = 1000,kd = kd_data1)))
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。