给定直方图来计算PDF

如何解决给定直方图来计算PDF

我有一个严重偏斜的直方图，想计算一系列Lifetimevalues（曲线下的面积，PDF）的概率。例如，生命周期值处于（0-0.01）

由LTV组成的数据帧，该数据是通过累计收入/累计安装次数来计算的：

df['LTV']是

(0,0.208125,0.0558879,0.608348,0.212553,0.0865896,0.728542,0.609512,0.0801339,0.140657,0.0194118,0.0634682,0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0.342055,0.42781,0.192115,1.6473,0.232329,2.21329,0.748,0.0424286,0.455439,0.210282,5.56453,0.427959,0.352059,0.567059,0.384462,1.29476,0.0103125,0.0126923,1.03356,0.289785,0)

我曾尝试利用SKlearn的KernelDensity，但是，在将其拟合到直方图之后，它并没有捕获到过度代表的0。

import gc
from sklearn.neighbors import KernelDensity

def plot_prob_density(df_lunch,field,x_start,x_end):
    plt.figure(figsize = (10,7))

    unit = 0
    x = np.linspace(df_lunch.min() - unit,df_lunch.max() + unit,1000)[:,np.newaxis]

    # Plot the data using a normalized histogram
    plt.hist(df_lunch,bins=200,density=True,label='LTV',color='blue',alpha=0.2)
    
    # Do kernel density estimation
    kd_lunch = KernelDensity(kernel='gaussian',bandwidth=0.00187).fit(df_lunch) #0.00187
  

    # Plot the estimated densty
    kd_vals_lunch = np.exp(kd_lunch.score_samples(x))


    plt.plot(x,kd_vals_lunch,color='orange')
    
    plt.axvline(x=x_start,color='red',linestyle='dashed')
    plt.axvline(x=x_end,linestyle='dashed')

    # Show the plots
    plt.xlabel(field,fontsize=15)
    plt.ylabel('Probability Density',fontsize=15)
    plt.legend(fontsize=15)
    plt.show()
    gc.collect()
    return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1),'LTV',x_start=0,x_end=0.01)

然后找到这样的概率：

def get_probability(start_value,end_value,eval_points,kd):
    
    # Number of evaluation points 
    N = eval_points                                      
    step = (end_value - start_value) / (N - 1)  # Step size

    x = np.linspace(start_value,N)[:,np.newaxis]  # Generate values in the range
    kd_vals = np.exp(kd.score_samples(x))  # Get PDF values for each x
    probability = np.sum(kd_vals * step)  # Approximate the integral of the PDF
    return probability.round(4)


print('Probability of LTV 0-3  tips during LUNCH time: {}\n'
      .format(get_probability(start_value = 0,end_value = 0.01,eval_points = 100,kd = kd_lunch)))

但是，这种方法不能产生我们想要的适当的PDF值。对于其他方法的任何建议将不胜感激。

地点：

解决方法

我在工作中使用了或多或少相似的脚本，这里是我的脚本，可能对你有帮助。

import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]

def plot_prob_density(data1,x_start,x_end):
    plt.figure(figsize = (4,3.5))

    unit = 1.5
    x = np.linspace(-20,20,1000)[:,np.newaxis]

    # Plot the data using a normalized histogram
    plt.hist(data1,bins=np.linspace(-20,40),density=True,color='r',alpha=0.4)
    #plt.show

    # Do kernel density estimation
    kd_data1 = KernelDensity(kernel='gaussian',bandwidth=1.8).fit(data1)

    # Plot the estimated densty
    kd_vals_data1 = np.exp(kd_data1.score_samples(x))

    plt.plot(x,kd_vals_data1,label='$N_a$',linewidth = 2)
    
    plt.axvline(x=9.95,color='green',linestyle='dashed',linewidth = 2.0,label='$β_o$')
    plt.axvline(x=1.9,color='black',label='$β_b$')
    
    plt.axvline(x=x_end,color='red',linewidth = 2,label='$β_{95\%}$')

    # Show the plots
    plt.xlabel('Beta',fontsize=10)
    plt.ylabel('Probability Density',fontsize=10)
    plt.title('02 hours window',fontsize=12)
    plt.xlim(-20,20)
    plt.ylim(0,0.3)
    plt.yticks([0,0.1,0.2,0.3]) 
    plt.legend(fontsize=12,loc='upper left',frameon=False)
    plt.show()
    gc.collect()
    return kd_data1

def get_probability(start_value,end_value,eval_points,kd):
    
    # Number of evaluation points 
    N = eval_points                                      
    step = (end_value - start_value) / (N - 1)  # Step size

    x = np.linspace(start_value,N)[:,np.newaxis]  # Generate values in the range
    kd_vals = np.exp(kd.score_samples(x))  # Get PDF values for each x
    probability = np.sum(kd_vals * step)  # Approximate the integral of the PDF
    return probability.round(4)

data1 = np.array(data1).reshape(-1,1)

kd_data1 = plot_prob_density(data1,x_start=3.0,x_end=13)

print('Beta-95%: {}\n'
      .format(get_probability(start_value = -10,end_value = 13,eval_points = 1000,kd = kd_data1)))