如何解决无法在 Databricks 上运行 Pandas 分析
我正在尝试在 Databricks 环境中的示例数据框上运行 Pandas Profiling。获取与 marplotlib 相关的错误,不确定此问题是否与 Matplotlib 或 pandas-profiling 相关。任何帮助将不胜感激。
Databricks 运行时配置: 7.4 ML(包括 Apache Spark 3.0.1、Scala 2.12)
这样安装
!pip install pandas-profiling[notebook]
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
profile = ProfileReport(df,title='EDA Report',explorative=True)
profile.to_file("/dbfs/mnt/sb2/EDA_Reports/EDA.html")
错误日志跟踪
Summarize dataset: 93%|█████████▎| 106/114 [11:30<07:27,55.91s/it,Calculate cramers correlation]/databricks/python/lib/python3.7/site-packages/pandas_profiling/model/correlations.py:139: UserWarning: There was an attempt to calculate the cramers correlation,but this Failed.
To hide this warning,disable the calculation
(using `df.profile_report(correlations={"cramers": {"calculate": False}})`
If this is problematic for your use case,please report this as an issue:
https://github.com/pandas-profiling/pandas-profiling/issues
(include the error message: 'No data; `observed` has size 0.')
(include the error message: '{error}')"""
Summarize dataset: 94%|█████████▍| 107/114 [11:56<00:46,6.69s/it,Get scatter matrix]
RuntimeError: "/databricks/python/lib/python3.7/site-packages/matplotlib/mpl-data" should be a path but it does not exist
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/context.py in manage_matplotlib_context()
79 sns.set_style(style="white")
---> 80 yield
81 finally:
/databricks/python/lib/python3.7/contextlib.py in inner(*args,**kwds)
73 with self._recreate_cm():
---> 74 return func(*args,**kwds)
75 return inner
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/plot.py in scatter_pairwise(series1,series2,x_label,y_label)
276 plt.scatter(series1,color=color)
--> 277 return plot_360_n0sc0pe(plt)
278
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/utils.py in plot_360_n0sc0pe(plt,image_format,attempts)
67 image_str = StringIO()
---> 68 plt.savefig(image_str,format=image_format)
69 image_str.seek(0)
/databricks/python/lib/python3.7/site-packages/matplotlib/pyplot.py in savefig(*args,**kwargs)
/databricks/python/lib/python3.7/site-packages/matplotlib/figure.py in savefig(self,fname,transparent,**kwargs)
/databricks/python/lib/python3.7/site-packages/matplotlib/backend_bases.py in print_figure(self,filename,dpi,facecolor,edgecolor,orientation,format,bBox_inches,**kwargs)
/databricks/python/lib/python3.7/site-packages/matplotlib/backend_bases.py in _get_output_canvas(self,fmt)
/databricks/python/lib/python3.7/site-packages/matplotlib/backend_bases.py in get_registered_canvas_class(format)
/databricks/python/lib/python3.7/importlib/__init__.py in import_module(name,package)
126 level += 1
--> 127 return _bootstrap._gcd_import(name[level:],package,level)
128
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _gcd_import(name,level)
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _find_and_load(name,import_)
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _find_and_load_unlocked(name,import_)
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _find_spec(name,path,target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in find_spec(cls,fullname,target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in _get_spec(cls,target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in find_spec(self,target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in _fill_cache(self)
OSError: [Errno 116] Stale file handle: '/databricks/python/lib/python3.7/site-packages/matplotlib/backends'
During handling of the above exception,another exception occurred:
RuntimeError Traceback (most recent call last)
<command-3404575914441933> in <module>
1 profile = ProfileReport(df,explorative=True)
----> 2 profile.to_file("/dbfs/mnt/sb2/naga/dataset/EDA_Reports/Digital_HO_New_Features_EDA.html")
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in to_file(self,output_file,silent)
272 create_html_assets(output_file)
273
--> 274 data = self.to_html()
275
276 if output_file.suffix != ".html":
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in to_html(self)
376
377 """
--> 378 return self.html
379
380 def to_json(self) -> str:
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in html(self)
195 def html(self):
196 if self._html is None:
--> 197 self._html = self._render_html()
198 return self._html
199
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in _render_html(self)
302 from pandas_profiling.report.presentation.flavours import HTMLReport
303
--> 304 report = self.report
305
306 disable_progress_bar = not config["progress_bar"].get(bool)
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in report(self)
189 def report(self):
190 if self._report is None:
--> 191 self._report = get_report_structure(self.description_set)
192 return self._report
193
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in description_set(self)
169 if self._description_set is None:
170 self._description_set = describe_df(
--> 171 self.title,self.df,self.summarizer,self.typeset,self._sample
172 )
173 return self._description_set
/databricks/python/lib/python3.7/site-packages/pandas_profiling/model/describe.py in describe(title,df,summarizer,typeset,sample)
105 # Scatter matrix
106 pbar.set_postfix_str("Get scatter matrix")
--> 107 scatter_matrix = get_scatter_matrix(df,interval_columns)
108 pbar.update()
109
/databricks/python/lib/python3.7/site-packages/pandas_profiling/model/summary.py in get_scatter_matrix(df,continuous_variables)
283 df_temp = df[[x,y]].dropna()
284 scatter_matrix[x][y] = scatter_pairwise(
--> 285 df_temp[x],df_temp[y],x,y
286 )
287 else:
/databricks/python/lib/python3.7/contextlib.py in inner(*args,**kwds)
72 def inner(*args,**kwds):
73 with self._recreate_cm():
---> 74 return func(*args,**kwds)
75 return inner
76
/databricks/python/lib/python3.7/contextlib.py in __exit__(self,type,value,traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type,traceback)
131 except stopiteration as exc:
132 # Suppress stopiteration *unless* it's the same exception that
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/context.py in manage_matplotlib_context()
83 with warnings.catch_warnings():
84 warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
---> 85 matplotlib.rcParams.update(originalRcParams) # revert to original rcParams
/databricks/python/lib/python3.7/_collections_abc.py in update(*args,**kwds)
839 if isinstance(other,Mapping):
840 for key in other:
--> 841 self[key] = other[key]
842 elif hasattr(other,"keys"):
843 for key in other.keys():
/databricks/python/lib/python3.7/site-packages/matplotlib/__init__.py in __setitem__(self,key,val)
/databricks/python/lib/python3.7/site-packages/matplotlib/rcsetup.py in validate_path_exists(s)
RuntimeError: "/databricks/python/lib/python3.7/site-packages/matplotlib/mpl-data" should be a path but it does not exist
解决方法
以下代码应该适用于数据块:
pip install pandas-profiling
# importing packages
import pandas as pd
import pandas_profiling
from pandas_profiling import ProfileReport
# dictionary of data
dct = {'ID': {0: 23,1: 43,2: 12,3: 13,4: 67,5: 89,6: 90,7: 56,8: 34},'Name': {0: 'Ram',1: 'Deep',2: 'Yash',3: 'Aman',4: 'Arjun',5: 'Aditya',6: 'Divya',7: 'Chalsea',8: 'Akash' },'Marks': {0: 89,1: 97,2: 45,3: 78,4: 56,5: 76,6: 100,7: 87,8: 81},'Grade': {0: 'B',1: 'A',2: 'F',3: 'C',4: 'E',5: 'C',6: 'A',7: 'B',8: 'B'}
}
# forming dataframe and printing
data = pd.DataFrame(dct)
print(data)
# forming ProfileReport and save
# as output.html file
profile = ProfileReport(data)
#profile.to_file("/dbfs/tmp/output.html")
#text_raw = profile.to_html()
p = profile.to_html()
displayHTML(p)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。