I am trying to execute the following code in a notebook on databricks:
df = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2019-01.csv.gz")
So far, so good, the df populates ok...
Then things go wrong:
report = ProfileReport(df, title="Pandas Profiling Report")
report_html = report.to_html() --> causes the dispatcherror
displayHTML(report_html)
Has anybody a similar problem, and who knows how to solve this?
Thanks in advance for your response.
Some details on the dispatch error:
---------------------------------------------------------------------------
DispatchError Traceback (most recent call last)
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
327 try:
--> 328 return func(*args, **kwargs)
329 except TypeError as ex:
/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py in spark_describe_1d(config, series, summarizer, typeset)
40 # Infer variable types
---> 41 vtype = typeset.infer_type(series)
42 series = typeset.cast_to_inferred(series)
/databricks/python/lib/python3.8/site-packages/visions/typesets/typeset.py in infer_type(self, data)
310 """
--> 311 _, paths, _ = self.infer(data)
312 return get_type_from_path(paths)
/databricks/python/lib/python3.8/site-packages/visions/typesets/typeset.py in infer(self, data)
298 """
--> 299 return traverse_graph(data, self.root_node, self.relation_graph)
300
/usr/lib/python3.8/functools.py in wrapper(*args, **kw)
874
--> 875 return dispatch(args[0].__class__)(*args, **kw)
876
/databricks/python/lib/python3.8/site-packages/visions/backends/spark/traversal.py in _traverse_graph_spark_dataframe(df, root_node, graph)
16 ) -> Tuple[DataFrame, Dict[str, List[T]], Dict[str, dict]]:
---> 17 inferred_values = {
18 col: traverse_graph_with_series(root_node, df.select(col), graph)
/databricks/python/lib/python3.8/site-packages/visions/backends/spark/traversal.py in <dictcomp>(.0)
17 inferred_values = {
---> 18 col: traverse_graph_with_series(root_node, df.select(col), graph)
19 for col in df.columns
/databricks/python/lib/python3.8/site-packages/visions/typesets/typeset.py in traverse_graph_with_series(base_type, series, graph, path, state)
145 relation = graph[base_type][vision_type]["relationship"]
--> 146 if relation.is_relation(series, state):
147 series = relation.transform(series, state)
/databricks/python/lib/python3.8/site-packages/visions/relations/relations.py in is_relation(self, series, state)
59 state = {}
---> 60 return self.relationship(series, state)
61
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
325 self.evaluate()
--> 326 func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
327 try:
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __missing__(self, types)
319 msg = f"{self.__name__}: {len(keys)} methods found" # type: ignore
--> 320 raise DispatchError(msg, types, keys)
321
DispatchError: ('contains_op: 0 methods found', (<class 'pyspark.sql.dataframe.DataFrame'>, <class 'dict'>), [])
The above exception was the direct cause of the following exception:
DispatchError Traceback (most recent call last)
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
327 try:
--> 328 return func(*args, **kwargs)
329 except TypeError as ex:
/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py in spark_get_series_descriptions(config, df, summarizer, typeset, pbar)
91 with multiprocessing.pool.ThreadPool(12) as executor:
---> 92 for i, (column, description) in enumerate(
93 executor.imap_unordered(multiprocess_1d, args)
/usr/lib/python3.8/multiprocessing/pool.py in next(self, timeout)
867 return value
--> 868 raise value
869
/usr/lib/python3.8/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py in multiprocess_1d(args)
87 column, df = args
---> 88 return column, describe_1d(config, df.select(column), summarizer, typeset)
89
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
329 except TypeError as ex:
--> 330 raise DispatchError(f"Function {func.__code__}") from ex
331
DispatchError: Function <code object spark_describe_1d at 0x7fea05eebbe0, file "/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py", line 16>
The above exception was the direct cause of the following exception:
DispatchError Traceback (most recent call last)
<command-4004046284364597> in <module>
1 df.printSchema()
2 report = ProfileReport(df, title="Pandas Profiling Report")
----> 3 report.to_widgets()
4 #report.to_file('profile.html')
/databricks/python/lib/python3.8/site-packages/ydata_profiling/profile_report.py in to_widgets(self)
514 from IPython.core.display import display
515
--> 516 display(self.widgets)
517
518 def _repr_html_(self) -> None:
DispatchError: Function <code object spark_get_series_descriptions at 0x7fea05eebea0, file "/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py", line 67>
---------------------------------------------------------------------------
DispatchError Traceback (most recent call last)
/databricks/python/lib/python3.8/site-packages/ydata_profiling/profile_report.py in widgets(self)
287 def widgets(self) -> Any:
288 if (
--> 289 isinstance(self.description_set.table["n"], list)
290 and len(self.description_set.table["n"]) > 1
291 ):
/databricks/python/lib/python3.8/site-packages/ydata_profiling/profile_report.py in description_set(self)
251 def description_set(self) -> BaseDescription:
252 if self._description_set is None:
--> 253 self._description_set = describe_df(
254 self.config,
255 self.df,
/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/describe.py in describe(config, df, summarizer, typeset, sample)
72 # Variable-specific
73 pbar.total += len(df.columns)
---> 74 series_description = get_series_descriptions(
75 config, df, summarizer, typeset, pbar
76 )
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
328 return func(*args, **kwargs)
329 except TypeError as ex:
--> 330 raise DispatchError(f"Function {func.__code__}") from ex
331
332 def evaluate(self):
DispatchError: Function <code object spark_get_series_descriptions at 0x7fea05eebea0, file "/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py", line 67>
Any idea on what is going wrong here, or what should I do?
Removing the 'title' parameter worked for me.
Replace
with