How can I solve dispatcherrors when using to_widgets, to_file, etc. on a profilereport of ydata-profiling?

371 views Asked by At

I am trying to execute the following code in a notebook on databricks:

df = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2019-01.csv.gz")

So far, so good, the df populates ok...

Then things go wrong:

report = ProfileReport(df, title="Pandas Profiling Report") 
report_html = report.to_html() --> causes the dispatcherror 
displayHTML(report_html)

Has anybody a similar problem, and who knows how to solve this?

Thanks in advance for your response.

Some details on the dispatch error:

---------------------------------------------------------------------------
DispatchError                             Traceback (most recent call last)
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
    327         try:
--> 328             return func(*args, **kwargs)
    329         except TypeError as ex:

/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py in spark_describe_1d(config, series, summarizer, typeset)
     40         # Infer variable types
---> 41         vtype = typeset.infer_type(series)
     42         series = typeset.cast_to_inferred(series)

/databricks/python/lib/python3.8/site-packages/visions/typesets/typeset.py in infer_type(self, data)
    310         """
--> 311         _, paths, _ = self.infer(data)
    312         return get_type_from_path(paths)

/databricks/python/lib/python3.8/site-packages/visions/typesets/typeset.py in infer(self, data)
    298         """
--> 299         return traverse_graph(data, self.root_node, self.relation_graph)
    300 

/usr/lib/python3.8/functools.py in wrapper(*args, **kw)
    874 
--> 875         return dispatch(args[0].__class__)(*args, **kw)
    876 

/databricks/python/lib/python3.8/site-packages/visions/backends/spark/traversal.py in _traverse_graph_spark_dataframe(df, root_node, graph)
     16 ) -> Tuple[DataFrame, Dict[str, List[T]], Dict[str, dict]]:
---> 17     inferred_values = {
     18         col: traverse_graph_with_series(root_node, df.select(col), graph)

/databricks/python/lib/python3.8/site-packages/visions/backends/spark/traversal.py in <dictcomp>(.0)
     17     inferred_values = {
---> 18         col: traverse_graph_with_series(root_node, df.select(col), graph)
     19         for col in df.columns

/databricks/python/lib/python3.8/site-packages/visions/typesets/typeset.py in traverse_graph_with_series(base_type, series, graph, path, state)
    145         relation = graph[base_type][vision_type]["relationship"]
--> 146         if relation.is_relation(series, state):
    147             series = relation.transform(series, state)

/databricks/python/lib/python3.8/site-packages/visions/relations/relations.py in is_relation(self, series, state)
     59             state = {}
---> 60         return self.relationship(series, state)
     61 

/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
    325             self.evaluate()
--> 326         func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
    327         try:

/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __missing__(self, types)
    319         msg = f"{self.__name__}: {len(keys)} methods found"  # type: ignore
--> 320         raise DispatchError(msg, types, keys)
    321 

DispatchError: ('contains_op: 0 methods found', (<class 'pyspark.sql.dataframe.DataFrame'>, <class 'dict'>), [])

The above exception was the direct cause of the following exception:

DispatchError                             Traceback (most recent call last)
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
    327         try:
--> 328             return func(*args, **kwargs)
    329         except TypeError as ex:

/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py in spark_get_series_descriptions(config, df, summarizer, typeset, pbar)
     91     with multiprocessing.pool.ThreadPool(12) as executor:
---> 92         for i, (column, description) in enumerate(
     93             executor.imap_unordered(multiprocess_1d, args)

/usr/lib/python3.8/multiprocessing/pool.py in next(self, timeout)
    867             return value
--> 868         raise value
    869 

/usr/lib/python3.8/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
    124         try:
--> 125             result = (True, func(*args, **kwds))
    126         except Exception as e:

/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py in multiprocess_1d(args)
     87         column, df = args
---> 88         return column, describe_1d(config, df.select(column), summarizer, typeset)
     89 

/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
    329         except TypeError as ex:
--> 330             raise DispatchError(f"Function {func.__code__}") from ex
    331 

DispatchError: Function <code object spark_describe_1d at 0x7fea05eebbe0, file "/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py", line 16>

The above exception was the direct cause of the following exception:

DispatchError                             Traceback (most recent call last)
<command-4004046284364597> in <module>
      1 df.printSchema()
      2 report = ProfileReport(df, title="Pandas Profiling Report")
----> 3 report.to_widgets()
      4 #report.to_file('profile.html')

/databricks/python/lib/python3.8/site-packages/ydata_profiling/profile_report.py in to_widgets(self)
    514         from IPython.core.display import display
    515 
--> 516         display(self.widgets)
    517 
    518     def _repr_html_(self) -> None:


DispatchError: Function <code object spark_get_series_descriptions at 0x7fea05eebea0, file "/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py", line 67>
---------------------------------------------------------------------------
DispatchError                             Traceback (most recent call last)
/databricks/python/lib/python3.8/site-packages/ydata_profiling/profile_report.py in widgets(self)
    287     def widgets(self) -> Any:
    288         if (
--> 289             isinstance(self.description_set.table["n"], list)
    290             and len(self.description_set.table["n"]) > 1
    291         ):

/databricks/python/lib/python3.8/site-packages/ydata_profiling/profile_report.py in description_set(self)
    251     def description_set(self) -> BaseDescription:
    252         if self._description_set is None:
--> 253             self._description_set = describe_df(
    254                 self.config,
    255                 self.df,

/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/describe.py in describe(config, df, summarizer, typeset, sample)
     72         # Variable-specific
     73         pbar.total += len(df.columns)
---> 74         series_description = get_series_descriptions(
     75             config, df, summarizer, typeset, pbar
     76         )

/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
    328             return func(*args, **kwargs)
    329         except TypeError as ex:
--> 330             raise DispatchError(f"Function {func.__code__}") from ex
    331 
    332     def evaluate(self):

DispatchError: Function <code object spark_get_series_descriptions at 0x7fea05eebea0, file "/databricks/python/lib/python3.8/site-packages/ydata_profiling/model/spark/summary_spark.py", line 67>

Any idea on what is going wrong here, or what should I do?

1

There are 1 answers

0
Gaurang On

Removing the 'title' parameter worked for me.

Replace

report = ProfileReport(df, title="Pandas Profiling Report") 

with

report = ProfileReport(df)