Saving concatenated Awkward-Arrays with Different Fields

60 views Asked by At

So I have been working with some data that includes both electrons and muons. Now for this we have to find opposite-sign same-flavour dilepton pairs, i.e. e+e- or mu+mu-. However, I run into problems after I find these pairs, concatenate them into a single dilepton array, then try and save the resulting array in a parquet file. Note I am using Awkward 1.10.3.

A simple example shows the problem that I am running into:

import awkward as ak
import vector
vector.register_awkward()


elec = ak.Array([[{'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}, {'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}]])

mu = ak.Array([[{'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}, {'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}]])


electrons_4V = ak.Array(elec, with_name = "Momentum4D")
ee_pairs = ak.combinations(electrons_4V, 2, fields = ["LeadLepton", "SubleadLepton"])

muons_4V = ak.Array(mu, with_name = "Momentum4D")
mm_pairs = ak.combinations(muons_4V, 2, fields = ["LeadLepton", "SubleadLepton"])

dileptons = ak.concatenate([ee_pairs, mm_pairs], axis = 1)

ak.to_parquet(dileptons, 'test.parquet')

This then leads to the error:

---------------------------------------------------------------------------
ArrowNotImplementedError                  Traceback (most recent call last)
Cell In[1], line 20
     17 dileptons = ak.concatenate([ee_pairs, mm_pairs], axis = 1)
     18 dileptons['Dilepton'] = dileptons.LeadLepton + dileptons.SubleadLepton
---> 20 ak.to_parquet(dileptons, 'test.parquet')

File ~/miniconda3/envs/idm/lib/python3.10/site-packages/awkward/operations/ak_to_parquet.py:295, in to_parquet(array, destination, list_to32, string_to32, bytestring_to32, emptyarray_to, categorical_as_dictionary, extensionarray, count_nulls, compression, compression_level, row_group_size, data_page_size, parquet_flavor, parquet_version, parquet_page_version, parquet_metadata_statistics, parquet_dictionary_encoding, parquet_byte_stream_split, parquet_coerce_timestamps, parquet_old_int96_timestamps, parquet_compliant_nested, parquet_extra_options, storage_options)
    293 fs, destination = fsspec.core.url_to_fs(destination, **(storage_options or {}))
    294 metalist = []
--> 295 with pyarrow_parquet.ParquetWriter(
    296     destination,
    297     table.schema,
    298     filesystem=fs,
    299     flavor=parquet_flavor,
    300     version=parquet_version,
    301     use_dictionary=parquet_dictionary_encoding,
    302     compression=compression,
    303     write_statistics=parquet_metadata_statistics,
    304     use_deprecated_int96_timestamps=parquet_old_int96_timestamps,
    305     compression_level=compression_level,
    306     use_byte_stream_split=parquet_byte_stream_split,
    307     data_page_version=parquet_page_version,
    308     use_compliant_nested_type=parquet_compliant_nested,
    309     data_page_size=data_page_size,
    310     coerce_timestamps=parquet_coerce_timestamps,
    311     metadata_collector=metalist,
    312     **parquet_extra_options,
    313 ) as writer:
    314     writer.write_table(table, row_group_size=row_group_size)
    315 meta = metalist[0]

File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/parquet/core.py:1001, in ParquetWriter.__init__(self, where, schema, filesystem, flavor, version, use_dictionary, compression, write_statistics, use_deprecated_int96_timestamps, compression_level, use_byte_stream_split, column_encoding, writer_engine_version, data_page_version, use_compliant_nested_type, encryption_properties, write_batch_size, dictionary_pagesize_limit, store_schema, **options)
    999 self._metadata_collector = options.pop('metadata_collector', None)
   1000 engine_version = 'V2'
-> 1001 self.writer = _parquet.ParquetWriter(
   1002     sink, schema,
   1003     version=version,
   1004     compression=compression,
   1005     use_dictionary=use_dictionary,
   1006     write_statistics=write_statistics,
   1007     use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
   1008     compression_level=compression_level,
   1009     use_byte_stream_split=use_byte_stream_split,
   1010     column_encoding=column_encoding,
   1011     writer_engine_version=engine_version,
   1012     data_page_version=data_page_version,
   1013     use_compliant_nested_type=use_compliant_nested_type,
   1014     encryption_properties=encryption_properties,
   1015     write_batch_size=write_batch_size,
   1016     dictionary_pagesize_limit=dictionary_pagesize_limit,
   1017     store_schema=store_schema,
   1018     **options)
   1019 self.is_open = True

File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/_parquet.pyx:1754, in pyarrow._parquet.ParquetWriter.__cinit__()

File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/error.pxi:121, in pyarrow.lib.check_status()

ArrowNotImplementedError: Unhandled type for Arrow to Parquet schema conversion: dense_union<0: extension> not null=0, 1: extension> not null=1>

This seems like it could also be an Arrow issue, and not an Awkward at all. It also seems that the issue comes down to the fact that the elec array and the mu array have different fields (MVAid for elec and tightId for mu) which is causing dileptons to be a union array, but I'm unsure if there is any other way to move forward to get this to save, or just keep them separated.

Also, if I don't include any of the vector stuff I still get the same error:

import awkward as ak

elec = ak.Array([[{'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}, {'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}]])

mu = ak.Array([[{'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}, {'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}]])

ee_pairs = ak.combinations(elec, 2, fields = ["LeadLepton", "SubleadLepton"])
mm_pairs = ak.combinations(mu, 2, fields = ["LeadLepton", "SubleadLepton"])

dileptons = ak.concatenate([ee_pairs, mm_pairs], axis = 1)

ak.to_parquet(dileptons, 'test.parquet')

Thanks!

T

0

There are 0 answers