Extracting data from HDF5 files to Pandas Dataframe

75 views Asked by At

First of all, my thanks for the space.

I came here to look for help because I need to extract data from multiple .HDF5 files that are in a folder and put them into a single and unique Pandas dataframe. The columns are ['H1', 'L1', 'frequency_Hz'].

I shared the files on my Google Drive so they are easy to access: https://drive.google.com/drive/folders/1GwocMZeqZGyikZYgwvGvnnsVNn0B7aNW?usp=sharing

My code is below with my programming logic, but it is returning the error below. I appreciate any help!

# Imports
import glob
import h5py
import numpy as np
import pandas as pd

# Create a list to store the DataFrames of each HDF5 file:
dfs = []

# Get the list of training HDF5 files using glob:
arquivos_hdf5_treino = glob.glob('../Data/OriginalDatasets/train/*.hdf5')

# View the file list:
arquivos_hdf5_treino
['../Data/OriginalDatasets/train\\001121a05.hdf5',
 '../Data/OriginalDatasets/train\\00a6db666.hdf5',
 '../Data/OriginalDatasets/train\\00f36a6ac.hdf5',
 '../Data/OriginalDatasets/train\\0197bacf8.hdf5',
 '../Data/OriginalDatasets/train\\01b8b67f3.hdf5',
 '../Data/OriginalDatasets/train\\01dba9731.hdf5',
 '../Data/OriginalDatasets/train\\02887d232.hdf5',
 '../Data/OriginalDatasets/train\\02c8f43f3.hdf5',
 '../Data/OriginalDatasets/train\\0367dc82c.hdf5',
 '../Data/OriginalDatasets/train\\0517ef7fe.hdf5',
 '../Data/OriginalDatasets/train\\05c0675fe.hdf5',
 '../Data/OriginalDatasets/train\\05cdc0769.hdf5',
 '../Data/OriginalDatasets/train\\05f0aef12.hdf5',
 '../Data/OriginalDatasets/train\\067b3fb4b.hdf5',
 '../Data/OriginalDatasets/train\\06e321c6e.hdf5',
 '../Data/OriginalDatasets/train\\08a060dad.hdf5',
 '../Data/OriginalDatasets/train\\08c444d66.hdf5',
 '../Data/OriginalDatasets/train\\0920a4276.hdf5',
 '../Data/OriginalDatasets/train\\09531cde3.hdf5',
 '../Data/OriginalDatasets/train\\097370861.hdf5',
 '../Data/OriginalDatasets/train\\09e55aeba.hdf5',
 '../Data/OriginalDatasets/train\\09ecddbba.hdf5',
 '../Data/OriginalDatasets/train\\0ba188c57.hdf5',
 '../Data/OriginalDatasets/train\\0bc8216f2.hdf5',
 '../Data/OriginalDatasets/train\\0c55d030c.hdf5',
 '../Data/OriginalDatasets/train\\0d0ad0b19.hdf5',
 '../Data/OriginalDatasets/train\\0dc4c8ed0.hdf5',
 '../Data/OriginalDatasets/train\\0e39a18bf.hdf5',
 '../Data/OriginalDatasets/train\\0e60d4893.hdf5',
 '../Data/OriginalDatasets/train\\0e66d0460.hdf5',
 '../Data/OriginalDatasets/train\\0eb30f7c4.hdf5',
 '../Data/OriginalDatasets/train\\0ebe28dd5.hdf5',
 '../Data/OriginalDatasets/train\\0f53d8b96.hdf5',
 '../Data/OriginalDatasets/train\\10dfa2ed6.hdf5',
 '../Data/OriginalDatasets/train\\10eaa1cb2.hdf5',
 '../Data/OriginalDatasets/train\\1185806d8.hdf5',
 '../Data/OriginalDatasets/train\\119610501.hdf5',
 '../Data/OriginalDatasets/train\\123594dc7.hdf5',
 '../Data/OriginalDatasets/train\\1282f6c1f.hdf5',
 '../Data/OriginalDatasets/train\\12f0fd6fd.hdf5',
 '../Data/OriginalDatasets/train\\12f9824fa.hdf5',
 '../Data/OriginalDatasets/train\\13a23148f.hdf5',
 '../Data/OriginalDatasets/train\\13df1746e.hdf5',
 '../Data/OriginalDatasets/train\\147cc5f92.hdf5',
 '../Data/OriginalDatasets/train\\1510f75f9.hdf5',
 '../Data/OriginalDatasets/train\\1523dcd0c.hdf5',
 '../Data/OriginalDatasets/train\\1607fd753.hdf5',
 '../Data/OriginalDatasets/train\\1748ad051.hdf5',
 '../Data/OriginalDatasets/train\\177d1a100.hdf5',
 '../Data/OriginalDatasets/train\\1796d0836.hdf5']


# Initializing the count of the number of hdf5 files:
numArquivo = 1

# Iterating over Training hdf5 files and extracting data:
for arquivo_hdf5 in arquivos_hdf5_treino:
    with h5py.File(arquivo_hdf5, 'r') as arquivo:
       
        # Printing the count of the number of hdf5 files on the screen:
        print(f'Arquivo {numArquivo}')

        # Creating the key list in HDF5 files:
        keyList = list(arquivo.keys())[0]

        # Creating the list of variables in HDF5 files:
        varList = list(arquivo[keyList])

        # Printing all datasets, also known as "keys":
        print(f'Chave em {arquivo_hdf5}: {keyList}')
           
        # Iterating over the datasets in the file:
        for key in arquivo.keys():
           
            # Printing the variables in the keys on the screen:
            print(f'Variáveis na chave {key}: {varList}')

            # Extracting the datasets:
            dados = arquivo[key][:]

            # Printing the dataset to the screen:
            print(f'Dados no conjunto de dados {key}: {dados}')

            # Converting data to a Pandas DataFrame:
            df = pd.DataFrame(dados)
                               
            # Adding the DataFrame to the list:
            dfs.append(df)
       
        # Printing a blank line on the screen:
        print()
       
        # Incrementing the number of files:
        numArquivo += 1

# Concatenating DataFrames into a single DataFrame:
resultado_final = pd.concat(dfs, ignore_index=True)

# Viewing the first lines:
print(resultado_final.head())

Arquivo 1
Chave em ../Data/OriginalDatasets/train\001121a05.hdf5: 001121a05
Variáveis na chave 001121a05: ['H1', 'L1', 'frequency_Hz']

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[7], line 27
     24 print(f'Variáveis na chave {key}: {varList}')
     26 # Extraindo os conjuntos de dados:
---> 27 dados = arquivo[key][:]
     29 # Imprimindo na tela o conjunto de dados:
     30 print(f'Dados no conjunto de dados {key}: {dados}')

File h5py\_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py\_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File c:\Opt\Anaconda3\Lib\site-packages\h5py\_hl\group.py:330, in Group.__getitem__(self, name)
    328     oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
    329 else:
--> 330     raise TypeError("Accessing a group is done with bytes or str, "
    331                     " not {}".format(type(name)))
    333 otype = h5i.get_type(oid)
    334 if otype == h5i.GROUP:

TypeError: Accessing a group is done with bytes or str,  not <class 'slice'> 

2

There are 2 answers

0
kcw78 On BEST ANSWER

Let's start with the file schema. (You have to understand the schema first. Then you can read the data correctly.) Be careful with keys. h5py uses dictionary syntax for H5 objects. So a key can be a Dataset or a Group. Each key is the object name and it's value is the object. If you don't know the object type, you can test it with isinstance().

You have a slight misunderstanding of the file schema. I checked 3 files, and each has this schema:

  • Top level Group with the same name as the file (eg 001121a05 for 001121a05.hdf5)
  • Under this group are 3 more objects: H1 is a Group, L1 is a Group and frequency_Hz is a Dataset.
  • The H1 and L1 Groups have 2 datasets each: named SFTs and timestamps_GPS.

There are at 2-3 issues to address:

  1. When you loop on arquivo.keys(), you are looping on the root level objects (only a group named 001121a05). That's why you get an error about TypeError: Accessing a group.
  2. Once you fix that, and loop on the 2nd level objects with arquivo[keyList].keys(), you will get another error when you try to read H1 and L1 as datasets (because they are groups).
  3. You will need more logic to read the data in the datasets under H1 and L1.

I modified your code to read the data in frequency_Hz and load to a dataframe. It should get you pointed in the right direction. If you want the H1 and L1 data, you will need to load [H1][SFTs] and [L1][SFTs] as appropriate.

Also, I made a few other minor changes to simplify the code (For example, I used glob.iglob() and enumerate()). Also, I modified some variable names to clarify their meaning.

# Create a list to store the DataFrames of each HDF5 file:
dfs = []

# Iterating over Training hdf5 files and extracting data:
for Arquivo_cnt, arquivo_hdf5 in enumerate(glob.iglob('*.hdf5')):
    with h5py.File(arquivo_hdf5, 'r') as arquivo:
       
        # Printing the count of the number of hdf5 files on the screen:
        print(f'Arquivo {Arquivo_cnt+1}')

        # Get the root level key list and print:
        rootkey = list(arquivo.keys())[0]
        print(f'Chave em {arquivo_hdf5}: {rootkey}')
        
        # Get a list of objects below the root level and print:
        varList = list(arquivo[rootkey].keys())
        print(f'Variáveis na chave {rootkey}: {varList}')
           
        # Iterating over the objects under rootkey (groups and datasets):
        for key in arquivo[rootkey].keys():
            print(f'For object name {key}, object type: {type(arquivo[rootkey][key])}')

            # only process datasets; skip groups
            if isinstance(arquivo[rootkey][key], h5py.Dataset):
               
                # Extract the dataset to an np.array:
                dados = arquivo[rootkey][key][:]
    
                # Print the data to the screen:
                print(f'Dados no conjunto de dados {key}: \n{dados}')
    
                # Load data to a Pandas DataFrame:
                df = pd.DataFrame(dados)
                                   
                # Add the DataFrame to the list:
                dfs.append(df)
           
            # Printing a blank line on the screen:
            print()       

# Concatenating DataFrames into a single DataFrame:
resultado_final = pd.concat(dfs, ignore_index=True)

# Viewing the first lines:
print(resultado_final.head())
3
kcw78 On

I've been studying your data and pretty sure my previous answer doesn't really solve your problem. It "works" in the sense that it loads all of the frequency_Hz datasets to a dataframe. (And it can be easily modified to load the other datasets.) However, inspecting the data more closely, I'm 99% sure that is NOT what you want, and concatenating all of the files into a single Pandas dataframe is incorrect. Let me explain using values from 3 HDF5 files.

As noted in my previous answer, each file has 3 datasets named: H1/SFTs, L1/SFTs, and frequency_Hz. There are 2 more datasets I didn't mention: H1/timestamps_GPS, L1/timestamps_GPS. As I studied the data, I think you need to read all of the datasets and organize that data to create the appropriate dataframe. And, you need a different dataframe for each file and each H1/L1 dataset.

Here is a summary of dataset shapes for 3 files:

Dataset 00a6db666 Shape 00f36a6ac Shape 001121a05 Shape
frequency_Hz 360 rows 360 rows 360 rows
H1/timestamps_GPS 4525 rows 4609 rows 4612 rows
H1/SFTs 360 x 4525 360 x 4609 360 x 4612
L1/timestamps_GPS 4598 rows 4566 rows 4653 rows
L1/SFTs 360 x 4598 360 x 4566 360 x 4653

See the pattern? Each H1/SFTs dataset has shape of (frequency_Hz, H1/timestamps_GPS) and L1/SFTs has shape of (frequency_Hz, L1/timestamps_GPS). And all of the H1/L1 timestamps_GPS datasets have a different shape. Conclusion: You will have to create separate dataframes for each file AND each set of H1/L1 data. frequency_Hz will be your row labels and timestamps_GPS will be your column labels.

The code below iterates over the HDF5 files and datasets to assemble "logically correct" dataframes (IMHO) of shape (frequency_Hz rows/indices X timestamps_GPS columns). It creates 1 datafrom for each file and H1/L1 dataset. I don't know how all of the dataframes can be combined (given the different number of columns for each file/dataset.)

# Iterate over Training hdf5 files and extract data:
for Arquivo_cnt, arquivo_hdf5 in enumerate(glob.iglob('*.hdf5')):
    with h5py.File(arquivo_hdf5, 'r') as arquivo:
       
        # Get the root level key list and print:
        rootkey = list(arquivo.keys())[0]
        print(f'\nArquivo {Arquivo_cnt+1}; Chave em {arquivo_hdf5}: {rootkey}')     
             
        # Extract the 'frequency_Hz' dataset to an np.array:
        # It is used for both dataframes H1/SFTs and L1/SFTs
        freq_dados = arquivo[rootkey]['frequency_Hz'][:]
        
        # Extract the 'H1' datasets: timestamps_GPS and SFTs:
        H1_time_GPS_dados = arquivo[rootkey]['H1/timestamps_GPS'][:]       
        H1_SFTs_dados = arquivo[rootkey]['H1/SFTs'][:]
        
        # Load data to a Pandas DataFrame:
        H1_SFTs_df = pd.DataFrame(data=H1_SFTs_dados, 
                                  index=freq_dados, columns=H1_time_GPS_dados)

        # View the first lines:
        print(f'\nHead of {arquivo_hdf5} H1_SFTs_df:')
        print(H1_SFTs_df.head())

        # Extract the 'L1' datasets: timestamps_GPS and SFTs:
        L1_time_GPS_dados = arquivo[rootkey]['L1/timestamps_GPS'][:]       
        L1_SFTs_dados = arquivo[rootkey]['L1/SFTs'][:]
        
        # Load data to a Pandas DataFrame:
        L1_SFTs_df = pd.DataFrame(data=L1_SFTs_dados, 
                                  index=freq_dados, columns=L1_time_GPS_dados)

        # View the first lines:
        print(f'\nHead of {arquivo_hdf5} L1_SFTs_df:')
        print(L1_SFTs_df.head())