How to read AWS CloudTrail JSON Logs into a pandas dataframe

1.2k views Asked by At

I have a problem as I am loading my data to pandas using Jupyterlab running with Anaconda3 as my VM suddenly went down. After it was up, I found that my code doesn't work anymore for some reason. Here is my code:

awsc = pd.DataFrame()
json_pattern = os.path.join('logs_old/AWSCloudtrailLog/','*')
file_list = glob.glob(json_pattern)
for file in file_list:
    data = pd.read_json(file, lines=True)
    awsc = awsc.append(data, ignore_index = True)
awsc = pd.concat([awsc, pd.json_normalize(awsc['userIdentity'])], axis=1).drop('userIdentity', 1)
awsc.rename(columns={'type':'userIdentity_type',
                     'principalId':'userIdentity_principalId',
                     'arn':'userIdentity_arn',
                     'accountId':'userIdentity_accountId',
                     'accessKeyId':'userIdentity_accessKeyId',
                     'userName':'userIdentity_userName',}, inplace=True)

When I run the code it gave me the KeyError message like this:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/envs/environment/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2888             try:
-> 2889                 return self._engine.get_loc(casted_key)
   2890             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'userIdentity'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-9-efd1d2e600a5> in <module>
      1 # unpack nested json
      2 
----> 3 awsc = pd.concat([awsc, pd.json_normalize(awsc['userIdentity'])], axis=1).drop('userIdentity', 1)
      4 awsc.rename(columns={'type':'userIdentity_type',
      5                      'principalId':'userIdentity_principalId',

~/anaconda3/envs/environment/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]

~/anaconda3/envs/environment/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2889                 return self._engine.get_loc(casted_key)
   2890             except KeyError as err:
-> 2891                 raise KeyError(key) from err
   2892 
   2893         if tolerance is not None:

KeyError: 'userIdentity'

The output of the dataframe awsc as i run print(awss.info()) or print(awsc.info()):

 <class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrameNone

Any solution to solve this issue? Does the problem come from the Pandas or Anaconda?

1

There are 1 answers

1
Trenton McKinney On BEST ANSWER

Using Code from OP

  • The method for creating the dataframe is not correct, which is way awsc is empty.
  • Without seeing a file, it's not possible to know if pd.read_json(file, lines=True) is the correct method to use.
  • pd.json_normalize(awsc['userIdentity']) will work on a column of dicts. It's more than likely the column is strings though.
    • If the dicts are str type, use ast.literal_eval to convert them to dict type.
import pandas as pd
from ast import literal_eval

# crate a list to add dataframes to
awsc_list = list()

# iterate through the list of and append them to awsc_list
for file in file_list:
    awsc_list.append(pd.read_json(file, lines=True))
    
# concat the files into a single dataframe
awsc = pd.concat(awsc_list).reset_index(drop=True)

# convert the userIdentity column to dict type, if it contains str type
awsc.userIdentity = awsc.userIdentity.apply(literal_eval)

# normalize userIdentity
normalized = pd.json_normalize(awsc['userIdentity'], sep='_')

# join awsc with normalized and drop the userIdentity column
awsc = awsc.join(normalized).drop('userIdentity', 1)

# rename the columns
awsc.rename(columns={'type':'userIdentity_type',
                     'principalId':'userIdentity_principalId',
                     'arn':'userIdentity_arn',
                     'accountId':'userIdentity_accountId',
                     'accessKeyId':'userIdentity_accessKeyId',
                     'userName':'userIdentity_userName',}, inplace=True)

New Code with Sample Data

import json
import pandas as pd

# crate a list to add dataframes to
awsc_list = list()

# list of files
files_list = ['test.json', 'test2.json']

# read the filess
for file in files_list:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())
    
    # normalize the file and append it to the list of dataframe
    awsc_list.append(pd.json_normalize(data, 'Records', sep='_'))
    
# concat the files into a single dataframe
awsc = pd.concat(awsc_list).reset_index(drop=True)

# display(awsc)
  eventVersion             eventTime        eventSource       eventName  awsRegion  sourceIPAddress                                                                                 userAgent userIdentity_type userIdentity_principalId                      userIdentity_arn userIdentity_accessKeyId userIdentity_accountId userIdentity_userName requestParameters_instancesSet_items                                                                                                 responseElements_instancesSet_items requestParameters_force userIdentity_sessionContext_attributes_mfaAuthenticated userIdentity_sessionContext_attributes_creationDate requestParameters_keyName responseElements_keyName                              responseElements_keyFingerprint responseElements_keyMaterial
0          1.0  2014-03-06T21:22:54Z  ec2.amazonaws.com  StartInstances  us-east-2  205.251.233.176                                                                    ec2-api-tools 1.6.12.2           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice       [{'instanceId': 'i-ebeaf9e2'}]    [{'instanceId': 'i-ebeaf9e2', 'currentState': {'code': 0, 'name': 'pending'}, 'previousState': {'code': 80, 'name': 'stopped'}}]                     NaN                                                     NaN                                                 NaN                       NaN                      NaN                                                          NaN                          NaN
1          1.0  2014-03-06T21:01:59Z  ec2.amazonaws.com   StopInstances  us-east-2  205.251.233.176                                                                    ec2-api-tools 1.6.12.2           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice       [{'instanceId': 'i-ebeaf9e2'}]  [{'instanceId': 'i-ebeaf9e2', 'currentState': {'code': 64, 'name': 'stopping'}, 'previousState': {'code': 16, 'name': 'running'}}]                   False                                                     NaN                                                 NaN                       NaN                      NaN                                                          NaN                          NaN
2          1.0  2014-03-06T17:10:34Z  ec2.amazonaws.com   CreateKeyPair  us-east-2     72.21.198.64  EC2ConsoleBackend, aws-sdk-java/Linux/x.xx.fleetxen Java_HotSpot(TM)_64-Bit_Server_VM/xx           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice                                  NaN                                                                                                                                 NaN                     NaN                                                   false                                2014-03-06T15:15:06Z                 mykeypair                mykeypair  30:1d:46:d0:5b:ad:7e:1b:b6:70:62:8b:ff:38:b5:e9:ab:5d:b8:21       <sensitiveDataRemoved>
3          1.0  2014-03-06T21:22:54Z  ec2.amazonaws.com  StartInstances  us-east-2  205.251.233.176                                                                    ec2-api-tools 1.6.12.2           IAMUser          EX_PRINCIPAL_ID  arn:aws:iam::123456789012:user/Alice           EXAMPLE_KEY_ID           123456789012                 Alice       [{'instanceId': 'i-ebeaf9e2'}]    [{'instanceId': 'i-ebeaf9e2', 'currentState': {'code': 0, 'name': 'pending'}, 'previousState': {'code': 80, 'name': 'stopped'}}]                     NaN                                                     NaN                                                 NaN                       NaN                      NaN                                                          NaN                          NaN

Sample Data

test.json

  • List of JSONs
[{
        "Records": [{
                "eventVersion": "1.0",
                "userIdentity": {
                    "type": "IAMUser",
                    "principalId": "EX_PRINCIPAL_ID",
                    "arn": "arn:aws:iam::123456789012:user/Alice",
                    "accessKeyId": "EXAMPLE_KEY_ID",
                    "accountId": "123456789012",
                    "userName": "Alice"
                },
                "eventTime": "2014-03-06T21:22:54Z",
                "eventSource": "ec2.amazonaws.com",
                "eventName": "StartInstances",
                "awsRegion": "us-east-2",
                "sourceIPAddress": "205.251.233.176",
                "userAgent": "ec2-api-tools 1.6.12.2",
                "requestParameters": {
                    "instancesSet": {
                        "items": [{
                                "instanceId": "i-ebeaf9e2"
                            }
                        ]
                    }
                },
                "responseElements": {
                    "instancesSet": {
                        "items": [{
                                "instanceId": "i-ebeaf9e2",
                                "currentState": {
                                    "code": 0,
                                    "name": "pending"
                                },
                                "previousState": {
                                    "code": 80,
                                    "name": "stopped"
                                }
                            }
                        ]
                    }
                }
            }
        ]
    }, {
        "Records": [{
                "eventVersion": "1.0",
                "userIdentity": {
                    "type": "IAMUser",
                    "principalId": "EX_PRINCIPAL_ID",
                    "arn": "arn:aws:iam::123456789012:user/Alice",
                    "accountId": "123456789012",
                    "accessKeyId": "EXAMPLE_KEY_ID",
                    "userName": "Alice"
                },
                "eventTime": "2014-03-06T21:01:59Z",
                "eventSource": "ec2.amazonaws.com",
                "eventName": "StopInstances",
                "awsRegion": "us-east-2",
                "sourceIPAddress": "205.251.233.176",
                "userAgent": "ec2-api-tools 1.6.12.2",
                "requestParameters": {
                    "instancesSet": {
                        "items": [{
                                "instanceId": "i-ebeaf9e2"
                            }
                        ]
                    },
                    "force": false
                },
                "responseElements": {
                    "instancesSet": {
                        "items": [{
                                "instanceId": "i-ebeaf9e2",
                                "currentState": {
                                    "code": 64,
                                    "name": "stopping"
                                },
                                "previousState": {
                                    "code": 16,
                                    "name": "running"
                                }
                            }
                        ]
                    }
                }
            }
        ]
    }, {
        "Records": [{
                "eventVersion": "1.0",
                "userIdentity": {
                    "type": "IAMUser",
                    "principalId": "EX_PRINCIPAL_ID",
                    "arn": "arn:aws:iam::123456789012:user/Alice",
                    "accountId": "123456789012",
                    "accessKeyId": "EXAMPLE_KEY_ID",
                    "userName": "Alice",
                    "sessionContext": {
                        "attributes": {
                            "mfaAuthenticated": "false",
                            "creationDate": "2014-03-06T15:15:06Z"
                        }
                    }
                },
                "eventTime": "2014-03-06T17:10:34Z",
                "eventSource": "ec2.amazonaws.com",
                "eventName": "CreateKeyPair",
                "awsRegion": "us-east-2",
                "sourceIPAddress": "72.21.198.64",
                "userAgent": "EC2ConsoleBackend, aws-sdk-java/Linux/x.xx.fleetxen Java_HotSpot(TM)_64-Bit_Server_VM/xx",
                "requestParameters": {
                    "keyName": "mykeypair"
                },
                "responseElements": {
                    "keyName": "mykeypair",
                    "keyFingerprint": "30:1d:46:d0:5b:ad:7e:1b:b6:70:62:8b:ff:38:b5:e9:ab:5d:b8:21",
                    "keyMaterial": "\u003csensitiveDataRemoved\u003e"
                }
            }
        ]
    }
]

test2.json

  • One JSON
{
    "Records": [{
            "eventVersion": "1.0",
            "userIdentity": {
                "type": "IAMUser",
                "principalId": "EX_PRINCIPAL_ID",
                "arn": "arn:aws:iam::123456789012:user/Alice",
                "accessKeyId": "EXAMPLE_KEY_ID",
                "accountId": "123456789012",
                "userName": "Alice"
            },
            "eventTime": "2014-03-06T21:22:54Z",
            "eventSource": "ec2.amazonaws.com",
            "eventName": "StartInstances",
            "awsRegion": "us-east-2",
            "sourceIPAddress": "205.251.233.176",
            "userAgent": "ec2-api-tools 1.6.12.2",
            "requestParameters": {
                "instancesSet": {
                    "items": [{
                            "instanceId": "i-ebeaf9e2"
                        }
                    ]
                }
            },
            "responseElements": {
                "instancesSet": {
                    "items": [{
                            "instanceId": "i-ebeaf9e2",
                            "currentState": {
                                "code": 0,
                                "name": "pending"
                            },
                            "previousState": {
                                "code": 80,
                                "name": "stopped"
                            }
                        }
                    ]
                }
            }
        }
    ]
}