This is basically just me trying to normalize my data so my model performs better. It worked without but I am trying things out to improve performance. It is a timeseries in a LSTM so has to be sequenced which works perfectly fine as well as the first block of code below that is separating my data into train test split based on year.
test = {}
tracks = incident_data["geoid10"].unique().tolist()
train_x, train_y = creatsequence(
incident_data[incident_data["geoid10"] == tracks[0]], 4
)
tracks.pop(0)
for track in tracks:
test_data = incident_data[incident_data["geoid10"] == track]
if test_data[test_data["Year"] < 2016].shape[0] > 4:
trainx, trainy = creatsequence(test_data[test_data["Year"] < 2016], 4)
train_x = np.concatenate((train_x, trainx))
train_y = np.concatenate((train_y, trainy))
if test_data[test_data["Year"] >= 2015].shape[0] > 4:
test_x, test_y = creatsequence(test_data[test_data["Year"] >= 2015], 4)
test[track] = {"X": test_x, "y": test_y}
def creatsequence(data, length):
x = []
y = []
for column in ["All Other Thefts_y", "Simple Assault_y", "Theft From Motor Vehicle_y"]:
data[column] = normalizeSeries(data[column], 4)
for i in range(len(data) - length):
x.append(data.drop(columns=['geoid10',
'Year','Quarter'])[i:i+length])
y.append(np.array(data[["All Other Thefts_y", "Simple Assault_y", "Theft From Motor Vehicle_y"]])[i+length])
return(np.array(x), np.array(y))
def normalizeSeries(x, priorDays):
new_x = []
start = len(x) - 1
while start > 0:
print(x)
if start - priorDays > 4:
subarray = x[start - 2 * priorDays : start - priorDays]
else:
subarray = x[0:4]
max_value = max([max(subarray),0.001])
for i in range(4):
value = x[start]
new_x.append(value / max_value)
start -= 1
return new_x
The error:
KeyError: 11
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/tmp/ipykernel_12/1047915744.py in <cell line: 11>()
12 test_data = incident_data[incident_data["geoid10"] == track]
13 if test_data[test_data["Year"] < 2016].shape[0] > 4:
---> 14 trainx, trainy = creatsequence(test_data[test_data["Year"] < 2016], 4)
15 train_x = np.concatenate((train_x, trainx))
16 train_y = np.concatenate((train_y, trainy))
/tmp/ipykernel_12/2018859376.py in creatsequence(data, length)
8 y = []
9 for column in ["All Other Thefts_y", "Simple Assault_y", "Theft From Motor Vehicle_y"]:
---> 10 data[column] = normalizeSeries(data[column], 4)
11 for i in range(len(data) - length):
12 x.append(data.drop(columns=['geoid10',
/tmp/ipykernel_12/2642226622.py in normalizeSeries(x, priorDays)
17 max_value = max([max(subarray),0.001])
18 for i in range(4):
---> 19 value = x[start]
20 new_x.append(value / max_value)
21 start -= 1
~/.cache/pypoetry/virtualenvs/python-kernel-OtKFaj5M-py3.9/lib/python3.9/site-packages/pandas/core/series.py in __getitem__(self, key)
979
980 elif key_is_scalar:
--> 981 return self._get_value(key)
982
983 if is_hashable(key):
~/.cache/pypoetry/virtualenvs/python-kernel-OtKFaj5M-py3.9/lib/python3.9/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
1087
1088 # Similar to Index.get_value, but we do not fall back to positional
-> 1089 loc = self.index.get_loc(label)
1090 return self.index._get_values_for_loc(self, loc, label)
1091
~/.cache/pypoetry/virtualenvs/python-kernel-OtKFaj5M-py3.9/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will raise
I am trying to get my data normalized the sequenced and thrown into a train test split. The first few tables work in the first track but once it hits the second it throws the index error. I checked the track and it has data and does have that length.
You are indexing from a series using the index 11. In this line:
The only way I can explain this is that this index exceeds the length of the pd.Series.
The fact that this works the first iteration but not the second, tells me that you are overwriting/changing some data in a meaningful way. And in fact there are variables outside the loop, for example:
These variables are then changed in the loop:
Note, that when you are initialising these variables outside the loop they likely do NOT make a copy of the
incident_data
but you are likely returning a slice. Therefore overwriting train_x and train_y might change incident_data, thereby likely breaking this:This is of course only based on the logic of the program as I can read it, but it is difficult to debug without the runnable example. But I would assume that your problem is somewhere along these lines.