I want to train a set of temporal and spatial features of a video dataset, in a ConvNet model, adding a attention mechanism to temporal features, the size of features are as billow: spatial size: (128, 128) temporal size: (240, 320) which are obtained from these codes:
def extract_spatial_features(frame_path):
frame = cv2.imread(frame_path)
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
resized = cv2.resize(gray, (128, 128))
features = np.reshape(resized, (128, 128)) # Reshape to a 4D tensor
return features
def extract_temporal_features(frame_path1, frame_path2):
frame1 = cv2.imread(frame_path1, cv2.IMREAD_GRAYSCALE)
frame2 = cv2.imread(frame_path2, cv2.IMREAD_GRAYSCALE)
frame_diff = cv2.absdiff(frame1, frame2)
return frame_diff
def extract_temporal_features_from_meis(gop_folder_path):
frame_files = [file for file in os.listdir(gop_folder_path) if file.endswith(".jpg")]
frame_files.sort() # Ensure frames are in the correct order
frame_differences = []
# Check if the number of frames is less than the minimum required (2 or 3)
min_required_frames = 2 # You can adjust this to 3 if you find it more appropriate
if len(frame_files) < min_required_frames:
for i in range(len(frame_files)):
frame_path1 = os.path.join(gop_folder_path, frame_files[i])
frame_path2 = os.path.join(gop_folder_path, frame_files[i])
frame_diff = extract_temporal_features(frame_path1, frame_path2)
frame_differences.append(frame_diff)
else:
for i in range(len(frame_files) - 1):
frame_path1 = os.path.join(gop_folder_path, frame_files[i])
frame_path2 = os.path.join(gop_folder_path, frame_files[i + 1])
frame_diff = extract_temporal_features(frame_path1, frame_path2)
frame_differences.append(frame_diff)
# Convert to a 3D numpy array
g_temporal_features = np.array(frame_differences)
return g_temporal_features
def aggregate_features_std(features_list):
return np.std(features_list, axis=0)
def extract_spatial_temporal_features_from_keyframes(process_folder_path):
global gop_temporal_features, video_temporal_features, video_spatial_features, video_folder_path, temporal_folder_path, spatial_folder_path
activity_folders = [file for file in os.listdir(process_folder_path) if
os.path.isdir(os.path.join(process_folder_path, file))]
gop_spatial_features = [] # List to store all spatial feature vectors of each video file
for activity_folder in activity_folders:
print(activity_folder)
activity_folder_path = os.path.join(process_folder_path, activity_folder)
video_folders = [file for file in os.listdir(activity_folder_path) if
os.path.isdir(os.path.join(activity_folder_path, file))]
spatial_folder_path = os.path.join(process_folder_path, "spatial_features")
os.makedirs(spatial_folder_path, exist_ok=True)
temporal_folder_path = os.path.join(process_folder_path, "temporal_features")
os.makedirs(temporal_folder_path, exist_ok=True)
for video_folder in video_folders:
video_folder_path = os.path.join(activity_folder_path, video_folder)
gop_folders = [file for file in os.listdir(video_folder_path)]
i = 0
for gop_folder in gop_folders:
gop_folder_path = os.path.join(video_folder_path, gop_folder)
spatial_path = os.path.join(video_folder_path, f'spatial_{i}')
temporal_path = os.path.join(video_folder_path, f'temporal_{i}')
keyframe_file = [file for file in os.listdir(gop_folder_path) if
file.endswith("_keyframe.jpg")]
keyframe_path = os.path.join(gop_folder_path, keyframe_file[0])
gop_spatial_feature = extract_spatial_features(keyframe_path)
gop_spatial_features.append(gop_spatial_feature)
gop_temporal_features = extract_temporal_features_from_meis(gop_folder_path)
i += 1
video_spatial_features = aggregate_features_std(gop_spatial_features)
np.save(os.path.join(spatial_folder_path, f"spatial_features_{video_folder}"),
video_spatial_features)
video_temporal_features = aggregate_features_std(gop_temporal_features)
np.save(os.path.join(temporal_folder_path, f"temporal_features_{video_folder}"),
video_temporal_features)
return video_spatial_features, video_temporal_features,
# Extract and save spatial and temporal features for training and testing videos
extract_spatial_temporal_features_from_keyframes(train_process_path)
extract_spatial_temporal_features_from_keyframes(test_process_path)
print(f"spatial size: {video_spatial_features.shape}")
print(f"temporal size: {video_temporal_features.shape}")
and the code of the model are as billow:
# ConvNet for Spatial Features
class SpatialConvNet(nn.Module):
def __init__(self, num_classes):
super(SpatialConvNet, self).__init__()
self.layer1 = self._make_layer(1, 64) # 1 input channel for grayscale
self.layer2 = self._make_layer(64, 128)
self.layer3 = self._make_layer(128, 256)
self.fc = nn.Linear(256, num_classes) # Output features based on num_classes
def _make_layer(self, in_channels, out_channels):
layer = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
return layer
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = F.adaptive_avg_pool2d(out, (1, 1))
out = out.view(out.size(0), -1)
out = self.fc(out)
return out
# ConvNet for Temporal Features
class TemporalConvNet(nn.Module):
def __init__(self, num_classes):
super(TemporalConvNet, self).__init__()
self.layer1 = self._make_layer(1, 64) # 1 input channel for grayscale
self.layer2 = self._make_layer(64, 128)
self.layer3 = self._make_layer(128, 256)
self.fc = nn.Linear(256, num_classes) # Output features based on num_classes
# self.fc = nn.Linear(256, 5) # Adjust the output features to 5
def _make_layer(self, in_channels, out_channels):
layer = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
return layer
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = F.adaptive_avg_pool2d(out, (1, 1))
out = out.view(out.size(0), -1)
out = self.fc(out)
return out
# (new) Attention Mechanism for Temporal Features
class Attention(nn.Module):
def __init__(self, input_size, hidden_size):
super(Attention, self).__init__()
self.W_query = nn.Linear(input_size, hidden_size)
self.W_key = nn.Linear(input_size, hidden_size)
self.W_value = nn.Linear(input_size, hidden_size)
self.softmax = nn.Softmax(dim=-1)
def forward(self, temporal_outputs, spatial_outputs):
query = self.W_query(temporal_outputs)
key = self.W_key(spatial_outputs)
value = self.W_value(spatial_outputs)
# Calculate attention scores
attention_scores = torch.matmul(query, key.transpose(-2, -1))
attention_scores = F.softmax(attention_scores, dim=-1)
# Apply attention to values
attended_values = torch.matmul(attention_scores, value)
return attended_values
# Dataset Class for Loading Features and Labels
class VideoDataset(Dataset):
def __init__(self, spatial_features_folder, temporal_features_folder, label_mapping):
self.spatial_features = []
self.temporal_features = []
self.labels = []
# Ensure the same ordering of files in both folders
spatial_files = sorted(os.listdir(spatial_features_folder))
temporal_files = sorted(os.listdir(temporal_features_folder))
for spatial_file, temporal_file in zip(spatial_files, temporal_files):
if spatial_file.endswith('.npy') and temporal_file.endswith('.npy'):
# Load features
spatial_feature = np.load(os.path.join(spatial_features_folder, spatial_file))
temporal_feature = np.load(os.path.join(temporal_features_folder, temporal_file))
# Normalize features
spatial_feature = spatial_feature.astype(np.float32) / 255.0
temporal_feature = temporal_feature.astype(np.float32) / 255.0
# Extract label from file name and map to integer
label_name = spatial_file.split('_')[3] # Adjust based on how your file names are structured
label_index = label_mapping.get(label_name, -1) # Default to -1 for unknown labels
# Append to lists if label found
if label_index != -1:
self.spatial_features.append(spatial_feature)
self.temporal_features.append(temporal_feature)
self.labels.append(label_index)
# Convert lists to tensors
self.spatial_features = torch.tensor(self.spatial_features).unsqueeze(1)
self.temporal_features = torch.tensor(self.temporal_features).unsqueeze(1)
self.labels = torch.tensor(self.labels, dtype=torch.long)
def __len__(self):
return len(self.spatial_features)
def __getitem__(self, idx):
return self.spatial_features[idx], self.temporal_features[idx], self.labels[idx]
# Assuming hidden_size is the same as the size of spatial_outputs and temporal_outputs
hidden_size = 5
# Initialize Attention mechanism
attention = Attention(input_size=hidden_size, hidden_size=hidden_size)
# Instantiate the Networks
activity_folders = [file for file in os.listdir(train_folder_path)]
activity_labels = [file.split("_")[0] for file in activity_folders]
print(f'Activity_labels: {activity_labels}')
label_mapping = {label: idx for idx, label in enumerate(set(activity_labels))}
num_classes = len(label_mapping)
print(f'num_classes: {num_classes}')
spatial_conv_net = SpatialConvNet(num_classes=num_classes)
temporal_conv_net = TemporalConvNet(num_classes=num_classes)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
list(spatial_conv_net.parameters()) +
list(temporal_conv_net.parameters()) +
list(attention.parameters()),
lr=0.001
)
spatial_folder_path = os.path.join(train_process_path, 'spatial_features')
temporal_folder_path = os.path.join(train_process_path, 'temporal_features')
train_dataset = VideoDataset(spatial_folder_path, temporal_folder_path, label_mapping)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
how can I address this issue?
by the way you can find all the code in kaggle in this link: https://www.kaggle.com/code/mohsendelaavar/sthar-video-training-retrieval-video-query-new
when I run the part of model of the code, this error is given:
/tmp/ipykernel_42/2117318364.py:114: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /usr/local/src/pytorch/torch/csrc/utils/tensor_new.cpp:245.)
self.spatial_features = torch.tensor(self.spatial_features).unsqueeze(1)
When I try to write the code, because of adopting the size of attention mechanism to other part of model(i.e. 3 layers Cnv.Net) I forced to change the size of input and output of model several times. the problem may happen due to these changes or I guess during extracting features I should merge or aggregate some of them.