Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array()

103 views Asked by At

I want to train a set of temporal and spatial features of a video dataset, in a ConvNet model, adding a attention mechanism to temporal features, the size of features are as billow: spatial size: (128, 128) temporal size: (240, 320) which are obtained from these codes:

def extract_spatial_features(frame_path):
    frame = cv2.imread(frame_path)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, (128, 128))
    features = np.reshape(resized, (128, 128))  # Reshape to a 4D tensor
    return features
​
​
def extract_temporal_features(frame_path1, frame_path2):
    frame1 = cv2.imread(frame_path1, cv2.IMREAD_GRAYSCALE)
    frame2 = cv2.imread(frame_path2, cv2.IMREAD_GRAYSCALE)
    frame_diff = cv2.absdiff(frame1, frame2)
    return frame_diff
​
​
def extract_temporal_features_from_meis(gop_folder_path):
    frame_files = [file for file in os.listdir(gop_folder_path) if file.endswith(".jpg")]
    frame_files.sort()  # Ensure frames are in the correct order
    frame_differences = []
    # Check if the number of frames is less than the minimum required (2 or 3)
    min_required_frames = 2  # You can adjust this to 3 if you find it more appropriate
    if len(frame_files) < min_required_frames:
        for i in range(len(frame_files)):
            frame_path1 = os.path.join(gop_folder_path, frame_files[i])
            frame_path2 = os.path.join(gop_folder_path, frame_files[i])
            frame_diff = extract_temporal_features(frame_path1, frame_path2)
            frame_differences.append(frame_diff)
    else:
        for i in range(len(frame_files) - 1):
            frame_path1 = os.path.join(gop_folder_path, frame_files[i])
            frame_path2 = os.path.join(gop_folder_path, frame_files[i + 1])
            frame_diff = extract_temporal_features(frame_path1, frame_path2)
            frame_differences.append(frame_diff)
​
    # Convert to a 3D numpy array
    g_temporal_features = np.array(frame_differences)
    return g_temporal_features
​
​
def aggregate_features_std(features_list):
    return np.std(features_list, axis=0)
​
​
def extract_spatial_temporal_features_from_keyframes(process_folder_path):
    global gop_temporal_features, video_temporal_features, video_spatial_features, video_folder_path, temporal_folder_path, spatial_folder_path
    activity_folders = [file for file in os.listdir(process_folder_path) if
                        os.path.isdir(os.path.join(process_folder_path, file))]
    gop_spatial_features = []  # List to store all spatial feature vectors of each video file
​
    for activity_folder in activity_folders:
        print(activity_folder)
        activity_folder_path = os.path.join(process_folder_path, activity_folder)
        video_folders = [file for file in os.listdir(activity_folder_path) if
                         os.path.isdir(os.path.join(activity_folder_path, file))]
​
        spatial_folder_path = os.path.join(process_folder_path, "spatial_features")
        os.makedirs(spatial_folder_path, exist_ok=True)
​
        temporal_folder_path = os.path.join(process_folder_path, "temporal_features")
        os.makedirs(temporal_folder_path, exist_ok=True)
​
        for video_folder in video_folders:
            video_folder_path = os.path.join(activity_folder_path, video_folder)
            gop_folders = [file for file in os.listdir(video_folder_path)]
​
            i = 0
            for gop_folder in gop_folders:
                gop_folder_path = os.path.join(video_folder_path, gop_folder)
                spatial_path = os.path.join(video_folder_path, f'spatial_{i}')
                temporal_path = os.path.join(video_folder_path, f'temporal_{i}')
​
                keyframe_file = [file for file in os.listdir(gop_folder_path) if
                                 file.endswith("_keyframe.jpg")]
                keyframe_path = os.path.join(gop_folder_path, keyframe_file[0])
​
                gop_spatial_feature = extract_spatial_features(keyframe_path)
                gop_spatial_features.append(gop_spatial_feature)
​
                gop_temporal_features = extract_temporal_features_from_meis(gop_folder_path)
                i += 1
​
            video_spatial_features = aggregate_features_std(gop_spatial_features)
            np.save(os.path.join(spatial_folder_path, f"spatial_features_{video_folder}"),
                    video_spatial_features)
​
            video_temporal_features = aggregate_features_std(gop_temporal_features)
            np.save(os.path.join(temporal_folder_path, f"temporal_features_{video_folder}"),
                    video_temporal_features)
​
    return video_spatial_features, video_temporal_features,
​
​
# Extract and save  spatial and temporal features for training and testing videos
extract_spatial_temporal_features_from_keyframes(train_process_path)
extract_spatial_temporal_features_from_keyframes(test_process_path)
​
print(f"spatial size: {video_spatial_features.shape}")
print(f"temporal size: {video_temporal_features.shape}")

and the code of the model are as billow:

# ConvNet for Spatial Features
class SpatialConvNet(nn.Module):
    def __init__(self, num_classes):
        super(SpatialConvNet, self).__init__()
        self.layer1 = self._make_layer(1, 64)  # 1 input channel for grayscale
        self.layer2 = self._make_layer(64, 128)
        self.layer3 = self._make_layer(128, 256)
        self.fc = nn.Linear(256, num_classes)  # Output features based on num_classes

    def _make_layer(self, in_channels, out_channels):
        layer = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True)
        )
        return layer

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


# ConvNet for Temporal Features
class TemporalConvNet(nn.Module):
    def __init__(self, num_classes):
            super(TemporalConvNet, self).__init__()
            self.layer1 = self._make_layer(1, 64)  # 1 input channel for grayscale
            self.layer2 = self._make_layer(64, 128)
            self.layer3 = self._make_layer(128, 256)
            self.fc = nn.Linear(256, num_classes)  # Output features based on num_classes

    # self.fc = nn.Linear(256, 5)  # Adjust the output features to 5

    def _make_layer(self, in_channels, out_channels):
        layer = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True)
        )
        return layer

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = out.view(out.size(0), -1)
        out = self.fc(out)

        return out


# (new) Attention Mechanism for Temporal Features
class Attention(nn.Module):
    def __init__(self, input_size, hidden_size):
            super(Attention, self).__init__()
            self.W_query = nn.Linear(input_size, hidden_size)
            self.W_key = nn.Linear(input_size, hidden_size)
            self.W_value = nn.Linear(input_size, hidden_size)
            self.softmax = nn.Softmax(dim=-1)

    def forward(self, temporal_outputs, spatial_outputs):
            query = self.W_query(temporal_outputs)
            key = self.W_key(spatial_outputs)
            value = self.W_value(spatial_outputs)

            # Calculate attention scores
            attention_scores = torch.matmul(query, key.transpose(-2, -1))
            attention_scores = F.softmax(attention_scores, dim=-1)

            # Apply attention to values
            attended_values = torch.matmul(attention_scores, value)

            return attended_values


# Dataset Class for Loading Features and Labels
class VideoDataset(Dataset):
    def __init__(self, spatial_features_folder, temporal_features_folder, label_mapping):
        self.spatial_features = []
        self.temporal_features = []
        self.labels = []

        # Ensure the same ordering of files in both folders
        spatial_files = sorted(os.listdir(spatial_features_folder))
        temporal_files = sorted(os.listdir(temporal_features_folder))

        for spatial_file, temporal_file in zip(spatial_files, temporal_files):
                if spatial_file.endswith('.npy') and temporal_file.endswith('.npy'):
                        # Load features
                        spatial_feature = np.load(os.path.join(spatial_features_folder, spatial_file))
                        temporal_feature = np.load(os.path.join(temporal_features_folder, temporal_file))

                        # Normalize features
                        spatial_feature = spatial_feature.astype(np.float32) / 255.0
                        temporal_feature = temporal_feature.astype(np.float32) / 255.0

                        # Extract label from file name and map to integer
                        label_name = spatial_file.split('_')[3]  # Adjust based on how your file names are structured
                        label_index = label_mapping.get(label_name, -1)  # Default to -1 for unknown labels

                        # Append to lists if label found
                        if label_index != -1:
                                self.spatial_features.append(spatial_feature)
                                self.temporal_features.append(temporal_feature)
                                self.labels.append(label_index)

        # Convert lists to tensors
        self.spatial_features = torch.tensor(self.spatial_features).unsqueeze(1)
        self.temporal_features = torch.tensor(self.temporal_features).unsqueeze(1)
        self.labels = torch.tensor(self.labels, dtype=torch.long)

    def __len__(self):
        return len(self.spatial_features)

    def __getitem__(self, idx):
        return self.spatial_features[idx], self.temporal_features[idx], self.labels[idx]


# Assuming hidden_size is the same as the size of spatial_outputs and temporal_outputs
hidden_size = 5

# Initialize Attention mechanism
attention = Attention(input_size=hidden_size, hidden_size=hidden_size)

# Instantiate the Networks
activity_folders = [file for file in os.listdir(train_folder_path)]
activity_labels = [file.split("_")[0] for file in activity_folders]
print(f'Activity_labels: {activity_labels}')
label_mapping = {label: idx for idx, label in enumerate(set(activity_labels))}
num_classes = len(label_mapping)
print(f'num_classes: {num_classes}')
spatial_conv_net = SpatialConvNet(num_classes=num_classes)

temporal_conv_net = TemporalConvNet(num_classes=num_classes)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
        list(spatial_conv_net.parameters()) +
        list(temporal_conv_net.parameters()) +
        list(attention.parameters()),
        lr=0.001
)


spatial_folder_path = os.path.join(train_process_path, 'spatial_features')
temporal_folder_path = os.path.join(train_process_path, 'temporal_features') 
train_dataset = VideoDataset(spatial_folder_path, temporal_folder_path, label_mapping)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


how can I address this issue?

by the way you can find all the code in kaggle in this link: https://www.kaggle.com/code/mohsendelaavar/sthar-video-training-retrieval-video-query-new

when I run the part of model of the code, this error is given:

/tmp/ipykernel_42/2117318364.py:114: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /usr/local/src/pytorch/torch/csrc/utils/tensor_new.cpp:245.)
  self.spatial_features = torch.tensor(self.spatial_features).unsqueeze(1)

When I try to write the code, because of adopting the size of attention mechanism to other part of model(i.e. 3 layers Cnv.Net) I forced to change the size of input and output of model several times. the problem may happen due to these changes or I guess during extracting features I should merge or aggregate some of them.

0

There are 0 answers