BPM detection from pyAudioAnalysis is producing the wrong number of beats for any signal

785 views Asked by At

Any help would be much appreciated. I am trying to extract the BPM from any .wav file that is loaded onto the python script by using the pyAudioAnalysis library. For some reason it is not outputting the correct BPM? I tried to change the window size in the beat_extraction() function but it only allows numbers under 1 second and when I change the window size, the BPM seem to change. But when kept at 1 second window it outputs 30 every time.

The following is my code:

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures
from pyAudioAnalysis import MidTermFeatures
import matplotlib.pyplot as plt
import numpy as np
import warnings

file_name = "unity_alan.wav"

# Extract the Sampling Rate (Fs) and the Raw Signal Data (signal)
[Fs, signal] = audioBasicIO.read_audio_file(file_name)

# Uncomment if signal has two channels
# signal = signal[:,0]

# Function to Normalize the Signal
def normalize_signal(signal):
  signal = np.double(signal)
  return (signal - signal.mean()) / ((np.abs(signal)).max() + 0.0000000001)

# Short Term Features
# For each short-term window a set of features is extracted. This would result in a sequence of feature
# vectors stored in a np matrix (in this case Features_midTerm)

signal = normalize_signal(signal)

# Fs - frequency sampling Rate
print("The Sampling Freq: ",Fs)

# Total Signal Len
signal_len = len(signal)
print("Total Signal Length: ",signal_len)

# Total Time Len of Song in Seconds
len_of_song = signal_len / Fs
print("Total Song Time (s): ",len_of_song)

# Window Size in mseconds
windowSize_in_ms = 50
windowSize_in_s = windowSize_in_ms/1000
# Window Size in Samples
windowSize_in_samples = Fs * (windowSize_in_ms / 1000) #divide by 1000 to turn to seconds
print("Window Size (samples): ",windowSize_in_samples)

# Window Step in mseconds
wStep_in_ms = 25
# Window Step in Samples
wStep_in_samples = Fs * (wStep_in_ms / 1000) #divide by 1000 to turn to seconds
print("Window Step in Samples: ", wStep_in_samples)

# Oversampling Percentage
oversampling_Percentage = (wStep_in_ms / windowSize_in_ms) * 100
print("Oversampling Percentage (overlap of windows): ",oversampling_Percentage)

# Total Number of Windows in Signal
total_number_windows = signal_len / windowSize_in_samples
print("Total Number of Windows in Signal: ",total_number_windows)

# Total number of feature samples produced (windows/size * total windows)
feature_samples_points_total_cal = int(total_number_windows * (windowSize_in_ms/wStep_in_ms))
print("Calculated Total of Points Produced per Short Term Feature: ",feature_samples_points_total_cal)

# Extract features and their names. Each index has its own vector of features. The total number should be the same
# as the calculated total of points produced per feature.
Features_shortTerm, feature_names = ShortTermFeatures.feature_extraction(signal, Fs, windowSize_in_samples, wStep_in_samples)

# Exact Number of points in the Features
feature_samples_points_total_exact = len(Features_shortTerm[0])
print("Exact Total of Points Produced per Short Term Feature: ",feature_samples_points_total_exact)

# Mid-window (in seconds)
mid_window_seconds = int(1 * Fs)

# Mid-step (in seconds)
mid_step_seconds = int(1 * Fs)

# MID FEATURE Extraction
Features_midTerm, short_Features_ignore, mid_feature_names = MidTermFeatures.mid_feature_extraction(signal,Fs,mid_window_seconds,mid_step_seconds,windowSize_in_samples,wStep_in_samples)

# Exact Mid-Term Feature Total Number of Points
midTerm_features_total_points = len(Features_midTerm)
print("Exact Mid-Term Total Number of Feature Points: ",midTerm_features_total_points)

# Beats per min
# The Tempo of music determins the speed at which it is played (measured in BPM)
bpm,confidence_ratio = MidTermFeatures.beat_extraction(Features_shortTerm,1)
print("Beats per Minute (bpm): ",bpm)
print("Confidence ratio for BPM: ", confidence_ratio)

# Figure out why the BPM does not match the actual reading
# of 115 BPM. It is showing 30 BPM which is for sure wrong.

The output of my script is as follows:

The Sampling Freq:  48000
Total Signal Length:  10992884
Total Song Time (s):  229.01841666666667
Window Size (samples):  2400.0
Window Step in Samples:  1200.0
Oversampling Percentage (overlap of windows):  50.0
Total Number of Windows in Signal:  4580.368333333333
Calculated Total of Points Produced per Short Term Feature:  9160
Exact Total of Points Produced per Short Term Feature:  9159
Exact Mid-Term Total Number of Feature Points:  136
Beats per Minute (bpm):  30.0
Confidence ratio for BPM:  1.0

The library's def of the function is as follows:

def beat_extraction(short_features, window_size, plot=False):
    """
    This function extracts an estimate of the beat rate for a musical signal.
    ARGUMENTS:
     - short_features:     a np array (n_feats x numOfShortTermWindows)
     - window_size:        window size in seconds
    RETURNS:
     - bpm:            estimates of beats per minute
     - ratio:          a confidence measure
    """

    # Features that are related to the beat tracking task:
    selected_features = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10,
                         11, 12, 13, 14, 15, 16, 17, 18]

    max_beat_time = int(round(2.0 / window_size))
    hist_all = np.zeros((max_beat_time,))
    # for each feature
    for ii, i in enumerate(selected_features):
        # dif threshold (3 x Mean of Difs)
        dif_threshold = 2.0 * (np.abs(short_features[i, 0:-1] -
                                      short_features[i, 1::])).mean()
        if dif_threshold <= 0:
            dif_threshold = 0.0000000000000001
        # detect local maxima
        [pos1, _] = utilities.peakdet(short_features[i, :], dif_threshold)
        position_diffs = []
        # compute histograms of local maxima changes
        for j in range(len(pos1)-1):
            position_diffs.append(pos1[j+1]-pos1[j])
        histogram_times, histogram_edges = \
            np.histogram(position_diffs, np.arange(0.5, max_beat_time + 1.5))
        hist_centers = (histogram_edges[0:-1] + histogram_edges[1::]) / 2.0
        histogram_times = \
            histogram_times.astype(float) / short_features.shape[1]
        hist_all += histogram_times
        if plot:
            plt.subplot(9, 2, ii + 1)
            plt.plot(short_features[i, :], 'k')
            for k in pos1:
                plt.plot(k, short_features[i, k], 'k*')
            f1 = plt.gca()
            f1.axes.get_xaxis().set_ticks([])
            f1.axes.get_yaxis().set_ticks([])

    if plot:
        plt.show(block=False)
        plt.figure()

    # Get beat as the argmax of the agregated histogram:
    max_indices = np.argmax(hist_all)
    bpms = 60 / (hist_centers * window_size)
    bpm = bpms[max_indices]
    # ... and the beat ratio:
    ratio = hist_all[max_indices] / hist_all.sum()

    if plot:
        # filter out >500 beats from plotting:
        hist_all = hist_all[bpms < 500]
        bpms = bpms[bpms < 500]

        plt.plot(bpms, hist_all, 'k')
        plt.xlabel('Beats per minute')
        plt.ylabel('Freq Count')
        plt.show(block=True)

    return bpm, ratio
0

There are 0 answers