Background

Here's a video of a song clip from an electronic song. At the beginning of the video, the song plays at full speed. When you slow down the song you can hear all the unique sounds that the song uses. Some of these sounds repeat.

Problem Description

What I am trying to do is create a visual like below, where a horizontal track/row is created for each unique sound, with a colored block on that track that corresponds to each timeframe in the song that sound is playing. The tracks/rows should be sorted by how similar the sounds are to each, with more similar sounds being closer together. If sounds are so identical a human couldn't tell them apart, they should be considered the same sound.

  • I'll accept an imperfect solution if it can generally do what I'm asking

enter image description here

  • Watch the video linked above for a video description of what I am saying. It includes a visual grid that I created manually which almost matches the grid I am trying to produce.

If for example, each of the 5 waves below represents the soundwave that a sound makes, each of these sounds would be considered similar, and would be placed close to each other vertically on the grid.

enter image description here


Attempts

I've been looking at an example for Laplacian segmentation in librosa. The graph labelled structure components looks like it might be what I need. From reading the paper, it looks like they are trying to break the song into segments like chorus, verse, bridge... but I am essentially trying to break the song into 1 or 2 beat fragments.


Here is the code for the Laplacian Segmentation (there's a Jupyter Notebook as well if you would prefer that).

# -*- coding: utf-8 -*-
"""
======================
Laplacian segmentation
======================

This notebook implements the laplacian segmentation method of
`McFee and Ellis, 2014 <http://bmcfee.github.io/papers/ismir2014_spectral.pdf>`_,
with a couple of minor stability improvements.

Throughout the example, we will refer to equations in the paper by number, so it will be
helpful to read along.
"""

# Code source: Brian McFee
# License: ISC


###################################
# Imports
#   - numpy for basic functionality
#   - scipy for graph Laplacian
#   - matplotlib for visualization
#   - sklearn.cluster for K-Means
#
import numpy as np
import scipy
import matplotlib.pyplot as plt

import sklearn.cluster

import librosa
import librosa.display
import matplotlib.patches as patches

#############################
# First, we'll load in a song
def laplacianSegmentation(fileName):
    y, sr = librosa.load(librosa.ex('fishin'))


    ##############################################
    # Next, we'll compute and plot a log-power CQT
    BINS_PER_OCTAVE = 12 * 3
    N_OCTAVES = 7
    C = librosa.amplitude_to_db(np.abs(librosa.cqt(y=y, sr=sr,
                                            bins_per_octave=BINS_PER_OCTAVE,
                                            n_bins=N_OCTAVES * BINS_PER_OCTAVE)),
                                ref=np.max)

    fig, ax = plt.subplots()
    librosa.display.specshow(C, y_axis='cqt_hz', sr=sr,
                            bins_per_octave=BINS_PER_OCTAVE,
                            x_axis='time', ax=ax)


    ##########################################################
    # To reduce dimensionality, we'll beat-synchronous the CQT
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False)
    Csync = librosa.util.sync(C, beats, aggregate=np.median)

    # For plotting purposes, we'll need the timing of the beats
    # we fix_frames to include non-beat frames 0 and C.shape[1] (final frame)
    beat_times = librosa.frames_to_time(librosa.util.fix_frames(beats,
                                                                x_min=0,
                                                                x_max=C.shape[1]),
                                        sr=sr)

    fig, ax = plt.subplots()
    librosa.display.specshow(Csync, bins_per_octave=12*3,
                            y_axis='cqt_hz', x_axis='time',
                            x_coords=beat_times, ax=ax)


    #####################################################################
    # Let's build a weighted recurrence matrix using beat-synchronous CQT
    # (Equation 1)
    # width=3 prevents links within the same bar
    # mode='affinity' here implements S_rep (after Eq. 8)
    R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity',
                                        sym=True)

    # Enhance diagonals with a median filter (Equation 2)
    df = librosa.segment.timelag_filter(scipy.ndimage.median_filter)
    Rf = df(R, size=(1, 7))


    ###################################################################
    # Now let's build the sequence matrix (S_loc) using mfcc-similarity
    #
    #   :math:`R_\text{path}[i, i\pm 1] = \exp(-\|C_i - C_{i\pm 1}\|^2 / \sigma^2)`
    #
    # Here, we take :math:`\sigma` to be the median distance between successive beats.
    #
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    Msync = librosa.util.sync(mfcc, beats)

    path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0)
    sigma = np.median(path_distance)
    path_sim = np.exp(-path_distance / sigma)

    R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1)


    ##########################################################
    # And compute the balanced combination (Equations 6, 7, 9)

    deg_path = np.sum(R_path, axis=1)
    deg_rec = np.sum(Rf, axis=1)

    mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2)

    A = mu * Rf + (1 - mu) * R_path


    ###########################################################
    # Plot the resulting graphs (Figure 1, left and center)
    fig, ax = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(10, 4))
    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time', x_axis='s',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[0])
    ax[0].set(title='Recurrence similarity')
    ax[0].label_outer()
    librosa.display.specshow(R_path, cmap='inferno_r', y_axis='time', x_axis='s',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[1])
    ax[1].set(title='Path similarity')
    ax[1].label_outer()
    librosa.display.specshow(A, cmap='inferno_r', y_axis='time', x_axis='s',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[2])
    ax[2].set(title='Combined graph')
    ax[2].label_outer()


    #####################################################
    # Now let's compute the normalized Laplacian (Eq. 10)
    L = scipy.sparse.csgraph.laplacian(A, normed=True)


    # and its spectral decomposition
    evals, evecs = scipy.linalg.eigh(L)


    # We can clean this up further with a median filter.
    # This can help smooth over small discontinuities
    evecs = scipy.ndimage.median_filter(evecs, size=(9, 1))


    # cumulative normalization is needed for symmetric normalize laplacian eigenvectors
    Cnorm = np.cumsum(evecs**2, axis=1)**0.5

    # If we want k clusters, use the first k normalized eigenvectors.
    # Fun exercise: see how the segmentation changes as you vary k

    k = 5

    X = evecs[:, :k] / Cnorm[:, k-1:k]


    # Plot the resulting representation (Figure 1, center and right)

    fig, ax = plt.subplots(ncols=2, sharey=True, figsize=(10, 5))
    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time', x_axis='time',
                            y_coords=beat_times, x_coords=beat_times, ax=ax[1])
    ax[1].set(title='Recurrence similarity')
    ax[1].label_outer()

    librosa.display.specshow(X,
                            y_axis='time',
                            y_coords=beat_times, ax=ax[0])
    ax[0].set(title='Structure components')


    #############################################################
    # Let's use these k components to cluster beats into segments
    # (Algorithm 1)
    KM = sklearn.cluster.KMeans(n_clusters=k)

    seg_ids = KM.fit_predict(X)


    # and plot the results
    fig, ax = plt.subplots(ncols=3, sharey=True, figsize=(10, 4))
    colors = plt.get_cmap('Paired', k)

    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time',
                            y_coords=beat_times, ax=ax[1])
    ax[1].set(title='Recurrence matrix')
    ax[1].label_outer()

    librosa.display.specshow(X,
                            y_axis='time',
                            y_coords=beat_times, ax=ax[0])
    ax[0].set(title='Structure components')

    img = librosa.display.specshow(np.atleast_2d(seg_ids).T, cmap=colors,
                            y_axis='time', y_coords=beat_times, ax=ax[2])
    ax[2].set(title='Estimated segments')
    ax[2].label_outer()
    fig.colorbar(img, ax=[ax[2]], ticks=range(k))


    ###############################################################
    # Locate segment boundaries from the label sequence
    bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:])

    # Count beat 0 as a boundary
    bound_beats = librosa.util.fix_frames(bound_beats, x_min=0)

    # Compute the segment label for each boundary
    bound_segs = list(seg_ids[bound_beats])

    # Convert beat indices to frames
    bound_frames = beats[bound_beats]

    # Make sure we cover to the end of the track
    bound_frames = librosa.util.fix_frames(bound_frames,
                                        x_min=None,
                                        x_max=C.shape[1]-1)

    ###################################################
    # And plot the final segmentation over original CQT


    # sphinx_gallery_thumbnail_number = 5

    bound_times = librosa.frames_to_time(bound_frames)
    freqs = librosa.cqt_frequencies(n_bins=C.shape[0],
                                    fmin=librosa.note_to_hz('C1'),
                                    bins_per_octave=BINS_PER_OCTAVE)

    fig, ax = plt.subplots()
    librosa.display.specshow(C, y_axis='cqt_hz', sr=sr,
                            bins_per_octave=BINS_PER_OCTAVE,
                            x_axis='time', ax=ax)

    for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs):
        ax.add_patch(patches.Rectangle((interval[0], freqs[0]),
                                    interval[1] - interval[0],
                                    freqs[-1],
                                    facecolor=colors(label),
                                    alpha=0.50))


One major thing that I believe would have to change would be the number of clusters, they have 5 in the example, but I don't know what I would want it to be because I don't know how many sounds there are. I set it to 400 producing the following result, which didn't really feel like something I could work with. Ideally I would want all the blocks to be solid colors: not colors in between the max red and blue values.

![enter image description here

(I turned it sideways to look more like my examples above and more like the output I'm trying to produce)


Additional Info

There may also be a drum track in the background and sometimes multiple sounds are played at the same time. If these multiple sound groups get interpreted as one unique sound that's ok, but I'd obviously prefer if they could be distinguished as separate sounds.

If it makes it easier you can remove a drum loop using

y, sr = librosa.load(librosa.ex('exampleSong.mp3'))
y_harmonic, y_percussive = librosa.effects.hpss(y)

Update

I was able to separate the sounds by transients. Currently this kind of works, but it separates into too many sounds, from I could tell, it seemed like it was mostly just separating some sounds into two though. I can also create a midi file from the software I'm using, and using that to determine the transient times, but I would like to solve this problem without the midi file if I could. The midi file was pretty accurate, and split the sound file into 33 sections, whereas that transient code split the sound file into 40 sections. Here's a visualization of the midi

enter image description here

So that parts that still need to be solved would be

  • Better transient separation
  • Sorting the sounds
1

There are 1 answers

2
Jon Nordby On

Below is a script that uses Non-negative Matrix Factorization (NMF) on mel-spectrograms to decompose the input audio. I took the first seconds with complete audio of your uploaded audio WAV, and ran the code to get the following output. Both the code and the audio clip can be found in the Github repository.

enter image description here

This approach seems to do pretty reasonably on short audio clips when the BPM is known (seems to be around 130 with given example) and the input audio is roughly aligned to the beat. No guarantee it will work as well on a whole song, or other songs.

There are many ways it could be improved:

  • Using a more compact and perceptual vector than mel-spectrogram as NMF. Possibly a transformation learned from music. Either an embedding an autoencoder.
  • De-duplicate NMF components into "primary" components.
  • Adding constraints to the NMF, such as temporal. Lots of research papers out there
  • Automatically detecting BPM and doing alignment
  • Better perceptual sorting. Might want to have groups, such as chords, single tones, percussive
import os.path
import sys

import librosa
import pandas
import numpy
import sklearn.decomposition
import skimage.color

from matplotlib import pyplot as plt
import librosa.display
import seaborn



def decompose_audio(y, sr, bpm, per_beat=8,
                    n_components=16, n_mels=128, fmin=100, fmax=6000):
    """
    Decompose audio using NMF spectrogram decomposition,
    using a fixed number of frames per beat (@per_beat) for a given @bpm
    NOTE: assumes audio to be aligned to the beat
    """
    
    interval = (60/bpm)/per_beat
    T = sklearn.decomposition.NMF(n_components)
    S = numpy.abs(librosa.feature.melspectrogram(y, hop_length=int(sr*interval), n_mels=128, fmin=100, fmax=6000))
    
    comps, acts = librosa.decompose.decompose(S, transformer=T, sort=False)
    
    # compute feature to sort components by
    ind = numpy.apply_along_axis(numpy.argmax, 0, comps)
    #ind = librosa.feature.spectral_rolloff(S=comps)[0]
    #ind = librosa.feature.spectral_centroid(S=comps)[0]

    # apply sorting
    order_idx = numpy.argsort(ind)
    ordered_comps = comps[:,order_idx]
    ordered_acts = acts[order_idx,:]
    
    # plot components
    librosa.display.specshow(librosa.amplitude_to_db(ordered_comps,
                                                  ref=numpy.max),y_axis='mel', sr=sr)
    
    return S, ordered_comps, ordered_acts



def plot_colorized_activations(acts, ax, hop_length=None, sr=None, value_mod=1.0):

    hsv = numpy.stack([
        numpy.ones(shape=acts.shape),
        numpy.ones(shape=acts.shape),
        acts,
    ], axis=-1)

    # Set hue based on a palette
    colors = seaborn.color_palette("husl", hsv.shape[0])
    for row_no in range(hsv.shape[0]):
        c = colors[row_no]
        c = skimage.color.rgb2hsv(numpy.stack([c]))[0]
        hsv[row_no, :, 0] = c[0]
        hsv[row_no, :, 1] = c[1]
        hsv[row_no, :, 2] *= value_mod

    colored = skimage.color.hsv2rgb(hsv)
    
    # use same kind of order as librosa.specshow
    flipped = colored[::-1, :, :]

    ax.imshow(flipped)
    ax.set(aspect='auto')
    
    ax.tick_params(axis='x',
        which='both',
        bottom=False,
        top=False,
        labelbottom=False)
    
    ax.tick_params(axis='both',
        which='both',
        bottom=False,
        left=False,
        top=False,
        labelbottom=False)
    

def plot_activations(S, acts):
    fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(25, 15), sharex=False)
    
    # spectrogram
    db = librosa.amplitude_to_db(S, ref=numpy.max)
    librosa.display.specshow(db, ax=ax[0], y_axis='mel')

    # original activations
    librosa.display.specshow(acts, x_axis='time', ax=ax[1])

    # colorize
    plot_colorized_activations(acts, ax=ax[2], value_mod=3.0)

    # thresholded
    q = numpy.quantile(acts, 0.90, axis=0, keepdims=True) + 1e-9
    norm = acts / q
    threshold = numpy.quantile(norm, 0.93)
    plot_colorized_activations((norm > threshold).astype(float), ax=ax[3], value_mod=1.0)
    return fig

def main():
    audio_file = 'silence-end.wav'
    audio_bpm = 130
    sr = 22050
    audio, sr = librosa.load(audio_file, sr=sr)
    S, comps, acts = decompose_audio(y=audio, sr=sr, bpm=audio_bpm)
    fig = plot_activations(S, acts)
    fig.savefig('plot.png', transparent=False)

main()