gpt4 book ai didi

python - 绘制歌曲中每个独特声音循环的时间范围,使用 python Librosa 按声音相似度对行进行排序

转载 作者:行者123 更新时间:2023-12-03 15:48:07 27 4
gpt4 key购买 nike

背景
Here's a video of a song clip from an electronic song.在视频的开头,歌曲全速播放。当您放慢歌曲速度时,您可以听到歌曲使用的所有独特声音。其中一些声音重复。

  • Mp3, Wav and MIDI of the audio in the video

  • 问题描述
    我想要做的是创建一个像下面这样的视觉效果,其中为每个独特的声音创建一个水平轨道/行,该轨道上有一个彩色块,对应于声音播放的歌曲中的每个时间帧。音轨/行应按声音与每个音轨的相似程度排序,越相似的声音越靠近。如果声音完全相同,以至于人类无法区分它们,那么它们应该被视为相同的声音。
  • 如果它通常可以满足我的要求,我会接受一个不完美的解决方案

  • enter image description here
  • 观看上面链接的视频,了解我所说的内容。它包括一个我手动创建的视觉网格,它几乎与我试图生成的网格相匹配。

  • 例如,如果下面的 5 个波中的每一个都代表声音产生的声波,则这些声音中的每一个都将被视为相似,并且将在网格上垂直放置在一起。
    enter image description here

    尝试
    我一直在查看 Laplacian segmentation 的示例在 librosa .标记为 的图表结构组件看起来它可能是我需要的。从阅读 paper ,看起来他们正试图将这首歌分成几段,如合唱、诗句、桥段……但我本质上是想将这首歌分成 1 或 2 个节拍片段。

    这是拉普拉斯分割的代码(如果您愿意,也可以使用 Jupyter Notebook)。

    # -*- coding: utf-8 -*-
    """
    ======================
    Laplacian segmentation
    ======================

    This notebook implements the laplacian segmentation method of
    `McFee and Ellis, 2014 <http://bmcfee.github.io/papers/ismir2014_spectral.pdf>`_,
    with a couple of minor stability improvements.

    Throughout the example, we will refer to equations in the paper by number, so it will be
    helpful to read along.
    """

    # Code source: Brian McFee
    # License: ISC


    ###################################
    # Imports
    # - numpy for basic functionality
    # - scipy for graph Laplacian
    # - matplotlib for visualization
    # - sklearn.cluster for K-Means
    #
    import numpy as np
    import scipy
    import matplotlib.pyplot as plt

    import sklearn.cluster

    import librosa
    import librosa.display
    import matplotlib.patches as patches

    #############################
    # First, we'll load in a song
    def laplacianSegmentation(fileName):
    y, sr = librosa.load(librosa.ex('fishin'))


    ##############################################
    # Next, we'll compute and plot a log-power CQT
    BINS_PER_OCTAVE = 12 * 3
    N_OCTAVES = 7
    C = librosa.amplitude_to_db(np.abs(librosa.cqt(y=y, sr=sr,
    bins_per_octave=BINS_PER_OCTAVE,
    n_bins=N_OCTAVES * BINS_PER_OCTAVE)),
    ref=np.max)

    fig, ax = plt.subplots()
    librosa.display.specshow(C, y_axis='cqt_hz', sr=sr,
    bins_per_octave=BINS_PER_OCTAVE,
    x_axis='time', ax=ax)


    ##########################################################
    # To reduce dimensionality, we'll beat-synchronous the CQT
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False)
    Csync = librosa.util.sync(C, beats, aggregate=np.median)

    # For plotting purposes, we'll need the timing of the beats
    # we fix_frames to include non-beat frames 0 and C.shape[1] (final frame)
    beat_times = librosa.frames_to_time(librosa.util.fix_frames(beats,
    x_min=0,
    x_max=C.shape[1]),
    sr=sr)

    fig, ax = plt.subplots()
    librosa.display.specshow(Csync, bins_per_octave=12*3,
    y_axis='cqt_hz', x_axis='time',
    x_coords=beat_times, ax=ax)


    #####################################################################
    # Let's build a weighted recurrence matrix using beat-synchronous CQT
    # (Equation 1)
    # width=3 prevents links within the same bar
    # mode='affinity' here implements S_rep (after Eq. 8)
    R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity',
    sym=True)

    # Enhance diagonals with a median filter (Equation 2)
    df = librosa.segment.timelag_filter(scipy.ndimage.median_filter)
    Rf = df(R, size=(1, 7))


    ###################################################################
    # Now let's build the sequence matrix (S_loc) using mfcc-similarity
    #
    # :math:`R_\text{path}[i, i\pm 1] = \exp(-\|C_i - C_{i\pm 1}\|^2 / \sigma^2)`
    #
    # Here, we take :math:`\sigma` to be the median distance between successive beats.
    #
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    Msync = librosa.util.sync(mfcc, beats)

    path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0)
    sigma = np.median(path_distance)
    path_sim = np.exp(-path_distance / sigma)

    R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1)


    ##########################################################
    # And compute the balanced combination (Equations 6, 7, 9)

    deg_path = np.sum(R_path, axis=1)
    deg_rec = np.sum(Rf, axis=1)

    mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2)

    A = mu * Rf + (1 - mu) * R_path


    ###########################################################
    # Plot the resulting graphs (Figure 1, left and center)
    fig, ax = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(10, 4))
    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time', x_axis='s',
    y_coords=beat_times, x_coords=beat_times, ax=ax[0])
    ax[0].set(title='Recurrence similarity')
    ax[0].label_outer()
    librosa.display.specshow(R_path, cmap='inferno_r', y_axis='time', x_axis='s',
    y_coords=beat_times, x_coords=beat_times, ax=ax[1])
    ax[1].set(title='Path similarity')
    ax[1].label_outer()
    librosa.display.specshow(A, cmap='inferno_r', y_axis='time', x_axis='s',
    y_coords=beat_times, x_coords=beat_times, ax=ax[2])
    ax[2].set(title='Combined graph')
    ax[2].label_outer()


    #####################################################
    # Now let's compute the normalized Laplacian (Eq. 10)
    L = scipy.sparse.csgraph.laplacian(A, normed=True)


    # and its spectral decomposition
    evals, evecs = scipy.linalg.eigh(L)


    # We can clean this up further with a median filter.
    # This can help smooth over small discontinuities
    evecs = scipy.ndimage.median_filter(evecs, size=(9, 1))


    # cumulative normalization is needed for symmetric normalize laplacian eigenvectors
    Cnorm = np.cumsum(evecs**2, axis=1)**0.5

    # If we want k clusters, use the first k normalized eigenvectors.
    # Fun exercise: see how the segmentation changes as you vary k

    k = 5

    X = evecs[:, :k] / Cnorm[:, k-1:k]


    # Plot the resulting representation (Figure 1, center and right)

    fig, ax = plt.subplots(ncols=2, sharey=True, figsize=(10, 5))
    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time', x_axis='time',
    y_coords=beat_times, x_coords=beat_times, ax=ax[1])
    ax[1].set(title='Recurrence similarity')
    ax[1].label_outer()

    librosa.display.specshow(X,
    y_axis='time',
    y_coords=beat_times, ax=ax[0])
    ax[0].set(title='Structure components')


    #############################################################
    # Let's use these k components to cluster beats into segments
    # (Algorithm 1)
    KM = sklearn.cluster.KMeans(n_clusters=k)

    seg_ids = KM.fit_predict(X)


    # and plot the results
    fig, ax = plt.subplots(ncols=3, sharey=True, figsize=(10, 4))
    colors = plt.get_cmap('Paired', k)

    librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time',
    y_coords=beat_times, ax=ax[1])
    ax[1].set(title='Recurrence matrix')
    ax[1].label_outer()

    librosa.display.specshow(X,
    y_axis='time',
    y_coords=beat_times, ax=ax[0])
    ax[0].set(title='Structure components')

    img = librosa.display.specshow(np.atleast_2d(seg_ids).T, cmap=colors,
    y_axis='time', y_coords=beat_times, ax=ax[2])
    ax[2].set(title='Estimated segments')
    ax[2].label_outer()
    fig.colorbar(img, ax=[ax[2]], ticks=range(k))


    ###############################################################
    # Locate segment boundaries from the label sequence
    bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:])

    # Count beat 0 as a boundary
    bound_beats = librosa.util.fix_frames(bound_beats, x_min=0)

    # Compute the segment label for each boundary
    bound_segs = list(seg_ids[bound_beats])

    # Convert beat indices to frames
    bound_frames = beats[bound_beats]

    # Make sure we cover to the end of the track
    bound_frames = librosa.util.fix_frames(bound_frames,
    x_min=None,
    x_max=C.shape[1]-1)

    ###################################################
    # And plot the final segmentation over original CQT


    # sphinx_gallery_thumbnail_number = 5

    bound_times = librosa.frames_to_time(bound_frames)
    freqs = librosa.cqt_frequencies(n_bins=C.shape[0],
    fmin=librosa.note_to_hz('C1'),
    bins_per_octave=BINS_PER_OCTAVE)

    fig, ax = plt.subplots()
    librosa.display.specshow(C, y_axis='cqt_hz', sr=sr,
    bins_per_octave=BINS_PER_OCTAVE,
    x_axis='time', ax=ax)

    for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs):
    ax.add_patch(patches.Rectangle((interval[0], freqs[0]),
    interval[1] - interval[0],
    freqs[-1],
    facecolor=colors(label),
    alpha=0.50))


    我认为必须改变的一件主要事情是集群的数量,在示例中它们有 5 个,但我不知道我想要它是什么,因为我不知道有多少声音。我将它设置为 400 产生以下结果,这并不是我可以使用的真正感觉。理想情况下,我希望所有块都是纯色:不是最大红色和蓝色值之间的颜色。
    ![enter image description here
    (我把它横过来,看起来更像我上面的例子,更像我试图产生的输出)

    附加信息
    背景中也可能有鼓音轨,有时会同时播放多个声音。如果这些多个声音组被解释为一种独特的声音,那没关系,但我显然更喜欢将它们区分为单独的声音。
    如果它更容易,您可以使用
    y, sr = librosa.load(librosa.ex('exampleSong.mp3'))
    y_harmonic, y_percussive = librosa.effects.hpss(y)

    更新
    我能够 separate the sounds by transients .目前这种作品,但它分成了太多的声音,据我所知,它似乎只是将一些声音分成了两个。我还可以从我正在使用的软件创建一个 MIDI 文件,并使用它来确定 transient 时间,但如果可以的话,我想在没有 MIDI 文件的情况下解决这个问题。 midi 文件非常准确,将声音文件分成 33 个部分,而 transient 代码将声音文件分成 40 个部分。这是midi的可视化
    enter image description here
    所以仍然需要解决的部分将是
  • 更好的 transient 分离
  • 对声音进行排序
  • 最佳答案

    下面是一个脚本,它在梅尔谱图上使用非负矩阵分解 (NMF) 来分解输入音频。我在前几秒钟使用了您上传的音频 WAV 的完整音频,然后运行代码以获得以下输出。
    代码和音频剪辑都可以在 Github repository 中找到.
    enter image description here
    当 BPM 已知(在给定示例中似乎约为 130)并且输入音频与节拍大致对齐时,这种方法在短音频剪辑上的表现似乎相当合理。不能保证它可以在整首歌曲或其他歌曲上正常工作。
    有很多方法可以改进:

  • 使用比梅尔谱图更紧凑和感知的向量作为 NMF。可能是从音乐中学到的一种转变。要么嵌入自编码器。
  • 将 NMF 组件去重为“主要”组件。
  • 向 NMF 添加约束,例如时间。大量研究论文
  • 自动检测 BPM 并进行对齐
  • 更好的感知排序。可能需要组,例如和弦、单音、打击乐

  • import os.path
    import sys

    import librosa
    import pandas
    import numpy
    import sklearn.decomposition
    import skimage.color

    from matplotlib import pyplot as plt
    import librosa.display
    import seaborn



    def decompose_audio(y, sr, bpm, per_beat=8,
    n_components=16, n_mels=128, fmin=100, fmax=6000):
    """
    Decompose audio using NMF spectrogram decomposition,
    using a fixed number of frames per beat (@per_beat) for a given @bpm
    NOTE: assumes audio to be aligned to the beat
    """

    interval = (60/bpm)/per_beat
    T = sklearn.decomposition.NMF(n_components)
    S = numpy.abs(librosa.feature.melspectrogram(y, hop_length=int(sr*interval), n_mels=128, fmin=100, fmax=6000))

    comps, acts = librosa.decompose.decompose(S, transformer=T, sort=False)

    # compute feature to sort components by
    ind = numpy.apply_along_axis(numpy.argmax, 0, comps)
    #ind = librosa.feature.spectral_rolloff(S=comps)[0]
    #ind = librosa.feature.spectral_centroid(S=comps)[0]

    # apply sorting
    order_idx = numpy.argsort(ind)
    ordered_comps = comps[:,order_idx]
    ordered_acts = acts[order_idx,:]

    # plot components
    librosa.display.specshow(librosa.amplitude_to_db(ordered_comps,
    ref=numpy.max),y_axis='mel', sr=sr)

    return S, ordered_comps, ordered_acts



    def plot_colorized_activations(acts, ax, hop_length=None, sr=None, value_mod=1.0):

    hsv = numpy.stack([
    numpy.ones(shape=acts.shape),
    numpy.ones(shape=acts.shape),
    acts,
    ], axis=-1)

    # Set hue based on a palette
    colors = seaborn.color_palette("husl", hsv.shape[0])
    for row_no in range(hsv.shape[0]):
    c = colors[row_no]
    c = skimage.color.rgb2hsv(numpy.stack([c]))[0]
    hsv[row_no, :, 0] = c[0]
    hsv[row_no, :, 1] = c[1]
    hsv[row_no, :, 2] *= value_mod

    colored = skimage.color.hsv2rgb(hsv)

    # use same kind of order as librosa.specshow
    flipped = colored[::-1, :, :]

    ax.imshow(flipped)
    ax.set(aspect='auto')

    ax.tick_params(axis='x',
    which='both',
    bottom=False,
    top=False,
    labelbottom=False)

    ax.tick_params(axis='both',
    which='both',
    bottom=False,
    left=False,
    top=False,
    labelbottom=False)


    def plot_activations(S, acts):
    fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(25, 15), sharex=False)

    # spectrogram
    db = librosa.amplitude_to_db(S, ref=numpy.max)
    librosa.display.specshow(db, ax=ax[0], y_axis='mel')

    # original activations
    librosa.display.specshow(acts, x_axis='time', ax=ax[1])

    # colorize
    plot_colorized_activations(acts, ax=ax[2], value_mod=3.0)

    # thresholded
    q = numpy.quantile(acts, 0.90, axis=0, keepdims=True) + 1e-9
    norm = acts / q
    threshold = numpy.quantile(norm, 0.93)
    plot_colorized_activations((norm > threshold).astype(float), ax=ax[3], value_mod=1.0)
    return fig

    def main():
    audio_file = 'silence-end.wav'
    audio_bpm = 130
    sr = 22050
    audio, sr = librosa.load(audio_file, sr=sr)
    S, comps, acts = decompose_audio(y=audio, sr=sr, bpm=audio_bpm)
    fig = plot_activations(S, acts)
    fig.savefig('plot.png', transparent=False)

    main()

    关于python - 绘制歌曲中每个独特声音循环的时间范围,使用 python Librosa 按声音相似度对行进行排序,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/65247230/

    27 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com