UPDATE

# update audio_operation: # Add normalize_0_1 and denormalize_0_1 function # Add preemphasis and inv_preemphasis function # Add spec2wav function # Optimize audio waveform preprocessing steps
2021-04-25 18:04:53 +08:00 · 2021-04-25 18:04:53 +08:00 · d63c76db75
parent eec6b3e752
commit d63c76db75
1 changed files with 115 additions and 2 deletions
--- a/audio_operation.py
+++ b/audio_operation.py
@ -2,6 +2,7 @@ import librosa
 import scipy
 import scipy.fftpack
 import numpy as np
+from scipy import signal

 import hparams

@ -24,6 +25,11 @@ def read_wav(path, sr, duration=None, mono=True):
    return wav


+def save_wav(path, wav, sr):
+    librosa.output.write_wav(path=path, y=wav, sr=sr)
+    pass
+
+
 def amp2db(amp):
    return librosa.amplitude_to_db(amp)

@ -32,6 +38,58 @@ def db2amp(db):
    return librosa.db_to_amplitude(db)


+def normalize_0_1(values, max_db, min_db):
+    normalized = np.clip((values - min_db) / (max_db - min_db), 0, 1)
+    return normalized
+
+
+def denormalize_0_1(normalized, max_db, min_db):
+    values = np.clip(normalized, 0, 1) * (max_db - min_db) + min_db
+    return values
+
+
+def preemphasis(wav, coeff=0.97):
+    """
+    Emphasize high frequency range of the waveform by increasing power(squared amplitude).
+
+    Parameters
+    ----------
+    wav : np.ndarray [shape=(n,)]
+        Real-valued the waveform.
+
+    coeff: float <= 1 [scalar]
+        Coefficient of pre-emphasis.
+
+    Returns
+    -------
+    preem_wav : np.ndarray [shape=(n,)]
+        The pre-emphasized waveform.
+    """
+    preem_wav = signal.lfilter([1, -coeff], [1], wav)
+    return preem_wav
+
+
+def inv_preemphasis(preem_wav, coeff=0.97):
+    """
+    Invert the pre-emphasized waveform to the original waveform.
+
+    Parameters
+    ----------
+    preem_wav : np.ndarray [shape=(n,)]
+        The pre-emphasized waveform.
+
+    coeff: float <= 1 [scalar]
+        Coefficient of pre-emphasis.
+
+    Returns
+    -------
+    wav : np.ndarray [shape=(n,)]
+        Real-valued the waveform.
+    """
+    wav = signal.lfilter([1], [1, -coeff], preem_wav)
+    return wav
+
+
 def get_random_crop(length, crop_length):
    start = np.random.choice(range(np.maximum(1, length - crop_length)), 1)[0]
    end = start + crop_length
@ -40,6 +98,9 @@ def get_random_crop(length, crop_length):


 def _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc):
+    # Pre-emphasis
+    wav = preemphasis(wav, coeff=hparams.timit_preemphasis)
+
    # Get spectrogram
    # (1 + n_fft/2, t)
    spec = librosa.stft(y=wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
@ -58,6 +119,9 @@ def _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc):
    # Get mfccs
    mfccs = scipy.fftpack.dct(mel_db, axis=0, type=2, norm='ortho')[:n_mfcc]

+    mag_db = normalize_0_1(mag_db, hparams.timit_max_db, hparams.timit_min_db)
+    mel_db = normalize_0_1(mel_db, hparams.timit_max_db, hparams.timit_min_db)
+
    debug = False
    if debug:
        print("wav.shape:" + str(wav.shape))
@ -76,7 +140,7 @@ def get_mfccs_and_phones(wav_file, trim=False, random_crop=True):
    sr = hparams.timit_sr
    n_fft = hparams.timit_n_fft
    hop_length = hparams.timit_hop_length
-    win_length = hparams.timit_wim_length
+    win_length = hparams.timit_win_length
    n_mels = hparams.timit_n_mels
    n_mfcc = hparams.timit_n_mfcc
    default_duration = hparams.timit_default_duration
@ -136,7 +200,7 @@ def get_mfccs_and_phones(wav_file, trim=False, random_crop=True):
 def get_mfccs_and_spectrogram(wav_file, trim=True, random_crop=False):
    sr = hparams.timit_sr
    hop_length = hparams.timit_hop_length
-    win_length = hparams.timit_wim_length
+    win_length = hparams.timit_win_length
    n_fft = hparams.timit_n_fft
    n_mels = hparams.timit_n_mels
    n_mfcc = hparams.timit_n_mfcc
@ -163,3 +227,52 @@ def get_mfccs_and_spectrogram(wav_file, trim=True, random_crop=False):
        print("wav.shape : " + str(wav.shape))

    return _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc)
+
+
+def spec2wav(mag, n_fft, win_length, hop_length, num_iters, phase=None):
+    """
+        Get a waveform from the magnitude spectrogram by Griffin-Lim Algorithm.
+
+        Parameters
+        ----------
+        mag : np.ndarray [shape=(1 + n_fft/2, t)]
+            Magnitude spectrogram.
+
+        n_fft : int > 0 [scalar]
+            FFT window size.
+
+        win_length  : int <= n_fft [scalar]
+            The window will be of length `win_length` and then padded
+            with zeros to match `n_fft`.
+
+        hop_length : int > 0 [scalar]
+            Number audio of frames between STFT columns.
+
+        num_iters: int > 0 [scalar]
+            Number of iterations of Griffin-Lim Algorithm.
+
+        phase : np.ndarray [shape=(1 + n_fft/2, t)]
+            Initial phase spectrogram.
+
+        Returns
+        -------
+        wav : np.ndarray [shape=(n,)]
+            The real-valued waveform.
+
+        """
+    assert (num_iters > 0)
+    if phase is None:
+        phase = np.pi * np.random.rand(*mag.shape)
+    stft = mag * np.exp(1.j * phase)
+    wav = None
+    for i in range(num_iters):
+        wav = librosa.istft(stft, win_length=win_length, hop_length=hop_length)
+        if i != num_iters - 1:
+            stft = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
+            _, phase = librosa.magphase(stft)
+            phase = np.angle(phase)
+            a, b = phase.shape
+            # phase = phase.reshape(a, b, 1)
+            phase = phase.reshape(a, b)
+            stft = mag * np.exp(1.j * phase)
+    return wav