UPDATE
# update audio_operation: # Add normalize_0_1 and denormalize_0_1 function # Add preemphasis and inv_preemphasis function # Add spec2wav function # Optimize audio waveform preprocessing steps
This commit is contained in:
parent
eec6b3e752
commit
d63c76db75
|
@ -2,6 +2,7 @@ import librosa
|
||||||
import scipy
|
import scipy
|
||||||
import scipy.fftpack
|
import scipy.fftpack
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from scipy import signal
|
||||||
|
|
||||||
import hparams
|
import hparams
|
||||||
|
|
||||||
|
@ -24,6 +25,11 @@ def read_wav(path, sr, duration=None, mono=True):
|
||||||
return wav
|
return wav
|
||||||
|
|
||||||
|
|
||||||
|
def save_wav(path, wav, sr):
|
||||||
|
librosa.output.write_wav(path=path, y=wav, sr=sr)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def amp2db(amp):
|
def amp2db(amp):
|
||||||
return librosa.amplitude_to_db(amp)
|
return librosa.amplitude_to_db(amp)
|
||||||
|
|
||||||
|
@ -32,6 +38,58 @@ def db2amp(db):
|
||||||
return librosa.db_to_amplitude(db)
|
return librosa.db_to_amplitude(db)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_0_1(values, max_db, min_db):
|
||||||
|
normalized = np.clip((values - min_db) / (max_db - min_db), 0, 1)
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def denormalize_0_1(normalized, max_db, min_db):
|
||||||
|
values = np.clip(normalized, 0, 1) * (max_db - min_db) + min_db
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
def preemphasis(wav, coeff=0.97):
|
||||||
|
"""
|
||||||
|
Emphasize high frequency range of the waveform by increasing power(squared amplitude).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
wav : np.ndarray [shape=(n,)]
|
||||||
|
Real-valued the waveform.
|
||||||
|
|
||||||
|
coeff: float <= 1 [scalar]
|
||||||
|
Coefficient of pre-emphasis.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
preem_wav : np.ndarray [shape=(n,)]
|
||||||
|
The pre-emphasized waveform.
|
||||||
|
"""
|
||||||
|
preem_wav = signal.lfilter([1, -coeff], [1], wav)
|
||||||
|
return preem_wav
|
||||||
|
|
||||||
|
|
||||||
|
def inv_preemphasis(preem_wav, coeff=0.97):
|
||||||
|
"""
|
||||||
|
Invert the pre-emphasized waveform to the original waveform.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
preem_wav : np.ndarray [shape=(n,)]
|
||||||
|
The pre-emphasized waveform.
|
||||||
|
|
||||||
|
coeff: float <= 1 [scalar]
|
||||||
|
Coefficient of pre-emphasis.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
wav : np.ndarray [shape=(n,)]
|
||||||
|
Real-valued the waveform.
|
||||||
|
"""
|
||||||
|
wav = signal.lfilter([1], [1, -coeff], preem_wav)
|
||||||
|
return wav
|
||||||
|
|
||||||
|
|
||||||
def get_random_crop(length, crop_length):
|
def get_random_crop(length, crop_length):
|
||||||
start = np.random.choice(range(np.maximum(1, length - crop_length)), 1)[0]
|
start = np.random.choice(range(np.maximum(1, length - crop_length)), 1)[0]
|
||||||
end = start + crop_length
|
end = start + crop_length
|
||||||
|
@ -40,6 +98,9 @@ def get_random_crop(length, crop_length):
|
||||||
|
|
||||||
|
|
||||||
def _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc):
|
def _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc):
|
||||||
|
# Pre-emphasis
|
||||||
|
wav = preemphasis(wav, coeff=hparams.timit_preemphasis)
|
||||||
|
|
||||||
# Get spectrogram
|
# Get spectrogram
|
||||||
# (1 + n_fft/2, t)
|
# (1 + n_fft/2, t)
|
||||||
spec = librosa.stft(y=wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
spec = librosa.stft(y=wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
||||||
|
@ -58,6 +119,9 @@ def _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc):
|
||||||
# Get mfccs
|
# Get mfccs
|
||||||
mfccs = scipy.fftpack.dct(mel_db, axis=0, type=2, norm='ortho')[:n_mfcc]
|
mfccs = scipy.fftpack.dct(mel_db, axis=0, type=2, norm='ortho')[:n_mfcc]
|
||||||
|
|
||||||
|
mag_db = normalize_0_1(mag_db, hparams.timit_max_db, hparams.timit_min_db)
|
||||||
|
mel_db = normalize_0_1(mel_db, hparams.timit_max_db, hparams.timit_min_db)
|
||||||
|
|
||||||
debug = False
|
debug = False
|
||||||
if debug:
|
if debug:
|
||||||
print("wav.shape:" + str(wav.shape))
|
print("wav.shape:" + str(wav.shape))
|
||||||
|
@ -76,7 +140,7 @@ def get_mfccs_and_phones(wav_file, trim=False, random_crop=True):
|
||||||
sr = hparams.timit_sr
|
sr = hparams.timit_sr
|
||||||
n_fft = hparams.timit_n_fft
|
n_fft = hparams.timit_n_fft
|
||||||
hop_length = hparams.timit_hop_length
|
hop_length = hparams.timit_hop_length
|
||||||
win_length = hparams.timit_wim_length
|
win_length = hparams.timit_win_length
|
||||||
n_mels = hparams.timit_n_mels
|
n_mels = hparams.timit_n_mels
|
||||||
n_mfcc = hparams.timit_n_mfcc
|
n_mfcc = hparams.timit_n_mfcc
|
||||||
default_duration = hparams.timit_default_duration
|
default_duration = hparams.timit_default_duration
|
||||||
|
@ -136,7 +200,7 @@ def get_mfccs_and_phones(wav_file, trim=False, random_crop=True):
|
||||||
def get_mfccs_and_spectrogram(wav_file, trim=True, random_crop=False):
|
def get_mfccs_and_spectrogram(wav_file, trim=True, random_crop=False):
|
||||||
sr = hparams.timit_sr
|
sr = hparams.timit_sr
|
||||||
hop_length = hparams.timit_hop_length
|
hop_length = hparams.timit_hop_length
|
||||||
win_length = hparams.timit_wim_length
|
win_length = hparams.timit_win_length
|
||||||
n_fft = hparams.timit_n_fft
|
n_fft = hparams.timit_n_fft
|
||||||
n_mels = hparams.timit_n_mels
|
n_mels = hparams.timit_n_mels
|
||||||
n_mfcc = hparams.timit_n_mfcc
|
n_mfcc = hparams.timit_n_mfcc
|
||||||
|
@ -163,3 +227,52 @@ def get_mfccs_and_spectrogram(wav_file, trim=True, random_crop=False):
|
||||||
print("wav.shape : " + str(wav.shape))
|
print("wav.shape : " + str(wav.shape))
|
||||||
|
|
||||||
return _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc)
|
return _get_mfcc_and_spec(wav, sr, n_fft, hop_length, win_length, n_mels, n_mfcc)
|
||||||
|
|
||||||
|
|
||||||
|
def spec2wav(mag, n_fft, win_length, hop_length, num_iters, phase=None):
|
||||||
|
"""
|
||||||
|
Get a waveform from the magnitude spectrogram by Griffin-Lim Algorithm.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mag : np.ndarray [shape=(1 + n_fft/2, t)]
|
||||||
|
Magnitude spectrogram.
|
||||||
|
|
||||||
|
n_fft : int > 0 [scalar]
|
||||||
|
FFT window size.
|
||||||
|
|
||||||
|
win_length : int <= n_fft [scalar]
|
||||||
|
The window will be of length `win_length` and then padded
|
||||||
|
with zeros to match `n_fft`.
|
||||||
|
|
||||||
|
hop_length : int > 0 [scalar]
|
||||||
|
Number audio of frames between STFT columns.
|
||||||
|
|
||||||
|
num_iters: int > 0 [scalar]
|
||||||
|
Number of iterations of Griffin-Lim Algorithm.
|
||||||
|
|
||||||
|
phase : np.ndarray [shape=(1 + n_fft/2, t)]
|
||||||
|
Initial phase spectrogram.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
wav : np.ndarray [shape=(n,)]
|
||||||
|
The real-valued waveform.
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert (num_iters > 0)
|
||||||
|
if phase is None:
|
||||||
|
phase = np.pi * np.random.rand(*mag.shape)
|
||||||
|
stft = mag * np.exp(1.j * phase)
|
||||||
|
wav = None
|
||||||
|
for i in range(num_iters):
|
||||||
|
wav = librosa.istft(stft, win_length=win_length, hop_length=hop_length)
|
||||||
|
if i != num_iters - 1:
|
||||||
|
stft = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
|
||||||
|
_, phase = librosa.magphase(stft)
|
||||||
|
phase = np.angle(phase)
|
||||||
|
a, b = phase.shape
|
||||||
|
# phase = phase.reshape(a, b, 1)
|
||||||
|
phase = phase.reshape(a, b)
|
||||||
|
stft = mag * np.exp(1.j * phase)
|
||||||
|
return wav
|
||||||
|
|
Loading…
Reference in New Issue