Python源码示例:hparams.hparams.sample_rate()
示例1
def main():
data_foler = "data"
wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")]
outputs_lws = [file + ".lws.gen.wav" for file in wavs]
wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs]
lws_processor = lws.lws(512, 128, mode="speech") # 512: window length; 128: window shift
i = 0
for x in wavs:
X = lws_processor.stft(x) # where x is a single-channel waveform
X0 = np.abs(X) # Magnitude spectrogram
print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0)))
X1 = lws_processor.run_lws(
X0) # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
print(X1.shape)
print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1)))
print(X1.shape)
wav = lws_processor.istft(X1).astype(np.float32)
audio.save_wav(wav, outputs_lws[i])
i += 1
示例2
def _process_utterance(out_dir, index, wav_path, labels_path, text):
# Load the wav file and trim silence from the ends:
wav = audio.load_wav(wav_path)
start_offset, end_offset = _parse_labels(labels_path)
start = int(start_offset * hparams.sample_rate)
end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
wav = wav[start:end]
max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
if len(wav) > max_samples:
return None
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
spectrogram_filename = 'blizzard-spec-%05d.npy' % index
mel_filename = 'blizzard-mel-%05d.npy' % index
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
return (spectrogram_filename, mel_filename, n_frames, text)
示例3
def _process_utterance(out_dir, index, wav_path, labels_path, text):
# Load the wav file and trim silence from the ends:
wav = audio.load_wav(wav_path)
start_offset, end_offset = _parse_labels(labels_path)
start = int(start_offset * hparams.sample_rate)
end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
wav = wav[start:end]
max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
if len(wav) > max_samples:
return None
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
spectrogram_filename = 'blizzard-spec-%05d.npy' % index
mel_filename = 'blizzard-mel-%05d.npy' % index
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
return (spectrogram_filename, mel_filename, n_frames, text)
示例4
def save_and_plot_fn(args, log_dir, step, loss, prefix):
idx, (seq, spec, align) = args
audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))
waveform = inv_spectrogram(spec.T,hparams)
save_wav(waveform, audio_path,hparams.sample_rate)
info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
log('Training korean : Use jamo')
plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
else:
log('Training non-korean : X use jamo')
plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False)
示例5
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
# seed의 앞부분만 사용한다.
seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
seed_audio = audio.trim_silence(seed_audio, hparams)
if scalar_input:
if len(seed_audio) < window_size:
return seed_audio
else: return seed_audio[:window_size]
else:
quantized = mu_law_encode(seed_audio, quantization_channels)
# 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
return quantized[:cut_index]
示例6
def evaluate_model(model, data_loader, checkpoint_dir, limit_eval_to=5):
"""evaluate model and save generated wav and plot
"""
test_path = data_loader.dataset.test_path
test_files = os.listdir(test_path)
counter = 0
output_dir = os.path.join(checkpoint_dir,'eval')
for f in test_files:
if f[-7:] == "mel.npy":
mel = np.load(os.path.join(test_path,f))
wav = model.generate(mel)
# save wav
wav_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.wav".format(global_step,counter))
librosa.output.write_wav(wav_path, wav, sr=hp.sample_rate)
# save wav plot
fig_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.png".format(global_step,counter))
fig = plt.plot(wav.reshape(-1))
plt.savefig(fig_path)
# clear fig to drawing to the same plot
plt.clf()
counter += 1
# stop evaluation early via limit_eval_to
if counter >= limit_eval_to:
break
示例7
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
# seed의 앞부분만 사용한다.
seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
seed_audio = audio.trim_silence(seed_audio, hparams)
if scalar_input:
if len(seed_audio) < window_size:
return seed_audio
else: return seed_audio[:window_size]
else:
quantized = mu_law_encode(seed_audio, quantization_channels)
# 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
return quantized[:cut_index]
示例8
def save_and_plot_fn(args, log_dir, step, loss, prefix):
idx, (seq, spec, align) = args
audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))
waveform = inv_spectrogram(spec.T,hparams)
save_wav(waveform, audio_path,hparams.sample_rate)
info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
log('Training korean : Use jamo')
plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
else:
log('Training non-korean : X use jamo')
plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False)
示例9
def mel_synthesis(out_dir='wav_griffi_syn',in_dir='mel'):
os.makedirs(out_dir, exist_ok=True)
#mel_file = os.path.join(mel_folder, mel_file)
mel_filenames=[x.split('.')[0] for x in os.listdir(in_dir)]
start_time=time.time()
for mel_file in mel_filenames:
try:
print('process {}'.format(mel_file))
mel_file_path = os.path.join('training_data/mels', 'mel-{}.wav.npy'.format(mel_file))
mel_spectro = np.load(mel_file_path)
wav = inv_mel_spectrogram(mel_spectro.T, hparams)
# save the wav under test_<folder>_<file>
save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(
mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
sr=hparams.sample_rate)
except:
print('{} error'.format(mel_file))
print('griffin-lim :{}'.format(time.time()-start_time))
示例10
def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
示例11
def _build_mel_basis():
n_fft = (hparams.num_freq - 1) * 2
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
示例12
def get_hop_size():
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
示例13
def spectrogram2wav(spectrogram, n_iter=hparams.griffin_lim_iters, n_fft=(hparams.num_freq - 1) * 2,
win_length=int(hparams.frame_length_ms / 1000 * hparams.sample_rate),
hop_length=int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)):
'''Converts spectrogram into a waveform using Griffin-lim's raw.
'''
def invert_spectrogram(spectrogram):
'''
spectrogram: [t, f]
'''
spectrogram = tf.expand_dims(spectrogram, 0)
inversed = tf.contrib.signal.inverse_stft(spectrogram, win_length, hop_length, n_fft)
squeezed = tf.squeeze(inversed, 0)
return squeezed
spectrogram = tf.transpose(spectrogram)
spectrogram = tf.cast(spectrogram, dtype=tf.complex64) # [t, f]
X_best = tf.identity(spectrogram)
for i in range(n_iter):
X_t = invert_spectrogram(X_best)
est = tf.contrib.signal.stft(X_t, win_length, hop_length, n_fft, pad_end=False) # (1, T, n_fft/2+1)
phase = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) # [t, f]
X_best = spectrogram * phase # [t, t]
X_t = invert_spectrogram(X_best)
y = tf.real(X_t)
return y
示例14
def load_wav(path):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
示例15
def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
scipy.io.wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
示例16
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(hparams.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = _db_to_amp(threshold_db)
for x in range(hop_length, len(wav) - window_length, hop_length):
if np.max(wav[x:x+window_length]) < threshold:
return x + hop_length
return len(wav)
示例17
def _stft_parameters():
n_fft = (hparams.num_freq - 1) * 2
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
return n_fft, hop_length, win_length
# Conversions:
示例18
def _build_mel_basis():
n_fft = (hparams.num_freq - 1) * 2
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
示例19
def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sr = hparams.sample_rate
hours = timesteps / sr / 3600
print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
len(metadata), mel_frames, timesteps, hours))
print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
示例20
def load_wav(path):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
示例21
def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
#proposed by @dsmiller
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
示例22
def get_hop_size():
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
示例23
def _build_mel_basis():
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
fmin=hparams.fmin, fmax=hparams.fmax)
示例24
def load_wav(path):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
示例25
def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
示例26
def get_hop_size():
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
示例27
def _build_mel_basis():
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
fmin=hparams.fmin, fmax=hparams.fmax)
示例28
def __init__(self, coordinator, metadata_filename, hparams):
super(Feeder, self).__init__()
self._coord = coordinator
self._hparams = hparams
self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
self._offset = 0
# Load metadata
self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
with open(metadata_filename, encoding='utf-8') as f:
self._metadata = [line.strip().split('|') for line in f]
frame_shift_ms = hparams.hop_size / hparams.sample_rate
hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))
# Create placeholders for inputs and targets. Don't specify batch size because we want
# to be able to feed different batch sizes at eval time.
self._placeholders = [
tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
tf.placeholder(tf.int32,[None],'mel_lengths'),
tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'),
]
# Create queue for buffering data
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue')
self._enqueue_op = queue.enqueue(self._placeholders)
self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue()
self.inputs.set_shape(self._placeholders[0].shape)
self.input_lengths.set_shape(self._placeholders[1].shape)
self.mel_targets.set_shape(self._placeholders[2].shape)
self.mel_lengths.set_shape(self._placeholders[3].shape)
self.token_targets.set_shape(self._placeholders[4].shape)
self.linear_targets.set_shape(self._placeholders[5].shape)
示例29
def load_wav(path):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
示例30
def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
# librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))