Python源码示例:hparams.hparams.sample_rate()

示例1
def main():
    data_foler = "data"
    wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")]
    outputs_lws = [file + ".lws.gen.wav" for file in wavs]
    wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs]

    lws_processor = lws.lws(512, 128, mode="speech")  # 512: window length; 128: window shift
    i = 0
    for x in wavs:
        X = lws_processor.stft(x)  # where x is a single-channel waveform
        X0 = np.abs(X)  # Magnitude spectrogram
        print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0)))
        X1 = lws_processor.run_lws(
            X0)  # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
        print(X1.shape)
        print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1)))
        print(X1.shape)
        wav = lws_processor.istft(X1).astype(np.float32)

        audio.save_wav(wav, outputs_lws[i])
        i += 1 
示例2
def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text) 
示例3
def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text) 
示例4
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T,hparams)
    save_wav(waveform, audio_path,hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False) 
示例5
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
    # seed의 앞부분만 사용한다.
    seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    seed_audio = audio.trim_silence(seed_audio, hparams)
    if scalar_input:
        if len(seed_audio) < window_size:
            return seed_audio
        else: return seed_audio[:window_size]
    else:
        quantized = mu_law_encode(seed_audio, quantization_channels)
    
    
        # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
        cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
    
        return quantized[:cut_index] 
示例6
def evaluate_model(model, data_loader, checkpoint_dir, limit_eval_to=5):
    """evaluate model and save generated wav and plot

    """
    test_path = data_loader.dataset.test_path
    test_files = os.listdir(test_path)
    counter = 0
    output_dir = os.path.join(checkpoint_dir,'eval')
    for f in test_files:
        if f[-7:] == "mel.npy":
            mel = np.load(os.path.join(test_path,f))
            wav = model.generate(mel)
            # save wav
            wav_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.wav".format(global_step,counter))
            librosa.output.write_wav(wav_path, wav, sr=hp.sample_rate)
            # save wav plot
            fig_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.png".format(global_step,counter))
            fig = plt.plot(wav.reshape(-1))
            plt.savefig(fig_path)
            # clear fig to drawing to the same plot
            plt.clf()
            counter += 1
        # stop evaluation early via limit_eval_to
        if counter >= limit_eval_to:
            break 
示例7
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
    # seed의 앞부분만 사용한다.
    seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    seed_audio = audio.trim_silence(seed_audio, hparams)
    if scalar_input:
        if len(seed_audio) < window_size:
            return seed_audio
        else: return seed_audio[:window_size]
    else:
        quantized = mu_law_encode(seed_audio, quantization_channels)
    
    
        # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
        cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
    
        return quantized[:cut_index] 
示例8
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T,hparams)
    save_wav(waveform, audio_path,hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False) 
示例9
def mel_synthesis(out_dir='wav_griffi_syn',in_dir='mel'):
    os.makedirs(out_dir, exist_ok=True)

    #mel_file = os.path.join(mel_folder, mel_file)
    mel_filenames=[x.split('.')[0] for x in os.listdir(in_dir)]
    start_time=time.time()
    for mel_file in mel_filenames:
        try:
            print('process {}'.format(mel_file))
            mel_file_path = os.path.join('training_data/mels', 'mel-{}.wav.npy'.format(mel_file))
            mel_spectro = np.load(mel_file_path)
            wav = inv_mel_spectrogram(mel_spectro.T, hparams)
            # save the wav under test_<folder>_<file>
            save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(
                mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
                     sr=hparams.sample_rate)
        except:
            print('{} error'.format(mel_file))

    print('griffin-lim :{}'.format(time.time()-start_time)) 
示例10
def save_wav(wav, path):
    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
    wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 
示例11
def _build_mel_basis():
    n_fft = (hparams.num_freq - 1) * 2
    return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) 
示例12
def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size 
示例13
def spectrogram2wav(spectrogram, n_iter=hparams.griffin_lim_iters, n_fft=(hparams.num_freq - 1) * 2,
                    win_length=int(hparams.frame_length_ms / 1000 * hparams.sample_rate),
                    hop_length=int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)):
    '''Converts spectrogram into a waveform using Griffin-lim's raw.
    '''

    def invert_spectrogram(spectrogram):
        '''
        spectrogram: [t, f]
        '''
        spectrogram = tf.expand_dims(spectrogram, 0)
        inversed = tf.contrib.signal.inverse_stft(spectrogram, win_length, hop_length, n_fft)
        squeezed = tf.squeeze(inversed, 0)
        return squeezed

    spectrogram = tf.transpose(spectrogram)

    spectrogram = tf.cast(spectrogram, dtype=tf.complex64)  # [t, f]
    X_best = tf.identity(spectrogram)
    for i in range(n_iter):
        X_t = invert_spectrogram(X_best)
        est = tf.contrib.signal.stft(X_t, win_length, hop_length, n_fft, pad_end=False)  # (1, T, n_fft/2+1)
        phase = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)  # [t, f]
        X_best = spectrogram * phase  # [t, t]
    X_t = invert_spectrogram(X_best)
    y = tf.real(X_t)

    return y 
示例14
def load_wav(path):
  return librosa.core.load(path, sr=hparams.sample_rate)[0] 
示例15
def save_wav(wav, path):
  wav *= 32767 / max(0.01, np.max(np.abs(wav)))
  scipy.io.wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 
示例16
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
  window_length = int(hparams.sample_rate * min_silence_sec)
  hop_length = int(window_length / 4)
  threshold = _db_to_amp(threshold_db)
  for x in range(hop_length, len(wav) - window_length, hop_length):
    if np.max(wav[x:x+window_length]) < threshold:
      return x + hop_length
  return len(wav) 
示例17
def _stft_parameters():
  n_fft = (hparams.num_freq - 1) * 2
  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
  return n_fft, hop_length, win_length


# Conversions: 
示例18
def _build_mel_basis():
  n_fft = (hparams.num_freq - 1) * 2
  return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) 
示例19
def write_metadata(metadata, out_dir):
	with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
		for m in metadata:
			f.write('|'.join([str(x) for x in m]) + '\n')
	mel_frames = sum([int(m[4]) for m in metadata])
	timesteps = sum([int(m[3]) for m in metadata])
	sr = hparams.sample_rate
	hours = timesteps / sr / 3600
	print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
		len(metadata), mel_frames, timesteps, hours))
	print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
	print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
	print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 
示例20
def load_wav(path):
	return librosa.core.load(path, sr=hparams.sample_rate)[0] 
示例21
def save_wav(wav, path):
	wav *= 32767 / max(0.01, np.max(np.abs(wav))) 
	#proposed by @dsmiller
	wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 
示例22
def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size 
示例23
def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax) 
示例24
def load_wav(path):
	return librosa.core.load(path, sr=hparams.sample_rate)[0] 
示例25
def save_wav(wav, path):
	wav *= 32767 / max(0.01, np.max(np.abs(wav)))
	librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate) 
示例26
def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size 
示例27
def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax) 
示例28
def __init__(self, coordinator, metadata_filename, hparams):
		super(Feeder, self).__init__()
		self._coord = coordinator
		self._hparams = hparams
		self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		self._offset = 0

		# Load metadata
		self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
		self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
		with open(metadata_filename, encoding='utf-8') as f:
			self._metadata = [line.strip().split('|') for line in f]
			frame_shift_ms = hparams.hop_size / hparams.sample_rate
			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))

		# Create placeholders for inputs and targets. Don't specify batch size because we want
		# to be able to feed different batch sizes at eval time.
		self._placeholders = [
		tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
		tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
		tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
		tf.placeholder(tf.int32,[None],'mel_lengths'),
		tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
		tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'),
		]

		# Create queue for buffering data
		queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue')
		self._enqueue_op = queue.enqueue(self._placeholders)
		self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue()
		self.inputs.set_shape(self._placeholders[0].shape)
		self.input_lengths.set_shape(self._placeholders[1].shape)
		self.mel_targets.set_shape(self._placeholders[2].shape)
		self.mel_lengths.set_shape(self._placeholders[3].shape)
		self.token_targets.set_shape(self._placeholders[4].shape)
		self.linear_targets.set_shape(self._placeholders[5].shape) 
示例29
def load_wav(path):
  return librosa.core.load(path, sr=hparams.sample_rate)[0] 
示例30
def save_wav(wav, path):
  wav *= 32767 / max(0.01, np.max(np.abs(wav)))
  # librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
  wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))