Python源码示例:tarfile.extract()

示例1
def sandboxed_tar_extract(self, sandbox, tarfile, first=None):
        """Extract those members that are below the tarfile path 'sandbox'.

        The tarfile module official doc warns against attacks with .. in tar.

        The option to start with a first member is useful for this case, since
        the recipe consumes a first member in the tar file to get the odoo
        main directory in parts.
        It is taken for granted that this first member has already been
        checked.
        """

        if first is not None:
            tarfile.extract(first)

        for tinfo in tarfile:
            if tinfo.name.startswith(sandbox + '/'):
                tarfile.extract(tinfo)
            else:
                logger.warn('Tarball member %r is outside of %r. Ignored.',
                            tinfo, sandbox) 
示例2
def generator(self, data_dir, tmp_dir, datasets,
                eos_list=None, start_from=0, how_many=0):
    del eos_list
    i = 0
    for url, subdir in datasets:
      filename = os.path.basename(url)
      compressed_file = generator_utils.maybe_download(tmp_dir, filename, url)

      read_type = "r:gz" if filename.endswith("tgz") else "r"
      with tarfile.open(compressed_file, read_type) as corpus_tar:
        # Create a subset of files that don't already exist.
        #   tarfile.extractall errors when encountering an existing file
        #   and tarfile.extract is extremely slow
        members = []
        for f in corpus_tar:
          if not os.path.isfile(os.path.join(tmp_dir, f.name)):
            members.append(f)
        corpus_tar.extractall(tmp_dir, members=members)

      data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
      data_files = _collect_data(data_dir, "flac", "txt")
      data_pairs = data_files.values()

      encoders = self.feature_encoders(None)
      audio_encoder = encoders["waveforms"]
      text_encoder = encoders["targets"]

      for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]:
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        spk_id, unused_book_id, _ = utt_id.split("-")
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": [spk_id],
        } 
示例3
def generator(self, data_dir, tmp_dir, datasets,
                eos_list=None, start_from=0, how_many=0):
    del eos_list
    i = 0
    for url, subdir in datasets:
      filename = os.path.basename(url)
      compressed_file = generator_utils.maybe_download(tmp_dir, filename, url)

      read_type = "r:gz" if filename.endswith("tgz") else "r"
      with tarfile.open(compressed_file, read_type) as corpus_tar:
        # Create a subset of files that don't already exist.
        #   tarfile.extractall errors when encountering an existing file
        #   and tarfile.extract is extremely slow
        members = []
        for f in corpus_tar:
          if not os.path.isfile(os.path.join(tmp_dir, f.name)):
            members.append(f)
        corpus_tar.extractall(tmp_dir, members=members)

      raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
      data_files = _collect_data(raw_data_dir, "flac", "txt")
      data_pairs = data_files.values()

      encoders = self.feature_encoders(data_dir)
      audio_encoder = encoders["waveforms"]
      text_encoder = encoders["targets"]

      for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]:
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        spk_id, unused_book_id, _ = utt_id.split("-")
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": [spk_id],
        } 
示例4
def generator(self, data_dir, tmp_dir, datasets,
                eos_list=None, start_from=0, how_many=0):
    del eos_list
    i = 0
    for url, subdir in datasets:
      filename = os.path.basename(url)
      compressed_file = generator_utils.maybe_download(tmp_dir, filename, url)

      read_type = "r:gz" if filename.endswith("tgz") else "r"
      with tarfile.open(compressed_file, read_type) as corpus_tar:
        # Create a subset of files that don't already exist.
        #   tarfile.extractall errors when encountering an existing file
        #   and tarfile.extract is extremely slow
        members = []
        for f in corpus_tar:
          if not os.path.isfile(os.path.join(tmp_dir, f.name)):
            members.append(f)
        corpus_tar.extractall(tmp_dir, members=members)

      raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
      data_files = _collect_data(raw_data_dir, "flac", "txt")
      data_pairs = data_files.values()

      encoders = self.feature_encoders(data_dir)
      audio_encoder = encoders["waveforms"]
      text_encoder = encoders["targets"]

      for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]:
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        spk_id, unused_book_id, _ = utt_id.split("-")
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": [spk_id],
        } 
示例5
def generator(self, data_dir, tmp_dir, datasets,
                eos_list=None, start_from=0, how_many=0):
    del eos_list
    i = 0
    for url, subdir in datasets:
      filename = os.path.basename(url)
      compressed_file = generator_utils.maybe_download(tmp_dir, filename, url)

      read_type = "r:gz" if filename.endswith("tgz") else "r"
      with tarfile.open(compressed_file, read_type) as corpus_tar:
        # Create a subset of files that don't already exist.
        #   tarfile.extractall errors when encountering an existing file
        #   and tarfile.extract is extremely slow
        members = []
        for f in corpus_tar:
          if not os.path.isfile(os.path.join(tmp_dir, f.name)):
            members.append(f)
        corpus_tar.extractall(tmp_dir, members=members)

      raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
      data_files = _collect_data(raw_data_dir, "flac", "txt")
      data_pairs = data_files.values()

      encoders = self.feature_encoders(data_dir)
      audio_encoder = encoders["waveforms"]
      text_encoder = encoders["targets"]

      for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]:
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        spk_id, unused_book_id, _ = utt_id.split("-")
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": [spk_id],
        } 
示例6
def generator(self, data_dir, tmp_dir, datasets,
                eos_list=None, start_from=0, how_many=0):
    del eos_list
    i = 0
    for url, subdir in datasets:
      filename = os.path.basename(url)
      compressed_file = generator_utils.maybe_download(tmp_dir, filename, url)

      read_type = "r:gz" if filename.endswith("tgz") else "r"
      with tarfile.open(compressed_file, read_type) as corpus_tar:
        # Create a subset of files that don't already exist.
        #   tarfile.extractall errors when encountering an existing file
        #   and tarfile.extract is extremely slow
        members = []
        for f in corpus_tar:
          if not os.path.isfile(os.path.join(tmp_dir, f.name)):
            members.append(f)
        corpus_tar.extractall(tmp_dir, members=members)

      raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
      data_files = _collect_data(raw_data_dir, "flac", "txt")
      data_pairs = data_files.values()

      encoders = self.feature_encoders(data_dir)
      audio_encoder = encoders["waveforms"]
      text_encoder = encoders["targets"]

      for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]:
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        spk_id, unused_book_id, _ = utt_id.split("-")
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": [spk_id],
        } 
示例7
def cleanup_odoo_dir(self):
        """Revert local modifications that have been made during installation.

        These can be, e.g., forbidden by the freeze process."""

        # from here we can't guess whether it's 'odoo' or 'odoo'.
        # Nothing guarantees that this method is called after develop().
        # It is in practice now, but one day, the extraction as a separate
        # script of freeze/extract will become a reality.
        for proj_name in ('openerp', 'odoo'):
            egg_info_dir = join(self.odoo_dir, proj_name + '.egg-info')
            if os.path.exists(egg_info_dir):
                shutil.rmtree(egg_info_dir) 
示例8
def generator(self,
                data_dir,
                tmp_dir,
                datasets,
                eos_list=None,
                start_from=0,
                how_many=0):
    del eos_list
    i = 0

    filename = os.path.basename(_COMMONVOICE_URL)
    compressed_file = generator_utils.maybe_download(tmp_dir, filename,
                                                     _COMMONVOICE_URL)

    read_type = "r:gz" if filename.endswith(".tgz") else "r"
    with tarfile.open(compressed_file, read_type) as corpus_tar:
      # Create a subset of files that don't already exist.
      #   tarfile.extractall errors when encountering an existing file
      #   and tarfile.extract is extremely slow. For security, check that all
      #   paths are relative.
      members = [
          f for f in corpus_tar if _is_relative(tmp_dir, f.name) and
          not _file_exists(tmp_dir, f.name)
      ]
      corpus_tar.extractall(tmp_dir, members=members)

    data_dir = os.path.join(tmp_dir, "cv_corpus_v1")
    data_tuples = _collect_data(data_dir)
    encoders = self.feature_encoders(None)
    audio_encoder = encoders["waveforms"]
    text_encoder = encoders["targets"]
    for dataset in datasets:
      data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset))
      for utt_id, media_file, text_data in tqdm.tqdm(
          sorted(data_tuples)[start_from:]):
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": ["unknown"],
        } 
示例9
def generator(self,
                data_dir,
                tmp_dir,
                datasets,
                eos_list=None,
                start_from=0,
                how_many=0):
    del eos_list
    i = 0

    filename = os.path.basename(_COMMONVOICE_URL)
    compressed_file = generator_utils.maybe_download(tmp_dir, filename,
                                                     _COMMONVOICE_URL)

    read_type = "r:gz" if filename.endswith(".tgz") else "r"
    with tarfile.open(compressed_file, read_type) as corpus_tar:
      # Create a subset of files that don't already exist.
      #   tarfile.extractall errors when encountering an existing file
      #   and tarfile.extract is extremely slow. For security, check that all
      #   paths are relative.
      members = [
          f for f in corpus_tar if _is_relative(tmp_dir, f.name) and
          not _file_exists(tmp_dir, f.name)
      ]
      corpus_tar.extractall(tmp_dir, members=members)

    raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1")
    data_tuples = _collect_data(raw_data_dir)
    encoders = self.feature_encoders(data_dir)
    audio_encoder = encoders["waveforms"]
    text_encoder = encoders["targets"]
    for dataset in datasets:
      data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset))
      for utt_id, media_file, text_data in tqdm.tqdm(
          sorted(data_tuples)[start_from:]):
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": ["unknown"],
        } 
示例10
def generator(self,
                data_dir,
                tmp_dir,
                datasets,
                eos_list=None,
                start_from=0,
                how_many=0):
    del eos_list
    i = 0

    filename = os.path.basename(_COMMONVOICE_URL)
    compressed_file = generator_utils.maybe_download(tmp_dir, filename,
                                                     _COMMONVOICE_URL)

    read_type = "r:gz" if filename.endswith(".tgz") else "r"
    with tarfile.open(compressed_file, read_type) as corpus_tar:
      # Create a subset of files that don't already exist.
      #   tarfile.extractall errors when encountering an existing file
      #   and tarfile.extract is extremely slow. For security, check that all
      #   paths are relative.
      members = [
          f for f in corpus_tar if _is_relative(tmp_dir, f.name) and
          not _file_exists(tmp_dir, f.name)
      ]
      corpus_tar.extractall(tmp_dir, members=members)

    raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1")
    data_tuples = _collect_data(raw_data_dir)
    encoders = self.feature_encoders(data_dir)
    audio_encoder = encoders["waveforms"]
    text_encoder = encoders["targets"]
    for dataset in datasets:
      data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset))
      for utt_id, media_file, text_data in tqdm.tqdm(
          sorted(data_tuples)[start_from:]):
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": ["unknown"],
        } 
示例11
def generator(self,
                data_dir,
                tmp_dir,
                datasets,
                eos_list=None,
                start_from=0,
                how_many=0):
    del eos_list
    i = 0

    filename = os.path.basename(_COMMONVOICE_URL)
    compressed_file = generator_utils.maybe_download(tmp_dir, filename,
                                                     _COMMONVOICE_URL)

    read_type = "r:gz" if filename.endswith(".tgz") else "r"
    with tarfile.open(compressed_file, read_type) as corpus_tar:
      # Create a subset of files that don't already exist.
      #   tarfile.extractall errors when encountering an existing file
      #   and tarfile.extract is extremely slow. For security, check that all
      #   paths are relative.
      members = [
          f for f in corpus_tar if _is_relative(tmp_dir, f.name) and
          not _file_exists(tmp_dir, f.name)
      ]
      corpus_tar.extractall(tmp_dir, members=members)

    raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1")
    data_tuples = _collect_data(raw_data_dir)
    encoders = self.feature_encoders(data_dir)
    audio_encoder = encoders["waveforms"]
    text_encoder = encoders["targets"]
    for dataset in datasets:
      data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset))
      for utt_id, media_file, text_data in tqdm.tqdm(
          sorted(data_tuples)[start_from:]):
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": ["unknown"],
        } 
示例12
def read_odoo_setup(self):
        """Ugly method to extract requirements & version from ugly setup.py.

        Primarily designed for 6.0, but works with 6.1 as well.
        """
        old_setup = setuptools.setup
        old_distutils_setup = distutils.core.setup  # 5.0 directly imports this

        def new_setup(*args, **kw):
            self.requirements.extend(kw.get('install_requires', ()))
            self.version_detected = kw['version']
        setuptools.setup = new_setup
        distutils.core.setup = new_setup
        sys.path.insert(0, '.')
        with open(join(self.odoo_dir, 'setup.py'), 'rb') as f:
            saved_argv = sys.argv
            sys.argv = ['setup.py', 'develop']
            try:
                imp.load_module('setup', f, 'setup.py',
                                ('.py', 'r', imp.PY_SOURCE))
            except SystemExit as exception:
                if 'dsextras' in unicode(exception):
                    raise EnvironmentError(
                        'Please first install PyGObject and PyGTK !')
                else:
                    try:
                        self.read_release()
                    except Exception as exc:
                        raise EnvironmentError(
                            'Problem while reading Odoo release.py: %s' % exc)
            except ImportError as exception:
                if 'babel' in unicode(exception):
                    raise EnvironmentError(
                        'OpenERP setup.py has an unwanted import Babel.\n'
                        '=> First install Babel on your system or '
                        'virtualenv :(\n'
                        '(sudo aptitude install python-babel, '
                        'or pip install babel)')
                else:
                    raise exception
            except Exception as exception:
                raise EnvironmentError('Problem while reading Odoo '
                                       'setup.py: %s' % exception)
            finally:
                sys.argv = saved_argv
        sys.path.pop(0)
        setuptools.setup = old_setup
        distutils.core.setup = old_distutils_setup
        self.apply_version_dependent_decisions()