Python源码示例:joblib.delayed()

示例1
def get_graph_stats(graph_obj_handle, prop='degrees'):
    # if prop == 'degrees':
    num_cores = multiprocessing.cpu_count()
    inputs = [int(i*len(graph_obj_handle)/num_cores) for i in range(num_cores)] + [len(graph_obj_handle)]
    res = Parallel(n_jobs=num_cores)(delayed(get_values)(graph_obj_handle, inputs[i], inputs[i+1], prop) for i in range(num_cores))

    stat_dict = {}

    if 'degrees' in prop:
        stat_dict['degrees'] = list(set([d for core_res in res for file_res in core_res for d in file_res['degrees']]))
    if 'edge_labels' in prop:
        stat_dict['edge_labels'] = list(set([d for core_res in res for file_res in core_res for d in file_res['edge_labels']]))
    if 'target_mean' in prop or 'target_std' in prop:
        param = np.array([file_res['params'] for core_res in res for file_res in core_res])
    if 'target_mean' in prop:
        stat_dict['target_mean'] = np.mean(param, axis=0)
    if 'target_std' in prop:
        stat_dict['target_std'] = np.std(param, axis=0)

    return stat_dict 
示例2
def __init__(self, path, split, tokenizer, bucket_size, ascending=False):
        # Setup
        self.path = path
        self.bucket_size = bucket_size

        # List all wave files
        file_list = []
        for s in split:
            split_list = list(Path(join(path, s)).rglob("*.flac"))
            assert len(split_list) > 0, "No data found @ {}".format(join(path,s))
            file_list += split_list
        # Read text
        text = Parallel(n_jobs=READ_FILE_THREADS)(
            delayed(read_text)(str(f)) for f in file_list)
        #text = Parallel(n_jobs=-1)(delayed(tokenizer.encode)(txt) for txt in text)
        text = [tokenizer.encode(txt) for txt in text]

        # Sort dataset by text length
        #file_len = Parallel(n_jobs=READ_FILE_THREADS)(delayed(getsize)(f) for f in file_list)
        self.file_list, self.text = zip(*[(f_name, txt)
                                          for f_name, txt in sorted(zip(file_list, text), reverse=not ascending, key=lambda x:len(x[1]))]) 
示例3
def partial_fit(self, X, y, classes=None):
        if self.partial_method == "gamma":
            w_all = -np.log(self
                            .random_state
                            .random(size=(X.shape[0], self.nsamples))
                            .clip(min=1e-12, max=None))
            appear_times = None
            rng = None
        elif self.partial_method == "poisson":
            w_all = None
            appear_times = self.random_state.poisson(1, size = (X.shape[0], self.nsamples))
            rng = np.arange(X.shape[0])
        else:
            raise ValueError(_unexpected_err_msg)
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._partial_fit_single)\
                    (sample, w_all, appear_times, rng, X, y) \
                        for sample in range(self.nsamples)) 
示例4
def next_minibatch(self):

        image_filenames_minibatch = self.image_filenames[self.current_index: self.current_index + self.minibatch_size]
        label_filenames_minibatch = self.label_filenames[self.current_index: self.current_index + self.minibatch_size]
        self.current_index += self.minibatch_size
        if self.current_index >= self.dataset_size:
            self.current_index = 0

        # Multithread image processing
        # Reference: https://www.kaggle.com/inoryy/fast-image-pre-process-in-parallel

        results = Parallel(n_jobs=self.num_jobs)(delayed(self.process_func)(image_filename, label_filename) for image_filename, label_filename in zip(image_filenames_minibatch, label_filenames_minibatch))
        images, labels = zip(*results)

        images = np.asarray(images)
        labels = np.asarray(labels)

        return images, labels 
示例5
def main_kinetics400(v_root, f_root, dim=150):
    print('extracting Kinetics400 ... ')
    for basename in ['train_split', 'val_split']:
        v_root_real = v_root + '/' + basename
        if not os.path.exists(v_root_real):
            print('Wrong v_root'); sys.exit()
        f_root_real = '/scratch/local/ssd/htd/kinetics400/frame_full' + '/' + basename 
        print('Extract to: \nframe: %s' % f_root_real)
        if not os.path.exists(f_root_real): os.makedirs(f_root_real)
        v_act_root = glob.glob(os.path.join(v_root_real, '*/'))
        v_act_root = sorted(v_act_root)

        # if resume, remember to delete the last video folder
        for i, j in tqdm(enumerate(v_act_root), total=len(v_act_root)):
            v_paths = glob.glob(os.path.join(j, '*.mp4'))
            v_paths = sorted(v_paths)
            # for resume:
            v_class = j.split('/')[-2]
            out_dir = os.path.join(f_root_real, v_class)
            if os.path.exists(out_dir): print(out_dir, 'exists!'); continue
            print('extracting: %s' % v_class)
            # dim = 150 (crop to 128 later) or 256 (crop to 224 later)
            Parallel(n_jobs=32)(delayed(extract_video_opencv)(p, f_root_real, dim=dim) for p in tqdm(v_paths, total=len(v_paths))) 
示例6
def fit(self,X):
        def func(ss):
            length = len(ss.unique())
            if length <= 1:
                return True
            else:
                return False
            
        df = X.data
        todo_cols = X.cat_cols + X.multi_cat_cols + X.num_cols + X.time_cols + X.binary_cols
        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
        
        drop_cols = []
        for col,unique in zip(todo_cols,res):
            if unique:
                drop_cols.append(col)
        
        self.drop_cols = drop_cols 
示例7
def fit(self,X):
        def func(ss):
            length = len(ss.unique())
            if length >= len(ss)-10:
                return True
            else:  
                return False
        
        df = X.data
        todo_cols = X.cat_cols
        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
        
        drop_cols = []
        for col,all_diff in zip(todo_cols,res):
            if all_diff:
                drop_cols.append(col)
        
        self.drop_cols = drop_cols 
示例8
def recognize_binary_col(self,data,cat_cols):
        def func(ss):
            ss = ss.unique()
            if len(ss) == 3:
                if pd.isna(ss).sum() == 1:
                    return True
            if len(ss) == 2:
                return True
            return False
        
        binary_cols = []
        
        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[col]) for col in cat_cols)
        
        for col,is_binary in zip(cat_cols,res):
            if is_binary:
                binary_cols.append(col)
        
        return binary_cols 
示例9
def prefer_parallel_execution(functions_to_be_called):  # pragma: no cover
    try:
        import joblib
        import multiprocessing
    except ImportError:
        print('Joblib not installed, switching to serial execution')
        [run_function(fn) for fn in functions_to_be_called]
    else:
        try:
            import tqdm
        except ImportError:
            inputs = functions_to_be_called
        else:
            inputs = tqdm.tqdm(functions_to_be_called)
        n_jobs = multiprocessing.cpu_count()
        print('Parallelizing execution using Joblib')
        joblib.Parallel(n_jobs=n_jobs)(
                joblib.delayed(run_function)(fn) for fn in inputs) 
示例10
def parallelize(bucket, only, _except, fn, args=(), versions=False):
    bucket = s3().Bucket(bucket)

    # use prefix for performance
    prefix = None
    if only:
        # get the first prefix before wildcard
        prefix = '/'.join(only.split('*')[0].split('/')[:-1])
        if prefix:
            prefix = prefix + '/'

    if versions:
        object_versions = bucket.object_versions.filter(Prefix=prefix) if prefix else bucket.object_versions.all()
        # delete markers have no size
        return Parallel(n_jobs=24)(delayed(fn)(bucket.name, ov.object_key, ov.id, *args) for ov in object_versions if object_matches(ov.object_key, only, _except) and not ov.is_latest and ov.size is not None)
    else:
        objects = bucket.objects.filter(Prefix=prefix) if prefix else bucket.objects.all()

        if only and not '*' in only:
            objects = [s3().Object(bucket, only)]

        return Parallel(n_jobs=24)(delayed(fn)(bucket.name, os.key, *args) for os in objects if object_matches(os.key, only, _except)) 
示例11
def recompute_factors_batched(Y, S, lambda_reg, W=None, X=None,
                              dtype='float32', batch_size=10000, n_jobs=4):
    m = S.shape[0]  # m = number of users
    f = Y.shape[1]  # f = number of factors

    YTY = np.dot(Y.T, Y)  # precompute this
    YTYpR = YTY + lambda_reg * np.eye(f)
    if W is not None:
        WX = lambda_reg * (X.dot(W)).T
    else:
        WX = None
    X_new = np.zeros((m, f), dtype=dtype)

    num_batches = int(np.ceil(m / float(batch_size)))

    res = Parallel(n_jobs=n_jobs)(delayed(solve_batch)(b, S, Y, WX, YTYpR,
                                                       batch_size, m, f, dtype)
                                  for b in xrange(num_batches))
    X_new = np.concatenate(res, axis=0)

    return X_new 
示例12
def convert_video_wapper(src_videos, 
                         dst_videos, 
                         cmd_format,
                         in_parallel=True):
    commands = []
    for src, dst in zip(src_videos, dst_videos):
        cmd = cmd_format.format(src, dst)
        commands.append(cmd)

    logging.info("- {} commonds to excute".format(len(commands)))

    if not in_parallel:
        for i, cmd in enumerate(commands):
            # if i % 100 == 0:
            #     logging.info("{} / {}: '{}'".format(i, len(commands), cmd))
            exe_cmd(cmd=cmd)
    else:
        num_jobs = 24
        logging.info("processing videos in parallel, num_jobs={}".format(num_jobs))
        Parallel(n_jobs=num_jobs)(delayed(exe_cmd)(cmd) for cmd in commands) 
示例13
def fitEnsemble(self, normMean, samples, factor):
        minWindowLength = 5
        maxWindowLength = getMax(samples, self.MAX_WINDOW_LENGTH)
        windows = self.getWindowsBetween(minWindowLength, maxWindowLength)
        self.logger.Log("Windows: %s" % windows)

        correctTraining = 0
        self.results = []

        self.logger.Log("%s  Fitting for a norm of %s" % (self.NAME, str(normMean)))
        Parallel(n_jobs=1, backend="threading")(delayed(self.fitIndividual, check_pickle=False)(normMean, samples, windows, i) for i in range(len(windows)))

        # Find best correctTraining
        for i in range(len(self.results)):
            if self.results[i].correct > correctTraining:
                correctTraining = self.results[i].correct

        # Remove Results that are no longer satisfactory
        new_results = []
        for i in range(len(self.results)):
            if self.results[i].correct >= (correctTraining * factor):
                new_results.append(self.results[i])

        return new_results, correctTraining 
示例14
def fitEnsemble(self, windows, normMean, samples):
        correctTraining = 0
        self.results = []

        self.logger.Log("%s  Fitting for a norm of %s" % (self.NAME, str(normMean)))
        Parallel(n_jobs=1, backend="threading")(delayed(self.fitIndividual, check_pickle=False)(normMean, samples, windows, i) for i in range(len(windows)))

        # Find best correctTraining
        for i in range(len(self.results)):
            if self.results[i].score > correctTraining:
                correctTraining = self.results[i].score

        # Remove Results that are no longer satisfactory
        new_results = []
        self.logger.Log("Stored Models for Norm=%s" % normMean)
        for i in range(len(self.results)):
            if self.results[i].score >= (correctTraining * self.factor):
                self.logger.Log("WindowLength:%s  Features:%s  TrainScore:%s" % (self.results[i].windowLength, self.results[i].features, self.results[i].score))
                new_results.append(self.results[i])

        return new_results 
示例15
def fitEnsemble(self, normMean, samples, factor):
        minWindowLength = 5
        maxWindowLength = getMax(samples, self.MAX_WINDOW_LENGTH)
        windows = self.getWindowsBetween(minWindowLength, maxWindowLength)
        self.logger.Log("Windows: %s" % windows)

        correctTraining = 0
        self.results = []

        self.logger.Log("%s  Fitting for a norm of %s" % (self.NAME, str(normMean)))
        Parallel(n_jobs=-1, backend="threading")(delayed(self.fitIndividual, check_pickle=False)(normMean, samples, windows, i) for i in range(len(windows)))

        # Find best correctTraining
        for i in range(len(self.results)):
            if self.results[i].correct > correctTraining:
                correctTraining = self.results[i].correct

        # Remove Results that are no longer satisfactory
        new_results = []
        for i in range(len(self.results)):
            if self.results[i].correct >= (correctTraining * factor):
                new_results.append(self.results[i])

        return new_results, correctTraining 
示例16
def fitEnsemble(self, NormMean, samples):
        correctTraining = 0
        self.results = []
        self.logger.Log("%s  Fitting for a norm of %s" % (self.NAME, str(NormMean)))

        Parallel(n_jobs=1, backend="threading")(delayed(self.fitIndividual, check_pickle=False)(NormMean, samples, i) for i in range(len(self.windows)))

        #Find best correctTraining
        for i in range(len(self.results)):
            if self.results[i].score > correctTraining:
                correctTraining = self.results[i].score

        self.logger.Log("CorrectTrain for a norm of %s" % (correctTraining))
        # Remove Results that are no longer satisfactory
        new_results = []
        self.logger.Log("Stored Models for Norm=%s" % NormMean)
        for i in range(len(self.results)):
            if self.results[i].score >= (correctTraining * self.factor):
                self.logger.Log("WindowLength:%s  Features:%s  TrainScore:%s" % (self.results[i].windowLength, self.results[i].features, self.results[i].score))
                new_results.append(self.results[i])

        return new_results, correctTraining 
示例17
def build_strain_specific_models(self, joblib=False, cores=1, force_rerun=False):
        """Wrapper function for _build_strain_specific_model"""
        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix, please calculate first!')
        ref_functional_genes = [g.id for g in self.reference_gempro.functional_genes]
        log.info('Building strain specific models...')
        if joblib:
            result = DictList(Parallel(n_jobs=cores)(delayed(self._build_strain_specific_model)(s, ref_functional_genes, self.df_orthology_matrix, force_rerun=force_rerun) for s in self.strain_ids))
        # if sc:
        #     strains_rdd = sc.parallelize(self.strain_ids)
        #     result = strains_rdd.map(self._build_strain_specific_model).collect()
        else:
            result = []
            for s in tqdm(self.strain_ids):
                result.append(self._build_strain_specific_model(s, ref_functional_genes, self.df_orthology_matrix, force_rerun=force_rerun))

        for strain_id, gp_noseqs_path in result:
            self.strain_infodict[strain_id]['gp_noseqs_path'] = gp_noseqs_path 
示例18
def transform(self, waveforms):
        #~ print('ici', waveforms.shape, self.ind_peak)
        features = waveforms[:, self.ind_peak, : ].copy()
        return features



#~ Parallel(n_jobs=n_jobs)(delayed(count_match_spikes)(sorting1.get_unit_spike_train(u1),
                                                                                  #~ s2_spiketrains, delta_frames) for
                                                      #~ i1, u1 in enumerate(unit1_ids))

#~ def get_pca_one_channel(wf_chan, chan, thresh, n_left, n_components_by_channel, params):
    #~ print(chan)
    #~ pca = sklearn.decomposition.IncrementalPCA(n_components=n_components_by_channel, **params)
    #~ wf_chan = waveforms[:,:,chan]
    #~ print(wf_chan.shape)
    #~ print(wf_chan[:, -n_left].shape)
    #~ keep = np.any((wf_chan>thresh) | (wf_chan<-thresh))
    #~ keep = (wf_chan[:, -n_left]>thresh) | (wf_chan[:, -n_left]<-thresh)

    #~ if keep.sum() >=n_components_by_channel:
        #~ pca.fit(wf_chan[keep, :])
        #~ return pca
    #~ else:
        #~ return None 
示例19
def write_data_csv(fname, frames, preproc):
   """Write data to csv file"""
   fdata = open(fname, "w")
   dr = Parallel()(delayed(get_data)(lst,preproc) for lst in frames)
   data,result = zip(*dr)
   for entry in data:
      fdata.write(','.join(entry)+'\r\n')
   print("All finished, %d slices in total" % len(data))
   fdata.close()
   result = np.ravel(result)
   return result 
示例20
def __init__(self, path, split, tokenizer, bucket_size):
        # Setup
        self.path = path
        self.bucket_size = bucket_size
        self.encode_on_fly = False
        read_txt_src = []

        # List all wave files
        file_list, all_sent = [], []

        for s in split:
            if s in OFFICIAL_TXT_SRC:
                self.encode_on_fly = True
                with open(join(path, s), 'r') as f:
                    all_sent += f.readlines()
            file_list += list(Path(join(path, s)).rglob("*.flac"))
        assert (len(file_list) > 0) or (len(all_sent)
                                        > 0), "No data found @ {}".format(path)

        # Read text
        text = Parallel(n_jobs=READ_FILE_THREADS)(
            delayed(read_text)(str(f)) for f in file_list)
        all_sent.extend(text)
        del text

        # Encode text
        if self.encode_on_fly:
            self.tokenizer = tokenizer
            self.text = all_sent
        else:
            self.text = [tokenizer.encode(txt) for txt in tqdm(all_sent)]
        del all_sent

        # Read file size and sort dataset by file size (Note: feature len. may be different)
        self.text = sorted(self.text, reverse=True, key=lambda x: len(x))
        if self.encode_on_fly:
            del self.text[:REMOVE_TOP_N_TXT] 
示例21
def fit(self, X, y):
        ix_take_all = self.random_state.integers(X.shape[0], size = (X.shape[0], self.nsamples))
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._fit_single)(sample, ix_take_all, X, y) \
                    for sample in range(self.nsamples)) 
示例22
def _pred_by_sample(self, X):
        pred = np.empty((X.shape[0], self.nsamples))
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._assign_score)(sample, pred, X) \
                    for sample in range(self.nsamples))
        return pred 
示例23
def partial_fit(self, X, a, r):
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._partial_fit_single)(choice, X, a, r) \
                    for choice in range(self.n)) 
示例24
def decision_function(self, X):
        preds = np.zeros((X.shape[0], self.n))
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._decision_function_single)(choice, X, preds, 1) \
                    for choice in range(self.n))
        _apply_smoothing(preds, self.smooth, self.counters,
                         self.noise_to_smooth, self.random_state)
        return preds 
示例25
def predict_proba_raw(self,X):
        preds = np.zeros((X.shape[0], self.n))
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._decision_function_single)(choice, X, preds, 0) \
                    for choice in range(self.n))
        _apply_smoothing(preds, self.smooth, self.counters,
                         self.noise_to_smooth, self.random_state)
        return preds 
示例26
def exploit(self, X):
        ### only usable within some policies
        pred = np.empty((X.shape[0], self.n))
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._exploit_single)(choice, pred, X) \
                    for choice in range(self.n))
        return pred 
示例27
def fit(self, X, a, r, p):
        """
        Fits the Offset Tree estimator to partially-labeled data collected from a different policy.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            Matrix of covariates for the available data.
        a : array (n_samples), int type
            Arms or actions that were chosen for each observations.
        r : array (n_samples), {0,1}
            Rewards that were observed for the chosen actions. Must be binary rewards 0/1.
        p : array (n_samples)
            Reward estimates for the actions that were chosen by the policy.
        """
        X, a, r = _check_fit_input(X, a, r)
        p = _check_1d_inp(p)
        assert p.shape[0] == X.shape[0]
        
        if self.c is not None:
            p = self.c * p
        if self.pmin is not None:
            p = np.clip(p, a_min = self.pmin, a_max = None)
        
        self._oracles = [deepcopy(self.base_algorithm) for c in range(self.nchoices - 1)]
        rs = self.random_state.integers(np.iinfo(np.int32).max, size=self.nchoices)
        Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")\
                (delayed(self._fit)(classif, X, a, r, p, rs) \
                    for classif in range(len(self._oracles)))
        self.is_fitted = True 
示例28
def parallel_cacher(cache, path_asset_id, n_jobs=-1):
    from joblib import Parallel, delayed
    Parallel(n_jobs=n_jobs, verbose=5)(delayed(cache)(path, asset_id)
                                       for path, asset_id in path_asset_id) 
示例29
def process_files(files):
	Parallel(n_jobs=config['num_process'])(delayed(do_process)(id, audio_file, spectro_file)
							   for id, audio_file, spectro_file in files) 
示例30
def get_top_300(model_id,seed_index,song_index,th,with_wl):
    fw=open('rec/rec_%s_top300_th_std.tsv' % model_id,'w')
    Parallel(n_jobs=20)(delayed(do_process_rank)(model_id,seed_index,song_index,th,i,with_wl)
                           for i in range(len(seed_index)))