Python源码示例:sklearn.pipeline.Pipeline()

示例1
def run_logreg(X_train, y_train, selection_threshold=0.2):
    print("\nrunning logistic regression...")
    print("using a selection threshold of {}".format(selection_threshold))
    pipe = Pipeline(
        [
            (
                "feature_selection",
                RandomizedLogisticRegression(selection_threshold=selection_threshold),
            ),
            ("classification", LogisticRegression()),
        ]
    )
    pipe.fit(X_train, y_train)
    print("training accuracy : {}".format(pipe.score(X_train, y_train)))
    print("testing accuracy : {}".format(pipe.score(X_test, y_test)))
    return pipe 
示例2
def create_logistic_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LogisticRegression(random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)]) 
示例3
def pca(self, **kwargs):
        if 'n_components' in kwargs:
            nComp = kwargs['n_components']
        else:
            nComp = 0.995

        if 'dates' in kwargs:
            mat = self.to_matrix(kwargs['dates'])
        else:
            mat = self.to_matrix()
        scaler = StandardScaler()
        pca = PCA(n_components=nComp)
        self._pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
        self._pipeline.fit(mat)
        
        if 'file' in kwargs:
            tofile(kwargs['file'], self._pipeline)
        
        return self._pipeline 
示例4
def run(self):
        '''
        Runs a model with params p.
        '''
        self.clf.set_params(**self.params)
        # f = get_feature_transformer(self.parser)
        # self.X_train_fts = f.fit_transform(self.X_train)
        # self.X_test_fts = f.transform(self.X_test)
        self.pipeline = Pipeline([
            # ('feature_gen', f),
            ('clf', self.clf),
        ])
        self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1]
        if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']:
            self.importances = self.clf.feature_importances_
        elif self.model_type in ['SVM', 'LR', 'SGD']:
            self.importances = self.clf.coef_[0] 
示例5
def _iwp_model(self, processes, cv_folds):
        """Return the default model for the IWP regressor
        """
        # Estimators are normally objects that have a fit and predict method
        # (e.g. MLPRegressor from sklearn). To make their training easier we
        # scale the input data in advance. With Pipeline objects from sklearn
        # we can combine such steps easily since they behave like an
        # estimator object as well.
        estimator = Pipeline([
            # SVM or NN work better if we have scaled the data in the first
            # place. MinMaxScaler is the simplest one. RobustScaler or
            # StandardScaler could be an alternative.
            ("scaler", RobustScaler(quantile_range=(15, 85))),
            # The "real" estimator:
            ("estimator", MLPRegressor(max_iter=6000, early_stopping=True)),
        ])

        # To optimize the results, we try different hyper parameters by
        # using a grid search
        hidden_layer_sizes = [
            (15, 10, 3),
            #(50, 20),
        ]
        hyper_parameter = [
            {   # Hyper parameter for lbfgs solver
                'estimator__solver': ['lbfgs'],
                'estimator__activation': ['tanh'],
                'estimator__hidden_layer_sizes': hidden_layer_sizes,
                'estimator__random_state': [0, 42, 100, 3452],
                'estimator__alpha': [0.1, 0.001, 0.0001],
            },
        ]

        return GridSearchCV(
            estimator, hyper_parameter, refit=True,
            n_jobs=processes, cv=cv_folds, verbose=self.verbose,
        ) 
示例6
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None):
    """
    Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.
    
    Note advanced users may wish to use their own custom pipeline.
    """

    # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
    #   inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
    pipeline = Pipeline([
        ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
        ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
        # Perform one of two basic imputation methods
        # TODO we need to think about making this optional to solve the problem of rare and very predictive values
        ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical)),
        ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
        ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
        ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
        ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
    ])
    return pipeline 
示例7
def test_stability_selection_regression():
    n, p, k = 500, 1000, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(important_betas, chosen_betas) 
示例8
def test_with_complementary_pairs_bootstrap():
    n, p, k = 500, 1000, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid,
                                  bootstrap_func='complementary_pairs')
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(important_betas, chosen_betas) 
示例9
def test_different_shape():
    n, p, k = 100, 200, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)
    selector.transform(X[:, :-2]) 
示例10
def test_no_features():
    n, p, k = 100, 200, 0

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)

    assert_almost_equal(selector.transform(X),
                        np.empty(0).reshape((X.shape[0], 0))) 
示例11
def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [(enc + '_' + col, encoders_dict[enc], [col])
                    for col, enc in clean_columns.items()]
    # adding the encoded column
    transformers += [(encoding_method, encoders_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', RidgeCV())
    ])
    return pipeline


#########################################################################
# Fitting each encoding methods with a RidgeCV
# --------------------------------------------
# Eventually, we loop over the different encoding methods,
# instantiate each time a new pipeline, fit it
# and store the returned cross-validation score: 
示例12
def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
    # adding the encoded column
    transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', RandomForestClassifier(random_state=5))
    ])

    return pipeline


###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score: 
示例13
def test_keras_autoencoder_scoring(model, kind, n_features_out):
    """
    Test the KerasAutoEncoder and KerasLSTMAutoEncoder have a working scoring function
    """
    Model = pydoc.locate(f"gordo.machine.model.models.{model}")
    model = Pipeline([("model", Model(kind=kind))])
    X = np.random.random((8, 2))

    # Should be able to deal with y output different than X input features
    y = np.random.random((8, n_features_out))

    with pytest.raises(NotFittedError):
        model.score(X, y)

    model.fit(X, y)
    score = model.score(X, y)
    logger.info(f"Score: {score:.4f}") 
示例14
def load(source_dir: Union[os.PathLike, str]) -> Any:
    """
    Load an object from a directory, saved by
    ``gordo.serializer.pipeline_serializer.dump``

    This take a directory, which is either top-level, meaning it contains
    a sub directory in the naming scheme: "n_step=<int>-class=<path.to.Class>"
    or the aforementioned naming scheme directory directly. Will return that
    unsterilized object.


    Parameters
    ----------
    source_dir: Union[os.PathLike, str]
        Location of the top level dir the pipeline was saved

    Returns
    -------
    Union[GordoBase, Pipeline, BaseEstimator]
    """
    # This source dir should have a single pipeline entry directory.
    # may have been passed a top level dir, containing such an entry:
    with open(os.path.join(source_dir, "model.pkl"), "rb") as f:
        return pickle.load(f) 
示例15
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline 
示例16
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    X = sparse_random_matrix(100, 100, density=0.10)
    missing_values = X.data[0]

    pipeline = Pipeline([('imputer',
                          SimpleImputer(missing_values=missing_values)),
                         ('tree',
                          tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__strategy': ["mean", "median", "most_frequent"]
    }

    Y = sparse_random_matrix(100, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y) 
示例17
def test_set_params_passes_all_parameters():
    # Make sure all parameters are passed together to set_params
    # of nested estimator. Regression test for #9944

    class TestDecisionTree(DecisionTreeClassifier):
        def set_params(self, **kwargs):
            super().set_params(**kwargs)
            # expected_kwargs is in test scope
            assert kwargs == expected_kwargs
            return self

    expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2}
    for est in [Pipeline([('estimator', TestDecisionTree())]),
                GridSearchCV(TestDecisionTree(), {})]:
        est.set_params(estimator__max_depth=5,
                       estimator__min_samples_leaf=2) 
示例18
def test_gridsearch_pipeline_precomputed():
    # Test if we can do a grid-search to find parameters to separate
    # circles with a perceptron model using a precomputed kernel.
    X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                        random_state=0)
    kpca = KernelPCA(kernel="precomputed", n_components=2)
    pipeline = Pipeline([("kernel_pca", kpca),
                         ("Perceptron", Perceptron(max_iter=5))])
    param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
    grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
    X_kernel = rbf_kernel(X, gamma=2.)
    grid_search.fit(X_kernel, y)
    assert_equal(grid_search.best_score_, 1)


# 0.23. warning about tol not having its correct default value. 
示例19
def test_cv_pipeline_precomputed():
    # Cross-validate a regression on four coplanar points with the same
    # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
    # is treated as a _pairwise operation.
    X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
    y_true = np.ones((4,))
    K = X.dot(X.T)
    kcent = KernelCenterer()
    pipeline = Pipeline([("kernel_centerer", kcent), ("svr",
                        SVR(gamma='scale'))])

    # did the pipeline set the _pairwise attribute?
    assert pipeline._pairwise

    # test cross-validation, score should be almost perfect
    # NB: this test is pretty vacuous -- it's mainly to test integration
    #     of Pipeline and KernelCenterer
    y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
    assert_array_almost_equal(y_true, y_pred) 
示例20
def create_random_forest_tfidf():
    vectorizer = TfidfVectorizer(lowercase=False)
    rf = RandomForestClassifier(n_estimators=500, random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("rf", rf)]) 
示例21
def create_random_forest_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    rf = RandomForestClassifier(n_estimators=500, random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("rf", rf)]) 
示例22
def create_linear_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LinearRegression()
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)]) 
示例23
def create_sklearn_linear_regressor(X, y, pipeline=False):
    lin = linear_model.LinearRegression(normalize=True)
    if pipeline:
        lin = Pipeline([("lin", lin)])
    model = lin.fit(X, y)
    return model 
示例24
def create_sklearn_logistic_regressor(X, y, pipeline=False):
    lin = linear_model.LogisticRegression()
    if pipeline:
        lin = Pipeline([("lin", lin)])
    model = lin.fit(X, y)
    return model 
示例25
def fit(self, X, y=None, **fit_params):
        #SkLearn Pipeline expects all transformers to have a fit and tranform method, but we are only using transform.
        return self 
示例26
def fit(self, X, y=None, **fit_params):
        #SkLearn Pipeline expects all transformers to have a fit and tranform method, but we are only using transform.
        return self 
示例27
def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
    '''
    Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
    Suitable for use as a step in a SKLearn Pipeline.

    inputs:
        parser: a Spacy pipeline object
    returns:
        feature transformer: FeatureUnion
    '''
    tfidf = Pipeline([
            ('cln', CleanTextTransformer()),
            ('pre', PreTokenizer(parser=parser)),
            ('vect', TfidfVectorizer(
                         max_features=3000, decode_error='replace')),
            ('clf', None)
        ])
    grammar_counter = Pipeline([
            ('cln', CleanTextTransformer()),
            ('grm', GrammarTransformer(parser=parser)),
            ('to_dict', DictVectorizer()),
            ('clf', None)
        ])
    if run_grammar and run_tfidf:
        print('Running both feature sets.')
        feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
    elif not run_grammar:
        print('Running only TFIDF.')
        feature_transformer = FeatureUnion([("tfidf", tfidf)])
    elif not run_tfidf:
        print('Running only PCFGs.')
        feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
    return feature_transformer 
示例28
def make_linear_reg_pipeline():
    steps = []

    steps.append(("standardize", StandardScaler()))
    steps.append(("linear regression", LinearRegression()))

    return Pipeline(steps) 
示例29
def test_non_serializable_parameters(self):
        pipeline = Pipeline([('pca', PCA()), ('rf', RandomForestClassifier())])
        performance_dict, hyperparameters = functions.verify_estimator_class(
            pipeline,
            'predict_proba',
            dict(Accuracy=self.source),
            self.dataset_properties
        )
        assert functions.is_valid_json(hyperparameters) 
示例30
def _pipeline_from_dict(dictionary):
        """Create a pipeline object from a dictionary"""
        all_steps = []
        for name, step in dictionary.items():
            model = RetrievalProduct._model_from_dict(step)
            all_steps.append([name, model])

        return Pipeline(all_steps)