Python源码示例:sklearn.pipeline.Pipeline()
示例1
def run_logreg(X_train, y_train, selection_threshold=0.2):
print("\nrunning logistic regression...")
print("using a selection threshold of {}".format(selection_threshold))
pipe = Pipeline(
[
(
"feature_selection",
RandomizedLogisticRegression(selection_threshold=selection_threshold),
),
("classification", LogisticRegression()),
]
)
pipe.fit(X_train, y_train)
print("training accuracy : {}".format(pipe.score(X_train, y_train)))
print("testing accuracy : {}".format(pipe.score(X_test, y_test)))
return pipe
示例2
def create_logistic_vectorizer():
vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
lr = LogisticRegression(random_state=777)
return Pipeline([("vectorizer", vectorizer), ("lr", lr)])
示例3
def pca(self, **kwargs):
if 'n_components' in kwargs:
nComp = kwargs['n_components']
else:
nComp = 0.995
if 'dates' in kwargs:
mat = self.to_matrix(kwargs['dates'])
else:
mat = self.to_matrix()
scaler = StandardScaler()
pca = PCA(n_components=nComp)
self._pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
self._pipeline.fit(mat)
if 'file' in kwargs:
tofile(kwargs['file'], self._pipeline)
return self._pipeline
示例4
def run(self):
'''
Runs a model with params p.
'''
self.clf.set_params(**self.params)
# f = get_feature_transformer(self.parser)
# self.X_train_fts = f.fit_transform(self.X_train)
# self.X_test_fts = f.transform(self.X_test)
self.pipeline = Pipeline([
# ('feature_gen', f),
('clf', self.clf),
])
self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1]
if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']:
self.importances = self.clf.feature_importances_
elif self.model_type in ['SVM', 'LR', 'SGD']:
self.importances = self.clf.coef_[0]
示例5
def _iwp_model(self, processes, cv_folds):
"""Return the default model for the IWP regressor
"""
# Estimators are normally objects that have a fit and predict method
# (e.g. MLPRegressor from sklearn). To make their training easier we
# scale the input data in advance. With Pipeline objects from sklearn
# we can combine such steps easily since they behave like an
# estimator object as well.
estimator = Pipeline([
# SVM or NN work better if we have scaled the data in the first
# place. MinMaxScaler is the simplest one. RobustScaler or
# StandardScaler could be an alternative.
("scaler", RobustScaler(quantile_range=(15, 85))),
# The "real" estimator:
("estimator", MLPRegressor(max_iter=6000, early_stopping=True)),
])
# To optimize the results, we try different hyper parameters by
# using a grid search
hidden_layer_sizes = [
(15, 10, 3),
#(50, 20),
]
hyper_parameter = [
{ # Hyper parameter for lbfgs solver
'estimator__solver': ['lbfgs'],
'estimator__activation': ['tanh'],
'estimator__hidden_layer_sizes': hidden_layer_sizes,
'estimator__random_state': [0, 42, 100, 3452],
'estimator__alpha': [0.1, 0.001, 0.0001],
},
]
return GridSearchCV(
estimator, hyper_parameter, refit=True,
n_jobs=processes, cv=cv_folds, verbose=self.verbose,
)
示例6
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None):
"""
Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.
Note advanced users may wish to use their own custom pipeline.
"""
# Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
# inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
pipeline = Pipeline([
('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
# Perform one of two basic imputation methods
# TODO we need to think about making this optional to solve the problem of rare and very predictive values
('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical)),
('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
])
return pipeline
示例7
def test_stability_selection_regression():
n, p, k = 500, 1000, 5
X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
base_estimator = Pipeline([
('scaler', StandardScaler()),
('model', Lasso())
])
lambdas_grid = np.logspace(-1, 1, num=10)
selector = StabilitySelection(base_estimator=base_estimator,
lambda_name='model__alpha',
lambda_grid=lambdas_grid)
selector.fit(X, y)
chosen_betas = selector.get_support(indices=True)
assert_almost_equal(important_betas, chosen_betas)
示例8
def test_with_complementary_pairs_bootstrap():
n, p, k = 500, 1000, 5
X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
base_estimator = Pipeline([
('scaler', StandardScaler()),
('model', Lasso())
])
lambdas_grid = np.logspace(-1, 1, num=10)
selector = StabilitySelection(base_estimator=base_estimator,
lambda_name='model__alpha',
lambda_grid=lambdas_grid,
bootstrap_func='complementary_pairs')
selector.fit(X, y)
chosen_betas = selector.get_support(indices=True)
assert_almost_equal(important_betas, chosen_betas)
示例9
def test_different_shape():
n, p, k = 100, 200, 5
X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
base_estimator = Pipeline([
('scaler', StandardScaler()),
('model', Lasso())
])
lambdas_grid = np.logspace(-1, 1, num=10)
selector = StabilitySelection(base_estimator=base_estimator,
lambda_name='model__alpha',
lambda_grid=lambdas_grid)
selector.fit(X, y)
selector.transform(X[:, :-2])
示例10
def test_no_features():
n, p, k = 100, 200, 0
X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
base_estimator = Pipeline([
('scaler', StandardScaler()),
('model', Lasso())
])
lambdas_grid = np.logspace(-1, 1, num=10)
selector = StabilitySelection(base_estimator=base_estimator,
lambda_name='model__alpha',
lambda_grid=lambdas_grid)
selector.fit(X, y)
assert_almost_equal(selector.transform(X),
np.empty(0).reshape((X.shape[0], 0)))
示例11
def make_pipeline(encoding_method):
# static transformers from the other columns
transformers = [(enc + '_' + col, encoders_dict[enc], [col])
for col, enc in clean_columns.items()]
# adding the encoded column
transformers += [(encoding_method, encoders_dict[encoding_method],
[dirty_column])]
pipeline = Pipeline([
# Use ColumnTransformer to combine the features
('union', ColumnTransformer(
transformers=transformers,
remainder='drop')),
('scaler', StandardScaler(with_mean=False)),
('clf', RidgeCV())
])
return pipeline
#########################################################################
# Fitting each encoding methods with a RidgeCV
# --------------------------------------------
# Eventually, we loop over the different encoding methods,
# instantiate each time a new pipeline, fit it
# and store the returned cross-validation score:
示例12
def make_pipeline(encoding_method):
# static transformers from the other columns
transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
# adding the encoded column
transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
[dirty_column])]
pipeline = Pipeline([
# Use ColumnTransformer to combine the features
('union', ColumnTransformer(
transformers=transformers,
remainder='drop')),
('scaler', StandardScaler(with_mean=False)),
('classifier', RandomForestClassifier(random_state=5))
])
return pipeline
###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score:
示例13
def test_keras_autoencoder_scoring(model, kind, n_features_out):
"""
Test the KerasAutoEncoder and KerasLSTMAutoEncoder have a working scoring function
"""
Model = pydoc.locate(f"gordo.machine.model.models.{model}")
model = Pipeline([("model", Model(kind=kind))])
X = np.random.random((8, 2))
# Should be able to deal with y output different than X input features
y = np.random.random((8, n_features_out))
with pytest.raises(NotFittedError):
model.score(X, y)
model.fit(X, y)
score = model.score(X, y)
logger.info(f"Score: {score:.4f}")
示例14
def load(source_dir: Union[os.PathLike, str]) -> Any:
"""
Load an object from a directory, saved by
``gordo.serializer.pipeline_serializer.dump``
This take a directory, which is either top-level, meaning it contains
a sub directory in the naming scheme: "n_step=<int>-class=<path.to.Class>"
or the aforementioned naming scheme directory directly. Will return that
unsterilized object.
Parameters
----------
source_dir: Union[os.PathLike, str]
Location of the top level dir the pipeline was saved
Returns
-------
Union[GordoBase, Pipeline, BaseEstimator]
"""
# This source dir should have a single pipeline entry directory.
# may have been passed a top level dir, containing such an entry:
with open(os.path.join(source_dir, "model.pkl"), "rb") as f:
return pickle.load(f)
示例15
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
示例16
def test_imputation_pipeline_grid_search():
# Test imputation within a pipeline + gridsearch.
X = sparse_random_matrix(100, 100, density=0.10)
missing_values = X.data[0]
pipeline = Pipeline([('imputer',
SimpleImputer(missing_values=missing_values)),
('tree',
tree.DecisionTreeRegressor(random_state=0))])
parameters = {
'imputer__strategy': ["mean", "median", "most_frequent"]
}
Y = sparse_random_matrix(100, 1, density=0.10).toarray()
gs = GridSearchCV(pipeline, parameters)
gs.fit(X, Y)
示例17
def test_set_params_passes_all_parameters():
# Make sure all parameters are passed together to set_params
# of nested estimator. Regression test for #9944
class TestDecisionTree(DecisionTreeClassifier):
def set_params(self, **kwargs):
super().set_params(**kwargs)
# expected_kwargs is in test scope
assert kwargs == expected_kwargs
return self
expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2}
for est in [Pipeline([('estimator', TestDecisionTree())]),
GridSearchCV(TestDecisionTree(), {})]:
est.set_params(estimator__max_depth=5,
estimator__min_samples_leaf=2)
示例18
def test_gridsearch_pipeline_precomputed():
# Test if we can do a grid-search to find parameters to separate
# circles with a perceptron model using a precomputed kernel.
X, y = make_circles(n_samples=400, factor=.3, noise=.05,
random_state=0)
kpca = KernelPCA(kernel="precomputed", n_components=2)
pipeline = Pipeline([("kernel_pca", kpca),
("Perceptron", Perceptron(max_iter=5))])
param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
X_kernel = rbf_kernel(X, gamma=2.)
grid_search.fit(X_kernel, y)
assert_equal(grid_search.best_score_, 1)
# 0.23. warning about tol not having its correct default value.
示例19
def test_cv_pipeline_precomputed():
# Cross-validate a regression on four coplanar points with the same
# value. Use precomputed kernel to ensure Pipeline with KernelCenterer
# is treated as a _pairwise operation.
X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
y_true = np.ones((4,))
K = X.dot(X.T)
kcent = KernelCenterer()
pipeline = Pipeline([("kernel_centerer", kcent), ("svr",
SVR(gamma='scale'))])
# did the pipeline set the _pairwise attribute?
assert pipeline._pairwise
# test cross-validation, score should be almost perfect
# NB: this test is pretty vacuous -- it's mainly to test integration
# of Pipeline and KernelCenterer
y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
assert_array_almost_equal(y_true, y_pred)
示例20
def create_random_forest_tfidf():
vectorizer = TfidfVectorizer(lowercase=False)
rf = RandomForestClassifier(n_estimators=500, random_state=777)
return Pipeline([("vectorizer", vectorizer), ("rf", rf)])
示例21
def create_random_forest_vectorizer():
vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
rf = RandomForestClassifier(n_estimators=500, random_state=777)
return Pipeline([("vectorizer", vectorizer), ("rf", rf)])
示例22
def create_linear_vectorizer():
vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
lr = LinearRegression()
return Pipeline([("vectorizer", vectorizer), ("lr", lr)])
示例23
def create_sklearn_linear_regressor(X, y, pipeline=False):
lin = linear_model.LinearRegression(normalize=True)
if pipeline:
lin = Pipeline([("lin", lin)])
model = lin.fit(X, y)
return model
示例24
def create_sklearn_logistic_regressor(X, y, pipeline=False):
lin = linear_model.LogisticRegression()
if pipeline:
lin = Pipeline([("lin", lin)])
model = lin.fit(X, y)
return model
示例25
def fit(self, X, y=None, **fit_params):
#SkLearn Pipeline expects all transformers to have a fit and tranform method, but we are only using transform.
return self
示例26
def fit(self, X, y=None, **fit_params):
#SkLearn Pipeline expects all transformers to have a fit and tranform method, but we are only using transform.
return self
示例27
def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
'''
Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
Suitable for use as a step in a SKLearn Pipeline.
inputs:
parser: a Spacy pipeline object
returns:
feature transformer: FeatureUnion
'''
tfidf = Pipeline([
('cln', CleanTextTransformer()),
('pre', PreTokenizer(parser=parser)),
('vect', TfidfVectorizer(
max_features=3000, decode_error='replace')),
('clf', None)
])
grammar_counter = Pipeline([
('cln', CleanTextTransformer()),
('grm', GrammarTransformer(parser=parser)),
('to_dict', DictVectorizer()),
('clf', None)
])
if run_grammar and run_tfidf:
print('Running both feature sets.')
feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
elif not run_grammar:
print('Running only TFIDF.')
feature_transformer = FeatureUnion([("tfidf", tfidf)])
elif not run_tfidf:
print('Running only PCFGs.')
feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
return feature_transformer
示例28
def make_linear_reg_pipeline():
steps = []
steps.append(("standardize", StandardScaler()))
steps.append(("linear regression", LinearRegression()))
return Pipeline(steps)
示例29
def test_non_serializable_parameters(self):
pipeline = Pipeline([('pca', PCA()), ('rf', RandomForestClassifier())])
performance_dict, hyperparameters = functions.verify_estimator_class(
pipeline,
'predict_proba',
dict(Accuracy=self.source),
self.dataset_properties
)
assert functions.is_valid_json(hyperparameters)
示例30
def _pipeline_from_dict(dictionary):
"""Create a pipeline object from a dictionary"""
all_steps = []
for name, step in dictionary.items():
model = RetrievalProduct._model_from_dict(step)
all_steps.append([name, model])
return Pipeline(all_steps)