Python源码示例:sklearn.pipeline.make_pipeline()
示例1
def main():
raw_data = load_iris()
data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])
pipeline = FeatureUnion([
("1", make_pipeline(
FunctionTransformer(lambda X: X.loc[:, ["sepal length (cm)"]]),
# other transformations
)),
("2", make_pipeline(
FunctionTransformer(lambda X: X.loc[:, ["sepal width (cm)"]]),
# other transformations
))
])
X = pipeline.fit_transform(data)
print(X["sepal length (cm)"].mean())
print(X["sepal width (cm)"].mean())
示例2
def main():
raw_data = load_iris()
data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])
data.loc[:, "class"] = raw_data["target"]
pipeline = PandasFeatureUnion([
("1", make_pipeline(
PandasTransform(lambda X: X.loc[:, ["sepal length (cm)"]]),
# other transformations
)),
("2", make_pipeline(
PandasTransform(lambda X: X.loc[:, ["sepal width (cm)"]]),
# other transformations
))
])
X = pipeline.fit_transform(data)
print(X["sepal length (cm)"].mean())
print(X["sepal width (cm)"].mean())
示例3
def main():
raw_data = load_iris()
data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])
data.loc[:, "class"] = raw_data["target"]
pipeline = FeatureUnion([
("1", make_pipeline(
PandasTransform(lambda X: X.loc[:, ["sepal length (cm)"]]),
# other transformations
)),
("2", make_pipeline(
PandasTransform(lambda X: X.loc[:, ["sepal width (cm)"]]),
# other transformations
))
])
X = pipeline.fit_transform(data)
print(X["sepal length (cm)"].mean())
print(X["sepal width (cm)"].mean())
示例4
def test_gradient_boosting_with_init_pipeline():
# Check that the init estimator can be a pipeline (see issue #13466)
X, y = make_regression(random_state=0)
init = make_pipeline(LinearRegression())
gb = GradientBoostingRegressor(init=init)
gb.fit(X, y) # pipeline without sample_weight works fine
with pytest.raises(
ValueError,
match='The initial estimator Pipeline does not support sample '
'weights'):
gb.fit(X, y, sample_weight=np.ones(X.shape[0]))
# Passing sample_weight to a pipeline raises a ValueError. This test makes
# sure we make the distinction between ValueError raised by a pipeline that
# was passed sample_weight, and a ValueError raised by a regular estimator
# whose input checking failed.
with pytest.raises(
ValueError,
match='nu <= 0 or nu > 1'):
# Note that NuSVR properly supports sample_weight
init = NuSVR(gamma='auto', nu=1.5)
gb = GradientBoostingRegressor(init=init)
gb.fit(X, y, sample_weight=np.ones(X.shape[0]))
示例5
def test_pipeline():
# Render a pipeline object
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
expected = """
Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('logisticregression',
LogisticRegression(C=999, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False)"""
expected = expected[1:] # remove first \n
assert pipeline.__repr__() == expected
示例6
def test_make_pipeline():
t1 = Transf()
t2 = Transf()
pipe = make_pipeline(t1, t2)
assert isinstance(pipe, Pipeline)
assert_equal(pipe.steps[0][0], "transf-1")
assert_equal(pipe.steps[1][0], "transf-2")
pipe = make_pipeline(t1, t2, FitParamT())
assert isinstance(pipe, Pipeline)
assert_equal(pipe.steps[0][0], "transf-1")
assert_equal(pipe.steps[1][0], "transf-2")
assert_equal(pipe.steps[2][0], "fitparamt")
assert_raise_message(
TypeError,
'Unknown keyword arguments: "random_parameter"',
make_pipeline, t1, t2, random_parameter='rnd'
)
示例7
def test_lasso_cv_with_some_model_selection():
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import datasets
from sklearn.linear_model import LassoCV
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
pipe = make_pipeline(
StandardScaler(),
LassoCV(cv=StratifiedKFold(n_splits=5))
)
pipe.fit(X, y)
示例8
def build_language_classifier(texts, labels, verbose=False, random_state=None):
"""Train a text classifier with scikit-learn
The text classifier is composed of two elements assembled in a pipeline:
- A text feature extractor (`TfidfVectorizer`) that extract the relative
frequencies of unigrams, bigrams and trigrams of characters in the text.
- An instance of `SGDClassifier` for the classification it-self. To speed
up training it is recommended to enable early stopping.
`random_state` is passed to the underlying `SGDClassifier` instance.
"""
language_classifier = make_pipeline(
TfidfVectorizer(analyzer="char", ngram_range=(1, 3),
min_df=2, max_df=0.9, norm="l2", dtype=np.float32),
SGDClassifier(early_stopping=True, validation_fraction=0.2,
n_iter_no_change=3, max_iter=1000, tol=1e-3,
alpha=1e-5, penalty="l2", verbose=verbose,
random_state=random_state)
)
return language_classifier.fit(texts, labels)
示例9
def test_time(pipeline_name, name, path):
if pipeline_name == "LR":
pipeline = make_pipeline(LogisticRegression())
if pipeline_name == "FGS":
pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())
if pipeline_name == "Tree":
pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
test_benchmark = Benchmark()
print("Dataset:\t", name)
print("Pipeline:\t", pipeline_name)
starttime = datetime.datetime.now()
test_benchmark.run_test(pipeline, name, path)
endtime = datetime.datetime.now()
print("Used time: ", (endtime - starttime).microseconds/1000)
print("")
示例10
def test():
url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
urllib.request.urlretrieve(url_zip_train, filename='train.bz2')
f_svm = open('train.svm', 'wt')
with bz2.open('train.bz2', 'rb') as f_zip:
data = f_zip.read()
f_svm.write(data.decode('utf-8'))
f_svm.close()
X, y = load_svmlight_file('train.svm')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
# pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
pipeline.fit(X_train, y_train)
print("Pipeline Score: ", pipeline.score(X_train, y_train))
示例11
def test_mdr_sklearn_pipeline():
"""Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
assert np.mean(cv_scores) > 0.
示例12
def test_mdr_sklearn_pipeline_parallel():
"""Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
assert np.mean(cv_scores) > 0.
示例13
def test_compare_with_sklearn(self):
from lale.operators import make_pipeline
tfm = PCA()
clf = LogisticRegression(LogisticRegression.solver.lbfgs, LogisticRegression.multi_class.auto)
trainable = make_pipeline(tfm, clf)
digits = sklearn.datasets.load_digits()
trained = trainable.fit(digits.data, digits.target)
predicted = trained.predict(digits.data)
from sklearn.pipeline import make_pipeline as scikit_make_pipeline
from sklearn.decomposition import PCA as SklearnPCA
from sklearn.linear_model import LogisticRegression as SklearnLR
sklearn_pipeline = scikit_make_pipeline(SklearnPCA(), SklearnLR(solver="lbfgs", multi_class="auto"))
sklearn_pipeline.fit(digits.data, digits.target)
predicted_sklearn = sklearn_pipeline.predict(digits.data)
from sklearn.metrics import accuracy_score
lale_score = accuracy_score(digits.target, predicted)
scikit_score = accuracy_score(digits.target, predicted_sklearn)
self.assertEqual(lale_score, scikit_score)
示例14
def test_import_from_sklearn_pipeline_feature_union(self):
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
union = FeatureUnion([("pca", PCA(n_components=1)), ("nys", Nystroem(n_components=2, random_state=42))])
sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
self.assertEqual(len(lale_pipeline.edges()), 3)
from lale.lib.sklearn.pca import PCAImpl
from lale.lib.sklearn.nystroem import NystroemImpl
from lale.lib.lale.concat_features import ConcatFeaturesImpl
from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), PCAImpl)
self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), ConcatFeaturesImpl)
self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), NystroemImpl)
self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), ConcatFeaturesImpl)
self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), ConcatFeaturesImpl)
self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), KNeighborsClassifierImpl)
self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例15
def test_bagging_with_pipeline():
estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
DecisionTreeClassifier()),
max_features=2)
estimator.fit(iris.data, iris.target)
assert isinstance(estimator[0].steps[-1][1].random_state, int)
示例16
def test_estimators_samples_deterministic():
# This test is a regression test to check that with a random step
# (e.g. SparseRandomProjection) and a given random state, the results
# generated at fit time can be identically reproduced at a later time using
# data saved in object attributes. Check issue #9524 for full discussion.
iris = load_iris()
X, y = iris.data, iris.target
base_pipeline = make_pipeline(SparseRandomProjection(n_components=2),
LogisticRegression())
clf = BaggingClassifier(base_estimator=base_pipeline,
max_samples=0.5,
random_state=0)
clf.fit(X, y)
pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
estimator = clf.estimators_[0]
estimator_sample = clf.estimators_samples_[0]
estimator_feature = clf.estimators_features_[0]
X_train = (X[estimator_sample])[:, estimator_feature]
y_train = y[estimator_sample]
estimator.fit(X_train, y_train)
assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
示例17
def test_bagging_regressor_with_missing_inputs():
# Check that BaggingRegressor can accept X with missing/infinite data
X = np.array([
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, np.NINF, 6],
])
y_values = [
np.array([2, 3, 3, 3, 3]),
np.array([
[2, 1, 9],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
[3, 6, 8],
])
]
for y in y_values:
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(
FunctionTransformer(replace, validate=False),
regressor
)
pipeline.fit(X, y).predict(X)
bagging_regressor = BaggingRegressor(pipeline)
y_hat = bagging_regressor.fit(X, y).predict(X)
assert_equal(y.shape, y_hat.shape)
# Verify that exceptions can be raised by wrapper regressor
regressor = DecisionTreeRegressor()
pipeline = make_pipeline(regressor)
assert_raises(ValueError, pipeline.fit, X, y)
bagging_regressor = BaggingRegressor(pipeline)
assert_raises(ValueError, bagging_regressor.fit, X, y)
示例18
def test_bagging_classifier_with_missing_inputs():
# Check that BaggingClassifier can accept X with missing/infinite data
X = np.array([
[1, 3, 5],
[2, None, 6],
[2, np.nan, 6],
[2, np.inf, 6],
[2, np.NINF, 6],
])
y = np.array([3, 6, 6, 6, 6])
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(
FunctionTransformer(replace, validate=False),
classifier
)
pipeline.fit(X, y).predict(X)
bagging_classifier = BaggingClassifier(pipeline)
bagging_classifier.fit(X, y)
y_hat = bagging_classifier.predict(X)
assert_equal(y.shape, y_hat.shape)
bagging_classifier.predict_log_proba(X)
bagging_classifier.predict_proba(X)
# Verify that exceptions can be raised by wrapper classifier
classifier = DecisionTreeClassifier()
pipeline = make_pipeline(classifier)
assert_raises(ValueError, pipeline.fit, X, y)
bagging_classifier = BaggingClassifier(pipeline)
assert_raises(ValueError, bagging_classifier.fit, X, y)
示例19
def test_pipeline_ducktyping():
pipeline = make_pipeline(Mult(5))
pipeline.predict
pipeline.transform
pipeline.inverse_transform
pipeline = make_pipeline(Transf())
assert not hasattr(pipeline, 'predict')
pipeline.transform
pipeline.inverse_transform
pipeline = make_pipeline('passthrough')
assert pipeline.steps[0] == ('passthrough', 'passthrough')
assert not hasattr(pipeline, 'predict')
pipeline.transform
pipeline.inverse_transform
pipeline = make_pipeline(Transf(), NoInvTransf())
assert not hasattr(pipeline, 'predict')
pipeline.transform
assert not hasattr(pipeline, 'inverse_transform')
pipeline = make_pipeline(NoInvTransf(), Transf())
assert not hasattr(pipeline, 'predict')
pipeline.transform
assert not hasattr(pipeline, 'inverse_transform')
示例20
def test_classes_property():
iris = load_iris()
X = iris.data
y = iris.target
reg = make_pipeline(SelectKBest(k=1), LinearRegression())
reg.fit(X, y)
assert_raises(AttributeError, getattr, reg, "classes_")
clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
assert_raises(AttributeError, getattr, clf, "classes_")
clf.fit(X, y)
assert_array_equal(clf.classes_, np.unique(y))
示例21
def test_make_pipeline_memory():
cachedir = mkdtemp()
if LooseVersion(joblib_version) < LooseVersion('0.12'):
# Deal with change of API in joblib
memory = Memory(cachedir=cachedir, verbose=10)
else:
memory = Memory(location=cachedir, verbose=10)
pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
assert pipeline.memory is memory
pipeline = make_pipeline(DummyTransf(), SVC())
assert pipeline.memory is None
assert len(pipeline) == 2
shutil.rmtree(cachedir)
示例22
def test_kde_pipeline_gridsearch():
# test that kde plays nice in pipelines and grid-searches
X, _ = make_blobs(cluster_std=.1, random_state=1,
centers=[[0, 1], [1, 0], [0, 0]])
pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
KernelDensity(kernel="gaussian"))
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
search = GridSearchCV(pipe1, param_grid=params, cv=5)
search.fit(X)
assert_equal(search.best_params_['kerneldensity__bandwidth'], .1)
示例23
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")):
return make_pipeline(DictVectorizer(sparse=False), clf)
示例24
def create_features_from_dataframe(self, df_train: pd.DataFrame, df_test: pd.DataFrame):
train_length = len(df_train)
n_components = 30
df_data: pd.DataFrame = pd.concat([df_train, df_test])
pipeline = make_pipeline(
OneHotEncoder(),
TruncatedSVD(n_components=n_components, random_state=71)
)
features = pipeline.fit_transform(df_data[['ip', 'app', 'os', 'device', 'channel']].values).astype(np.float32)
feature_columns = []
for i in range(n_components):
feature_columns.append(self.name + '_{}'.format(i))
return pd.DataFrame(data=features[:train_length], columns=feature_columns), \
pd.DataFrame(data=features[train_length:], columns=feature_columns)
示例25
def create_features_from_dataframe(self, df_train: pd.DataFrame, df_test: pd.DataFrame):
train_length = len(df_train)
n_components = 30
df_data: pd.DataFrame = pd.concat([df_train, df_test])
pipeline = make_pipeline(
OneHotEncoder(),
TfidfTransformer(),
TruncatedSVD(n_components=30, random_state=71)
)
features = pipeline.fit_transform(df_data[['ip', 'app', 'os', 'device', 'channel']].values).astype(np.float32)
feature_columns = []
for i in range(n_components):
feature_columns.append(self.name + '_{}'.format(i))
return pd.DataFrame(data=features[:train_length], columns=feature_columns), \
pd.DataFrame(data=features[train_length:], columns=feature_columns)
示例26
def test_pipeline():
"""Check that SymbolicRegressor/Transformer can work in a pipeline"""
# Check the regressor
est = make_pipeline(StandardScaler(),
SymbolicRegressor(population_size=50,
generations=5,
tournament_size=5,
random_state=0))
est.fit(boston.data, boston.target)
assert_almost_equal(est.score(boston.data, boston.target), -4.00270923)
# Check the classifier
est = make_pipeline(StandardScaler(),
SymbolicClassifier(population_size=50,
generations=5,
tournament_size=5,
random_state=0))
est.fit(cancer.data, cancer.target)
assert_almost_equal(est.score(cancer.data, cancer.target), 0.934973637961)
# Check the transformer
est = make_pipeline(SymbolicTransformer(population_size=50,
hall_of_fame=20,
generations=5,
tournament_size=5,
random_state=0),
DecisionTreeRegressor())
est.fit(boston.data, boston.target)
assert_almost_equal(est.score(boston.data, boston.target), 1.0)
示例27
def test_relieff_pipeline():
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline"""
np.random.seed(49082)
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
示例28
def test_relieff_pipeline_parallel():
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline when ReliefF is parallelized"""
#Note that the rebate algorithm cannot be parallelized with both the random forest and the cross validation all at once. If the rebate algorithm is parallelized, the cross-validation scoring cannot be.
np.random.seed(49082)
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
示例29
def test_relieffpercent_pipeline():
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF with % neighbors works in a sklearn pipeline"""
np.random.seed(49082)
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
示例30
def test_surf_pipeline():
"""Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline"""
np.random.seed(240932)
clf = make_pipeline(SURF(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7