Python源码示例:sklearn.pipeline.FeatureUnion()
示例1
def main():
raw_data = load_iris()
data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])
pipeline = FeatureUnion([
("1", make_pipeline(
FunctionTransformer(lambda X: X.loc[:, ["sepal length (cm)"]]),
# other transformations
)),
("2", make_pipeline(
FunctionTransformer(lambda X: X.loc[:, ["sepal width (cm)"]]),
# other transformations
))
])
X = pipeline.fit_transform(data)
print(X["sepal length (cm)"].mean())
print(X["sepal width (cm)"].mean())
示例2
def main():
raw_data = load_iris()
data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])
data.loc[:, "class"] = raw_data["target"]
pipeline = FeatureUnion([
("1", make_pipeline(
PandasTransform(lambda X: X.loc[:, ["sepal length (cm)"]]),
# other transformations
)),
("2", make_pipeline(
PandasTransform(lambda X: X.loc[:, ["sepal width (cm)"]]),
# other transformations
))
])
X = pipeline.fit_transform(data)
print(X["sepal length (cm)"].mean())
print(X["sepal width (cm)"].mean())
示例3
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
示例4
def _apply_extractor(extractor, X, return_as_df):
"""Utility function to apply features extractor to ndarray X.
Parameters
----------
extractor : Instance of :class:`~sklearn.pipeline.FeatureUnion` or
:class:`~sklearn.pipeline.Pipeline`.
X : ndarray, shape (n_channels, n_times)
return_as_df : bool
Returns
-------
X : ndarray, shape (n_features,)
feature_names : list of str | None
Not None, only if ``return_as_df`` is True.
"""
X = extractor.fit_transform(X)
feature_names = None
if return_as_df:
feature_names = extractor.get_feature_names()
return X, feature_names
示例5
def __add__(self, other):
"""
Returns:
:py:class:`ibex.sklearn.pipeline.FeatureUnion`
"""
if isinstance(self, FeatureUnion):
self_features = [e[1] for e in self.transformer_list]
else:
self_features = [self]
if isinstance(other, FeatureUnion):
other_features = [e[1] for e in other.transformer_list]
else:
other_features = [other]
combined = self_features + other_features
return FeatureUnion(_make_pipeline_steps(combined))
示例6
def get_model(with_pipeline=False):
"""Get a multi-layer perceptron model.
Optionally, put it in a pipeline that scales the data.
"""
model = NeuralNetClassifier(MLPClassifier)
if with_pipeline:
model = Pipeline([
('scale', FeatureUnion([
('minmax', MinMaxScaler()),
('normalize', Normalizer()),
])),
('select', SelectKBest(k=N_FEATURES)), # keep input size constant
('net', model),
])
return model
示例7
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
vect_numerator = vectorizers.NumberVectorizer()
vect_denominator = vectorizers.NumberVectorizer()
def get_feature_names_(vect_numerator, vect_denominator):
def res():
return ['numerator_' + str(c) for c in vect_numerator.get_feature_names()] \
+ ['denominator_' + str(c) for c in vect_denominator.get_feature_names()]
return res
return [
('vect', FeatureUnion(transformer_list=[
('numerator', Pipeline([
('selector', vectorizers.DictItemSelector(item='numerator')),
('vect', vect_numerator),
])),
('denominator', Pipeline([
('selector', vectorizers.DictItemSelector(item='denominator')),
('vect', vect_denominator),
]))
]))
], get_feature_names_(vect_numerator, vect_denominator)
示例8
def fit(self, X, y=None):
"""Fit all transformers using X.
Parameters
----------
X : iterable or array-like, depending on transformers
Input data, used to fit transformers.
y : array-like, shape (n_samples, ...), optional
Targets for supervised learning.
Returns
-------
self : FeatureUnion
This estimator
"""
self.transformer_list = list(self.transformer_list)
self._validate_transformers()
with Pool(self.n_jobs) as pool:
transformers = pool.starmap(_fit_one_transformer,
((trans, X[trans.steps[0][1].columns], y) for _, trans, _ in self._iter()))
self._update_transformer_list(transformers)
return self
示例9
def test_import_from_sklearn_pipeline_feature_union(self):
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
union = FeatureUnion([("pca", PCA(n_components=1)), ("nys", Nystroem(n_components=2, random_state=42))])
sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
self.assertEqual(len(lale_pipeline.edges()), 3)
from lale.lib.sklearn.pca import PCAImpl
from lale.lib.sklearn.nystroem import NystroemImpl
from lale.lib.lale.concat_features import ConcatFeaturesImpl
from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), PCAImpl)
self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), ConcatFeaturesImpl)
self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), NystroemImpl)
self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), ConcatFeaturesImpl)
self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), ConcatFeaturesImpl)
self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), KNeighborsClassifierImpl)
self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例10
def test_export_to_sklearn_pipeline3(self):
from lale.lib.lale import ConcatFeatures
from lale.lib.sklearn import PCA
from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC
from sklearn.feature_selection import SelectKBest
from lale.lib.sklearn import Nystroem
from sklearn.pipeline import FeatureUnion
lale_pipeline = ((PCA() >> SelectKBest(k=2)) & (Nystroem(random_state = 42) >> SelectKBest(k=3))
& (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression()
trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion)
self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'], SelectKBest)
from sklearn.linear_model import LogisticRegression
self.assertIsInstance(sklearn_pipeline.named_steps['logisticregression'], LogisticRegression)
self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
示例11
def test_multiouput_prediction(self):
# TODO: Make this a real test
steps = [
("pre_horizon", HorizonTransformer(horizon=4)),
("pre_imputer", ReversibleImputer(y_only=True)),
(
"features",
FeatureUnion(
[("ar_transformer", AutoregressiveTransformer(num_lags=3))]
),
),
("post_lag_imputer", ReversibleImputer()),
("regressor", LinearRegression()),
]
pipeline = ForecasterPipeline(steps)
l = np.linspace(0, 1, 100)
y = np.sin(2 * np.pi * 5 * l) + np.random.normal(0, 0.1, size=100)
pipeline.fit(y[:, np.newaxis], y)
pipeline.predict(y[:, np.newaxis], to_scale=True, refit=True)
示例12
def test_multiouput_forecast(self):
# TODO: Make this a real test
steps = [
("pre_horizon", HorizonTransformer(horizon=4)),
("pre_imputer", ReversibleImputer(y_only=True)),
(
"features",
FeatureUnion(
[("ar_transformer", AutoregressiveTransformer(num_lags=3))]
),
),
("post_lag_imputer", ReversibleImputer()),
("regressor", LinearRegression()),
]
pipeline = ForecasterPipeline(steps)
l = np.linspace(0, 1, 100)
y = np.sin(2 * np.pi * 5 * l) + np.random.normal(0, 0.1, size=100)
pipeline.fit(y[:, np.newaxis], y)
pipeline.forecast(y[:, np.newaxis], 20)
示例13
def transform(self, X):
if self.func is None:
return X
if self.signature:
input_dims, output_dims = _parse_gufunc_signature(
signature=self.signature)
else:
input_dims, output_dims = [()], [()]
# This below ensures FeatureUnion's concatenation (hstack) does not fail
# because of resulting arrays having different number of dims
if len(input_dims[0]) == 1 and len(output_dims[0]) == 0:
X = np.expand_dims(X, axis=1) # Add one extra dimension if (n)->()
elif len(input_dims[0]) == 0 and len(output_dims[0]) == 1:
X = np.squeeze(X, axis=1) # Remove singleton dimension if ()->(n)
return np.vectorize(self.func, otypes=[np.float], signature=self.signature)(
X)
示例14
def test_feature_union(caplog, named_steps):
pipe_w_default_log_callback = DebugPipeline(named_steps, log_callback="default")
pipe_w_custom_log_callback = DebugPipeline(named_steps, log_callback=custom_log_callback)
pipe_union = FeatureUnion(
[
("pipe_w_default_log_callback", pipe_w_default_log_callback),
("pipe_w_custom_log_callback", pipe_w_custom_log_callback),
]
)
caplog.clear()
with caplog.at_level(logging.INFO):
pipe_union.fit(IRIS.data, IRIS.target)
assert caplog.text, f"Log should be none empty: {caplog.text}"
for pipe in [pipe_w_default_log_callback, pipe_w_custom_log_callback]:
for _, step in pipe.steps[:-1]:
assert str(step) in caplog.text, f"{step} should be in: {caplog.text}"
assert (
caplog.text.count(str(step)) == 2
), f"{step} should be once in {caplog.text}"
示例15
def test_FeatureUnion_pipeline():
# pipeline with segmentation plus multiple feature extraction
steps = [
('segment', RandomIntervalSegmenter(n_intervals=3)),
('transform', FeatureUnion([
('mean', RowTransformer(
FunctionTransformer(func=np.mean, validate=False))),
('std',
RowTransformer(FunctionTransformer(func=np.std, validate=False)))
])),
('clf', DecisionTreeClassifier())
]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
assert y_pred.shape[0] == y_test.shape[0]
np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))
示例16
def _filter(obj):
if isinstance(obj, DataFrameMapper):
obj.features = _filter_steps(obj.features)
if hasattr(obj, "built_features"):
if obj.built_features is not None:
obj.built_features = _filter_steps(obj.built_features)
elif isinstance(obj, ColumnTransformer):
obj.transformers = _filter_steps(obj.transformers)
obj.remainder = _filter(obj.remainder)
if hasattr(obj, "transformers_"):
obj.transformers_ = _filter_steps(obj.transformers_)
elif isinstance(obj, FeatureUnion):
obj.transformer_list = _filter_steps(obj.transformer_list)
elif isinstance(obj, Pipeline):
obj.steps = _filter_steps(obj.steps)
elif isinstance(obj, SelectorMixin):
return SelectorProxy(obj)
elif isinstance(obj, list):
return [_filter(e) for e in obj]
return obj
示例17
def test_simple_feature_union(self):
data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]],
dtype=numpy.float32)
model = FeatureUnion([("scaler1", StandardScaler()),
("scaler2", RobustScaler())])
model.fit(data)
all_models = list(enumerate_pipeline_models(model))
steps = collect_intermediate_steps(model, "feature union",
[("input",
FloatTensorType([None, 2]))])
assert len(steps) == 2
assert len(all_models) == 3
model.transform(data)
for step in steps:
onnx_step = step['onnx_step']
sess = onnxruntime.InferenceSession(onnx_step.SerializeToString())
onnx_outputs = sess.run(None, {'input': data})
onnx_output = onnx_outputs[0]
skl_outputs = step['model']._debug.outputs['transform']
assert_almost_equal(onnx_output, skl_outputs)
compare_objects(onnx_output, skl_outputs)
示例18
def test_feature_union_default(self):
data = load_iris()
X, y = data.data, data.target
X = X.astype(np.float32)
X_train, X_test, *_ = train_test_split(X, y, test_size=0.5,
random_state=42)
model = FeatureUnion([('standard', StandardScaler()),
('minmax', MinMaxScaler())]).fit(X_train)
model_onnx = convert_sklearn(
model, 'feature union',
[('input', FloatTensorType([None, X_test.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(X_test,
model,
model_onnx,
basename="SklearnFeatureUnionDefault")
示例19
def test_feature_union_transformer_weights_1(self):
data = load_digits()
X, y = data.data, data.target
X = X.astype(np.int64)
X_train, X_test, *_ = train_test_split(X, y, test_size=0.5,
random_state=42)
model = FeatureUnion([('pca', PCA()),
('svd', TruncatedSVD())],
transformer_weights={'pca': 10, 'svd': 3}
).fit(X_train)
model_onnx = convert_sklearn(
model, 'feature union',
[('input', Int64TensorType([None, X_test.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X_test,
model,
model_onnx,
basename="SklearnFeatureUnionTransformerWeights1-Dec4",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例20
def test_feature_union_transformer_weights_2(self):
data = load_digits()
X, y = data.data, data.target
X = X.astype(np.float32)
X_train, X_test, *_ = train_test_split(X, y, test_size=0.5,
random_state=42)
model = FeatureUnion([('pca', PCA()),
('svd', TruncatedSVD())],
transformer_weights={'pca1': 10, 'svd2': 3}
).fit(X_train)
model_onnx = convert_sklearn(
model, 'feature union',
[('input', FloatTensorType([None, X_test.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X_test,
model,
model_onnx,
basename="SklearnFeatureUnionTransformerWeights2-Dec4",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例21
def test_same_result_weight(self):
X, Z = self.make_text_rdd(2)
loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
loc_word = CountVectorizer(analyzer="word")
dist_word = SparkCountVectorizer(analyzer="word")
loc_union = FeatureUnion([
("chars", loc_char),
("words", loc_word)
], transformer_weights={"words": 10})
dist_union = SparkFeatureUnion([
("chars", dist_char),
("words", dist_word)
], transformer_weights={"words": 10})
loc_union.fit(X)
dist_union.fit(Z)
X_transformed = loc_union.transform(X)
Z_transformed = sp.vstack(dist_union.transform(Z).collect())
assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
示例22
def make_sparkunion(*transformers):
"""Construct a FeatureUnion from the given transformers.
This is a shorthand for the FeatureUnion constructor; it does not require,
and does not permit, naming the transformers. Instead, they will be given
names automatically based on their types. It also does not allow weighting.
Examples
--------
>>> from sklearn.decomposition import PCA, TruncatedSVD
>>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE
FeatureUnion(n_jobs=1,
transformer_list=[('pca', PCA(copy=True, n_components=None,
whiten=False)),
('truncatedsvd',
TruncatedSVD(algorithm='randomized',
n_components=2, n_iter=5,
random_state=None, tol=0.0))],
transformer_weights=None)
Returns
-------
f : FeatureUnion
"""
return SparkFeatureUnion(_name_estimators(transformers))
示例23
def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
'''
Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
Suitable for use as a step in a SKLearn Pipeline.
inputs:
parser: a Spacy pipeline object
returns:
feature transformer: FeatureUnion
'''
tfidf = Pipeline([
('cln', CleanTextTransformer()),
('pre', PreTokenizer(parser=parser)),
('vect', TfidfVectorizer(
max_features=3000, decode_error='replace')),
('clf', None)
])
grammar_counter = Pipeline([
('cln', CleanTextTransformer()),
('grm', GrammarTransformer(parser=parser)),
('to_dict', DictVectorizer()),
('clf', None)
])
if run_grammar and run_tfidf:
print('Running both feature sets.')
feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
elif not run_grammar:
print('Running only TFIDF.')
feature_transformer = FeatureUnion([("tfidf", tfidf)])
elif not run_tfidf:
print('Running only PCFGs.')
feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
return feature_transformer
示例24
def test_feature_union_weights():
# test feature union with transformer weights
iris = load_iris()
X = iris.data
y = iris.target
pca = PCA(n_components=2, svd_solver='randomized', random_state=0)
select = SelectKBest(k=1)
# test using fit followed by transform
fs = FeatureUnion([("pca", pca), ("select", select)],
transformer_weights={"pca": 10})
fs.fit(X, y)
X_transformed = fs.transform(X)
# test using fit_transform
fs = FeatureUnion([("pca", pca), ("select", select)],
transformer_weights={"pca": 10})
X_fit_transformed = fs.fit_transform(X, y)
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)],
transformer_weights={"mock": 10})
X_fit_transformed_wo_method = fs.fit_transform(X, y)
# check against expected result
# We use a different pca object to control the random_state stream
assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
assert_array_equal(X_transformed[:, -1],
select.fit_transform(X, y).ravel())
assert_array_almost_equal(X_fit_transformed[:, :-1],
10 * pca.fit_transform(X))
assert_array_equal(X_fit_transformed[:, -1],
select.fit_transform(X, y).ravel())
assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
示例25
def test_feature_union_feature_names():
word_vect = CountVectorizer(analyzer="word")
char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
ft.fit(JUNK_FOOD_DOCS)
feature_names = ft.get_feature_names()
for feat in feature_names:
assert "chars__" in feat or "words__" in feat
assert_equal(len(feature_names), 35)
ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
assert_raise_message(AttributeError,
'Transformer tr1 (type Transf) does not provide '
'get_feature_names', ft.get_feature_names)
示例26
def test_set_feature_union_steps():
mult2 = Mult(2)
mult2.get_feature_names = lambda: ['x2']
mult3 = Mult(3)
mult3.get_feature_names = lambda: ['x3']
mult5 = Mult(5)
mult5.get_feature_names = lambda: ['x5']
ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())
# Directly setting attr
ft.transformer_list = [('m5', mult5)]
assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
assert_equal(['m5__x5'], ft.get_feature_names())
# Using set_params
ft.set_params(transformer_list=[('mock', mult3)])
assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
assert_equal(['mock__x3'], ft.get_feature_names())
# Using set_params to replace single step
ft.set_params(mock=mult5)
assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
assert_equal(['mock__x5'], ft.get_feature_names())
示例27
def test_step_name_validation():
bad_steps1 = [('a__q', Mult(2)), ('b', Mult(3))]
bad_steps2 = [('a', Mult(2)), ('a', Mult(3))]
for cls, param in [(Pipeline, 'steps'),
(FeatureUnion, 'transformer_list')]:
# we validate in construction (despite scikit-learn convention)
bad_steps3 = [('a', Mult(2)), (param, Mult(3))]
for bad_steps, message in [
(bad_steps1, "Estimator names must not contain __: got ['a__q']"),
(bad_steps2, "Names provided are not unique: ['a', 'a']"),
(bad_steps3, "Estimator names conflict with constructor "
"arguments: ['%s']" % param),
]:
# three ways to make invalid:
# - construction
assert_raise_message(ValueError, message, cls,
**{param: bad_steps})
# - setattr
est = cls(**{param: [('a', Mult(1))]})
setattr(est, param, bad_steps)
assert_raise_message(ValueError, message, est.fit, [[1]], [1])
assert_raise_message(ValueError, message, est.fit_transform,
[[1]], [1])
# - set_params
est = cls(**{param: [('a', Mult(1))]})
est.set_params(**{param: bad_steps})
assert_raise_message(ValueError, message, est.fit, [[1]], [1])
assert_raise_message(ValueError, message, est.fit_transform,
[[1]], [1])
示例28
def construct_pipeline(classifier):
"""
This function creates a feature extraction pipeline that accepts data
from a CorpusLoader and appends the classification model to the end of
the pipeline, returning a newly constructed Pipeline object that is
ready to be fit and trained!
"""
return Pipeline([
# Create a Feature Union of Text Stats and Bag of Words
('union', FeatureUnion(
transformer_list = [
# Pipeline for pulling document structure features
('stats', Pipeline([
('stats', TextStats()),
('vect', DictVectorizer()),
])),
# Pipeline for creating a bag of words TF-IDF vector
('bow', Pipeline([
('tokens', TextNormalizer()),
('tfidf', TfidfVectorizer(
tokenizer=identity, preprocessor=None, lowercase=False
)),
('best', TruncatedSVD(n_components=1000)),
])),
],
# weight components in feature union
transformer_weights = {
'stats': 0.15,
'bow': 0.85,
},
)),
# Append the estimator to the end of the pipeline
('classifier', classifier),
])
示例29
def __init__(self, transformer_list, n_jobs=1, transformer_weights=None, as_index=True):
pipeline.FeatureUnion.__init__(
self,
transformer_list,
n_jobs,
transformer_weights)
FrameMixin.__init__(self)
self._as_index = as_index
# Tmp Ami - get docstrings from sklearn.
示例30
def _yield_preproc_steps(model):
if not isinstance(model, Pipeline):
return
for key, val in model.get_params().items():
if isinstance(val, BaseEstimator):
if not isinstance(val, (Pipeline, FeatureUnion)):
yield key, val