Python源码示例:sklearn.base.TransformerMixin()
示例1
def create_pandas_only_svm_classifier(X, y, probability=True):
class PandasOnlyEstimator(TransformerMixin):
def fit(self, X, y=None, **fitparams):
return self
def transform(self, X, **transformparams):
dataset_is_df = isinstance(X, pd.DataFrame)
if not dataset_is_df:
raise Exception("Dataset must be a pandas dataframe!")
return X
pandas_only = PandasOnlyEstimator()
clf = svm.SVC(gamma=0.001, C=100.0, probability=probability, random_state=777)
pipeline = Pipeline([("pandas_only", pandas_only), ("clf", clf)])
return pipeline.fit(X, y)
示例2
def test_template_1():
"""Assert that TPOT template option generates pipeline when each step is a type of operator."""
tpot_obj = TPOTClassifier(
random_state=42,
verbosity=0,
template='Selector-Transformer-Classifier'
)
tpot_obj._fit_init()
pop = tpot_obj._toolbox.population(n=10)
for deap_pipeline in pop:
operator_count = tpot_obj._operator_count(deap_pipeline)
sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
assert operator_count == 3
assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin)
assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin)
示例3
def test_template_2():
"""Assert that TPOT template option generates pipeline when each step is operator type with a duplicate main type."""
tpot_obj = TPOTClassifier(
random_state=42,
verbosity=0,
template='Selector-Selector-Transformer-Classifier'
)
tpot_obj._fit_init()
pop = tpot_obj._toolbox.population(n=10)
for deap_pipeline in pop:
operator_count = tpot_obj._operator_count(deap_pipeline)
sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
assert operator_count == 4
assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
assert issubclass(sklearn_pipeline.steps[1][1].__class__, SelectorMixin)
assert issubclass(sklearn_pipeline.steps[2][1].__class__, TransformerMixin)
assert issubclass(sklearn_pipeline.steps[3][1].__class__, ClassifierMixin)
示例4
def test_template_3():
"""Assert that TPOT template option generates pipeline when one of steps is a specific operator."""
tpot_obj = TPOTClassifier(
random_state=42,
verbosity=0,
template='SelectPercentile-Transformer-Classifier'
)
tpot_obj._fit_init()
pop = tpot_obj._toolbox.population(n=10)
for deap_pipeline in pop:
operator_count = tpot_obj._operator_count(deap_pipeline)
sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
assert operator_count == 3
assert sklearn_pipeline.steps[0][0] == 'SelectPercentile'.lower()
assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin)
assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin)
示例5
def test_template_4():
"""Assert that TPOT template option generates pipeline when one of steps is a specific operator."""
tpot_obj = TPOTClassifier(
population_size=5,
generations=2,
random_state=42,
verbosity=0,
config_dict = 'TPOT light',
template='SelectPercentile-Transformer-Classifier'
)
tpot_obj.fit(pretest_X, pretest_y)
assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
assert not (tpot_obj._start_datetime is None)
sklearn_pipeline = tpot_obj.fitted_pipeline_
operator_count = tpot_obj._operator_count(tpot_obj._optimized_pipeline)
assert operator_count == 3
assert sklearn_pipeline.steps[0][0] == 'SelectPercentile'.lower()
assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin)
assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin)
assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin)
示例6
def track_selected_features(pipeline_stages, num_features):
"""
Args:
pipeline_stages (list [tuple[str, TransformerMixin]]): list of steps. each step is a tuple of Name and
Transformer Object.
num_features (int):
Returns:
np.ndarray:
"""
selected_features = np.arange(num_features)
for p_name, p in pipeline_stages:
if not isinstance(p, BaseFeatureSelector):
continue
p_features = p.selected_features
selected_features = selected_features[p_features]
return selected_features
示例7
def create_pandas_only_svm_classifier(X, y, probability=True):
class PandasOnlyEstimator(TransformerMixin):
def fit(self, X, y=None, **fitparams):
return self
def transform(self, X, **transformparams):
dataset_is_df = isinstance(X, pd.DataFrame)
if not dataset_is_df:
raise Exception("Dataset must be a pandas dataframe!")
return X
pandas_only = PandasOnlyEstimator()
clf = svm.SVC(gamma=0.001, C=100., probability=probability, random_state=777)
pipeline = Pipeline([('pandas_only', pandas_only), ('clf', clf)])
return pipeline.fit(X, y)
示例8
def metric_wrapper(metric, scaler: Optional[TransformerMixin] = None):
"""
Ensures that a given metric works properly when the model itself returns
a y which is shorter than the target y, and allows scaling the data
before applying the metrics.
Parameters
----------
metric
Metric which must accept y_true and y_pred of the same length
scaler : Optional[TransformerMixin]
Transformer which will be applied on y and y_pred before the metrics is
calculated. Must have method `transform`, so for most scalers it must already
be fitted on `y`.
"""
@functools.wraps(metric)
def _wrapper(y_true, y_pred, *args, **kwargs):
if scaler:
logger.debug(
"Transformer provided to metrics wrapper, scaling y and y_pred before "
"passing to metrics"
)
y_true = scaler.transform(y_true)
y_pred = scaler.transform(y_pred)
return metric(y_true[-len(y_pred) :], y_pred, *args, **kwargs)
return _wrapper
示例9
def __init__(
self,
base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"),
scaler: TransformerMixin = RobustScaler(),
require_thresholds: bool = True,
window=None,
):
"""
Classifier which wraps a ``base_estimator`` and provides a diff error
based approach to anomaly detection.
It trains a ``scaler`` to the target **after** training, purely for
error calculations. The underlying ``base_estimator`` is trained
with the original, unscaled, ``y``.
Parameters
----------
base_estimator: sklearn.base.BaseEstimator
The model to which normal ``.fit``, ``.predict`` methods will be used.
defaults to py:class:`gordo.machine.model.models.KerasAutoEncoder` with
``kind='feedforward_hourglass``
scaler: sklearn.base.TransformerMixin
Defaults to ``sklearn.preprocessing.RobustScaler``
Used for transforming model output and the original ``y`` to calculate
the difference/error in model output vs expected.
require_thresholds: bool
Requires calculating ``thresholds_`` via a call to :func:`~DiffBasedAnomalyDetector.cross_validate`.
If this is set (default True), but :func:`~DiffBasedAnomalyDetector.cross_validate`
was not called before calling :func:`~DiffBasedAnomalyDetector.anomaly` an ``AttributeError``
will be raised.
window: int
Window size for smoothed thresholds
"""
self.base_estimator = base_estimator
self.scaler = scaler
self.require_thresholds = require_thresholds
self.window = window
示例10
def test_clone_pandas_dataframe():
class DummyEstimator(BaseEstimator, TransformerMixin):
"""This is a dummy class for generating numerical features
This feature extractor extracts numerical features from pandas data
frame.
Parameters
----------
df: pandas data frame
The pandas data frame parameter.
Notes
-----
"""
def __init__(self, df=None, scalar_param=1):
self.df = df
self.scalar_param = scalar_param
def fit(self, X, y=None):
pass
def transform(self, X):
pass
# build and clone estimator
d = np.arange(10)
df = MockDataFrame(d)
e = DummyEstimator(df, scalar_param=1)
cloned_e = clone(e)
# the test
assert (e.df == cloned_e.df).values.all()
assert_equal(e.scalar_param, cloned_e.scalar_param)
示例11
def vectorizer_factory(self) -> TransformerMixin:
raise NotImplementedError
示例12
def transformer_factory(self) -> TransformerMixin:
return LatentDirichletAllocation(n_components=self.width, learning_method='online', random_state=71)
示例13
def transformer_factory(self) -> TransformerMixin:
return LatentDirichletAllocation(n_components=self.width, learning_method='online', random_state=71)
示例14
def transformer_factory(self) -> TransformerMixin:
return TruncatedSVD(n_components=self.width, random_state=71)
示例15
def transformer_factory(self) -> TransformerMixin:
return NMF(n_components=self.width, random_state=71)
示例16
def _generate_bases_test(est, pd_est):
def test(self):
self.assertTrue(isinstance(pd_est, FrameMixin), pd_est)
self.assertFalse(isinstance(est, FrameMixin))
self.assertTrue(isinstance(pd_est, base.BaseEstimator))
try:
mixins = [
base.ClassifierMixin,
base.ClusterMixin,
base.BiclusterMixin,
base.TransformerMixin,
base.DensityMixin,
base.MetaEstimatorMixin,
base.ClassifierMixin,
base.RegressorMixin]
except:
if _sklearn_ver > 17:
raise
mixins = [
base.ClassifierMixin,
base.ClusterMixin,
base.BiclusterMixin,
base.TransformerMixin,
base.MetaEstimatorMixin,
base.ClassifierMixin,
base.RegressorMixin]
for mixin in mixins:
self.assertEqual(
isinstance(pd_est, mixin),
isinstance(est, mixin),
mixin)
return test
示例17
def test_get_params_without_init(self, teardown):
"""Test edge case where the base class does not define
an __init__ method. get_params should resolve to object.__init__
which results in an empty dict.
"""
class TransformerWithoutInit(TransformerMixin, BaseEstimator):
pass
class TransformerWithoutInitStep(Step, TransformerWithoutInit):
pass
step = TransformerWithoutInitStep()
assert step.get_params() == {}
示例18
def test_basic():
bags = [np.random.normal(5, 3, size=(np.random.randint(10, 100), 20))
for _ in xrange(50)]
feats = Features(bags, stack=True)
stder = BagStandardizer()
stdized = stder.fit_transform(bags)
stdized.make_stacked()
assert np.allclose(np.mean(stdized.stacked_features), 0)
assert np.allclose(np.std(stdized.stacked_features), 1)
first_five = stder.transform(bags[:5])
assert first_five == stdized[:5]
minmaxer = BagMinMaxScaler([3, 7])
minmaxed = minmaxer.fit_transform(feats)
minmaxed.make_stacked()
assert np.allclose(np.min(minmaxed.stacked_features, 0), 3)
assert np.allclose(np.max(minmaxed.stacked_features, 0), 7)
normer = BagNormalizer('l1')
normed = normer.fit_transform(Features(bags))
normed.make_stacked()
assert np.allclose(np.sum(np.abs(normed.stacked_features), 1), 1)
class GetMean(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return X.mean(axis=1)[None, :]
m = BagPreprocesser(GetMean())
assert_raises(ValueError, lambda: m.transform(bags))
示例19
def transform(self, X):
"""Inherited from the ``TransformerMixin``. Pass the ``X`` array
through the inferential MLP layers.
Parameters
----------
X : array-like, shape=(n_samples, n_features)
The array of samples that will be encoded into the new
hidden layer space.
"""
return self.encode(X)
示例20
def fit_transform(self, **kwargs):
"""
被装饰器@entry_wrapper()装饰,默认参数即支持有监督和无监督学习,
内部通过检测isinstance(fiter, TransformerMixin) or hasattr(fiter, 'fit_transform')
来判定是否可以fit_transform
eg:
input: ttn_abu.x.shape
output: (891, 14)
input: ttn_abu.fit_transform(fiter_type=ml.EMLFitType.E_FIT_PCA).shape
output: (891, 4)
input: ttn_abu.fit_transform(fiter_type=ml.EMLFitType.E_FIT_KMEAN).shape
output: (891, 2)
:param kwargs: 外部可以传递x, y, 通过
x = kwargs.pop('x', self.x)
y = kwargs.pop('y', self.y)
以及装饰器使用的fiter_type,eg:ttn_abu.fit_transform(fiter_type=ml.EMLFitType.E_FIT_CLF)
:return: fit_transform后的转换结果矩阵
"""
fiter = self.get_fiter()
if isinstance(fiter, TransformerMixin) or hasattr(fiter, 'fit_transform'):
x = kwargs.pop('x', self.x)
y = kwargs.pop('y', self.y)
if self.is_supervised_learning():
trans = fiter.fit_transform(x, y)
else:
trans = fiter.fit_transform(x)
return trans
else:
self.log_func('{} not support fit_transform'.format(fiter))
示例21
def split(self, X, y, groups):
n_groups = self.get_n_splits(groups=groups)
#print('n_groups', n_groups)
lpgo = ms.LeavePGroupsOut(n_groups=n_groups-1)
return lpgo.split(X, y, groups)
#class WithoutElement(BaseEstimator, TransformerMixin):
# " Train the model without each element, then test on the rows with that element "
# pass
示例22
def add_normalization_strategy(self, name, normalization_type, is_default_normalization_strategy=False):
"""Add a normalization strategy.
Will be called with {pipeline_config, X, Y}
Arguments:
name {string} -- name of normalization strategy for definition in config
normalization_strategy {function} -- callable with {pipeline_config, X}
is_default_normalization_strategy {bool} -- should the given normalization_strategy be the default normalization_strategy if not specified in config
"""
if (not issubclass(normalization_type, BaseEstimator) and not issubclass(normalization_type, TransformerMixin)):
raise ValueError("normalization_type must be subclass of BaseEstimator")
self.normalization_strategies[name] = normalization_type
示例23
def test_clone_pandas_dataframe():
class DummyEstimator(BaseEstimator, TransformerMixin):
"""This is a dummy class for generating numerical features
This feature extractor extracts numerical features from pandas data
frame.
Parameters
----------
df: pandas data frame
The pandas data frame parameter.
Notes
-----
"""
def __init__(self, df=None, scalar_param=1):
self.df = df
self.scalar_param = scalar_param
def fit(self, X, y=None):
pass
def transform(self, X):
pass
# build and clone estimator
d = np.arange(10)
df = MockDataFrame(d)
e = DummyEstimator(df, scalar_param=1)
cloned_e = clone(e)
# the test
assert_true((e.df == cloned_e.df).values.all())
assert_equal(e.scalar_param, cloned_e.scalar_param)
示例24
def enumerate_pipeline_models(pipe, coor=None, vs=None):
"""
Enumerates all the models within a pipeline.
"""
if coor is None:
coor = (0,)
yield coor, pipe, vs
if hasattr(pipe, 'transformer_and_mapper_list') and len(
pipe.transformer_and_mapper_list):
# azureml DataTransformer
raise NotImplementedError("Unable to handle this specific case.")
elif hasattr(pipe, 'mapper') and pipe.mapper:
# azureml DataTransformer
for couple in enumerate_pipeline_models(pipe.mapper, coor + (0,)):
yield couple
elif hasattr(pipe, 'built_features'):
# sklearn_pandas.dataframe_mapper.DataFrameMapper
for i, (columns, transformers, _) in enumerate(pipe.built_features):
if isinstance(columns, str):
columns = (columns,)
if transformers is None:
yield (coor + (i,)), None, columns
else:
for couple in enumerate_pipeline_models(transformers,
coor + (i,),
columns):
yield couple
elif isinstance(pipe, Pipeline):
for i, (_, model) in enumerate(pipe.steps):
for couple in enumerate_pipeline_models(model, coor + (i,)):
yield couple
elif ColumnTransformer is not None and isinstance(pipe, ColumnTransformer):
for i, (_, fitted_transformer, column) in enumerate(pipe.transformers):
for couple in enumerate_pipeline_models(
fitted_transformer, coor + (i,), column):
yield couple
elif isinstance(pipe, FeatureUnion):
for i, (_, model) in enumerate(pipe.transformer_list):
for couple in enumerate_pipeline_models(model, coor + (i,)):
yield couple
elif TransformedTargetRegressor is not None and isinstance(
pipe, TransformedTargetRegressor):
raise NotImplementedError(
"Not yet implemented for TransformedTargetRegressor.")
elif isinstance(pipe, (TransformerMixin, ClassifierMixin, RegressorMixin)):
pass
elif isinstance(pipe, BaseEstimator):
pass
else:
raise TypeError(
"Parameter pipe is not a scikit-learn object: {}\n{}".format(
type(pipe), pipe))
示例25
def test_sub(self):
class CustomOpTransformer(BaseEstimator, TransformerMixin):
def __init__(self, op_version=None):
self.op_version = op_version
def fit(self, X, y=None):
self.W = np.mean(X, axis=0)
return self
def transform(self, X):
return X - self.W
mat = np.array([[0., 1.], [1., 2.], [3., 4.]])
tr = CustomOpTransformer(op_version=None)
tr.fit(mat)
z = tr.transform(mat)
def conv(scope, operator, container):
W = operator.raw_operator.W.astype(container.dtype)
op = OnnxSub(
operator.inputs[0], W, output_names=operator.outputs,
op_version=TARGET_OPSET)
op.add_to(scope, container)
text = str(container)
if 'name:"Su_Sub"' not in text:
raise AssertionError(
"Unnamed operator: '{}'".format(text))
nin = list(op.enumerate_initial_types())
nno = list(op.enumerate_nodes())
nva = list(op.enumerate_variables())
assert len(nin) == 1
assert nin[0][0] == 'input'
assert nin[0][1].shape == [None, 2]
assert len(nno) == 1
assert nno[0].output_names == ['variable']
assert len(nva) == 1
assert isinstance(nva[0], tuple)
assert nva[0][1] == 0
def shape(operator):
N = operator.inputs[0].type.shape[0]
W = operator.raw_operator.W
operator.outputs[0].type.shape = [N, W.shape[0]]
model_onnx = convert_sklearn(
tr, 'a-sub', [('input', FloatTensorType([None, 2]))],
custom_shape_calculators={CustomOpTransformer: shape},
custom_conversion_functions={CustomOpTransformer: conv})
sess = InferenceSession(model_onnx.SerializeToString())
z2 = sess.run(None, {'input': mat.astype(np.float32)})[0]
assert_almost_equal(z, z2)
示例26
def test_sub_div(self):
class CustomOpTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
self.W = np.mean(X, axis=0)
self.S = np.std(X, axis=0)
return self
def transform(self, X):
return (X - self.W) / self.S
mat = np.array([[0., 1.], [0., 1.], [2., 2.]])
tr = CustomOpTransformer()
tr.fit(mat)
z = tr.transform(mat)
def conv(scope, operator, container):
W = operator.raw_operator.W.astype(np.float32)
S = operator.raw_operator.S.astype(np.float32)
X = operator.inputs[0]
out = operator.outputs
op = OnnxDiv(
OnnxSub(X, W, op_version=container.target_opset),
S, output_names=out,
op_version=container.target_opset)
op.add_to(scope, container)
def shape(operator):
N = operator.inputs[0].type.shape[0]
W = operator.raw_operator.W
operator.outputs[0].type.shape = [N, W.shape[0]]
model_onnx = convert_sklearn(
tr, 'a-sub-div', [('input', FloatTensorType([None, 2]))],
custom_shape_calculators={CustomOpTransformer: shape},
custom_conversion_functions={CustomOpTransformer: conv},
target_opset=None)
try:
sess = InferenceSession(model_onnx.SerializeToString())
except RuntimeError as e:
raise AssertionError(
"Cannot load model\n---\n{}\n---".format(model_onnx)) from e
z2 = sess.run(None, {'input': mat.astype(np.float32)})[0]
assert_almost_equal(z, z2)
示例27
def verify_explain_model_categorical(self, pass_categoricals=False):
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
"num_doors", "body_style", "drive_wheels", "engine_location",
"wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system",
"bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
"city_mpg", "highway_mpg", "price"]
df = retrieve_dataset('imports-85.csv', header=None, names=headers, na_values="?")
df_y = df['price']
df_X = df.drop(columns='price')
df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(df_X, df_y, test_size=0.2, random_state=7)
# Encode strings to ordinal values
categorical_col_names = list(df_train_X.select_dtypes(include='object').columns)
categorical_col_indices = [df_train_X.columns.get_loc(col_name) for col_name in categorical_col_names]
kwargs = {'num_leaves': 31, 'num_trees': 100, 'objective': 'regression',
'categorical_feature': categorical_col_indices}
lgbm_regressor = LGBMRegressor(**kwargs)
# Impute the x and y values
imp_X = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_y = SimpleImputer(missing_values=np.nan, strategy='mean')
# reshape to 2D array since SimpleImputer can't work on 1D array
df_train_y = df_train_y.values.reshape(df_train_y.shape[0], 1)
imp_y.fit(df_train_y)
imp_df_y = imp_y.transform(df_train_y)
imp_X.fit(df_train_X)
imp_train_X = pd.DataFrame(imp_X.transform(df_train_X))
class CustomTextTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
return
def fit(self, X, y=None):
return self
def transform(self, X):
return X.astype('U')
custom_text = CustomTextTransformer()
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
ct1 = ColumnTransformer([('cu', custom_text, categorical_col_indices)], remainder='passthrough')
ct2 = ColumnTransformer([('ord', encoder, slice(0, len(categorical_col_indices)))], remainder='passthrough')
pipeline = Pipeline([('cu', ct1), ('ct', ct2), ('lgbm', lgbm_regressor)])
pipeline.fit(imp_train_X, imp_df_y[:, 0])
if pass_categoricals:
explainer = self.create_explainer(pipeline, imp_train_X, categorical_features=categorical_col_indices)
else:
explainer = self.create_explainer(pipeline, imp_train_X)
explanation = explainer.explain_global(imp_X.transform(df_test_X))
verify_serialization(explanation, exist_ok=True)