Python源码示例:sklearn.base.BaseEstimator()
示例1
def test_get_metadata_helper(model: BaseEstimator, expect_empty_dict: bool):
"""
Ensure the builder works with various model configs and that each has
expected/valid metadata results.
"""
X, y = np.random.random((1000, 4)), np.random.random((1000,))
model.fit(X, y)
metadata = ModelBuilder._extract_metadata_from_model(model)
# All the metadata we've implemented so far is 'history', so we'll check that
if not expect_empty_dict:
assert "history" in metadata
assert all(
name in metadata["history"] for name in ("params", "loss", "accuracy")
)
else:
assert dict() == metadata
示例2
def load_model(directory: str, name: str) -> BaseEstimator:
"""
Load a given model from the directory by name.
Parameters
----------
directory: str
Directory to look for the model
name: str
Name of the model to load, this would be the sub directory within the
directory parameter.
Returns
-------
BaseEstimator
"""
start_time = timeit.default_timer()
model = serializer.load(os.path.join(directory, name))
logger.debug(f"Time to load model: {timeit.default_timer() - start_time}s")
return model
示例3
def _determine_offset(
model: BaseEstimator, X: Union[np.ndarray, pd.DataFrame]
) -> int:
"""
Determine the model's offset. How much does the output of the model differ
from its input?
Parameters
----------
model: sklearn.base.BaseEstimator
Trained model with either ``predict`` or ``transform`` method, preference
given to ``predict``.
X: Union[np.ndarray, pd.DataFrame]
Data to pass to the model's ``predict`` or ``transform`` method.
Returns
-------
int
The difference between X and the model's output lengths.
"""
out = model.predict(X) if hasattr(model, "predict") else model.transform(X)
return len(X) - len(out)
示例4
def _dispatch_models(algorithm_type: Union[str, Type[BaseEstimator]],
target_type: str, custom_eval: Optional[Callable] = None):
if not isinstance(algorithm_type, str):
assert issubclass(algorithm_type, BaseEstimator), "algorithm_type should be str or subclass of BaseEstimator"
return algorithm_type, _dispatch_eval_func(target_type, custom_eval), None
cat_features = {
'lgbm': 'categorical_feature',
'cat': 'cat_features',
'xgb': None
}
gbdt_class = _dispatch_gbdt_class(algorithm_type, target_type)
eval_func = _dispatch_eval_func(target_type, custom_eval)
return gbdt_class, eval_func, cat_features[algorithm_type]
示例5
def test_sample_weight_adaboost_regressor():
"""
AdaBoostRegressor should work without sample_weights in the base estimator
The random weighted sampling is done internally in the _boost method in
AdaBoostRegressor.
"""
class DummyEstimator(BaseEstimator):
def fit(self, X, y):
pass
def predict(self, X):
return np.zeros(X.shape[0])
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
boost.fit(X, y_regr)
assert_equal(len(boost.estimator_weights_), len(boost.estimator_errors_))
示例6
def test_calibration_accepts_ndarray(X):
"""Test that calibration accepts n-dimensional arrays as input"""
y = [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0]
class MockTensorClassifier(BaseEstimator):
"""A toy estimator that accepts tensor inputs"""
def fit(self, X, y):
self.classes_ = np.unique(y)
return self
def decision_function(self, X):
# toy decision function that just needs to have the right shape:
return X.reshape(X.shape[0], -1).sum(axis=1)
calibrated_clf = CalibratedClassifierCV(MockTensorClassifier())
# we should be able to fit this classifier with no error
calibrated_clf.fit(X, y)
示例7
def persist_estimator(estimator: BaseEstimator) -> Path:
"""
Saves the given estimator to a gobbli-managed filepath, where it can be loaded from
disk by the SKLearnClassifier. This is useful if you want to use an estimator but
don't want to bother with saving it to disk on your own.
Args:
estimator: The estimator to load.
Returns:
The path where the estimator was saved.
"""
estimator_dir = (
SKLearnClassifier.model_class_dir() / "user_estimators" / generate_uuid()
)
estimator_dir.mkdir(exist_ok=True, parents=True)
estimator_path = estimator_dir / SKLearnClassifier._TRAIN_OUTPUT_CHECKPOINT
SKLearnClassifier._dump_estimator(estimator, estimator_path)
return estimator_path
示例8
def _validate_estimator(estimator: BaseEstimator):
"""
Run some checks on the given object to determine if it's an estimator which is
valid for our purposes.
"""
# sklearn has a function that does a lot more intensive checking regarding
# the interface of a candidate Estimator
# (sklearn.utils.estimator_checks.check_estimator), but the function
# doesn't work well for our use case as of version 0.22. It doesn't properly
# detect Pipeline X_types based on the first pipeline component and won't
# test anything that doesn't accept a 2-D numpy array as input. We'll settle
# for lax checks here until sklearn has something that works better for us.
if not is_classifier(estimator):
raise ValueError(
"Estimator must be a classifier according to sklearn.base.is_classifier()"
)
if not hasattr(estimator, "predict_proba"):
raise ValueError(
"Estimator must support the predict_proba() method to fulfill gobbli's "
"interface requirements for a prediction model."
)
示例9
def print_help(model, defaults=None):
"""Print help for the command line arguments of the given model.
Parameters
----------
model : sklearn.base.BaseEstimator
The basic model, e.g. a ``NeuralNet`` or sklearn ``Pipeline``.
defautls : dict or None (default=None)
Optionally, change the default values to use custom
defaults. Commandline arguments have precedence over defaults.
"""
defaults = defaults or {}
print("This is the help for the model-specific parameters.")
print("To invoke help for the remaining options, run:")
print("python {} -- --help".format(sys.argv[0]))
print()
lines = (_get_help_for_estimator(prefix, estimator, defaults=defaults) for
prefix, estimator in _yield_estimators(model))
print('\n'.join(chain(*lines)))
示例10
def test_loader(sklearn_model, project_manager):
skl = SklearnModel(artifact=sklearn_model)
skl.store(name='clf')
reloaded = skl.load(name='clf')
assert isinstance(reloaded, BaseEstimator)
skl2 = SklearnModel(artifact=sklearn_model)
skl2.store(name='clf')
reload_first = skl.load(run_number=1, name='clf')
assert isinstance(reload_first, BaseEstimator)
for root, dirs, files in os.walk(project_manager.CONFIG['saved-models']):
for f in files:
os.unlink(os.path.join(root, f))
for d in dirs:
shutil.rmtree(os.path.join(root, d))
with open(os.path.join(project_manager.CONFIG['saved-models'], '.gitkeep'), 'w') as gitkeep:
gitkeep.write('empty')
示例11
def test_trainable_model_from_file(sklearn_model, project_manager):
skl = SklearnModel(artifact=sklearn_model)
# lr = LogisticRegression()
# trainable = TrainableModel(artifact=lr)
skl.store(name='clf')
trainable = TrainableModel.from_file(run_number=1, name='clf', model_type='sklearn')
assert isinstance(trainable.model, BaseEstimator)
for root, dirs, files in os.walk(project_manager.CONFIG['saved-models']):
for f in files:
os.unlink(os.path.join(root, f))
for d in dirs:
shutil.rmtree(os.path.join(root, d))
with open(os.path.join(project_manager.CONFIG['saved-models'], '.gitkeep'), 'w') as gitkeep:
gitkeep.write('empty')
示例12
def __init__(self, classifier=None, predictors="all"):
"""Create an instance of the MissingnessClassifier.
The MissingnessClassifier inherits from sklearn BaseEstimator and
ClassifierMixin. This inheritence and this class' implementation
ensure that the MissingnessClassifier is a valid classifier that will
work in an sklearn pipeline.
Args:
classifier (classifier, optional): valid classifier from sklearn.
If None, default is xgboost. Note that classifier must
conform to sklearn style. This means it must implement the
`predict_proba` method and act as a porper classifier.
predictors (str, iter, dict, optiona): defaults to all, i.e.
use all predictors. If all, every column will be used for
every class prediction. If a list, subset of columns used for
all predictions. If a dict, specify which columns to use as
predictors for each imputation. Columns not specified in dict
will receive `all` by default.
"""
self.classifier = classifier
self.predictors = predictors
示例13
def make_query_strategy(utility_measure: Callable, selector: Callable) -> Callable:
"""
Takes the given utility measure and selector functions and makes a query strategy by combining them.
Args:
utility_measure: Utility measure, for instance :func:`~modAL.disagreement.vote_entropy`, but it can be a custom
function as well. Should take a classifier and the unlabelled data and should return an array containing the
utility scores.
selector: Function selecting instances for query. Should take an array of utility scores and should return an
array containing the queried items.
Returns:
A function which returns queried instances given a classifier and an unlabelled pool.
"""
def query_strategy(classifier: BaseEstimator, X: modALinput) -> Tuple:
utility = utility_measure(classifier, X)
query_idx = selector(utility)
return query_idx, X[query_idx]
return query_strategy
示例14
def check_class_labels(*args: BaseEstimator) -> bool:
"""
Checks the known class labels for each classifier.
Args:
*args: Classifier objects to check the known class labels.
Returns:
True, if class labels match for all classifiers, False otherwise.
"""
try:
classes_ = [estimator.classes_ for estimator in args]
except AttributeError:
raise NotFittedError('Not all estimators are fitted. Fit all estimators before using this method.')
for classifier_idx in range(len(args) - 1):
if not np.array_equal(classes_[classifier_idx], classes_[classifier_idx+1]):
return False
return True
示例15
def __init__(self,
estimator: BaseEstimator,
query_strategy: Callable,
X_training: Optional[modALinput] = None,
y_training: Optional[modALinput] = None,
bootstrap_init: bool = False,
force_all_finite: bool = True,
**fit_kwargs
) -> None:
assert callable(query_strategy), 'query_strategy must be callable'
self.estimator = estimator
self.query_strategy = query_strategy
self.X_training = X_training
self.y_training = y_training
if X_training is not None:
self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
self.force_all_finite = force_all_finite
示例16
def classifier_uncertainty(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
"""
Classification uncertainty of the classifier for the provided samples.
Args:
classifier: The classifier for which the uncertainty is to be measured.
X: The samples for which the uncertainty of classification is to be measured.
**predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
Returns:
Classifier uncertainty, which is 1 - P(prediction is correct).
"""
# calculate uncertainty for each point provided
try:
classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
except NotFittedError:
return np.ones(shape=(X.shape[0], ))
# for each point, select the maximum uncertainty
uncertainty = 1 - np.max(classwise_uncertainty, axis=1)
return uncertainty
示例17
def classifier_margin(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
"""
Classification margin uncertainty of the classifier for the provided samples. This uncertainty measure takes the
first and second most likely predictions and takes the difference of their probabilities, which is the margin.
Args:
classifier: The classifier for which the prediction margin is to be measured.
X: The samples for which the prediction margin of classification is to be measured.
**predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
Returns:
Margin uncertainty, which is the difference of the probabilities of first and second most likely predictions.
"""
try:
classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
except NotFittedError:
return np.zeros(shape=(X.shape[0], ))
if classwise_uncertainty.shape[1] == 1:
return np.zeros(shape=(classwise_uncertainty.shape[0],))
part = np.partition(-classwise_uncertainty, 1, axis=1)
margin = - part[:, 0] + part[:, 1]
return margin
示例18
def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
"""
Entropy of predictions of the for the provided samples.
Args:
classifier: The classifier for which the prediction entropy is to be measured.
X: The samples for which the prediction entropy is to be measured.
**predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
Returns:
Entropy of the class probabilities.
"""
try:
classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
except NotFittedError:
return np.zeros(shape=(X.shape[0], ))
return np.transpose(entropy(np.transpose(classwise_uncertainty)))
示例19
def tosklearn(self):
class NeuraxleToSKLearnPipelineWrapper(BaseEstimator):
def __init__(self, neuraxle_step):
self.p: Union[BaseStep, TruncableSteps] = neuraxle_step
def set_params(self, **params) -> BaseEstimator:
self.p.set_hyperparams(HyperparameterSpace(params))
return self
def get_params(self, deep=True):
neuraxle_params = HyperparameterSamples(self.p.get_hyperparams()).to_flat_as_dict_primitive()
return neuraxle_params
def get_params_space(self, deep=True):
neuraxle_params = HyperparameterSpace(self.p.get_hyperparams_space()).to_flat_as_dict_primitive()
return neuraxle_params
def fit(self, **args) -> BaseEstimator:
self.p = self.p.fit(**args)
def transform(self, **args) -> BaseEstimator:
return self.p.transform(**args)
return NeuraxleToSKLearnPipelineWrapper(self)
示例20
def make_pmml_pipeline(obj, active_fields = None, target_fields = None):
"""Translates a regular Scikit-Learn estimator or pipeline to a PMML pipeline.
Parameters:
----------
obj: BaseEstimator
The object.
active_fields: list of strings, optional
Feature names. If missing, "x1", "x2", .., "xn" are assumed.
target_fields: list of strings, optional
Label name(s). If missing, "y" is assumed.
"""
steps = _filter_steps(_get_steps(obj))
pipeline = PMMLPipeline(steps)
if active_fields is not None:
pipeline.active_fields = numpy.asarray(active_fields)
if target_fields is not None:
pipeline.target_fields = numpy.asarray(target_fields)
return pipeline
示例21
def normalize_estimator(est):
"""Normalize an estimator.
Note: Since scikit-learn requires duck-typing, but not sub-typing from
``BaseEstimator``, we sometimes need to call this function directly."""
base = [type(est).__name__, normalize_token(est.get_params())]
# fitted attributes: https://github.com/dask/dask-ml/issues/658
attrs = [x for x in dir(est) if x.endswith("_") and not x.startswith("_")]
exclude = {"cv_results_", "model_history_", "history_", "refit_time_"}
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
for attr in attrs:
if attr in exclude:
continue
try:
val = getattr(est, attr)
except (sklearn.exceptions.NotFittedError, AttributeError):
continue
base.append(val)
return tuple(base)
示例22
def test_class(self):
from sklearn.base import BaseEstimator
self.assertTrue(issubclass(StandardScaler, BaseEstimator))
示例23
def setclassifier(self, estimator=KNeighborsClassifier(n_neighbors=10)):
"""Assign classifier for which decision boundary should be plotted.
Parameters
----------
estimator : BaseEstimator instance, optional (default=KNeighborsClassifier(n_neighbors=10)).
Classifier for which the decision boundary should be plotted. Must have
probability estimates enabled (i.e. estimator.predict_proba must work).
Make sure it is possible for probability estimates to get close to 0.5
(more specifically, as close as specified by acceptance_threshold).
"""
self.classifier = estimator
示例24
def predict_regression(x_test, trained_estimator):
"""
Given feature data and a trained estimator, return a regression prediction
Args:
x_test:
trained_estimator (sklearn.base.BaseEstimator): a trained scikit-learn estimator
Returns:
a prediction
"""
validate_estimator(trained_estimator)
prediction = trained_estimator.predict(x_test)
return prediction
示例25
def predict_classification(x_test, trained_estimator):
"""
Given feature data and a trained estimator, return a classification prediction
Args:
x_test:
trained_estimator (sklearn.base.BaseEstimator): a trained scikit-learn estimator
Returns:
a prediction
"""
validate_estimator(trained_estimator)
prediction = np.squeeze(trained_estimator.predict_proba(x_test)[:, 1])
return prediction
示例26
def validate_estimator(possible_estimator):
"""
Given an object, raise an error if it is not a scikit-learn BaseEstimator
Args:
possible_estimator (object): Object of any type.
Returns:
True or raises error - the True is used only for testing
"""
if not issubclass(type(possible_estimator), BaseEstimator):
raise HealthcareAIError(
'Predictions require an estimator. You passed in {}, which is of type: {}'.format(possible_estimator,
type(possible_estimator)))
return True
示例27
def test_client_download_model(gordo_project, gordo_single_target, ml_server):
"""
Test client's ability to download the model
"""
client = Client(project=gordo_project)
models = client.download_model()
assert isinstance(models, dict)
assert isinstance(models[gordo_single_target], BaseEstimator)
# Can't download model for non-existent target
with pytest.raises(NotFound):
client = Client(project=gordo_project)
client.download_model(targets=["non-existent-target"])
示例28
def test_client_cli_download_model(
gordo_project, gordo_single_target, ml_server, tmpdir
):
"""
Test proper execution of client predict sub-command
"""
runner = CliRunner()
# Empty output directory before downloading
assert len(os.listdir(tmpdir)) == 0
out = runner.invoke(
cli.gordo,
args=[
"client",
"--project",
gordo_project,
"download-model",
str(tmpdir),
"--target",
gordo_single_target,
],
)
assert (
out.exit_code == 0
), f"Expected output code 0 got '{out.exit_code}', {out.output}"
# Output directory should not be empty any longer
assert len(os.listdir(tmpdir)) > 0
model_output_dir = os.path.join(tmpdir, gordo_single_target)
assert os.path.isdir(model_output_dir)
model = serializer.load(model_output_dir)
assert isinstance(model, BaseEstimator)
示例29
def test_determine_offset(model: BaseEstimator, expected_offset: int):
"""
Determine the correct output difference from the model
"""
X, y = np.random.random((100, 10)), np.random.random((100, 10))
model.fit(X, y)
offset = ModelBuilder._determine_offset(model, X)
assert offset == expected_offset
示例30
def __init__(
self,
base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"),
scaler: TransformerMixin = RobustScaler(),
require_thresholds: bool = True,
window=None,
):
"""
Classifier which wraps a ``base_estimator`` and provides a diff error
based approach to anomaly detection.
It trains a ``scaler`` to the target **after** training, purely for
error calculations. The underlying ``base_estimator`` is trained
with the original, unscaled, ``y``.
Parameters
----------
base_estimator: sklearn.base.BaseEstimator
The model to which normal ``.fit``, ``.predict`` methods will be used.
defaults to py:class:`gordo.machine.model.models.KerasAutoEncoder` with
``kind='feedforward_hourglass``
scaler: sklearn.base.TransformerMixin
Defaults to ``sklearn.preprocessing.RobustScaler``
Used for transforming model output and the original ``y`` to calculate
the difference/error in model output vs expected.
require_thresholds: bool
Requires calculating ``thresholds_`` via a call to :func:`~DiffBasedAnomalyDetector.cross_validate`.
If this is set (default True), but :func:`~DiffBasedAnomalyDetector.cross_validate`
was not called before calling :func:`~DiffBasedAnomalyDetector.anomaly` an ``AttributeError``
will be raised.
window: int
Window size for smoothed thresholds
"""
self.base_estimator = base_estimator
self.scaler = scaler
self.require_thresholds = require_thresholds
self.window = window