Python源码示例:sklearn.base.is_classifier()
示例1
def _validate_estimator(estimator: BaseEstimator):
"""
Run some checks on the given object to determine if it's an estimator which is
valid for our purposes.
"""
# sklearn has a function that does a lot more intensive checking regarding
# the interface of a candidate Estimator
# (sklearn.utils.estimator_checks.check_estimator), but the function
# doesn't work well for our use case as of version 0.22. It doesn't properly
# detect Pipeline X_types based on the first pipeline component and won't
# test anything that doesn't accept a 2-D numpy array as input. We'll settle
# for lax checks here until sklearn has something that works better for us.
if not is_classifier(estimator):
raise ValueError(
"Estimator must be a classifier according to sklearn.base.is_classifier()"
)
if not hasattr(estimator, "predict_proba"):
raise ValueError(
"Estimator must support the predict_proba() method to fulfill gobbli's "
"interface requirements for a prediction model."
)
示例2
def test_submission(self, estimator_fitted, X):
"""Predict using a fitted estimator.
Parameters
----------
estimator_fitted : estimator object
A fitted scikit-learn estimator.
X : {array-like, sparse matrix, dataframe} of shape \
(n_samples, n_features)
The test data set.
Returns
-------
pred : ndarray of shape (n_samples, n_classes) or (n_samples)
"""
if is_classifier(estimator_fitted):
return estimator_fitted.predict_proba(X)
return estimator_fitted.predict(X)
示例3
def __init__(self, models):
"""Proxy class to build an ensemble of models with an API as one
Parameters
----------
models: array
An array of models
"""
self._models = models if len(models) else None
if self._models is not None:
if is_classifier(self._models[0]):
check_type = is_classifier
self._scoring_fun = accuracy_score
elif is_regressor(self._models[0]):
check_type = is_regressor
self._scoring_fun = r2_score
else:
raise ValueError('Expected regressors or classifiers,'
' got %s instead' % type(self._models[0]))
for model in self._models:
if not check_type(model):
raise ValueError('Different types of models found, privide'
' either regressors or classifiers.')
示例4
def __init__(self, X, y, criterion, min_samples_split, max_depth,
n_val_sample, random_state):
# make sure max_depth > 1
if max_depth < 2:
raise ValueError("max depth must be > 1")
# check the input arrays, and if it's classification validate the
# target values in y
X, y = check_X_y(X, y, accept_sparse=False, dtype=None, copy=True)
if is_classifier(self):
check_classification_targets(y)
# hyper parameters so we can later inspect attributes of the model
self.min_samples_split = min_samples_split
self.max_depth = max_depth
self.n_val_sample = n_val_sample
self.random_state = random_state
# create the splitting class
random_state = check_random_state(random_state)
self.splitter = RandomSplitter(random_state, criterion, n_val_sample)
# grow the tree depth first
self.tree = self._find_next_split(X, y, 0)
示例5
def fit(self, X, y, sample_weight=None):
"""Fit a separate classifier for each output variable."""
for _, clf in self.classifiers:
if not hasattr(clf, 'fit'):
raise ValueError('Every base classifier should implement a fit method.')
X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)
if is_classifier(self):
check_classification_targets(y)
if y.ndim == 1:
raise ValueError('Output y must have at least two dimensions for multi-output classification but has only one.')
if sample_weight is not None and any([not has_fit_parameter(clf, 'sample_weight') for _, clf in self.classifiers]):
raise ValueError('One of base classifiers does not support sample weights.')
self.classifiers_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)(clf, X, y[:, i], sample_weight)
for i, (_, clf) in zip(range(y.shape[1]), self.classifiers))
return self
示例6
def convert_sklearn_grid_search_cv(scope, operator, container):
"""
Converter for scikit-learn's GridSearchCV.
"""
opts = scope.get_options(operator.raw_operator)
grid_search_op = operator.raw_operator
best_estimator = grid_search_op.best_estimator_
op_type = sklearn_operator_name_map[type(best_estimator)]
grid_search_operator = scope.declare_local_operator(op_type)
grid_search_operator.raw_operator = best_estimator
container.add_options(id(best_estimator), opts)
grid_search_operator.inputs = operator.inputs
label_name = scope.declare_local_variable('label')
grid_search_operator.outputs.append(label_name)
if is_classifier(best_estimator):
proba_name = scope.declare_local_variable('probability_tensor',
FloatTensorType())
grid_search_operator.outputs.append(proba_name)
apply_identity(scope, label_name.full_name,
operator.outputs[0].full_name, container)
if is_classifier(best_estimator):
apply_identity(scope, proba_name.full_name,
operator.outputs[1].full_name, container)
示例7
def fit(self, X, y):
y_labels = self._get_labels(y)
cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator))
self.estimators_ = []
for train, _ in cv.split(X, y_labels):
self.estimators_.append(
clone(self.estimator).fit(X[train], y_labels[train])
)
return self
示例8
def transform(self, X, y=None):
cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
X_prob = np.zeros((X.shape[0], self.n_classes))
X_pred = np.zeros(X.shape[0])
for estimator, (_, test) in zip(self.estimators_, cv.split(X)):
X_prob[test] = estimator.predict_proba(X[test])
X_pred[test] = estimator.predict(X[test])
return np.hstack([X_prob, np.array([X_pred]).T])
示例9
def test_is_classifier():
svc = SVC()
assert is_classifier(svc)
assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))
assert is_classifier(Pipeline([('svc', svc)]))
assert is_classifier(Pipeline(
[('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))
示例10
def test_late_onset_averaging_not_reached(klass):
clf1 = klass(average=600)
clf2 = klass()
for _ in range(100):
if is_classifier(clf1):
clf1.partial_fit(X, Y, classes=np.unique(Y))
clf2.partial_fit(X, Y, classes=np.unique(Y))
else:
clf1.partial_fit(X, Y)
clf2.partial_fit(X, Y)
assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
示例11
def test_validation_set_not_used_for_training(klass):
X, Y = iris.data, iris.target
validation_fraction = 0.4
seed = 42
shuffle = False
max_iter = 10
clf1 = klass(early_stopping=True,
random_state=np.random.RandomState(seed),
validation_fraction=validation_fraction,
learning_rate='constant', eta0=0.01,
tol=None, max_iter=max_iter, shuffle=shuffle)
clf1.fit(X, Y)
assert clf1.n_iter_ == max_iter
clf2 = klass(early_stopping=False,
random_state=np.random.RandomState(seed),
learning_rate='constant', eta0=0.01,
tol=None, max_iter=max_iter, shuffle=shuffle)
if is_classifier(clf2):
cv = StratifiedShuffleSplit(test_size=validation_fraction,
random_state=seed)
else:
cv = ShuffleSplit(test_size=validation_fraction,
random_state=seed)
idx_train, idx_val = next(cv.split(X, Y))
idx_train = np.sort(idx_train) # remove shuffling
clf2.fit(X[idx_train], Y[idx_train])
assert clf2.n_iter_ == max_iter
assert_array_equal(clf1.coef_, clf2.coef_)
示例12
def fit(self, X, y=None, groups=None, **fit_params):
"""
Run fit method with all sets of parameters
Args
----
X : array-like, shape = [n_samples, n_features]
Training vector, where n_samples is the number of samples and
n_features is the number of features
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
None for unsupervised learning
groups : array-like, shape = [n_samples], optional
Training vector groups for cross-validation
**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of the estimator
"""
# check estimator and cv methods are valid
self.cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
# check for binary response
if len(np.unique(y)) > 2:
raise ValueError('Only a binary response vector is currently supported')
# check that scoring metric has been specified
if self.scoring is None:
raise ValueError('No score function is defined')
# perform cross validation prediction
self.y_pred_ = cross_val_predict(
estimator=self.estimator, X=X, y=y, groups=groups, cv=self.cv,
method='predict_proba', n_jobs=self.n_jobs, **fit_params)
self.y_true = y
# add fold id to the predictions
self.test_idx_ = [indexes[1] for indexes in self.cv.split(X, y, groups)]
示例13
def cross_val_score(
estimator,
X,
y=None,
groups=None,
scoring=None,
cv=None,
n_jobs=1,
verbose=0,
fit_params=None,
pre_dispatch="2*n_jobs",
):
"""
Evaluate a score by cross-validation
"""
if not isinstance(scoring, (list, tuple)):
scoring = [scoring]
X, y, groups = indexable(X, y, groups)
cv = check_cv(cv, y, classifier=is_classifier(estimator))
splits = list(cv.split(X, y, groups))
scorer = [check_scoring(estimator, scoring=s) for s in scoring]
# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
scores = parallel(
delayed(_fit_and_score)(
clone(estimator), X, y, scorer, train, test, verbose, None, fit_params
)
for train, test in splits
)
group_order = []
if hasattr(cv, "groups"):
group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
return np.squeeze(np.array(scores)), group_order
示例14
def test_is_classifier():
svc = SVC()
assert_true(is_classifier(svc))
assert_true(is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})))
assert_true(is_classifier(Pipeline([('svc', svc)])))
assert_true(is_classifier(Pipeline(
[('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))])))
示例15
def test_precision():
rng_reg = RandomState(2)
rng_clf = RandomState(8)
for X, y, clf in zip(
(rng_reg.random_sample((5, 2)),
rng_clf.random_sample((1000, 4))),
(rng_reg.random_sample((5, )),
rng_clf.randint(2, size=(1000, ))),
(DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
max_depth=1),
DecisionTreeClassifier(max_depth=1, random_state=0))):
clf.fit(X, y)
for precision in (4, 3):
dot_data = export_graphviz(clf, out_file=None, precision=precision,
proportion=True)
# With the current random state, the impurity and the threshold
# will have the number of precision set in the export_graphviz
# function. We will check the number of precision with a strict
# equality. The value reported will have only 2 precision and
# therefore, only a less equal comparison will be done.
# check value
for finding in finditer(r"value = \d+\.\d+", dot_data):
assert_less_equal(
len(search(r"\.\d+", finding.group()).group()),
precision + 1)
# check impurity
if is_classifier(clf):
pattern = r"gini = \d+\.\d+"
else:
pattern = r"friedman_mse = \d+\.\d+"
# check impurity
for finding in finditer(pattern, dot_data):
assert_equal(len(search(r"\.\d+", finding.group()).group()),
precision + 1)
# check threshold
for finding in finditer(r"<= \d+\.\d+", dot_data):
assert_equal(len(search(r"\.\d+", finding.group()).group()),
precision + 1)
示例16
def yield_all_checks(name, estimator):
tags = estimator._get_tags()
if "2darray" not in tags["X_types"]:
warnings.warn("Can't test estimator {} which requires input "
" of type {}".format(name, tags["X_types"]),
SkipTestWarning)
return
if tags["_skip_test"]:
warnings.warn("Explicit SKIP via _skip_test tag for estimator "
"{}.".format(name),
SkipTestWarning)
return
yield from _yield_checks(name, estimator)
if is_classifier(estimator):
yield from _yield_classifier_checks(name, estimator)
if is_regressor(estimator):
yield from _yield_regressor_checks(name, estimator)
if hasattr(estimator, 'transform'):
if not tags["allow_variable_length"]:
# Transformer tests ensure that shapes are the same at fit and
# transform time, hence we need to skip them for estimators that
# allow variable-length inputs
yield from _yield_transformer_checks(name, estimator)
if isinstance(estimator, ClusterMixin):
yield from _yield_clustering_checks(name, estimator)
if is_outlier_detector(estimator):
yield from _yield_outliers_checks(name, estimator)
# We are not strict on presence/absence of the 3rd dimension
# yield check_fit2d_predict1d
if not tags["non_deterministic"]:
yield check_methods_subset_invariance
yield check_fit2d_1sample
yield check_fit2d_1feature
yield check_fit1d
yield check_get_params_invariance
yield check_set_params
yield check_dict_unchanged
yield check_dont_overwrite_parameters
yield check_fit_idempotent
if (is_classifier(estimator) or
is_regressor(estimator) or
isinstance(estimator, ClusterMixin)):
if tags["allow_variable_length"]:
yield check_different_length_fit_predict_transform
示例17
def permutation_test_score(
estimator,
X,
y,
groups=None,
cv=None,
n_permutations=100,
n_jobs=1,
random_state=0,
verbose=0,
scoring=None,
):
"""
Evaluate the significance of a cross-validated score with permutations,
as in test 1 of [Ojala2010]_.
A modification of original sklearn's permutation test score function
to evaluate p-value outside this function, so that the score can be
reused from outside.
.. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
Performance. The Journal of Machine Learning Research (2010)
vol. 11
"""
X, y, groups = indexable(X, y, groups)
cv = check_cv(cv, y, classifier=is_classifier(estimator))
scorer = check_scoring(estimator, scoring=scoring)
random_state = check_random_state(random_state)
# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
delayed(_permutation_test_score)(
clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer
)
for _ in range(n_permutations)
)
permutation_scores = np.array(permutation_scores)
return permutation_scores
示例18
def _cross_validate_with_pruning(
self,
trial, # type: trial_module.Trial
estimator, # type: BaseEstimator
):
# type: (...) -> Dict[str, OneDimArrayLikeType]
if is_classifier(estimator):
partial_fit_params = self.fit_params.copy()
classes = np.unique(self.y)
partial_fit_params.setdefault("classes", classes)
else:
partial_fit_params = self.fit_params
n_splits = self.cv.get_n_splits(self.X, self.y, groups=self.groups)
estimators = [clone(estimator) for _ in range(n_splits)]
scores = {
"fit_time": np.zeros(n_splits),
"score_time": np.zeros(n_splits),
"test_score": np.empty(n_splits),
}
if self.return_train_score:
scores["train_score"] = np.empty(n_splits)
for step in range(self.max_iter):
for i, (train, test) in enumerate(self.cv.split(self.X, self.y, groups=self.groups)):
out = self._partial_fit_and_score(estimators[i], train, test, partial_fit_params)
if self.return_train_score:
scores["train_score"][i] = out.pop(0)
scores["test_score"][i] = out[0]
scores["fit_time"][i] += out[1]
scores["score_time"][i] += out[2]
intermediate_value = np.nanmean(scores["test_score"])
trial.report(intermediate_value, step=step)
if trial.should_prune():
self._store_scores(trial, scores)
raise TrialPruned("trial was pruned at iteration {}.".format(step))
return scores
示例19
def test_precision():
rng_reg = RandomState(2)
rng_clf = RandomState(8)
for X, y, clf in zip(
(rng_reg.random_sample((5, 2)),
rng_clf.random_sample((1000, 4))),
(rng_reg.random_sample((5, )),
rng_clf.randint(2, size=(1000, ))),
(DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
max_depth=1),
DecisionTreeClassifier(max_depth=1, random_state=0))):
clf.fit(X, y)
for precision in (4, 3):
dot_data = export_graphviz(clf, out_file=None, precision=precision,
proportion=True)
# With the current random state, the impurity and the threshold
# will have the number of precision set in the export_graphviz
# function. We will check the number of precision with a strict
# equality. The value reported will have only 2 precision and
# therefore, only a less equal comparison will be done.
# check value
for finding in finditer("value = \d+\.\d+", dot_data):
assert_less_equal(
len(search("\.\d+", finding.group()).group()),
precision + 1)
# check impurity
if is_classifier(clf):
pattern = "gini = \d+\.\d+"
else:
pattern = "friedman_mse = \d+\.\d+"
# check impurity
for finding in finditer(pattern, dot_data):
assert_equal(len(search("\.\d+", finding.group()).group()),
precision + 1)
# check threshold
for finding in finditer("<= \d+\.\d+", dot_data):
assert_equal(len(search("\.\d+", finding.group()).group()),
precision + 1)