Python源码示例:sklearn.base.is_classifier()

示例1
def _validate_estimator(estimator: BaseEstimator):
        """
        Run some checks on the given object to determine if it's an estimator which is
        valid for our purposes.
        """
        # sklearn has a function that does a lot more intensive checking regarding
        # the interface of a candidate Estimator
        # (sklearn.utils.estimator_checks.check_estimator), but the function
        # doesn't work well for our use case as of version 0.22.  It doesn't properly
        # detect Pipeline X_types based on the first pipeline component and won't
        # test anything that doesn't accept a 2-D numpy array as input.  We'll settle
        # for lax checks here until sklearn has something that works better for us.
        if not is_classifier(estimator):
            raise ValueError(
                "Estimator must be a classifier according to sklearn.base.is_classifier()"
            )

        if not hasattr(estimator, "predict_proba"):
            raise ValueError(
                "Estimator must support the predict_proba() method to fulfill gobbli's "
                "interface requirements for a prediction model."
            ) 
示例2
def test_submission(self, estimator_fitted, X):
        """Predict using a fitted estimator.

        Parameters
        ----------
        estimator_fitted : estimator object
            A fitted scikit-learn estimator.
        X : {array-like, sparse matrix, dataframe} of shape \
                (n_samples, n_features)
            The test data set.

        Returns
        -------
        pred : ndarray of shape (n_samples, n_classes) or (n_samples)
        """
        if is_classifier(estimator_fitted):
            return estimator_fitted.predict_proba(X)
        return estimator_fitted.predict(X) 
示例3
def __init__(self, models):
        """Proxy class to build an ensemble of models with an API as one

        Parameters
        ----------
        models: array
            An array of models
        """
        self._models = models if len(models) else None
        if self._models is not None:
            if is_classifier(self._models[0]):
                check_type = is_classifier
                self._scoring_fun = accuracy_score
            elif is_regressor(self._models[0]):
                check_type = is_regressor
                self._scoring_fun = r2_score
            else:
                raise ValueError('Expected regressors or classifiers,'
                                 ' got %s instead' % type(self._models[0]))
            for model in self._models:
                if not check_type(model):
                    raise ValueError('Different types of models found, privide'
                                     ' either regressors or classifiers.') 
示例4
def __init__(self, X, y, criterion, min_samples_split, max_depth,
                 n_val_sample, random_state):
        # make sure max_depth > 1
        if max_depth < 2:
            raise ValueError("max depth must be > 1")

        # check the input arrays, and if it's classification validate the
        # target values in y
        X, y = check_X_y(X, y, accept_sparse=False, dtype=None, copy=True)
        if is_classifier(self):
            check_classification_targets(y)

        # hyper parameters so we can later inspect attributes of the model
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_val_sample = n_val_sample
        self.random_state = random_state

        # create the splitting class
        random_state = check_random_state(random_state)
        self.splitter = RandomSplitter(random_state, criterion, n_val_sample)

        # grow the tree depth first
        self.tree = self._find_next_split(X, y, 0) 
示例5
def fit(self, X, y, sample_weight=None):
        """Fit a separate classifier for each output variable."""

        for _, clf in self.classifiers:
            if not hasattr(clf, 'fit'):
                raise ValueError('Every base classifier should implement a fit method.')

        X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)

        if is_classifier(self):
            check_classification_targets(y)

        if y.ndim == 1:
            raise ValueError('Output y must have at least two dimensions for multi-output classification but has only one.')

        if sample_weight is not None and any([not has_fit_parameter(clf, 'sample_weight') for _, clf in self.classifiers]):
            raise ValueError('One of base classifiers does not support sample weights.')

        self.classifiers_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)(clf, X, y[:, i], sample_weight) 
                                                        for i, (_, clf) in zip(range(y.shape[1]), self.classifiers))
        
        return self 
示例6
def convert_sklearn_grid_search_cv(scope, operator, container):
    """
    Converter for scikit-learn's GridSearchCV.
    """
    opts = scope.get_options(operator.raw_operator)
    grid_search_op = operator.raw_operator
    best_estimator = grid_search_op.best_estimator_
    op_type = sklearn_operator_name_map[type(best_estimator)]
    grid_search_operator = scope.declare_local_operator(op_type)
    grid_search_operator.raw_operator = best_estimator
    container.add_options(id(best_estimator), opts)
    grid_search_operator.inputs = operator.inputs
    label_name = scope.declare_local_variable('label')
    grid_search_operator.outputs.append(label_name)
    if is_classifier(best_estimator):
        proba_name = scope.declare_local_variable('probability_tensor',
                                                  FloatTensorType())
        grid_search_operator.outputs.append(proba_name)
    apply_identity(scope, label_name.full_name,
                   operator.outputs[0].full_name, container)
    if is_classifier(best_estimator):
        apply_identity(scope, proba_name.full_name,
                       operator.outputs[1].full_name, container) 
示例7
def fit(self, X, y):
        y_labels = self._get_labels(y)
        cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator))
        self.estimators_ = []
        
        for train, _ in cv.split(X, y_labels):
            self.estimators_.append(
                clone(self.estimator).fit(X[train], y_labels[train])
            )
        return self 
示例8
def transform(self, X, y=None):
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
        
        X_prob = np.zeros((X.shape[0], self.n_classes))
        X_pred = np.zeros(X.shape[0])
        
        for estimator, (_, test) in zip(self.estimators_, cv.split(X)):
            X_prob[test] = estimator.predict_proba(X[test])
            X_pred[test] = estimator.predict(X[test])
        return np.hstack([X_prob, np.array([X_pred]).T]) 
示例9
def test_is_classifier():
    svc = SVC()
    assert is_classifier(svc)
    assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))
    assert is_classifier(Pipeline([('svc', svc)]))
    assert is_classifier(Pipeline(
        [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))])) 
示例10
def test_late_onset_averaging_not_reached(klass):
    clf1 = klass(average=600)
    clf2 = klass()
    for _ in range(100):
        if is_classifier(clf1):
            clf1.partial_fit(X, Y, classes=np.unique(Y))
            clf2.partial_fit(X, Y, classes=np.unique(Y))
        else:
            clf1.partial_fit(X, Y)
            clf2.partial_fit(X, Y)

    assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
    assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) 
示例11
def test_validation_set_not_used_for_training(klass):
    X, Y = iris.data, iris.target
    validation_fraction = 0.4
    seed = 42
    shuffle = False
    max_iter = 10
    clf1 = klass(early_stopping=True,
                 random_state=np.random.RandomState(seed),
                 validation_fraction=validation_fraction,
                 learning_rate='constant', eta0=0.01,
                 tol=None, max_iter=max_iter, shuffle=shuffle)
    clf1.fit(X, Y)
    assert clf1.n_iter_ == max_iter

    clf2 = klass(early_stopping=False,
                 random_state=np.random.RandomState(seed),
                 learning_rate='constant', eta0=0.01,
                 tol=None, max_iter=max_iter, shuffle=shuffle)

    if is_classifier(clf2):
        cv = StratifiedShuffleSplit(test_size=validation_fraction,
                                    random_state=seed)
    else:
        cv = ShuffleSplit(test_size=validation_fraction,
                          random_state=seed)
    idx_train, idx_val = next(cv.split(X, Y))
    idx_train = np.sort(idx_train)  # remove shuffling
    clf2.fit(X[idx_train], Y[idx_train])
    assert clf2.n_iter_ == max_iter

    assert_array_equal(clf1.coef_, clf2.coef_) 
示例12
def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit method with all sets of parameters

        Args
        ----
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning

        groups : array-like, shape = [n_samples], optional
            Training vector groups for cross-validation

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        # check estimator and cv methods are valid
        self.cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        # check for binary response
        if len(np.unique(y)) > 2:
            raise ValueError('Only a binary response vector is currently supported')

        # check that scoring metric has been specified
        if self.scoring is None:
            raise ValueError('No score function is defined')

        # perform cross validation prediction
        self.y_pred_ = cross_val_predict(
            estimator=self.estimator, X=X, y=y, groups=groups, cv=self.cv,
            method='predict_proba', n_jobs=self.n_jobs, **fit_params)
        self.y_true = y

        # add fold id to the predictions
        self.test_idx_ = [indexes[1] for indexes in self.cv.split(X, y, groups)] 
示例13
def cross_val_score(
    estimator,
    X,
    y=None,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=1,
    verbose=0,
    fit_params=None,
    pre_dispatch="2*n_jobs",
):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(
            clone(estimator), X, y, scorer, train, test, verbose, None, fit_params
        )
        for train, test in splits
    )

    group_order = []
    if hasattr(cv, "groups"):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order 
示例14
def test_is_classifier():
    svc = SVC()
    assert_true(is_classifier(svc))
    assert_true(is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})))
    assert_true(is_classifier(Pipeline([('svc', svc)])))
    assert_true(is_classifier(Pipeline(
        [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))) 
示例15
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
            (rng_reg.random_sample((5, 2)),
             rng_clf.random_sample((1000, 4))),
            (rng_reg.random_sample((5, )),
             rng_clf.randint(2, size=(1000, ))),
            (DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
                                   max_depth=1),
             DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf, out_file=None, precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer(r"value = \d+\.\d+", dot_data):
                assert_less_equal(
                    len(search(r"\.\d+", finding.group()).group()),
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = r"gini = \d+\.\d+"
            else:
                pattern = r"friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
            # check threshold
            for finding in finditer(r"<= \d+\.\d+", dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1) 
示例16
def yield_all_checks(name, estimator):
    tags = estimator._get_tags()
    if "2darray" not in tags["X_types"]:
        warnings.warn("Can't test estimator {} which requires input "
                      " of type {}".format(name, tags["X_types"]),
                      SkipTestWarning)
        return
    if tags["_skip_test"]:
        warnings.warn("Explicit SKIP via _skip_test tag for estimator "
                      "{}.".format(name),
                      SkipTestWarning)
        return

    yield from _yield_checks(name, estimator)
    if is_classifier(estimator):
        yield from _yield_classifier_checks(name, estimator)
    if is_regressor(estimator):
        yield from _yield_regressor_checks(name, estimator)
    if hasattr(estimator, 'transform'):
        if not tags["allow_variable_length"]:
            # Transformer tests ensure that shapes are the same at fit and
            # transform time, hence we need to skip them for estimators that
            # allow variable-length inputs
            yield from _yield_transformer_checks(name, estimator)
    if isinstance(estimator, ClusterMixin):
        yield from _yield_clustering_checks(name, estimator)
    if is_outlier_detector(estimator):
        yield from _yield_outliers_checks(name, estimator)
    # We are not strict on presence/absence of the 3rd dimension
    # yield check_fit2d_predict1d

    if not tags["non_deterministic"]:
        yield check_methods_subset_invariance

    yield check_fit2d_1sample
    yield check_fit2d_1feature
    yield check_fit1d
    yield check_get_params_invariance
    yield check_set_params
    yield check_dict_unchanged
    yield check_dont_overwrite_parameters
    yield check_fit_idempotent

    if (is_classifier(estimator) or
            is_regressor(estimator) or
            isinstance(estimator, ClusterMixin)):
        if tags["allow_variable_length"]:
            yield check_different_length_fit_predict_transform 
示例17
def permutation_test_score(
    estimator,
    X,
    y,
    groups=None,
    cv=None,
    n_permutations=100,
    n_jobs=1,
    random_state=0,
    verbose=0,
    scoring=None,
):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    return permutation_scores 
示例18
def _cross_validate_with_pruning(
        self,
        trial,  # type: trial_module.Trial
        estimator,  # type: BaseEstimator
    ):
        # type: (...) -> Dict[str, OneDimArrayLikeType]

        if is_classifier(estimator):
            partial_fit_params = self.fit_params.copy()
            classes = np.unique(self.y)

            partial_fit_params.setdefault("classes", classes)

        else:
            partial_fit_params = self.fit_params

        n_splits = self.cv.get_n_splits(self.X, self.y, groups=self.groups)
        estimators = [clone(estimator) for _ in range(n_splits)]
        scores = {
            "fit_time": np.zeros(n_splits),
            "score_time": np.zeros(n_splits),
            "test_score": np.empty(n_splits),
        }

        if self.return_train_score:
            scores["train_score"] = np.empty(n_splits)

        for step in range(self.max_iter):
            for i, (train, test) in enumerate(self.cv.split(self.X, self.y, groups=self.groups)):
                out = self._partial_fit_and_score(estimators[i], train, test, partial_fit_params)

                if self.return_train_score:
                    scores["train_score"][i] = out.pop(0)

                scores["test_score"][i] = out[0]
                scores["fit_time"][i] += out[1]
                scores["score_time"][i] += out[2]

            intermediate_value = np.nanmean(scores["test_score"])

            trial.report(intermediate_value, step=step)

            if trial.should_prune():
                self._store_scores(trial, scores)

                raise TrialPruned("trial was pruned at iteration {}.".format(step))

        return scores 
示例19
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
            (rng_reg.random_sample((5, 2)),
             rng_clf.random_sample((1000, 4))),
            (rng_reg.random_sample((5, )),
             rng_clf.randint(2, size=(1000, ))),
            (DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
                                   max_depth=1),
             DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf, out_file=None, precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer("value = \d+\.\d+", dot_data):
                assert_less_equal(
                    len(search("\.\d+", finding.group()).group()),
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = "gini = \d+\.\d+"
            else:
                pattern = "friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert_equal(len(search("\.\d+", finding.group()).group()),
                             precision + 1)
            # check threshold
            for finding in finditer("<= \d+\.\d+", dot_data):
                assert_equal(len(search("\.\d+", finding.group()).group()),
                             precision + 1)