Python源码示例:sklearn.base.clone()
示例1
def _check_behavior_2d(clf):
# 1d case
X = np.array([[0], [0], [0], [0]]) # ignored
y = np.array([1, 2, 1, 1])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert_equal(y.shape, y_pred.shape)
# 2d case
y = np.array([[1, 0],
[2, 0],
[1, 0],
[1, 3]])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert_equal(y.shape, y_pred.shape)
示例2
def _do_fit(n_jobs, verbose, pre_dispatch, base_estimator,
X, y, scorer, parameter_iterable, fit_params,
error_score, cv, **kwargs):
groups = kwargs.pop('groups')
# test_score, n_samples, parameters
out = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
delayed(_fit_and_score)(
clone(base_estimator), X, y, scorer,
train, test, verbose, parameters,
fit_params=fit_params,
return_train_score=False,
return_n_test_samples=True,
return_times=False,
return_parameters=True,
error_score=error_score)
for parameters in parameter_iterable
for train, test in cv.split(X, y, groups))
# test_score, n_samples, _, parameters
return [(mod[0], mod[1], None, mod[2]) for mod in out]
示例3
def _make_estimator(self, append=True, random_state=None):
"""Make and configure a copy of the `base_estimator_` attribute.
sklearn/base.py
Warning: This method should be used to properly instantiate new
sub-estimators.
"""
# TODO: add a check for estimator_param
estimator = clone(self.base_estimator_)
estimator.set_params(**self.estimator_params)
if random_state is not None:
_set_random_states(estimator, random_state)
if append:
self.estimators_.append(estimator)
return estimator
示例4
def check_cross_val_predict_binary(est, X, y, method):
"""Helper for tests of cross_val_predict with binary classification"""
cv = KFold(n_splits=3, shuffle=False)
# Generate expected outputs
if y.ndim == 1:
exp_shape = (len(X),) if method == 'decision_function' else (len(X), 2)
else:
exp_shape = y.shape
expected_predictions = np.zeros(exp_shape)
for train, test in cv.split(X, y):
est = clone(est).fit(X[train], y[train])
expected_predictions[test] = getattr(est, method)(X[test])
# Check actual outputs for several representations of y
for tg in [y, y + 1, y - 2, y.astype('str')]:
assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
expected_predictions)
示例5
def check_cross_val_predict_multiclass(est, X, y, method):
"""Helper for tests of cross_val_predict with multiclass classification"""
cv = KFold(n_splits=3, shuffle=False)
# Generate expected outputs
float_min = np.finfo(np.float64).min
default_values = {'decision_function': float_min,
'predict_log_proba': float_min,
'predict_proba': 0}
expected_predictions = np.full((len(X), len(set(y))),
default_values[method],
dtype=np.float64)
_, y_enc = np.unique(y, return_inverse=True)
for train, test in cv.split(X, y_enc):
est = clone(est).fit(X[train], y_enc[train])
fold_preds = getattr(est, method)(X[test])
i_cols_fit = np.unique(y_enc[train])
expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds
# Check actual outputs for several representations of y
for tg in [y, y + 1, y - 2, y.astype('str')]:
assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
expected_predictions)
示例6
def test_transform_target_regressor_2d_transformer_multioutput():
# Check consistency with transformer accepting only 2D array and a 2D y
# array.
X = friedman[0]
y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
transformer = StandardScaler()
regr = TransformedTargetRegressor(regressor=LinearRegression(),
transformer=transformer)
y_pred = regr.fit(X, y).predict(X)
assert y.shape == y_pred.shape
# consistency forward transform
y_tran = regr.transformer_.transform(y)
_check_standard_scaled(y, y_tran)
assert y.shape == y_pred.shape
# consistency inverse transform
assert_allclose(y, regr.transformer_.inverse_transform(
y_tran).squeeze())
# consistency of the regressor
lr = LinearRegression()
transformer2 = clone(transformer)
lr.fit(X, transformer2.fit_transform(y))
y_lr_pred = lr.predict(X)
assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
assert_allclose(regr.regressor_.coef_, lr.coef_)
示例7
def test_fit_predict_on_pipeline():
# test that the fit_predict method is implemented on a pipeline
# test that the fit_predict on pipeline yields same results as applying
# transform and clustering steps separately
iris = load_iris()
scaler = StandardScaler()
km = KMeans(random_state=0)
# As pipeline doesn't clone estimators on construction,
# it must have its own estimators
scaler_for_pipeline = StandardScaler()
km_for_pipeline = KMeans(random_state=0)
# first compute the transform and clustering step separately
scaled = scaler.fit_transform(iris.data)
separate_pred = km.fit_predict(scaled)
# use a pipeline to do the transform and clustering in one step
pipe = Pipeline([
('scaler', scaler_for_pipeline),
('Kmeans', km_for_pipeline)
])
pipeline_pred = pipe.fit_predict(iris.data)
assert_array_almost_equal(pipeline_pred, separate_pred)
示例8
def test_base_chain_random_order():
# Fit base chain with random order
X, Y = generate_multilabel_dataset_with_correlations()
for chain in [ClassifierChain(LogisticRegression()),
RegressorChain(Ridge())]:
chain_random = clone(chain).set_params(order='random', random_state=42)
chain_random.fit(X, Y)
chain_fixed = clone(chain).set_params(order=chain_random.order_)
chain_fixed.fit(X, Y)
assert_array_equal(chain_fixed.order_, chain_random.order_)
assert_not_equal(list(chain_random.order), list(range(4)))
assert_equal(len(chain_random.order_), 4)
assert_equal(len(set(chain_random.order_)), 4)
# Randomly ordered chain should behave identically to a fixed order
# chain with the same order.
for est1, est2 in zip(chain_random.estimators_,
chain_fixed.estimators_):
assert_array_almost_equal(est1.coef_, est2.coef_)
示例9
def test_base_chain_crossval_fit_and_predict():
# Fit chain with cross_val_predict and verify predict
# performance
X, Y = generate_multilabel_dataset_with_correlations()
for chain in [ClassifierChain(LogisticRegression()),
RegressorChain(Ridge())]:
chain.fit(X, Y)
chain_cv = clone(chain).set_params(cv=3)
chain_cv.fit(X, Y)
Y_pred_cv = chain_cv.predict(X)
Y_pred = chain.predict(X)
assert Y_pred_cv.shape == Y_pred.shape
assert not np.all(Y_pred == Y_pred_cv)
if isinstance(chain, ClassifierChain):
assert jaccard_score(Y, Y_pred_cv, average='samples') > .4
else:
assert mean_squared_error(Y, Y_pred_cv) < .25
示例10
def test_classifier_results():
"""tests if classifier results match target"""
alpha = .1
n_features = 20
n_samples = 10
tol = .01
max_iter = 200
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
w = rng.normal(size=n_features)
y = np.dot(X, w)
y = np.sign(y)
clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
max_iter=max_iter, tol=tol, random_state=77)
clf2 = clone(clf1)
clf1.fit(X, y)
clf2.fit(sp.csr_matrix(X), y)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert_almost_equal(pred1, y, decimal=12)
assert_almost_equal(pred2, y, decimal=12)
示例11
def test_weighted_vs_repeated():
# a sample weight of N should yield the same result as an N-fold
# repetition of the sample
rng = np.random.RandomState(0)
sample_weight = rng.randint(1, 5, size=n_samples)
X_repeat = np.repeat(X, sample_weight, axis=0)
estimators = [KMeans(init="k-means++", n_clusters=n_clusters,
random_state=42),
KMeans(init="random", n_clusters=n_clusters,
random_state=42),
KMeans(init=centers.copy(), n_clusters=n_clusters,
random_state=42),
MiniBatchKMeans(n_clusters=n_clusters, batch_size=10,
random_state=42)]
for estimator in estimators:
est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
est_repeated = clone(estimator).fit(X_repeat)
repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
assert_almost_equal(v_measure_score(est_repeated.labels_,
repeated_labels), 1.0)
if not isinstance(estimator, MiniBatchKMeans):
assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
_sort_centers(est_repeated.cluster_centers_))
示例12
def test_nmf_sparse_input():
# Test that sparse matrices are accepted as input
from scipy.sparse import csc_matrix
rng = np.random.mtrand.RandomState(42)
A = np.abs(rng.randn(10, 10))
A[:, 2 * np.arange(5)] = 0
A_sparse = csc_matrix(A)
for solver in ('cd', 'mu'):
est1 = NMF(solver=solver, n_components=5, init='random',
random_state=0, tol=1e-2)
est2 = clone(est1)
W1 = est1.fit_transform(A)
W2 = est2.fit_transform(A_sparse)
H1 = est1.components_
H2 = est2.components_
assert_array_almost_equal(W1, W2)
assert_array_almost_equal(H1, H2)
示例13
def apply_gridsearch(self,model):
"""
apply grid search on ml algorithm to specified parameters
returns updated best score and parameters
"""
# check if custom evalution function is specified
if callable(self.params_cv['scoring']):
scoring = make_scorer(self.params_cv['scoring'],greater_is_better=self._greater_is_better)
else:
scoring = self.params_cv['scoring']
gsearch = GridSearchCV(estimator=model,param_grid=self.get_params_tune(),scoring=scoring,
iid=self.params_cv['iid'],cv=self.params_cv['cv_folds'],n_jobs=self.params_cv['n_jobs'])
gsearch.fit(self.X,self.y)
# update best model if best_score is improved
if (gsearch.best_score_ * self._score_mult) > (self.best_score * self._score_mult):
self.best_model = clone(gsearch.best_estimator_)
self.best_score = gsearch.best_score_
# update tuned parameters with optimal values
for key,value in gsearch.best_params_.items():
self._params[key] = value
self._temp_score = gsearch.best_score_
return self
示例14
def _clone_and_score_clusterer(clf, X, n_clusters):
"""Clones and scores clusterer instance.
Args:
clf: Clusterer instance that implements ``fit``,``fit_predict``, and
``score`` methods, and an ``n_clusters`` hyperparameter.
e.g. :class:`sklearn.cluster.KMeans` instance
X (array-like, shape (n_samples, n_features)):
Data to cluster, where n_samples is the number of samples and
n_features is the number of features.
n_clusters (int): Number of clusters
Returns:
score: Score of clusters
time: Number of seconds it took to fit cluster
"""
start = time.time()
clf = clone(clf)
setattr(clf, 'n_clusters', n_clusters)
return clf.fit(X).score(X), time.time() - start
示例15
def fit(self, X, y=None, **fit_params):
if not isinstance(X, pd.DataFrame):
raise ValueError('X is not a pandas.DataFrame')
self.models_ = {}
columns = self._get_fit_columns(X)
for key in X[self.by].unique():
# Copy the model
model = clone(self.base_model)
# Select the rows that will be fitted
mask = (X[self.by] == key).tolist()
rows = X.index[mask]
# Fit the model
model.fit(X.loc[rows, columns], y[mask], **fit_params)
# Save the model
self.models_[key] = model
return self
示例16
def net_fit(self, net_cls, module_cls, dummy_callback, data):
# Careful, don't call additional fits or set_params on this,
# since that would have side effects on other tests.
X, y = data
# We need a new instance of the net and cannot reuse the net
# fixture, because otherwise fixture net and net_fit refer to
# the same object; also, we cannot clone(net) because this
# will result in the dummy_callback not being the mock anymore
net = net_cls(
module_cls,
callbacks=[('dummy', dummy_callback)],
max_epochs=10,
lr=0.1,
)
return net.fit(X, y)
示例17
def test_changing_model_reinitializes_optimizer(self, net, data):
# The idea is that we change the model using `set_params` to
# add parameters. Since the optimizer depends on the model
# parameters it needs to be reinitialized.
X, y = data
net.set_params(module__nonlin=nn.ReLU())
net.fit(X, y)
net.set_params(module__nonlin=nn.PReLU())
assert isinstance(net.module_.nonlin, nn.PReLU)
d1 = net.module_.nonlin.weight.data.clone().cpu().numpy()
# make sure that we do not initialize again by making sure that
# the network is initialized and by using partial_fit.
assert net.initialized_
net.partial_fit(X, y)
d2 = net.module_.nonlin.weight.data.clone().cpu().numpy()
# all newly introduced parameters should have been trained (changed)
# by the optimizer after 10 epochs.
assert (abs(d2 - d1) > 1e-05).all()
示例18
def fit(self, X, y):
"""Fit estimator on parameterized data.
Parameters
----------
* `X` [array-like, shape=(n_samples, n_features+len(params))]:
The samples, concatenated with the corresponding parameter values.
* `y` [array-like, shape=(n_samples,)]:
The output values.
Returns
-------
* `self` [object]:
`self`.
"""
self.stacker_ = ParameterStacker(self.params)
# XXX: this assumes that X is extended with parameters
self.n_features_ = X.shape[1] - len(self.params)
self.estimator_ = clone(self.base_estimator).fit(X, y)
return self
示例19
def oob_dropcol_importances(rf, X_train, y_train):
"""
Compute drop-column feature importances for scikit-learn.
Given a RandomForestClassifier or RandomForestRegressor in rf
and training X and y data, return a data frame with columns
Feature and Importance sorted in reverse order by importance.
A clone of rf is trained once to get the baseline score and then
again, once per feature to compute the drop in out of bag (OOB)
score.
return: A data frame with Feature, Importance columns
SAMPLE CODE
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
X_train, y_train = ..., ...
rf.fit(X_train, y_train)
imp = oob_dropcol_importances(rf, X_train, y_train)
"""
rf_ = clone(rf)
rf_.random_state = 999
rf_.fit(X_train, y_train)
baseline = rf_.oob_score_
imp = []
for col in X_train.columns:
X = X_train.drop(col, axis=1)
rf_ = clone(rf)
rf_.random_state = 999
rf_.fit(X, y_train)
o = rf_.oob_score_
imp.append(baseline - o)
imp = np.array(imp)
I = pd.DataFrame(data={'Feature':X_train.columns, 'Importance':imp})
I = I.set_index('Feature')
I = I.sort_values('Importance', ascending=False)
return I
示例20
def oob_dropcol_importances(rf, X_train, y_train):
"""
Compute drop-column feature importances for scikit-learn.
Given a RandomForestClassifier or RandomForestRegressor in rf
and training X and y data, return a data frame with columns
Feature and Importance sorted in reverse order by importance.
A clone of rf is trained once to get the baseline score and then
again, once per feature to compute the drop in out of bag (OOB)
score.
return: A data frame with Feature, Importance columns
SAMPLE CODE
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
X_train, y_train = ..., ...
rf.fit(X_train, y_train)
imp = oob_dropcol_importances(rf, X_train, y_train)
"""
rf_ = clone(rf)
rf_.random_state = 999
rf_.fit(X_train, y_train)
baseline = rf_.oob_score_
imp = []
for col in X_train.columns:
X = X_train.drop(col, axis=1)
rf_ = clone(rf)
rf_.random_state = 999
rf_.fit(X, y_train)
o = rf_.oob_score_
imp.append(baseline - o)
imp = np.array(imp)
I = pd.DataFrame(data={'Feature':X_train.columns, 'Importance':imp})
I = I.set_index('Feature')
I = I.sort_values('Importance', ascending=False)
return I
示例21
def clone(self):
return SklearnProbaAdapter(clone(self._model))
示例22
def clone(self):
return SklearnProbaAdapter(clone(self._model))
示例23
def __init__(self, estimator, k_features,
scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
示例24
def fit(self, X, y):
""" Fit classifiers.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Matrix of training samples.
y : array-like, shape = [n_samples]
Vector of target class labels.
Returns
-------
self : object
"""
if self.vote not in ('probability', 'classlabel'):
raise ValueError("vote must be 'probability' or 'classlabel'"
"; got (vote=%r)"
% vote)
if self.weights and len(self.weights) != len(self.classifiers):
raise ValueError('Number of classifiers and weights must be equal'
'; got %d weights, %d classifiers'
% (len(self.weights), len(self.classifiers)))
# Use LabelEncoder to ensure class labels start with 0, which
# is important for np.argmax call in self.predict
self.lablenc_ = LabelEncoder()
self.lablenc_.fit(y)
self.classes_ = self.lablenc_.classes_
self.classifiers_ = []
for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
self.classifiers_.append(fitted_clf)
return self
示例25
def _do_fit(n_jobs, verbose, pre_dispatch, base_estimator,
X, y, scorer, parameter_iterable, fit_params,
error_score, cv, **kwargs):
# test_score, n_samples, score_time, parameters
return Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
delayed(_fit_and_score)(
clone(base_estimator), X, y, scorer,
train, test, verbose, parameters,
fit_params, return_parameters=True,
error_score=error_score)
for parameters in parameter_iterable
for train, test in cv)
示例26
def _clone_h2o_obj(estimator, ignore=False, **kwargs):
# do initial clone
est = clone(estimator)
# set kwargs:
if kwargs:
for k, v in six.iteritems(kwargs):
setattr(est, k, v)
# check on h2o estimator
if isinstance(estimator, H2OPipeline):
# the last step from the original estimator
e = estimator.steps[-1][1]
if isinstance(e, H2OEstimator):
last_step = est.steps[-1][1]
# so it's the last step
for k, v in six.iteritems(e._parms):
k, v = _kv_str(k, v)
# if (not k in PARM_IGNORE) and (not v is None):
# e._parms[k] = v
last_step._parms[k] = v
# otherwise it's an BaseH2OFunctionWrapper
return est
示例27
def _new_base_estimator(est, clonable_kwargs):
"""When the grid searches are pickled, the estimator
has to be dropped out. When we load it back in, we have
to reinstate a new one, since the fit is predicated on
being able to clone a base estimator, we've got to have
an estimator to clone and fit.
Parameters
----------
est : str
The type of model to build
Returns
-------
estimator : H2OEstimator
The cloned base estimator
"""
est_map = {
'dl': H2ODeepLearningEstimator,
'gbm': H2OGradientBoostingEstimator,
'glm': H2OGeneralizedLinearEstimator,
# 'glrm': H2OGeneralizedLowRankEstimator,
# 'km' : H2OKMeansEstimator,
'nb': H2ONaiveBayesEstimator,
'rf': H2ORandomForestEstimator
}
estimator = est_map[est]() # initialize the new ones
for k, v in six.iteritems(clonable_kwargs):
k, v = _kv_str(k, v)
estimator._parms[k] = v
return estimator
示例28
def test_clonable(est):
# fit it, then clone it
est.fit(y)
est2 = clone(est)
assert isinstance(est2, est.__class__)
assert est is not est2
示例29
def build_ensemble(cls, **kwargs):
"""Build ML-Ensemble"""
ens = cls(**kwargs)
use = ["ExtraTrees", "RandomForest",
"LogisticRegression-SAG", "MLP-adam"]
meta = RandomForestClassifier(n_estimators=100,
random_state=0,
n_jobs=-1)
base_learners = list()
for est_name, est in ESTIMATORS.items():
e = clone(est)
if est_name not in use:
continue
elif est_name == "MLP-adam":
e.verbose = False
try:
e.set_params(**{'n_jobs': 1})
except ValueError:
pass
base_learners.append((est_name, e))
ens.add(base_learners, proba=True, shuffle=True, random_state=1)
ens.add_meta(meta, shuffle=True, random_state=2)
return ens
示例30
def _pre_train(self, y):
self.cv = check_cv(self.cv, y)
self.n_splits = self.cv.get_n_splits()
self.transformers = [clone(self.base_transformer) for _ in range(self.n_splits + 1)]