Python源码示例:sklearn.grid.GridSearchCV()
示例1
def grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__stop_words=[None, "english"],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
示例2
def __grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
示例3
def nestedCrossValidation(X, y, cvFolds, estimator):
kf = KFold(len(X), n_folds=cvFolds, shuffle=True, random_state = 30)
cv_j=0
param_grid = {'alpha': [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000,100000, 1000000, 10000000,1000000000]}
r2 = np.zeros((cvFolds,1))
for train_index, test_index in kf:
train_X = X[train_index,:]
test_X = X[test_index,:]
train_y = y[train_index]
test_y = y[test_index]
grid = GridSearchCV(estimator, param_grid=param_grid, verbose=0, cv=cvFolds, scoring='mean_squared_error')
grid.fit(train_X,train_y)
y_true, y_pred = test_y,grid.best_estimator_.predict(test_X)
r2[cv_j] = r2_score(y_true, y_pred)
cv_j = cv_j + 1
return r2
#%% main script
示例4
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'max_features': ['sqrt', 'log2', None],
'max_depth': range(2,1000),
}
]
reg = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
reg.fit(self.X_train, self.y_train)
print "Best parameters set found on development set:\n"
print reg.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in reg.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "MSE for test data set:\n"
y_true, y_pred = self.y_test, reg.predict(self.X_test)
print mean_squared_error(y_true, y_pred)
示例5
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'alpha': np.logspace(-5,5)
}
]
reg = GridSearchCV(linear_model.Ridge(alpha = 0.5), tuned_parameters, cv=5, scoring='mean_squared_error')
reg.fit(self.X_train, self.y_train)
print "Best parameters set found on development set:\n"
print reg.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in reg.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print reg.scorer_
print "MSE for test data set:"
y_true, y_pred = self.y_test, reg.predict(self.X_test)
print mean_squared_error(y_pred, y_true)
示例6
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'weights': ['uniform', 'distance'],
'n_neighbors': range(2,100)
}
]
reg = GridSearchCV(neighbors.KNeighborsRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
reg.fit(self.X_train, self.y_train)
print "Best parameters set found on development set:\n"
print reg.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in reg.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print reg.scorer_
print "MSE for test data set:"
y_true, y_pred = self.y_test, reg.predict(self.X_test)
print mean_squared_error(y_pred, y_true)
示例7
def parameterChoosing(self):
#Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(20,60),
'n_estimators': range(10,40),
'max_features': ['sqrt', 'log2', None]
}
]
clf = GridSearchCV(RandomForestRegressor(n_estimators=30), tuned_parameters, cv=5, scoring='mean_squared_error')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "MSE for test data set:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print mean_squared_error(y_true, y_pred)
示例8
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'],
'gamma': np.logspace(-4, 3, 30),
'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]},
{'kernel': ['poly'],
'degree': [1, 2, 3, 4],
'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
'coef0': np.logspace(-4, 3, 30)},
{'kernel': ['linear'],
'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}]
clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
示例9
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'penalty': ['l1'],
'C': np.logspace(-5,5)},
{'penalty': ['l2'],
'C': np.logspace(-5,5)}]
clf = GridSearchCV(linear_model.LogisticRegression(tol=1e-6), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
示例10
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'weights': ['uniform', 'distance'],
'n_neighbors': range(2,60)
}
]
clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
示例11
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(2,60),
'max_features': ['sqrt', 'log2', None]
}
]
clf = GridSearchCV(DecisionTreeClassifier(max_depth=5), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
示例12
def test_same_result(self):
X, y, Z = self.make_classification(2, 40000, nonnegative=True)
parameters = {'alpha': [0.1, 1, 10]}
fit_params = {'classes': np.unique(y)}
local_estimator = MultinomialNB()
local_grid = GridSearchCV(estimator=local_estimator,
param_grid=parameters)
estimator = SparkMultinomialNB()
grid = SparkGridSearchCV(estimator=estimator,
param_grid=parameters,
fit_params=fit_params)
local_grid.fit(X, y)
grid.fit(Z)
locscores = [r.mean_validation_score for r in local_grid.grid_scores_]
scores = [r.mean_validation_score for r in grid.grid_scores_]
assert_array_almost_equal(locscores, scores, decimal=2)
示例13
def compute_svm_score_nestedCV(K, y, n_folds,
scoring=balanced_accuracy_scoring,
random_state=None,
param_grid=[{'C': np.logspace(-5, 5, 25)}]):
"""Compute cross-validated score of SVM using precomputed kernel.
"""
cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True,
random_state=random_state)
scores = np.zeros(n_folds)
for i, (train, test) in enumerate(cv):
cvclf = SVC(kernel='precomputed')
y_train = y[train]
cvcv = StratifiedKFold(y_train, n_folds=n_folds,
shuffle=True,
random_state=random_state)
clf = GridSearchCV(cvclf, param_grid=param_grid, scoring=scoring,
cv=cvcv, n_jobs=1)
clf.fit(K[train, :][:, train], y_train)
# print clf.best_params_
scores[i] = clf.score(K[test, :][:, train], y[test])
return scores.mean()
示例14
def test_cv_pipeline(self):
pipeline = SKL_Pipeline([
('vect', SKL_HashingVectorizer(n_features=20)),
('tfidf', SKL_TfidfTransformer(use_idf=False)),
('lasso', SKL_Lasso())
])
parameters = {
'lasso__alpha': (0.001, 0.005, 0.01)
}
grid_search = GridSearchCV(self.sc, pipeline, parameters)
data = [('hi there', 0.0),
('what is up', 1.0),
('huh', 1.0),
('now is the time', 5.0),
('for what', 0.0),
('the spark was there', 5.0),
('and so', 3.0),
('were many socks', 0.0),
('really', 1.0),
('too cool', 2.0)]
df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
skl_gs = grid_search.fit(df.review.values, df.rating.values)
assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
示例15
def fit(self, X, y, featurename=[]):
self.dim_ = X.shape[1]
self.setfeaturename(featurename)
self.setdefaultpred(y)
param_grid = {"max_depth": self.max_depth_, "min_samples_leaf": self.min_samples_leaf_}
if self.modeltype_ == 'regression':
mdl = tree.DecisionTreeRegressor()
elif self.modeltype_ == 'classification':
mdl = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(mdl, param_grid=param_grid, cv=self.cv_)
grid_search.fit(X, y)
mdl = grid_search.best_estimator_
self.__parseTree(mdl)
self.weight_ = np.ones(len(self.rule_))
示例16
def main():
import sys
import numpy as np
from sklearn import cross_validation
from sklearn import svm
import cPickle
data_dir = sys.argv[1]
fet_list = load_list(osp.join(data_dir, 'c3d.list'))
pos_list = load_list(osp.join(data_dir, 'pos.urls'))
features = np.load(osp.join(data_dir, 'c3d.npy'))
fet_set = set(fet_list)
pos_idx = [fet_list.index(i) for i in pos_list if i in fet_set]
y = np.zeros(features.shape[0])
y[pos_idx] = 1
print 'n_pos', np.sum(y), 'n_neg', np.sum(1 - y)
params = {'n_estimators':[2, 4, 5, 6, 8, 10, 30]}
#params = {'n_estimators':[50, 70, 100, 120, 150, 200]}
clf = grid_search.GridSearchCV(RandomForestClassifier(n_estimators = 2, n_jobs = 4), params, scoring = metrics.make_scorer(lambda yt, yp: metrics.f1_score(yt, yp, pos_label = 0)), cv = 5)
clf.fit(features, y)
print clf.best_score_
print clf.best_estimator_
cPickle.dump(clf.best_estimator_, open(osp.join(data_dir, 'c3d-models-rfc.pkl'), 'w'))
示例17
def testIrisDNN(self):
if HAS_SKLEARN:
random.seed(42)
iris = datasets.load_iris()
feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
classifier = learn.DNNClassifier(
feature_columns=feature_columns, hidden_units=[10, 20, 10],
n_classes=3)
grid_search = GridSearchCV(classifier,
{'hidden_units': [[5, 5], [10, 10]]},
scoring='accuracy',
fit_params={'steps': [50]})
grid_search.fit(iris.data, iris.target)
score = accuracy_score(iris.target, grid_search.predict(iris.data))
self.assertGreater(score, 0.5, 'Failed with score = {0}'.format(score))
示例18
def plot_kernel_density(col, verbose=True):
"""Plots kernel density function of column
From:
https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/
Parameters
----------
col : np.ndarray
verbose : boolean
iff True, display the graph
Returns
-------
matplotlib.figure.Figure
Figure containing plot
"""
#address pass entire matrix
# TODO respect missing_val
# TODO what does n do?
col = utils.check_col(col)
x_grid = np.linspace(min(col), max(col), 1000)
grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1,1.0,30)}, cv=20) # 20-fold cross-validation
grid.fit(col[:, None])
kde = grid.best_estimator_
pdf = np.exp(kde.score_samples(x_grid[:, None]))
fig, ax = plt.subplots()
#fig = plt.figure()
ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
ax.hist(col, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
ax.legend(loc='upper left')
ax.set_xlim(min(col), max(col))
if verbose:
plt.show()
return fig
示例19
def runGridSearch(self, model):
logging.debug("run grid search on model: {}".format(model.__class__.__name__))
logging.debug("cross validation strategy: {}".format(model.holdout_split))
logging.debug("used features: {}".format(model.usedFeatures))
logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions()))
features,labels,cv = model.getFeaturesLabel()
# do grid search
if self.do_random_gridsearch:
estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs,
scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch)
else:
estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs,
fit_params=model.get_fit_params(),
scoring=mean_absolute_percentage_error_scoring, verbose = 500)
estimator.fit(features, labels)
model.clf = estimator.best_estimator_
model.save_final_model = True
model.save_model()
# model.dispFeatureImportance()
logging.debug('estimaator parameters: {}'.format(estimator.get_params))
logging.debug('Best parameters: {}'.format(estimator.best_params_))
logging.debug('Best Scores: {}'.format(-estimator.best_score_))
logging.debug('Score grid: {}'.format(estimator.grid_scores_ ))
for i in estimator.grid_scores_ :
logging.debug('parameters: {}'.format(i.parameters ))
logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score)))
logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) ))
return
示例20
def GBR_stacking(y_train, X_train, X_test):
param_grid = {'learning_rate': [0.1, 0.05, 0.01],
'max_depth': [2, 3, 4, 5], # [2, 3, 4, 6],
'min_samples_leaf': [1, 2, 3], # ,5, 7],
'max_features': [1.0, 0.5, 0.3, 0.1]}
est = en.GradientBoostingRegressor(loss='ls', n_estimators=100)
clf = GridSearchCV(est, param_grid, n_jobs=3, verbose=1, cv=20, scoring=spearman_scoring).fit(X_train, y_train.flatten())
# clf.fit(X_train, y_train.flatten())
return clf.predict(X_test)
示例21
def SVM_stacking(y_train, X_train, X_test):
parameters = {'kernel': ('linear', 'rbf'), 'C': np.linspace(1, 10, 10), 'gamma': np.linspace(1e-3, 1., 10)}
svr = svm.SVR()
clf = GridSearchCV(svr, parameters, n_jobs=3, verbose=1, cv=10, scoring=spearman_scoring)
clf.fit(X_train, y_train.flatten())
return clf.predict(X_test)
示例22
def shrunk_cov_score(X):
shrinkages = np.logspace(-2, 0, 30)
cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages})
return np.mean(cross_val_score(cv.fit(X).best_estimator_, X))
示例23
def create_classif_search(name_clf, clf_pipeline, nb_labels,
search_type='random', cross_val=10,
eval_metric='f1', nb_iter=250, nb_workers=5):
""" create sklearn search depending on spec. random or grid
:param int nb_labels: number of labels
:param str search_type: hyper-params search type
:param str eval_metric: evaluation metric
:param int nb_iter: for random number of tries
:param str name_clf: name of classif.
:param obj clf_pipeline: object
:param obj cross_val: obj specific CV for fix train-test
:param int nb_workers: number jobs running in parallel
:return:
"""
score_weight = 'weighted' if nb_labels > 2 else 'binary'
scoring = metrics.make_scorer(DICT_SCORING[eval_metric.lower()],
average=score_weight)
if search_type == 'grid':
clf_parameters = create_clf_param_search_grid(name_clf)
logging.info('init Grid search...')
clf_search = GridSearchCV(
clf_pipeline, clf_parameters, scoring=scoring, cv=cross_val,
n_jobs=nb_workers, verbose=1, refit=True)
else:
clf_parameters = create_clf_param_search_distrib(name_clf)
nb_iter = search_params_cut_down_max_nb_iter(clf_parameters, nb_iter)
logging.info('init Randomized search...')
clf_search = RandomizedSearchCV(
clf_pipeline, clf_parameters, scoring=scoring, cv=cross_val,
n_jobs=nb_workers, n_iter=nb_iter, verbose=1, refit=True)
return clf_search
示例24
def perform_class(X, y, iterations=1):
scores = []
for i in range(iterations):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations)
parameters = {'C':[0.01, 0.1, 1, 10, 100]}
clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy')
clf_acc.fit(X_train, y_train)
scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')])
acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores])
mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores])
return acc, mif
示例25
def perform_class(X, y, iterations=1):
scores = []
for i in range(iterations):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations)
parameters = {'C':[0.01, 0.1, 1, 10, 100]}
clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy')
clf_acc.fit(X_train, y_train)
scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')])
acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores])
mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores])
return acc, mif
示例26
def __init__(self, mode = 'adaboost'):
if mode == 'adaboost':
clf = GradientBoostingRegressor(
learning_rate = 1,
n_estimators = 1000,
max_depth = 3,
random_state = 0)
elif mode == 'randomforest':
clf = RandomForestRegressor(
n_estimators = 10,
max_depth = None,
n_jobs = -1)
elif mode == 'SVM':
clf = SVC(C = 10.0,
kernel = 'linear',
)
elif mode == 'vjcascade':
clf = vjcascade(n_stage=30,
n_esti = 1,
l_rate = 1)
elif mode == 'gridSearch':
param_grid = [
{'max_depth': [1, 2, 3], 'loss': ['ls', 'lad']},
]
gbr = GradientBoostingRegressor()
clf = grid_search.GridSearchCV(gbr, param_grid, n_jobs = -1)
else:
raise Exception('no mode named: '+mode+' found!')
self.classifier = clf
self.mode = mode
示例27
def test_GridGlobalParams(self):
clf = GridSearchCV(
self.__estimator__(layers=[L(self.__output__)], n_iter=1),
param_grid={'learning_rate': [0.01, 0.001]})
clf.fit(self.a_in, self.a_out)
示例28
def test_GridLayerParams(self):
clf = GridSearchCV(
self.__estimator__(layers=[L("Rectifier", units=12), L(self.__output__)], n_iter=1),
param_grid={'hidden0__units': [4, 8, 12]})
clf.fit(self.a_in, self.a_out)
示例29
def __init__(self, *models, **kwargs):
""" Initializes the grid search
:param list models: List of models to use. Each one should be a tuple
with a model instance or class and a dictionary for the search space.
:param kwargs: addition initialization arguments
for `sklearn.grid_search.GridSearchCV`
"""
self.models = filter(None, models)
kwargs['refit'] = True
self.kwargs = kwargs
示例30
def fit(self, training_sets):
""" Searches for the best estimator and its arguments as well as the best
training set amongst those specified.
:param generator training_sets: Training set to use. Should be a sequence
of tuples (x, y, metadata) where x is the training set, y is the
correct answer for each chunk and metadata contains additional data that will
be returned back
:return: the metadata of the training set which yielded the best score,
the best score obtained by the model, parameters of the model and
fitted model itself
:rtype: tuple
"""
best_training, best_score, best_params, best_model = None, None, None, None
for i, (metadata, extractor) in enumerate(training_sets):
for model, grid in self.models:
assert isclass(model)
x, y = extractor.get_features(refit=True)
grid['model_cls'] = [model]
grid['selector_column'] = [None, extractor.lu_column()]
search = GridSearchCV(
FeatureSelectedClassifier(model), param_grid=grid, **self.kwargs
)
search.fit(x, y)
score, params, model = search.best_score_, search.best_params_, search.best_estimator_
logger.debug('%s with parameters %s and training meta %s has score %s',
type(model), params, metadata, score)
if best_score is None or score > best_score:
best_training, best_score, best_params, best_model = (x, y, metadata), score, params, model
return best_training, best_score, best_params, best_model
# needs to be pickleable and callable