Python源码示例:sklearn.naive.MultinomialNB()
示例1
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
示例2
def test_mnb_prior_unobserved_targets():
# test smoothing of prior for yet unobserved targets
# Create toy training data
X = np.array([[0, 1], [1, 0]])
y = np.array([0, 1])
clf = MultinomialNB()
assert_no_warnings(
clf.partial_fit, X, y, classes=[0, 1, 2]
)
assert clf.predict([[0, 1]]) == 0
assert clf.predict([[1, 0]]) == 1
assert clf.predict([[1, 1]]) == 0
# add a training example with previously unobserved class
assert_no_warnings(
clf.partial_fit, [[1, 1]], [2]
)
assert clf.predict([[0, 1]]) == 0
assert clf.predict([[1, 0]]) == 1
assert clf.predict([[1, 1]]) == 2
示例3
def test_ovr_fit_predict():
# A classifier which implements decision_function.
ovr = OneVsRestClassifier(LinearSVC(random_state=0))
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
assert_equal(len(ovr.estimators_), n_classes)
clf = LinearSVC(random_state=0)
pred2 = clf.fit(iris.data, iris.target).predict(iris.data)
assert_equal(np.mean(iris.target == pred), np.mean(iris.target == pred2))
# A classifier which implements predict_proba.
ovr = OneVsRestClassifier(MultinomialNB())
pred = ovr.fit(iris.data, iris.target).predict(iris.data)
assert_greater(np.mean(iris.target == pred), 0.65)
# 0.23. warning about tol not having its correct default value.
示例4
def test_ovr_multiclass():
# Toy dataset where features correspond directly to labels.
X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
y = ["eggs", "spam", "ham", "eggs", "ham"]
Y = np.array([[0, 0, 1],
[0, 1, 0],
[1, 0, 0],
[0, 0, 1],
[1, 0, 0]])
classes = set("ham eggs spam".split())
for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
LinearRegression(), Ridge(),
ElasticNet()):
clf = OneVsRestClassifier(base_clf).fit(X, y)
assert_equal(set(clf.classes_), classes)
y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
assert_array_equal(y_pred, ["eggs"])
# test input as label indicator matrix
clf = OneVsRestClassifier(base_clf).fit(X, Y)
y_pred = clf.predict([[0, 0, 4]])[0]
assert_array_equal(y_pred, [0, 0, 1])
示例5
def test_ovr_multilabel():
# Toy dataset where features correspond directly to labels.
X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
y = np.array([[0, 1, 1],
[0, 1, 0],
[1, 1, 1],
[1, 0, 1],
[1, 0, 0]])
for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
LinearRegression(), Ridge(),
ElasticNet(), Lasso(alpha=0.5)):
clf = OneVsRestClassifier(base_clf).fit(X, y)
y_pred = clf.predict([[0, 4, 4]])[0]
assert_array_equal(y_pred, [0, 1, 1])
assert clf.multilabel_
示例6
def test_ovr_single_label_predict_proba():
base_clf = MultinomialNB(alpha=1)
X, Y = iris.data, iris.target
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
# Decision function only estimator.
decision_only = OneVsRestClassifier(svm.SVR(gamma='scale')
).fit(X_train, Y_train)
assert not hasattr(decision_only, 'predict_proba')
Y_pred = clf.predict(X_test)
Y_proba = clf.predict_proba(X_test)
assert_almost_equal(Y_proba.sum(axis=1), 1.0)
# predict assigns a label if the probability that the
# sample has the label is greater than 0.5.
pred = np.array([l.argmax() for l in Y_proba])
assert not (pred - Y_pred).any()
示例7
def trainNB(trainX,trainY,testX,testY,samples,limit):
start = time.clock()
clf = MultinomialNB()
clf.fit(trainX[:samples], trainY[:samples])
print time.clock()-start
start = time.clock()
predicted = clf.predict(trainX[0:samples])
print "percent Trained correct: ", percentCorrect(trainY[:samples],predicted)
print "f-score: ", f1_score(trainY[:samples],predicted)
metric = precision_recall_fscore_support(trainY[:samples],predicted)
print "precision: ", metric[0]
print "recall: ", metric[1]
predicted = clf.predict(testX[0:limit])
print "percent Test correct: ", percentCorrect(testY[:limit],predicted)
print "f-score: ", f1_score(testY[:limit],predicted)
metric = precision_recall_fscore_support(testY[:limit],predicted)
print "precision: ", metric[0]
print "recall: ", metric[1]
print time.clock()-start
return clf
示例8
def fit_naive_bayes(path, saveto=None, cv=12):
model = Pipeline([
('norm', TextNormalizer()),
('tfidf', TfidfVectorizer(tokenizer=identity, lowercase=False)),
('clf', MultinomialNB())
])
if saveto is None:
saveto = "naive_bayes_{}.pkl".format(time.time())
scores, delta = train_model(path, model, saveto, cv)
logger.info((
"naive bayes training took {:0.2f} seconds "
"with an average score of {:0.3f}"
).format(delta, scores.mean()))
示例9
def __init__(self, df, weight=True, min_ct=0, total_iter=5):
self.logger = logging.getLogger(__name__)
super(MultinomialNaiveBayes, self).__init__(total_iterations=total_iter) # call base constructor
#self.set_min_count(min_ct)
self.is_weighted_sample = weight
# process data
#df = self._filter_rows(df) # filter out low count rows
# row_sums = df.sum(axis=1).astype(float)
# df = df.div(row_sums, axis=0) # normalize each row
# df = df.mul(100)
# df.to_csv('tmp.nbclf.txt', sep='\t')
df = df.fillna(df.mean())
total = df['total']
df = df[['recurrent missense', 'recurrent indel', 'frame shift',
'nonsense', 'missense', 'synonymous', 'inframe indel', 'no protein',
'lost stop', 'splicing mutation']]
df = df.mul(total, axis=0).astype(int) # get back counts instead of pct
self.x, self.y = features.randomize(df)
# setup classifier
self.clf = MultinomialNB(alpha=1, # laplacian smooth, i.e. pseudocounts
fit_prior=True) # use data for prior class probs
示例10
def __init__(self, distributions, weights=None, **kwargs):
self.models = []
for dist in distributions:
dist = NaiveBayesianDistribution.from_string(dist)
if dist is NaiveBayesianDistribution.GAUSSIAN:
model = nb.GaussianNB(**kwargs)
elif dist is NaiveBayesianDistribution.MULTINOMIAL:
model = nb.MultinomialNB(**kwargs)
elif dist is NaiveBayesianDistribution.BERNOULLI:
model = nb.BernoulliNB(**kwargs)
else:
raise ValueError('Unknown distribution: {}.'.format(dist))
kwargs['fit_prior'] = False # Except the first model.
self.models.append(model)
self.weights = weights
示例11
def test_model_selection_works(self):
for x,y in self.get_multilabel_data_for_tests('dense'):
parameters = {
'classifier': [LabelPowerset(), BinaryRelevance()],
'clusterer': [RandomLabelSpaceClusterer(None, None, False)],
'clusterer__cluster_size': list(range(2, 3)),
'clusterer__cluster_count': [3],
'clusterer__allow_overlap': [False],
'classifier__classifier': [MultinomialNB()],
'classifier__classifier__alpha': [0.7, 1.0],
}
clf = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring='f1_macro')
clf.fit(x, y)
for p in list(parameters.keys()):
self.assertIn(p, clf.best_params_)
self.assertIsNotNone(clf.best_score_)
示例12
def test_model_calibrated_classifier_cv_float(self):
data = load_iris()
X, y = data.data, data.target
clf = MultinomialNB().fit(X, y)
model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
model_onnx = convert_sklearn(
model,
"scikit-learn CalibratedClassifierCVMNB",
[("input", FloatTensorType([None, X.shape[1]]))],
target_opset=TARGET_OPSET
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.float32),
model,
model_onnx,
basename="SklearnCalibratedClassifierCVFloat",
allow_failure="StrictVersion(onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例13
def test_model_calibrated_classifier_cv_float_nozipmap(self):
data = load_iris()
X, y = data.data, data.target
clf = MultinomialNB().fit(X, y)
model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
model_onnx = convert_sklearn(
model, "scikit-learn CalibratedClassifierCVMNB",
[("input", FloatTensorType([None, X.shape[1]]))],
target_opset=TARGET_OPSET,
options={id(model): {'zipmap': False}})
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.float32), model, model_onnx,
basename="SklearnCalibratedClassifierCVFloatNoZipMap",
allow_failure="StrictVersion(onnxruntime.__version__)"
"<= StrictVersion('0.2.1')")
示例14
def test_model_calibrated_classifier_cv_int(self):
data = load_digits()
X, y = data.data, data.target
clf = MultinomialNB().fit(X, y)
model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
model_onnx = convert_sklearn(
model,
"scikit-learn CalibratedClassifierCVMNB",
[("input", Int64TensorType([None, X.shape[1]]))],
target_opset=TARGET_OPSET
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnCalibratedClassifierCVInt-Dec4",
allow_failure="StrictVersion(onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例15
def test_model_calibrated_classifier_cv_binary(self):
data = load_iris()
X, y = data.data, data.target
y[y > 1] = 1
clf = MultinomialNB().fit(X, y)
model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
model_onnx = convert_sklearn(
model,
"scikit-learn CalibratedClassifierCV",
[("input", FloatTensorType([None, X.shape[1]]))],
target_opset=TARGET_OPSET
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.float32),
model,
model_onnx,
basename="SklearnCalibratedClassifierCVBinaryMNB",
allow_failure="StrictVersion(onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例16
def test_same_prediction(self):
X, y, Z = self.make_classification(4, 100000, nonnegative=True)
local = MultinomialNB()
dist = SparkMultinomialNB()
y_local = local.fit(X, y).predict(X)
y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X'])
y_converted = dist.to_scikit().predict(X)
assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
assert_array_almost_equal(y_local, y_dist.toarray())
assert_array_almost_equal(y_local, y_converted)
y_proba_local = local.fit(X, y).predict_proba(X)
y_proba_dist = dist.fit(Z, classes=np.unique(y)).predict_proba(Z[:, 'X'])
y_proba_converted = dist.to_scikit().predict_proba(X)
assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
assert_array_almost_equal(y_proba_local, y_proba_dist.toarray(), 5)
assert_array_almost_equal(y_proba_local, y_proba_converted, 5)
示例17
def test_same_result(self):
X, y, Z = self.make_classification(2, 40000, nonnegative=True)
parameters = {'alpha': [0.1, 1, 10]}
fit_params = {'classes': np.unique(y)}
local_estimator = MultinomialNB()
local_grid = GridSearchCV(estimator=local_estimator,
param_grid=parameters)
estimator = SparkMultinomialNB()
grid = SparkGridSearchCV(estimator=estimator,
param_grid=parameters,
fit_params=fit_params)
local_grid.fit(X, y)
grid.fit(Z)
locscores = [r.mean_validation_score for r in local_grid.grid_scores_]
scores = [r.mean_validation_score for r in grid.grid_scores_]
assert_array_almost_equal(locscores, scores, decimal=2)
示例18
def _fit_meta_classifier(self, X_meta, y_meta):
"""Train the meta-classifier :math:`\\lambda`, using
the meta-training dataset.
Parameters
----------
X_meta : array of shape = [n_meta_examples, n_meta_features]
The meta-training examples.
y_meta : array of shape = [n_meta_examples]
Class labels of each example in X_test. 1 whether the base
classifier made the correct prediction, otherwise 0.
"""
if isinstance(self.meta_classifier_, MultinomialNB):
# Digitize the data (Same implementation we have on PRTools)
X_meta = np.digitize(X_meta, np.linspace(0.1, 1, 10))
self.meta_classifier_.fit(X_meta, y_meta)
示例19
def test_discretenb_pickle():
# Test picklability of discrete naive Bayes classifiers
for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
clf = cls().fit(X2, y2)
y_pred = clf.predict(X2)
store = BytesIO()
pickle.dump(clf, store)
clf = pickle.load(BytesIO(store.getvalue()))
assert_array_equal(y_pred, clf.predict(X2))
if cls is not GaussianNB:
# TODO re-enable me when partial_fit is implemented for GaussianNB
# Test pickling of estimator trained with partial_fit
clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
clf2.partial_fit(X2[3:], y2[3:])
store = BytesIO()
pickle.dump(clf2, store)
clf2 = pickle.load(BytesIO(store.getvalue()))
assert_array_equal(y_pred, clf2.predict(X2))
示例20
def test_discretenb_provide_prior_with_partial_fit():
# Test whether discrete NB classes use provided prior
# when using partial_fit
iris = load_iris()
iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
iris.data, iris.target, test_size=0.4, random_state=415)
for cls in [BernoulliNB, MultinomialNB]:
for prior in [None, [0.3, 0.3, 0.4]]:
clf_full = cls(class_prior=prior)
clf_full.fit(iris.data, iris.target)
clf_partial = cls(class_prior=prior)
clf_partial.partial_fit(iris_data1, iris_target1,
classes=[0, 1, 2])
clf_partial.partial_fit(iris_data2, iris_target2)
assert_array_almost_equal(clf_full.class_log_prior_,
clf_partial.class_log_prior_)
示例21
def script_run():
# 产生keyword
kw_list = build_key_word("train.txt")
# 保存数据
fp = open("new_word.txt", encoding="utf-8", mode="w")
for word in kw_list:
fp.write(word + "\n")
fp.close()
# kw_list = load_key_words("word.txt")
feature, label = get_feature("train.txt", kw_list)
gnb = MultinomialNB() # 多项式贝叶斯
gnb = gnb.fit(feature, label)
joblib.dump(gnb, 'model/gnb.model')
print("训练完成")
示例22
def train_expert(history_context, history_action):
n_round = len(history_context)
history_context = np.array([history_context[t] for t in range(n_round)])
history_action = np.array([history_action[t] for t in range(n_round)])
logreg = OneVsRestClassifier(LogisticRegression())
mnb = OneVsRestClassifier(MultinomialNB())
logreg.fit(history_context, history_action)
mnb.fit(history_context, history_action)
return [logreg, mnb]
示例23
def train_expert(action_context):
logreg = OneVsRestClassifier(LogisticRegression())
mnb = OneVsRestClassifier(MultinomialNB(), )
logreg.fit(action_context.iloc[:, 2:], action_context.iloc[:, 1])
mnb.fit(action_context.iloc[:, 2:], action_context.iloc[:, 1])
return [logreg, mnb]
示例24
def create_ngram_model(params=None):
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False)
clf = MultinomialNB()
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
示例25
def create_ngram_model(params=None):
def preprocessor(tweet):
global emoticons_replaced
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
clf = MultinomialNB()
pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
示例26
def create_ngram_model():
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False)
clf = MultinomialNB()
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
return pipeline
示例27
def test_discrete_prior():
# Test whether class priors are properly set.
for cls in [BernoulliNB, MultinomialNB]:
clf = cls().fit(X2, y2)
assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0),
clf.class_log_prior_, 8)
示例28
def test_discretenb_predict_proba():
# Test discrete NB classes' probability scores
# The 100s below distinguish Bernoulli from multinomial.
# FIXME: write a test to show this.
X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
X_multinomial = [[0, 1], [1, 3], [4, 0]]
# test binary case (1-d output)
y = [0, 0, 2] # 2 is regression test for binary case, 02e673
for cls, X in zip([BernoulliNB, MultinomialNB],
[X_bernoulli, X_multinomial]):
clf = cls().fit(X, y)
assert_equal(clf.predict(X[-1:]), 2)
assert_equal(clf.predict_proba([X[0]]).shape, (1, 2))
assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
np.array([1., 1.]), 6)
# test multiclass case (2-d output, must sum to one)
y = [0, 1, 2]
for cls, X in zip([BernoulliNB, MultinomialNB],
[X_bernoulli, X_multinomial]):
clf = cls().fit(X, y)
assert_equal(clf.predict_proba(X[0:1]).shape, (1, 3))
assert_equal(clf.predict_proba(X[:2]).shape, (2, 3))
assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1)
assert_almost_equal(np.sum(clf.predict_proba([X[-1]])), 1)
assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
示例29
def test_sample_weight_mnb():
clf = MultinomialNB()
clf.fit([[1, 2], [1, 2], [1, 0]],
[0, 0, 1],
sample_weight=[1, 1, 4])
assert_array_equal(clf.predict([[1, 0]]), [1])
positive_prior = np.exp(clf.intercept_[0])
assert_array_almost_equal([1 - positive_prior, positive_prior],
[1 / 3., 2 / 3.])
示例30
def test_check_accuracy_on_digits():
# Non regression test to make sure that any further refactoring / optim
# of the NB models do not harm the performance on a slightly non-linearly
# separable dataset
digits = load_digits()
X, y = digits.data, digits.target
binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]
# Multinomial NB
scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
assert_greater(scores.mean(), 0.86)
scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
assert_greater(scores.mean(), 0.94)
# Bernoulli NB
scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
assert_greater(scores.mean(), 0.83)
scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
assert_greater(scores.mean(), 0.92)
# Gaussian NB
scores = cross_val_score(GaussianNB(), X, y, cv=10)
assert_greater(scores.mean(), 0.77)
scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10)
assert_greater(scores.mean(), 0.89)
scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
assert_greater(scores.mean(), 0.86)