Python源码示例:sklearn.neighbors.LocalOutlierFactor()
示例1
def test_lof():
# Toy sample (the last two samples are outliers):
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]]
# Test LocalOutlierFactor:
clf = neighbors.LocalOutlierFactor(n_neighbors=5)
score = clf.fit(X).negative_outlier_factor_
assert_array_equal(clf._fit_X, X)
# Assert largest outlier score is smaller than smallest inlier score:
assert_greater(np.min(score[:-2]), np.max(score[-2:]))
# Assert predict() works:
clf = neighbors.LocalOutlierFactor(contamination=0.25,
n_neighbors=5).fit(X)
assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
assert_array_equal(clf.fit_predict(X), 6 * [1] + 2 * [-1])
示例2
def test_lof_values():
# toy samples:
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
contamination=0.1,
novelty=True).fit(X_train)
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2,
novelty=True).fit(X_train)
s_0 = 2. * sqrt(2.) / (1. + sqrt(2.))
s_1 = (1. + sqrt(2)) * (1. / (4. * sqrt(2.)) + 1. / (2. + 2. * sqrt(2)))
# check predict()
assert_array_almost_equal(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
assert_array_almost_equal(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
# check predict(one sample not in train)
assert_array_almost_equal(-clf1.score_samples([[2., 2.]]), [s_0])
assert_array_almost_equal(-clf2.score_samples([[2., 2.]]), [s_0])
# check predict(one sample already in train)
assert_array_almost_equal(-clf1.score_samples([[1., 1.]]), [s_1])
assert_array_almost_equal(-clf2.score_samples([[1., 1.]]), [s_1])
示例3
def test_lof_precomputed(random_state=42):
"""Tests LOF with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
# As a feature matrix (n_samples by n_features)
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
lof_X.fit(X)
pred_X_X = lof_X._predict()
pred_X_Y = lof_X.predict(Y)
# As a dense distance matrix (n_samples by n_samples)
lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
metric='precomputed', novelty=True)
lof_D.fit(DXX)
pred_D_X = lof_D._predict()
pred_D_Y = lof_D.predict(DYX)
assert_array_almost_equal(pred_X_X, pred_D_X)
assert_array_almost_equal(pred_X_Y, pred_D_Y)
示例4
def test_novelty_training_scores():
# check that the scores of the training samples are still accessible
# when novelty=True through the negative_outlier_factor_ attribute
X = iris.data
# fit with novelty=False
clf_1 = neighbors.LocalOutlierFactor()
clf_1.fit(X)
scores_1 = clf_1.negative_outlier_factor_
# fit with novelty=True
clf_2 = neighbors.LocalOutlierFactor(novelty=True)
clf_2.fit(X)
scores_2 = clf_2.negative_outlier_factor_
assert_array_almost_equal(scores_1, scores_2)
示例5
def test_hasattr_prediction():
# check availability of prediction methods depending on novelty value.
X = [[1, 1], [1, 2], [2, 1]]
# when novelty=True
clf = neighbors.LocalOutlierFactor(novelty=True)
clf.fit(X)
assert hasattr(clf, 'predict')
assert hasattr(clf, 'decision_function')
assert hasattr(clf, 'score_samples')
assert not hasattr(clf, 'fit_predict')
# when novelty=False
clf = neighbors.LocalOutlierFactor(novelty=False)
clf.fit(X)
assert hasattr(clf, 'fit_predict')
assert not hasattr(clf, 'predict')
assert not hasattr(clf, 'decision_function')
assert not hasattr(clf, 'score_samples')
示例6
def test_lof():
# Toy sample (the last two samples are outliers):
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]]
# Test LocalOutlierFactor:
clf = neighbors.LocalOutlierFactor(n_neighbors=5)
score = clf.fit(X).negative_outlier_factor_
assert_array_equal(clf._fit_X, X)
# Assert largest outlier score is smaller than smallest inlier score:
assert_greater(np.min(score[:-2]), np.max(score[-2:]))
# Assert predict() works:
clf = neighbors.LocalOutlierFactor(contamination=0.25,
n_neighbors=5).fit(X)
assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
示例7
def test_lof_performance():
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model
clf = neighbors.LocalOutlierFactor().fit(X_train)
# predict scores (the lower, the more normal)
y_pred = -clf._decision_function(X_test)
# check that roc_auc is good
assert_greater(roc_auc_score(y_test, y_pred), .99)
示例8
def test_lof_precomputed(random_state=42):
"""Tests LOF with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
# As a feature matrix (n_samples by n_features)
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3)
lof_X.fit(X)
pred_X_X = lof_X._predict()
pred_X_Y = lof_X._predict(Y)
# As a dense distance matrix (n_samples by n_samples)
lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
metric='precomputed')
lof_D.fit(DXX)
pred_D_X = lof_D._predict()
pred_D_Y = lof_D._predict(DYX)
assert_array_almost_equal(pred_X_X, pred_D_X)
assert_array_almost_equal(pred_X_Y, pred_D_Y)
示例9
def fit(self, X, y=None):
"""Fit detector. y is ignored in unsupervised methods.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Fitted estimator.
"""
# validate inputs X and y (optional)
X = check_array(X)
self._set_n_classes(y)
self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors,
algorithm=self.algorithm,
leaf_size=self.leaf_size,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
contamination=self.contamination,
n_jobs=self.n_jobs)
self.detector_.fit(X=X, y=y)
# Invert decision_scores_. Outliers comes with higher outlier scores
self.decision_scores_ = invert_order(
self.detector_.negative_outlier_factor_)
self._process_decision_scores()
return self
示例10
def test_lof_performance():
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model for novelty detection
clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = -clf.decision_function(X_test)
# check that roc_auc is good
assert_greater(roc_auc_score(y_test, y_pred), .99)
示例11
def test_n_neighbors_attribute():
X = iris.data
clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
assert_equal(clf.n_neighbors_, X.shape[0] - 1)
clf = neighbors.LocalOutlierFactor(n_neighbors=500)
assert_warns_message(UserWarning,
"n_neighbors will be set to (n_samples - 1)",
clf.fit, X)
assert_equal(clf.n_neighbors_, X.shape[0] - 1)
示例12
def test_contamination():
X = [[1, 1], [1, 0]]
clf = neighbors.LocalOutlierFactor(contamination=0.6)
assert_raises(ValueError, clf.fit, X)
示例13
def test_novelty_errors():
X = iris.data
# check errors for novelty=False
clf = neighbors.LocalOutlierFactor()
clf.fit(X)
# predict, decision_function and score_samples raise ValueError
for method in ['predict', 'decision_function', 'score_samples']:
msg = ('{} is not available when novelty=False'.format(method))
assert_raises_regex(AttributeError, msg, getattr, clf, method)
# check errors for novelty=True
clf = neighbors.LocalOutlierFactor(novelty=True)
msg = 'fit_predict is not available when novelty=True'
assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict')
示例14
def test_novelty_true_common_tests():
# the common tests are run for the default LOF (novelty=False).
# here we run these common tests for LOF when novelty=True
check_estimator(neighbors.LocalOutlierFactor(novelty=True))
示例15
def test_predicted_outlier_number():
# the number of predicted outliers should be equal to the number of
# expected outliers unless there are ties in the abnormality scores.
X = iris.data
n_samples = X.shape[0]
expected_outliers = 30
contamination = float(expected_outliers)/n_samples
clf = neighbors.LocalOutlierFactor(contamination=contamination)
y_pred = clf.fit_predict(X)
num_outliers = np.sum(y_pred != 1)
if num_outliers != expected_outliers:
y_dec = clf.negative_outlier_factor_
check_outlier_corruption(num_outliers, expected_outliers, y_dec)
示例16
def _fit(self, X):
self.estimator_ = LocalOutlierFactor(
algorithm = self.algorithm,
contamination = self.contamination,
leaf_size = self.leaf_size,
metric = self.metric,
novelty = self.novelty,
n_jobs = self.n_jobs,
n_neighbors = self.n_neighbors,
p = self.p,
metric_params = self.metric_params
).fit(X)
return self
示例17
def _get_pipeline(self):
return [('scaler', StandardScaler()),
('model', LocalOutlierFactor(n_jobs=self.conf.n_jobs,
novelty=True))]
示例18
def test_lof_values():
# toy samples:
X_train = [[1, 1], [1, 2], [2, 1]]
clf = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X_train)
s_0 = 2. * sqrt(2.) / (1. + sqrt(2.))
s_1 = (1. + sqrt(2)) * (1. / (4. * sqrt(2.)) + 1. / (2. + 2. * sqrt(2)))
# check predict()
assert_array_almost_equal(-clf.negative_outlier_factor_, [s_0, s_1, s_1])
# check predict(one sample not in train)
assert_array_almost_equal(-clf._decision_function([[2., 2.]]), [s_0])
# # check predict(one sample already in train)
assert_array_almost_equal(-clf._decision_function([[1., 1.]]), [s_1])
示例19
def test_n_neighbors_attribute():
X = iris.data
clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
assert_equal(clf.n_neighbors_, X.shape[0] - 1)
clf = neighbors.LocalOutlierFactor(n_neighbors=500)
assert_warns_message(UserWarning,
"n_neighbors will be set to (n_samples - 1)",
clf.fit, X)
assert_equal(clf.n_neighbors_, X.shape[0] - 1)
示例20
def remove_outliers(object_points):
if len(object_points) > 100:
points_t0 = object_points[:, 0]
points_t1 = object_points[:, 1]
mask = np.zeros(len(object_points), dtype=np.bool)
# fit the model for outlier detection (default)
for points in [points_t0, points_t1]:
clf = LocalOutlierFactor(n_neighbors=20)
clf.fit_predict(points)
X_scores = clf.negative_outlier_factor_
X_scores = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
median_score = np.median(X_scores)
mask = np.logical_or([X_scores[i] > median_score for i in range(len(points))], mask)
# print(X_scores)
# print('median_score ', mean_score)
# plt.title("Local Outlier Factor (LOF)")
# plt.scatter(points[:, 0], points[:, 2], color='k', s=3., label='Data points')
# # plot circles with radius proportional to the outlier scores
# plt.scatter(points[:, 0], points[:, 2], s=1000 * X_scores, edgecolors='r',
# facecolors='none', label='Outlier scores')
# plt.axis('tight')
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
# legend = plt.legend(loc='upper left')
# legend.legendHandles[0]._sizes = [10]
# legend.legendHandles[1]._sizes = [20]
# points = points[np.logical_not(mask)]
# X_scores = X_scores[np.logical_not(mask)]
# plt.title("Local Outlier Factor (LOF)")
# plt.scatter(points[:, 0], points[:, 2], color='k', s=3., label='Data points')
# # plot circles with radius proportional to the outlier scores
# plt.scatter(points[:, 0], points[:, 2], s=1000 * X_scores, edgecolors='r',
# facecolors='none', label='Outlier scores')
# plt.axis('tight')
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
# legend = plt.legend(loc='upper left')
# legend.legendHandles[0]._sizes = [10]
# legend.legendHandles[1]._sizes = [20]
# plt.show()
if len(object_points[np.logical_not(mask)]) > 10:
object_points = object_points[np.logical_not(mask)]
return object_points