-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOutlier_Tests.py
92 lines (73 loc) · 2.89 KB
/
Outlier_Tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 10 15:53:06 2024
@author: jkosnoff
"""
import numpy as np
def IQR_Outlier_test(data):
"""
A test for outliers based on interquartiles
See https://online.stat.psu.edu/stat200/lesson/3/3.2
Parameters:
- x: array-like, input data
Returns:
- outliers: array-like, an array of boolean indices for whether or not the data
at index i falls outside of the interquartile range
"""
Q1 = np.percentile(data, 25, method = 'midpoint')
Q3 = np.percentile(data, 75, method = 'midpoint')
IQR = Q3 - Q1
outliers = [(datum > Q3 + 1.5 * IQR) or (datum < Q1 - 1.5 * IQR) for datum in data]
return outliers
def double_mad(x, zero_mad_action="warn"):
"""
Calculate the Double Median Absolute Deviation (DoubleMAD) for a given array x.
Converted to Python from Peter Rosenmai's Eureka Statistcs blog post
https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
Parameters:
- x: array-like, input data
- zero_mad_action: str, determines the action in the event of an MAD of zero.
Possible values: "stop", "warn", "na", and "warn and na".
Returns:
- left_mad: float, left median absolute deviation
- right_mad: float, right median absolute deviation
"""
x = np.array(x)
x = x[~np.isnan(x)]
m = np.median(x)
abs_dev = np.abs(x - m)
left_mad = np.median(abs_dev[x <= m])
right_mad = np.median(abs_dev[x >= m])
if left_mad == 0 or right_mad == 0:
if zero_mad_action == "stop":
raise ValueError("MAD is 0")
if zero_mad_action in ["warn", "warn and na"]:
print("MAD is 0")
left_mad += 1e-12
left_mad += 1e-12
if zero_mad_action in ["na", "warn and na"]:
if left_mad == 0:
left_mad = np.nan
if right_mad == 0:
right_mad = np.nan
return left_mad, right_mad
def double_mads_from_median(x, zero_mad_action="warn"):
"""
Calculate the Double Median Absolute Deviation (DoubleMAD) distances from the median for a given array x.
Converted to Python from Peter Rosenmai's Eureka Statistcs blog post
https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
Parameters:
- x: array-like, input data
- zero_mad_action: str, determines the action in the event of an MAD of zero.
Possible values: "stop", "warn", "na", and "warn and na".
Returns:
- mad_distance: array, MAD distances from the median
"""
two_sided_mad = double_mad(x, zero_mad_action)
m = np.median(x)
x_mad = np.repeat(two_sided_mad[0], len(x))
x_mad[x > m] = two_sided_mad[1]
mad_distance = np.abs(x - m) / (x_mad + 1e-12)
mad_distance[x == m] = 0
return mad_distance