remove some unused dependencies in metrics

This commit is contained in:
xuhongzuo 2023-11-10 13:24:08 +08:00
parent 254b7f19b1
commit c2c7566d63
5 changed files with 2 additions and 561 deletions

View File

@ -9,11 +9,8 @@ module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
sys.path.append(module_path)
from deepod.metrics.vus.utils.slidingWindows import find_length
from deepod.metrics.vus.utils.metrics import metricor
from deepod.metrics.vus.models.distance import Fourier
from deepod.metrics.vus.models.feature import Window
def generate_new_label(label,lag):

View File

@ -1,188 +0,0 @@
import numpy as np
import math
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
sys.path.append(module_path)
from deepod.metrics.vus.utils.slidingWindows import find_length
from deepod.metrics.vus.utils.metrics import metricor
from deepod.metrics.vus.models.distance import Fourier
from deepod.metrics.vus.models.feature import Window
from deepod.metrics.vus.models.cnn import cnn
from deepod.metrics.vus.models.AE_mlp2 import AE_MLP2
from deepod.metrics.vus.models.lstm import lstm
from deepod.metrics.vus.models.ocsvm import OCSVM
from deepod.metrics.vus.models.poly import POLY
from deepod.metrics.vus.models.pca import PCA
from deepod.metrics.vus.models.norma import NORMA
from deepod.metrics.vus.models.matrix_profile import MatrixProfile
from deepod.metrics.vus.models.lof import LOF
from deepod.metrics.vus.models.iforest import IForest
def find_section_length(label,length):
best_i = None
best_sum = None
current_subseq = False
for i in range(len(label)):
changed = False
if label[i] == 1:
if current_subseq == False:
current_subseq = True
if best_i is None:
changed = True
best_i = i
best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)])
else:
if np.sum(label[max(0,i-200):min(len(label),i+9800)]) < best_sum:
changed = True
best_i = i
best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)])
else:
changed = False
if changed:
diff = i+9800 - len(label)
pos1 = max(0,i-200 - max(0,diff))
pos2 = min(i+9800,len(label))
else:
current_subseq = False
if best_i is not None:
return best_i-pos1,(pos1,pos2)
else:
return None,None
def generate_data(filepath,init_pos,max_length):
df = pd.read_csv(filepath, header=None).to_numpy()
name = filepath.split('/')[-1]
#max_length = 30000
data = df[init_pos:init_pos+max_length,0].astype(float)
label = df[init_pos:init_pos+max_length,1]
pos_first_anom,pos = find_section_length(label,max_length)
data = df[pos[0]:pos[1],0].astype(float)
label = df[pos[0]:pos[1],1]
slidingWindow = find_length(data)
#slidingWindow = 70
X_data = Window(window = slidingWindow).convert(data).to_numpy()
data_train = data[:int(0.1*len(data))]
data_test = data
X_train = Window(window = slidingWindow).convert(data_train).to_numpy()
X_test = Window(window = slidingWindow).convert(data_test).to_numpy()
return pos_first_anom,slidingWindow,data,X_data,data_train,data_test,X_train,X_test,label
def compute_score(methods,slidingWindow,data,X_data,data_train,data_test,X_train,X_test):
methods_scores = {}
for method in methods:
start_time = time.time()
if method == 'IForest':
clf = IForest(n_jobs=1)
x = X_data
clf.fit(x)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
elif method == 'LOF':
clf = LOF(n_neighbors=20, n_jobs=1)
x = X_data
clf.fit(x)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
elif method == 'MatrixProfile':
clf = MatrixProfile(window = slidingWindow)
x = data
clf.fit(x)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
elif method == 'NormA':
clf = NORMA(pattern_length = slidingWindow, nm_size=3*slidingWindow)
x = data
clf.fit(x)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
score = np.array([score[0]]*((slidingWindow-1)//2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
elif method == 'PCA':
clf = PCA()
x = X_data
clf.fit(x)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
elif method == 'POLY':
clf = POLY(power=3, window = slidingWindow)
x = data
clf.fit(x)
measure = Fourier()
measure.detector = clf
measure.set_param()
clf.decision_function(measure=measure)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
elif method == 'OCSVM':
X_train_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_train.T).T
X_test_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_test.T).T
clf = OCSVM(nu=0.05)
clf.fit(X_train_, X_test_)
score = clf.decision_scores_
score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
elif method == 'LSTM':
clf = lstm(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 50, patience = 5, verbose=0)
clf.fit(data_train, data_test)
measure = Fourier()
measure.detector = clf
measure.set_param()
clf.decision_function(measure=measure)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
elif method == 'AE':
clf = AE_MLP2(slidingWindow = slidingWindow, epochs=100, verbose=0)
clf.fit(data_train, data_test)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
elif method == 'CNN':
clf = cnn(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 100, patience = 5, verbose=0)
clf.fit(data_train, data_test)
measure = Fourier()
measure.detector = clf
measure.set_param()
clf.decision_function(measure=measure)
score = clf.decision_scores_
score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
#end_time = time.time()
#time_exec = end_time - start_time
#print(method,"\t time: {}".format(time_exec))
methods_scores[method] = score
return methods_scores

View File

@ -3,18 +3,8 @@
"""
import numpy as np
# import matplotlib.pyplot as plt
# import random
from arch import arch_model
# import pandas as pd
import math
# import pmdarima as pm
# from pmdarima import model_selection
# import os
# import dis
# import statistics
# from sklearn import metrics
# import sklearn
class Euclidean:

View File

@ -1,359 +0,0 @@
# -*- coding: utf-8 -*-
"""Classes of feature mapping for model type B
"""
import numpy as np
# import matplotlib.pyplot as plt
# import random
# from arch import arch_model
import pandas as pd
import math
# import pmdarima as pm
# from pmdarima import model_selection
# import os
# import dis
# import statistics
# from sklearn import metrics
# import sklearn
from tsfresh import extract_features
from statsmodels.tsa.seasonal import seasonal_decompose
# import itertools
# import functools
import warnings
from builtins import range
# from collections import defaultdict
from numpy.linalg import LinAlgError
# from scipy.signal import cwt, find_peaks_cwt, ricker, welch
# from scipy.stats import linregress
# from statsmodels.tools.sm_exceptions import MissingDataError
with warnings.catch_warnings():
# Ignore warnings of the patsy package
warnings.simplefilter("ignore", DeprecationWarning)
from statsmodels.tsa.ar_model import AR
# from statsmodels.tsa.stattools import acf, adfuller, pacf
from hurst import compute_Hc
class Window:
""" The class for rolling window feature mapping.
The mapping converts the original timeseries X into a matrix.
The matrix consists of rows of sliding windows of original X.
"""
def __init__(self, window = 100):
self.window = window
self.detector = None
def convert(self, X):
n = self.window
X = pd.Series(X)
L = []
if n == 0:
df = X
else:
for i in range(n):
L.append(X.shift(i))
df = pd.concat(L, axis = 1)
df = df.iloc[n-1:]
return df
class tf_Stat:
'''statisitc feature extraction using the tf_feature package.
It calculates 763 features in total so it might be over complicated for some models.
Recommend to use for methods like Isolation Forest which randomly picks a feature
and then perform the classification. To use for other distance-based model like KNN,
LOF, CBLOF, etc, first train to pass a function that give weights to individual features so that
inconsequential features won't cloud the important ones (mean, variance, kurtosis, etc).
'''
def __init__(self, window = 100, step = 25):
self.window = window
self.step = step
self.detector = None
def convert(self, X):
window = self.window
step = self.step
pos = math.ceil(window/2)
#step <= window
length = X.shape[0]
Xd = pd.DataFrame(X)
Xd.columns = pd.Index(['x'], dtype='object')
Xd['id'] = 1
Xd['time'] = Xd.index
test = np.array(extract_features(Xd.iloc[0+pos-math.ceil(window/2):0+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
M = np.zeros((length - window, test.shape[1]+1 ))
i = 0
while i + window <= M.shape[0]:
M[i:i+step, 0]= X[pos + i: pos + i + step]
vector = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):i+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
M[i:i+step, 1:] = vector
i+= step
num = M.shape[0]
if i < num:
M[i: num, 0]= X[pos + i: pos + num]
M[i: num, 1:] = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
return M
class Stat:
'''statisitc feature extraction.
Features include [mean, variance, skewness, kurtosis, autocorrelation, maximum,
minimum, entropy, seasonality, hurst component, AR coef]
'''
def __init__(self, window = 100, data_step = 10, param = [{"coeff": 0, "k": 5}], lag = 1, freq = 720):
self.window = window
self.data_step = data_step
self.detector = None
self.param = param
self.lag = lag
self.freq =freq
if data_step > int(window/2):
raise ValueError('value step shoudm\'t be greater than half of the window')
def convert(self, X):
freq = self.freq
n = self.window
data_step = self.data_step
X = pd.Series(X)
L = []
if n == 0:
df = X
raise ValueError('window lenght is set to zero')
else:
for i in range(n):
L.append(X.shift(i))
df = pd.concat(L, axis = 1)
df = df.iloc[n:]
df2 = pd.concat(L[:data_step], axis = 1)
df = df.reset_index()
#value
x0 = df2[math.ceil(n/2) : - math.floor(n/2)].reset_index()
#mean
x1 = (df.mean(axis=1))
#variance
x2 = df.var(axis=1)
#AR-coef
self.ar_function = lambda x: self.ar_coefficient(x)
x3 = df.apply(self.ar_function, axis =1, result_type='expand' )
#autocorrelation
self.auto_function = lambda x: self.autocorrelation(x)
x4 = df.apply(self.auto_function, axis =1, result_type='expand' )
#kurtosis
x5 = (df.kurtosis(axis=1))
#skewness
x6 = (df.skew(axis=1))
#maximum
x7 = (df.max(axis=1))
#minimum
x8 = (df.min(axis=1))
#entropy
self.entropy_function = lambda x: self.sample_entropy(x)
x9 = df.apply(self.entropy_function, axis =1, result_type='expand')
#seasonality
result = seasonal_decompose(X, model='additive', freq = freq, extrapolate_trend='freq')
#seasonal
x10 = pd.Series(np.array(result.seasonal[math.ceil(n/2) : - math.floor(n/2)]))
#trend
x11 = pd.Series(np.array(result.trend[math.ceil(n/2) : - math.floor(n/2)]))
#resid
x12 = pd.Series(np.array(result.resid[math.ceil(n/2) : - math.floor(n/2)]))
#Hurst component
self.hurst_function = lambda x: self.hurst_f(x)
x13 = df.apply(self.hurst_function, axis =1, result_type='expand')
L = [x0, x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12, x13]
M = pd.concat(L, axis = 1)
M = M.drop(columns=['index'])
return M
def ar_coefficient(self, x):
"""
This feature calculator fits the unconditional maximum likelihood
of an autoregressive AR(k) process.
The k parameter is the maximum lag of the process
.. math::
X_{t}=\\varphi_0 +\\sum _{{i=1}}^{k}\\varphi_{i}X_{{t-i}}+\\varepsilon_{t}
For the configurations from param which should contain the maxlag "k" and such an AR process is calculated. Then
the coefficients :math:`\\varphi_{i}` whose index :math:`i` contained from "coeff" are returned.
:param x: the time series to calculate the feature of
:type x: numpy.ndarray
:param param: contains dictionaries {"coeff": x, "k": y} with x,y int
:type param: list
:return x: the different feature values
:return type: pandas.Series
"""
calculated_ar_params = {}
param = self.param
x_as_list = list(x)
res = {}
for parameter_combination in param:
k = parameter_combination["k"]
p = parameter_combination["coeff"]
column_name = "coeff_{}__k_{}".format(p, k)
if k not in calculated_ar_params:
try:
calculated_AR = AR(x_as_list)
calculated_ar_params[k] = calculated_AR.fit(maxlag=k, solver="mle").params
except (LinAlgError, ValueError):
calculated_ar_params[k] = [np.NaN] * k
mod = calculated_ar_params[k]
if p <= k:
try:
res[column_name] = mod[p]
except IndexError:
res[column_name] = 0
else:
res[column_name] = np.NaN
L = [(key, value) for key, value in res.items()]
L0 = []
for item in L:
L0.append(item[1])
return L0
def autocorrelation(self, x):
"""
Calculates the autocorrelation of the specified lag, according to the formula [1]
.. math::
\\frac{1}{(n-l)\\sigma^{2}} \\sum_{t=1}^{n-l}(X_{t}-\\mu )(X_{t+l}-\\mu)
where :math:`n` is the length of the time series :math:`X_i`, :math:`\\sigma^2` its variance and :math:`\\mu` its
mean. `l` denotes the lag.
.. rubric:: References
[1] https://en.wikipedia.org/wiki/Autocorrelation#Estimation
:param x: the time series to calculate the feature of
:type x: numpy.ndarray
:param lag: the lag
:type lag: int
:return: the value of this feature
:return type: float
"""
lag = self.lag
# This is important: If a series is passed, the product below is calculated
# based on the index, which corresponds to squaring the series.
if isinstance(x, pd.Series):
x = x.values
if len(x) < lag:
return np.nan
# Slice the relevant subseries based on the lag
y1 = x[:(len(x) - lag)]
y2 = x[lag:]
# Subtract the mean of the whole series x
x_mean = np.mean(x)
# The result is sometimes referred to as "covariation"
sum_product = np.sum((y1 - x_mean) * (y2 - x_mean))
# Return the normalized unbiased covariance
v = np.var(x)
if np.isclose(v, 0):
return np.NaN
else:
return sum_product / ((len(x) - lag) * v)
def _into_subchunks(self, x, subchunk_length, every_n=1):
"""
Split the time series x into subwindows of length "subchunk_length", starting every "every_n".
For example, the input data if [0, 1, 2, 3, 4, 5, 6] will be turned into a matrix
0 2 4
1 3 5
2 4 6
with the settings subchunk_length = 3 and every_n = 2
"""
len_x = len(x)
assert subchunk_length > 1
assert every_n > 0
# how often can we shift a window of size subchunk_length over the input?
num_shifts = (len_x - subchunk_length) // every_n + 1
shift_starts = every_n * np.arange(num_shifts)
indices = np.arange(subchunk_length)
indexer = np.expand_dims(indices, axis=0) + np.expand_dims(shift_starts, axis=1)
return np.asarray(x)[indexer]
def sample_entropy(self, x):
"""
Calculate and return sample entropy of x.
.. rubric:: References
| [1] http://en.wikipedia.org/wiki/Sample_Entropy
| [2] https://www.ncbi.nlm.nih.gov/pubmed/10843903?dopt=Abstract
:param x: the time series to calculate the feature of
:type x: numpy.ndarray
:return: the value of this feature
:return type: float
"""
x = np.array(x)
# if one of the values is NaN, we can not compute anything meaningful
if np.isnan(x).any():
return np.nan
m = 2 # common value for m, according to wikipedia...
tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia...
# Split time series and save all templates of length m
# Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4]
xm = self._into_subchunks(x, m)
# Now calculate the maximum distance between each of those pairs
# np.abs(xmi - xm).max(axis=1)
# and check how many are below the tolerance.
# For speed reasons, we are not doing this in a nested for loop,
# but with numpy magic.
# Example:
# if x = [1, 2, 3]
# then xm = [[1, 2], [2, 3]]
# so we will substract xm from [1, 2] => [[0, 0], [-1, -1]]
# and from [2, 3] => [[1, 1], [0, 0]]
# taking the abs and max gives us:
# [0, 1] and [1, 0]
# as the diagonal elements are always 0, we substract 1.
B = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm])
# Similar for computing A
xmp1 = self._into_subchunks(x, m + 1)
A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1])
# Return SampEn
return -np.log(A / B)
def hurst_f(self, x):
H,c, M = compute_Hc(x)
return [H, c]

View File

@ -7,4 +7,5 @@ tqdm>=4.62.3
ray==2.6.1
pyarrow>=11.0.0
einops
statsmodels
statsmodels
arch