remove some unused dependencies in metrics

2023-11-10 13:24:08 +08:00 · 2023-11-10 13:24:08 +08:00 · c2c7566d63
parent 254b7f19b1
commit c2c7566d63
5 changed files with 2 additions and 561 deletions
--- a/deepod/metrics/vus/analysis/robustness_eval.py
+++ b/deepod/metrics/vus/analysis/robustness_eval.py
@ -9,11 +9,8 @@ module_path = os.path.abspath(os.path.join('../..'))
 if module_path not in sys.path:
    sys.path.append(module_path)

-from deepod.metrics.vus.utils.slidingWindows import find_length
 from deepod.metrics.vus.utils.metrics import metricor

-from deepod.metrics.vus.models.distance import Fourier
-from deepod.metrics.vus.models.feature import Window


 def generate_new_label(label,lag):
--- a/deepod/metrics/vus/analysis/score_computation.py
+++ b/deepod/metrics/vus/analysis/score_computation.py
@ -1,188 +0,0 @@
-
-import numpy as np
-import math
-import pandas as pd
-import time
-from sklearn.preprocessing import MinMaxScaler
-
-
-import os
-import sys
-module_path = os.path.abspath(os.path.join('../..'))
-if module_path not in sys.path:
-    sys.path.append(module_path)
-
-from deepod.metrics.vus.utils.slidingWindows import find_length
-from deepod.metrics.vus.utils.metrics import metricor
-
-from deepod.metrics.vus.models.distance import Fourier
-from deepod.metrics.vus.models.feature import Window
-from deepod.metrics.vus.models.cnn import cnn
-from deepod.metrics.vus.models.AE_mlp2 import AE_MLP2
-from deepod.metrics.vus.models.lstm import lstm
-from deepod.metrics.vus.models.ocsvm import OCSVM
-from deepod.metrics.vus.models.poly import POLY
-from deepod.metrics.vus.models.pca import PCA
-from deepod.metrics.vus.models.norma import NORMA
-from deepod.metrics.vus.models.matrix_profile import MatrixProfile
-from deepod.metrics.vus.models.lof import LOF
-from deepod.metrics.vus.models.iforest import IForest
-
-def find_section_length(label,length):
-    best_i = None
-    best_sum = None
-    current_subseq = False
-    for i in range(len(label)):
-        changed = False
-        if label[i] == 1:
-            if current_subseq == False:
-                current_subseq = True
-                if best_i is None:
-                    changed = True
-                    best_i = i
-                    best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)])
-                else:
-                    if np.sum(label[max(0,i-200):min(len(label),i+9800)]) < best_sum:
-                        changed = True
-                        best_i = i
-                        best_sum = np.sum(label[max(0,i-200):min(len(label),i+9800)])
-                    else:
-                        changed = False
-                if changed:
-                    diff = i+9800 - len(label)
-
-                    pos1 = max(0,i-200 - max(0,diff))
-                    pos2 = min(i+9800,len(label))
-        else:
-            current_subseq = False
-    if best_i is not None:
-        return best_i-pos1,(pos1,pos2)
-    else:
-        return None,None
-
-def generate_data(filepath,init_pos,max_length):
-    
-    df = pd.read_csv(filepath, header=None).to_numpy()
-    name = filepath.split('/')[-1]
-    #max_length = 30000
-    data = df[init_pos:init_pos+max_length,0].astype(float)
-    label = df[init_pos:init_pos+max_length,1]
-    
-    pos_first_anom,pos = find_section_length(label,max_length)
-    
-    data = df[pos[0]:pos[1],0].astype(float)
-    label = df[pos[0]:pos[1],1]
-    
-    slidingWindow = find_length(data)
-    #slidingWindow = 70
-    X_data = Window(window = slidingWindow).convert(data).to_numpy()
-
-    data_train = data[:int(0.1*len(data))]
-    data_test = data
-
-    X_train = Window(window = slidingWindow).convert(data_train).to_numpy()
-    X_test = Window(window = slidingWindow).convert(data_test).to_numpy()
-    
-    return pos_first_anom,slidingWindow,data,X_data,data_train,data_test,X_train,X_test,label
-
-def compute_score(methods,slidingWindow,data,X_data,data_train,data_test,X_train,X_test):
-    
-    methods_scores = {}
-    for method in methods:
-        start_time = time.time()
-        if method == 'IForest':
-            clf = IForest(n_jobs=1)
-            x = X_data
-            clf.fit(x)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-            score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
-
-        elif method == 'LOF':
-            clf = LOF(n_neighbors=20, n_jobs=1)
-            x = X_data
-            clf.fit(x)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-            score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
-
-        elif method == 'MatrixProfile':
-            clf = MatrixProfile(window = slidingWindow)
-            x = data
-            clf.fit(x)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-            score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
-
-        elif method == 'NormA':
-            clf = NORMA(pattern_length = slidingWindow, nm_size=3*slidingWindow)
-            x = data
-            clf.fit(x)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-            score = np.array([score[0]]*((slidingWindow-1)//2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
-
-        elif method == 'PCA':
-            clf = PCA()
-            x = X_data
-            clf.fit(x)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-            score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
-
-        elif method == 'POLY':
-            clf = POLY(power=3, window = slidingWindow)
-            x = data
-            clf.fit(x)
-            measure = Fourier()
-            measure.detector = clf
-            measure.set_param()
-            clf.decision_function(measure=measure)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-
-        elif method == 'OCSVM':
-            X_train_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_train.T).T
-            X_test_ = MinMaxScaler(feature_range=(0,1)).fit_transform(X_test.T).T
-            clf = OCSVM(nu=0.05)
-            clf.fit(X_train_, X_test_)
-            score = clf.decision_scores_
-            score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-
-        elif method == 'LSTM':
-            clf = lstm(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 50, patience = 5, verbose=0)
-            clf.fit(data_train, data_test)
-            measure = Fourier()
-            measure.detector = clf
-            measure.set_param()
-            clf.decision_function(measure=measure)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-
-        elif method == 'AE':
-            clf = AE_MLP2(slidingWindow = slidingWindow, epochs=100, verbose=0)
-            clf.fit(data_train, data_test)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-
-        elif method == 'CNN':
-            clf = cnn(slidingwindow = slidingWindow, predict_time_steps=1, epochs = 100, patience = 5, verbose=0)
-            clf.fit(data_train, data_test)
-            measure = Fourier()
-            measure.detector = clf
-            measure.set_param()
-            clf.decision_function(measure=measure)
-            score = clf.decision_scores_
-            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
-
-        #end_time = time.time()
-        #time_exec = end_time - start_time
-        #print(method,"\t time: {}".format(time_exec))
-        methods_scores[method] = score
-        
-    return methods_scores
-
-
-
-
--- a/deepod/metrics/vus/models/distance.py
+++ b/deepod/metrics/vus/models/distance.py
@ -3,18 +3,8 @@
 """

 import numpy as np
-# import matplotlib.pyplot as plt
-# import random
 from arch import arch_model
-# import pandas as pd
 import math
-# import pmdarima as pm
-# from pmdarima import model_selection
-# import os
-# import dis
-# import statistics
-# from sklearn import metrics
-# import sklearn


 class Euclidean:
--- a/deepod/metrics/vus/models/feature.py
+++ b/deepod/metrics/vus/models/feature.py
@ -1,359 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Classes of feature mapping for model type B
-"""
-
-import numpy as np
-# import matplotlib.pyplot as plt
-# import random
-# from arch import arch_model
-import pandas as pd
-import math
-# import pmdarima as pm
-# from pmdarima import model_selection
-# import os
-# import dis
-# import statistics
-# from sklearn import metrics
-# import sklearn
-from tsfresh import extract_features
-
-from statsmodels.tsa.seasonal import seasonal_decompose
-
-# import itertools
-# import functools
-import warnings
-from builtins import range
-# from collections import defaultdict
-
-
-from numpy.linalg import LinAlgError
-# from scipy.signal import cwt, find_peaks_cwt, ricker, welch
-# from scipy.stats import linregress
-# from statsmodels.tools.sm_exceptions import MissingDataError
-
-with warnings.catch_warnings():
-    # Ignore warnings of the patsy package
-    warnings.simplefilter("ignore", DeprecationWarning)
-
-    from statsmodels.tsa.ar_model import AR
-# from statsmodels.tsa.stattools import acf, adfuller, pacf
-
-from hurst import compute_Hc
-
-class Window:
-    """ The  class for rolling window feature mapping.
-    The mapping converts the original timeseries X into a matrix. 
-    The matrix consists of rows of sliding windows of original X. 
-    """
-
-    def __init__(self,  window = 100):
-        self.window = window
-        self.detector = None
-    def convert(self, X):
-        n = self.window
-        X = pd.Series(X)
-        L = []
-        if n == 0:
-            df = X
-        else:
-            for i in range(n):
-                L.append(X.shift(i))
-            df = pd.concat(L, axis = 1)
-            df = df.iloc[n-1:]
-        return df
-
-class tf_Stat:
-    '''statisitc feature extraction using the tf_feature package. 
-    It calculates 763 features in total so it might be over complicated for some models. 
-    Recommend to use for methods like Isolation Forest which randomly picks a feature
-    and then perform the classification. To use for other distance-based model like KNN,
-    LOF, CBLOF, etc, first train to pass a function that give weights to individual features so that
-    inconsequential features won't cloud the important ones (mean, variance, kurtosis, etc).
-
-    '''
-    def __init__(self,  window = 100, step = 25):
-        self.window = window
-        self.step = step
-        self.detector = None
-    def convert(self, X):
-        window = self.window
-        step = self.step
-        pos = math.ceil(window/2)
-        #step <= window
-
-        length = X.shape[0]
-
-        Xd = pd.DataFrame(X)
-        Xd.columns = pd.Index(['x'], dtype='object')
-        Xd['id'] = 1
-        Xd['time'] = Xd.index
-        
-        test = np.array(extract_features(Xd.iloc[0+pos-math.ceil(window/2):0+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
-        M = np.zeros((length - window, test.shape[1]+1 ))
-
-        
-        i = 0
-        while i + window <= M.shape[0]:
-            M[i:i+step, 0]= X[pos + i: pos + i + step]
-            vector = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):i+pos + math.floor(window/2)], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
-
-            M[i:i+step, 1:] = vector
-            i+= step
-        num = M.shape[0]
-        if i <  num:
-            M[i: num, 0]= X[pos + i: pos + num]
-            M[i: num, 1:] = np.array(extract_features(Xd.iloc[i+pos-math.ceil(window/2):], column_id="id", column_sort="time", column_kind=None, column_value=None).fillna(0))
-        return M
-
-class Stat:
-    '''statisitc feature extraction. 
-    Features include [mean, variance, skewness, kurtosis, autocorrelation, maximum, 
-    minimum, entropy, seasonality, hurst component, AR coef]
-
-    '''
-    def __init__(self,  window = 100, data_step = 10, param = [{"coeff": 0, "k": 5}], lag = 1, freq = 720):
-        self.window = window
-        self.data_step = data_step
-        self.detector = None
-        self.param = param
-        self.lag = lag 
-        self.freq =freq
-        if data_step > int(window/2):
-            raise ValueError('value step shoudm\'t be greater than half of the window')
-        
-        
-    def convert(self, X):
-        freq = self.freq
-        n = self.window
-        data_step = self.data_step
-        X = pd.Series(X)
-        L = []
-        if n == 0:
-            df = X
-            raise ValueError('window lenght is set to zero')
-        else:
-            for i in range(n):
-                L.append(X.shift(i))
-            df = pd.concat(L, axis = 1)
-            df = df.iloc[n:]
-            df2 = pd.concat(L[:data_step], axis = 1)
-
-        
-        
-        df = df.reset_index()
-        #value 
-        x0 = df2[math.ceil(n/2) : - math.floor(n/2)].reset_index()
-        #mean 
-        x1 = (df.mean(axis=1))
-        #variance 
-        x2 = df.var(axis=1)
-        #AR-coef
-        self.ar_function = lambda x: self.ar_coefficient(x)
-        x3 = df.apply(self.ar_function, axis =1, result_type='expand'  )
-        #autocorrelation
-        self.auto_function = lambda x: self.autocorrelation(x)
-        x4 = df.apply(self.auto_function, axis =1, result_type='expand'  )
-        #kurtosis
-        x5 = (df.kurtosis(axis=1))
-        #skewness
-        x6 = (df.skew(axis=1))
-        #maximum
-        x7 = (df.max(axis=1))
-        #minimum
-        x8 = (df.min(axis=1))
-        #entropy
-        self.entropy_function = lambda x: self.sample_entropy(x)
-        x9 = df.apply(self.entropy_function, axis =1, result_type='expand')
-        
-        #seasonality
-        result = seasonal_decompose(X, model='additive', freq = freq, extrapolate_trend='freq')
-        #seasonal
-        x10 = pd.Series(np.array(result.seasonal[math.ceil(n/2) : - math.floor(n/2)]))
-        #trend 
-        x11 = pd.Series(np.array(result.trend[math.ceil(n/2) : - math.floor(n/2)]))
-        #resid 
-        x12 = pd.Series(np.array(result.resid[math.ceil(n/2) : - math.floor(n/2)]))
-        
-        #Hurst component
-        self.hurst_function = lambda x: self.hurst_f(x)
-        x13 = df.apply(self.hurst_function, axis =1, result_type='expand')
-        
-        L = [x0, x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12, x13]
-        M = pd.concat(L, axis = 1)
-        M = M.drop(columns=['index'])
-
-        return M
-    def ar_coefficient(self, x):
-        """
-        This feature calculator fits the unconditional maximum likelihood
-        of an autoregressive AR(k) process.
-        The k parameter is the maximum lag of the process
-
-        .. math::
-
-            X_{t}=\\varphi_0 +\\sum _{{i=1}}^{k}\\varphi_{i}X_{{t-i}}+\\varepsilon_{t}
-
-        For the configurations from param which should contain the maxlag "k" and such an AR process is calculated. Then
-        the coefficients :math:`\\varphi_{i}` whose index :math:`i` contained from "coeff" are returned.
-
-        :param x: the time series to calculate the feature of
-        :type x: numpy.ndarray
-        :param param: contains dictionaries {"coeff": x, "k": y} with x,y int
-        :type param: list
-        :return x: the different feature values
-        :return type: pandas.Series
-        """
-        calculated_ar_params = {}
-        param = self.param
-        x_as_list = list(x)
-
-        res = {}
-
-        for parameter_combination in param:
-            k = parameter_combination["k"]
-            p = parameter_combination["coeff"]
-
-            column_name = "coeff_{}__k_{}".format(p, k)
-
-            if k not in calculated_ar_params:
-                try:
-                    calculated_AR = AR(x_as_list)
-                    calculated_ar_params[k] = calculated_AR.fit(maxlag=k, solver="mle").params
-                except (LinAlgError, ValueError):
-                    calculated_ar_params[k] = [np.NaN] * k
-
-            mod = calculated_ar_params[k]
-
-            if p <= k:
-                try:
-                    res[column_name] = mod[p]
-                except IndexError:
-                    res[column_name] = 0
-            else:
-                res[column_name] = np.NaN
-
-        L = [(key, value) for key, value in res.items()]
-        L0 = []
-        for item in L:
-            L0.append(item[1])
-        return L0
-
-    def autocorrelation(self, x):
-        """
-        Calculates the autocorrelation of the specified lag, according to the formula [1]
-
-        .. math::
-
-            \\frac{1}{(n-l)\\sigma^{2}} \\sum_{t=1}^{n-l}(X_{t}-\\mu )(X_{t+l}-\\mu)
-
-        where :math:`n` is the length of the time series :math:`X_i`, :math:`\\sigma^2` its variance and :math:`\\mu` its
-        mean. `l` denotes the lag.
-
-        .. rubric:: References
-
-        [1] https://en.wikipedia.org/wiki/Autocorrelation#Estimation
-
-        :param x: the time series to calculate the feature of
-        :type x: numpy.ndarray
-        :param lag: the lag
-        :type lag: int
-        :return: the value of this feature
-        :return type: float
-        """
-        lag = self.lag
-        # This is important: If a series is passed, the product below is calculated
-        # based on the index, which corresponds to squaring the series.
-        if isinstance(x, pd.Series):
-            x = x.values
-        if len(x) < lag:
-            return np.nan
-        # Slice the relevant subseries based on the lag
-        y1 = x[:(len(x) - lag)]
-        y2 = x[lag:]
-        # Subtract the mean of the whole series x
-        x_mean = np.mean(x)
-        # The result is sometimes referred to as "covariation"
-        sum_product = np.sum((y1 - x_mean) * (y2 - x_mean))
-        # Return the normalized unbiased covariance
-        v = np.var(x)
-        if np.isclose(v, 0):
-            return np.NaN
-        else:
-            return sum_product / ((len(x) - lag) * v)
-    def _into_subchunks(self, x, subchunk_length, every_n=1):
-        """
-        Split the time series x into subwindows of length "subchunk_length", starting every "every_n".
-
-        For example, the input data if [0, 1, 2, 3, 4, 5, 6] will be turned into a matrix
-
-            0  2  4
-            1  3  5
-            2  4  6
-
-        with the settings subchunk_length = 3 and every_n = 2
-        """
-        len_x = len(x)
-
-        assert subchunk_length > 1
-        assert every_n > 0
-
-        # how often can we shift a window of size subchunk_length over the input?
-        num_shifts = (len_x - subchunk_length) // every_n + 1
-        shift_starts = every_n * np.arange(num_shifts)
-        indices = np.arange(subchunk_length)
-
-        indexer = np.expand_dims(indices, axis=0) + np.expand_dims(shift_starts, axis=1)
-        return np.asarray(x)[indexer]
-    def sample_entropy(self, x):
-        """
-        Calculate and return sample entropy of x.
-
-        .. rubric:: References
-
-        |  [1] http://en.wikipedia.org/wiki/Sample_Entropy
-        |  [2] https://www.ncbi.nlm.nih.gov/pubmed/10843903?dopt=Abstract
-
-        :param x: the time series to calculate the feature of
-        :type x: numpy.ndarray
-
-        :return: the value of this feature
-        :return type: float
-        """
-        x = np.array(x)
-
-        # if one of the values is NaN, we can not compute anything meaningful
-        if np.isnan(x).any():
-            return np.nan
-
-        m = 2  # common value for m, according to wikipedia...
-        tolerance = 0.2 * np.std(x)  # 0.2 is a common value for r, according to wikipedia...
-
-        # Split time series and save all templates of length m
-        # Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4]
-        xm = self._into_subchunks(x, m)
-
-        # Now calculate the maximum distance between each of those pairs
-        #   np.abs(xmi - xm).max(axis=1)
-        # and check how many are below the tolerance.
-        # For speed reasons, we are not doing this in a nested for loop,
-        # but with numpy magic.
-        # Example:
-        # if x = [1, 2, 3]
-        # then xm = [[1, 2], [2, 3]]
-        # so we will substract xm from [1, 2] => [[0, 0], [-1, -1]]
-        # and from [2, 3] => [[1, 1], [0, 0]]
-        # taking the abs and max gives us:
-        # [0, 1] and [1, 0]
-        # as the diagonal elements are always 0, we substract 1.
-        B = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm])
-
-        # Similar for computing A
-        xmp1 = self._into_subchunks(x, m + 1)
-
-        A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1])
-
-        # Return SampEn
-        return -np.log(A / B)
-    def hurst_f(self, x):
-        H,c, M = compute_Hc(x)
-        return [H, c]
--- a/requirements.txt
+++ b/requirements.txt
@ -7,4 +7,5 @@ tqdm>=4.62.3
 ray==2.6.1
 pyarrow>=11.0.0
 einops
-statsmodels
+statsmodels
+arch