Source code for luminaire.exploration.data_exploration

from luminaire.model.model_utils import LADHolidays


[docs]
class DataExplorationError(Exception):
    """
    Exception class for Luminaire Data Exploration.

    """
    def __init__(self, message):
        message = f'Data exploration failed. Error: {message}'
        super(DataExplorationError, self).__init__(message)




[docs]
class DataExploration(object):
    """
    This is a general class for time series data exploration and pre-processing.

    :param str freq: The frequency of the time-series. A `Pandas offset`_ such as 'D', 'H', or 'M'. Luminaire currently
        supports the following pandas frequency types: 'H', 'D', 'W', 'W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU',
        'W-FRI', 'W-SAT'.
    :param float sig_level: The significance level to use for any statistical test withing data profile. This should be
        a number between 0 and 1.
    :param float min_ts_mean: The minimum mean value of the time series required for the model to run. For data that
        originated as integers (such as counts), the ARIMA model can behave erratically when the numbers are small. When
        this parameter is set, any time series whose mean value is less than this will automatically result in a model
        failure, rather than a mostly bogus anomaly.
    :param float fill_rate: Minimum proportion of data availability in the recent data window. Should be a fraction
        between 0 and 1.
    :param int max_window_length: The maximum size of the sub windows for input data segmentation.
    :param int window_length: The size of the sub windows for input data segmentation.
    :param int min_ts_length: The minimum required length of the time series for training.
    :param int max_ts_length: The maximum required length of the time series for training.
    :param bool is_log_transformed: A flag to specify whether to take a log transform of the input data. If the data
        contain negatives, is_log_transformed is ignored even though it is set to True.
    :param bool data_shift_truncate: A flag to specify whether left side of the most recent change point needs to
        be truncated from the training data.
    :param int min_changepoint_padding_length: A padding length between two change points. This parameter makes sure
        that two consecutive change points are not close to each other.
    :param float change_point_threshold: Minimum threshold (a value > 0) to flag change points based on KL divergence.
        This parameter can be used to tune the sensitivity of the change point detection method.

    .. _Pandas offset: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
    """
    __version__ = "0.1"

    def __init__(self,
                 freq='D',
                 min_ts_mean=None,
                 fill_rate=None,
                 sig_level=0.05,
                 min_ts_length=None,
                 max_ts_length=None,
                 is_log_transformed=None,
                 data_shift_truncate=True,
                 min_changepoint_padding_length=None,
                 change_point_threshold=2,
                 window_length=None,
                 *args,
                 **kwargs):

        self._target_index = 'index'
        self._target_metric = 'raw'

        # Assigning a default min_ts_length if not provided by the user. Assigning two cycles.
        # Keys are same as pandas frequency.
        min_ts_length_dict = {
            'H': 144,
            'D': 21,
            'W': 16, 'W-SUN': 16, 'W-MON': 16, 'W-TUE': 16, 'W-WED': 16, 'W-THU': 16, 'W-FRI': 16, 'W-SAT': 16,
            'M': 24, 'MS': 24,
        }
        self.min_ts_length = min_ts_length or min_ts_length_dict.get(freq)
        # Assigning a default max_ts_length if not provided by the user.
        # Keys are same as pandas frequency.
        max_ts_length_dict = {
            'H': 90 * 24,
            'D': 365 * 3,
            'W': 100, 'W-SUN': 100, 'W-MON': 100, 'W-TUE': 100, 'W-WED': 100, 'W-THU': 100, 'W-FRI': 100, 'W-SAT': 100,
            'M': 120, 'MS': 120,
        }
        self.max_ts_length = max_ts_length or max_ts_length_dict.get(freq)

        # Pre-specification of the window lengths for different window frequencies with their min and max
        window_length_dict = {
            'S': 60 * 60,
            'T': 60 * 24,
            '15T': 4 * 24,
            'H': 24,
            'D': 28,
        }

        if freq in ['S', 'T', '15T', 'H', 'D']:
            window_length = window_length_dict.get(freq)
        min_num_train_windows = 6
        max_num_train_windows = 10000
        min_window_length = 10
        max_window_length = 100000

        self.freq = freq
        self.min_ts_mean = min_ts_mean
        self.fill_rate = fill_rate
        self.sig_level = sig_level
        self.is_log_transformed = is_log_transformed
        self.data_shift_truncate = data_shift_truncate
        self.change_point_threshold = change_point_threshold
        self.min_changepoint_padding_length = min_changepoint_padding_length
        self.min_window_length = min_window_length
        self.max_window_length = max_window_length
        self.min_num_train_windows = min_num_train_windows
        self.max_num_train_windows = max_num_train_windows
        self.window_length = window_length

        # Assigning different padding based on the time series frequency
        min_changepoint_padding_length_dict = {
            'H': 7 * 24,
            'D': 10,
            'W': 12, 'W-SUN': 12, 'W-MON': 12, 'W-TUE': 12, 'W-WED': 12, 'W-THU': 12, 'W-FRI': 12, 'W-SAT': 12,
            'M': 24, 'MS': 24,
        }
        self.min_changepoint_padding_length = min_changepoint_padding_length or min_changepoint_padding_length_dict.get(
            freq)

        tc_window_len_dict = {
            'H': 24,
            'D': 7,
            'W':4
        }

        self.tc_window_length = tc_window_len_dict.get(freq) if freq in ['H', 'D', 'W'] else None

        self.tc_max_window_length = 24


[docs]
    def add_missing_index(self, df=None, freq=None):
        """
        This function reindexes a pandas dataframe with missing dates for a given time series frequency.

        Note: If duplicate dates dates are present in the dataframe, this function takes average of the duplicate
        data dates and merges them as a single data date.

        :param pandas.DataFrame df: Input pandas dataframe containing the time series
        :param str freq: The frequency of the time-series. A `Pandas offset`_ such as 'D', 'H', or 'M'
        :return: pandas dataframe after reindexing missing data dates

        :rtype: pandas.DataFrame
        """

        import pandas as pd

        # Adding a group by logic for duplicate index
        df = df.groupby(df.index).mean()

        # Create a new Pandas data frame based on the first valid index and
        # current date using the frequency defined by the use
        idx = pd.date_range(start=df.first_valid_index(), end=df.last_valid_index(), freq=freq)
        df_reindexed = df.reindex(idx)
        return df_reindexed


    def _kalman_smoothing_imputation(self, df=None, target_metric=None, imputed_metric=None, impute_only=False):
        """
        This function performs a missing data imputation using the kalman smoothing.
        :param pandas.Dataframe df: A pandas dataframe containing the time series
        :param str target_metric: A string among the dataframe column name that contains the time series
        :param str imputed_metric: A string among the dataframe column that stores the imputed time series
        :return: A pandas dataframe containing the raw and the imputed time series

        .. Note: missing data are imputed using moving average if position of the missing data does not satisfy the minimum
        length requirement for Kalman smoothing
        """
        import numpy as np
        from pykalman import KalmanFilter
        time_series = np.array(df[target_metric], dtype=np.float64)
        missing_idx = np.where(np.isnan(time_series))[0]
        if len(missing_idx) > 0:
            if missing_idx[0] == 0:
                time_series[0] = np.nanmean(time_series)
                missing_idx = np.delete(missing_idx, 0)
            if missing_idx[-1] == len(time_series) - 1:
                time_series[-1] = np.nanmean(time_series[-min(10, len(time_series)):])
                if np.isnan(time_series[-1]):
                    raise ValueError("Interpolation failed due to too many recent missing data.")
                missing_idx = np.delete(missing_idx, -1)
            if len(missing_idx) > 0:
                masked_series = np.ma.array(np.copy(time_series))
                for i in missing_idx:
                    masked_series[i] = np.ma.masked

                kf = KalmanFilter()
                ts_smoothed, ts_cov = kf.em(masked_series).smooth(masked_series)

                for i in missing_idx:
                    time_series[i] = ts_smoothed[i]

        if impute_only:
            df[target_metric] = time_series
        else:
            df[imputed_metric] = time_series

        return df

    def _moving_average(self, series=None, window_length=None, train_subwindow_len=None):
        """
        This function calculates the moving average of a series based on a given window length.

        :param list series: The list containing the training data.
        :param int window_length: The length of the moving average window.
        :return: Smoothed input series using moving averages
        :rtype: list
        """
        import numpy as np

        moving_averages = []
        iter_length = len(series) - window_length
        for i in range(0, iter_length):
            ma_current = np.mean(series[i:i + window_length])
            moving_averages.append(ma_current)

        # Moving average shrinkes w.r.t the actual series based on the moving average window. Hence, to keep the
        # length of the moving average equal to the series, we append proxy padding which are the moving averages
        # from the closest representative training sub-window.
        moving_averages_padded = moving_averages[(train_subwindow_len - (
                window_length // 2)):train_subwindow_len] + moving_averages + moving_averages[-train_subwindow_len:-(
                train_subwindow_len - (window_length // 2))]

        return moving_averages_padded


    @classmethod
    def _get_exog_data(cls, exog_start, exog_end, index):
        """
        This function gets the exogenous data for the specified index.
        :param pandas.Timestamp exog_start: Start date for the exogenous data
        :param pandas.Timestamp exog_end: End date for the exogenous data
        :param list[pandas.Timestamp] index: List of indices
        :return: Exogenous data for the given list of index
        :rtype: pandas.DataFrame
        """
        import pandas as pd

        holiday_calendar = LADHolidays()
        holiday_series = holiday_calendar.holidays(start=exog_start, end=exog_end, return_name=True)
        return (pd.DataFrame({'Holiday': holiday_series, 'Ones': 1})
                .pivot(columns='Holiday', values='Ones')
                .reindex(index)
                .fillna(0)
                )


    @classmethod
    def _stationarizer(cls, endog=None, diff_min=1, diff_max=2, significance_level=0.01, obs_incl=True):
        """
        This function tests the stationarity of the given time series and performs the required differencing.

        :param pandas.Series endog: The list containing the training data. Note: endog can include the
        raw actual for specific models (e.g. State Space)
        :param int diff_min: Minimum order of differencing for non-stationarity.
        :param int diff_max: Maximum order of differencing for non-stationarity.
        :param float significance_level: Significance level for the adfuller test for checking non-stationarity
        :param bool obs_incl: A flag indicating whether the raw actual is included in the endog.
        :return: Difference time series, the order of differencing based on the stationarity test and the
        raw actuals in every differencing step (for prediction adjustment in the actual
        :rtype: tuple(numpy.array, int, list)
        """

        import numpy as np
        from statsmodels.tsa.stattools import adfuller

        endog_diff = np.array(endog)

        if diff_min == 0:
            # Performing the adfuller test over the raw and the aggregated data to test for stationarity
            adf_pvalue = adfuller(endog_diff)[1]
            take_diff_flag = adf_pvalue > significance_level
        else:
            take_diff_flag = True

        diff_order = 0

        actual_previous_per_diff = []

        # Take the difference until the difference data is stationarity based on the adfuller test
        while take_diff_flag and diff_order < diff_max:
            diff_order = diff_order + 1
            if obs_incl:
                actual_previous_per_diff.append(endog_diff[-2])
            else:
                actual_previous_per_diff.append(endog_diff[-1])
            endog_diff = np.diff(endog_diff)
            adf_pvalue = adfuller(endog_diff)[1]
            take_diff_flag = adf_pvalue > significance_level

        return endog_diff, diff_order, actual_previous_per_diff

    def _partition(self, training_data, window_length, value_column=None):
        """
        This function slices a list from the end of the list based on the size of the slice. Any remainder part of the
        list from the beginning is ignored

        :param pandas.DataFrame/list training_data: Pandas dataframe or a list containing the training data.
        :param int window_length: The length of every slice of the list.
        :param str value_column: Value column in the training dataframe.
        :return: Sliced lists of input time series and aggregated timestamps
        :rtype: tuple
        """
        import collections
        import operator

        if not isinstance(training_data, list):
            lst = list(training_data[value_column])
            idx = training_data.index.normalize()
        else:
            lst = training_data

        n = int(len(lst) / float(window_length))

        # Performing pertition
        lst_sliced = [lst[::-1][int(round(window_length * i)):
                                int(round(window_length * (i + 1)))][::-1] for i in range(n)][::-1]

        if not isinstance(training_data, list):
            idx_truncated = idx[-(n * window_length):]
            aggregated_datetime = []
            for i in range(n):
                current_date_window = idx_truncated[(i * window_length): ((i + 1) * window_length)]
                dates_freq_dist = dict(collections.Counter(current_date_window))
                aggregated_datetime.append(max(dates_freq_dist.items(), key=operator.itemgetter(1))[0])

            return lst_sliced, aggregated_datetime
        else:
            return lst_sliced, None

    def _detrender(self, training_data_sliced=None, detrend_order_max=2, significance_level=0.01,
                   detrend_method=None, agg_datetime=None, past_model=None):
        """
        This function tests the stationarity of the given time series and performs the required differencing.

        :param list training_data_sliced: The list of list containing the training data.
        :param int detrend_order_max: Maximum number of differencing for non-stationarity.
        :param float significance_level: Significance level for the adfuller test for checking non-stationarity.
        :param list agg_datetime: List of aggregated date times.
        :param luminaire.model.window_density.WindowDensityModel past_model: Past stored window density model.
        :return: Difference time series and the order of differencing based on the stationarity test.
        :rtype: tuple(list, int)
        """

        import numpy as np
        import pandas as pd
        from itertools import chain
        from statsmodels.tsa.stattools import adfuller
        from luminaire.model.lad_structural import LADStructuralModel

        agg_data_model = None

        # Flattening the training data using for the stationarity test
        training_data_flattened = list(chain.from_iterable(training_data_sliced))

        # Obtaining the aggregated series for modeling longer term patterns
        avg_series = np.median(np.array(training_data_sliced), axis=1).tolist()
        avg_series_df = pd.DataFrame({'index': agg_datetime, 'raw': avg_series}).set_index('index')

        if past_model:
            past_df = pd.DataFrame(past_model._params['AggregatedData'][
                                       list(past_model._params['AggregatedData'].keys())[0]], columns=[
                self._target_index, self._target_metric]).set_index(self._target_index)
            current_agg_data_dict = avg_series_df['raw'].to_dict()
            agg_data_dict = past_df['raw'].to_dict()
            agg_data_dict.update(current_agg_data_dict)
            avg_series_df = pd.DataFrame({'raw': agg_data_dict})

        # Performing the adfuller test over the raw and the aggregated data to test for stationarity
        adf_pvalue_raw_series = adfuller(training_data_flattened)[1]
        adf_pvalue_avg_series = adfuller(avg_series)[1]
        detrend_flag = adf_pvalue_raw_series > significance_level or adf_pvalue_avg_series > significance_level

        detrend_order = 0

        if detrend_method == 'diff':
            training_data_sliced_stationarized = training_data_sliced
        elif detrend_method == 'modeling':
            if not detrend_flag:
                return training_data_sliced, detrend_order, agg_data_model, agg_data_model

        # Take the difference until the difference data is stationarity based on the adfuller test
        while detrend_flag and detrend_order < detrend_order_max:
            detrend_order = detrend_order + 1
            if detrend_method == 'diff':
                for i in range(0, len(training_data_sliced)):
                    training_data_sliced_stationarized[i] = np.diff(training_data_sliced_stationarized[i])
                training_data_stationarized_flattened = list(chain.from_iterable(training_data_sliced_stationarized))

                avg_series = np.array(training_data_sliced_stationarized).mean(0).tolist()
                adf_pvalue_raw_series = adfuller(training_data_stationarized_flattened)[1]
                adf_pvalue_avg_series = adfuller(avg_series)[1]

                detrend_flag = (
                        adf_pvalue_raw_series > significance_level or adf_pvalue_avg_series > significance_level)
            elif detrend_method == 'modeling':
                training_data_sliced_stationarized = (np.array(training_data_sliced) /
                                                      (np.array(avg_series).reshape(-1, 1))).tolist()

                agg_struct_model_config = {"include_holidays_exog": 1, "is_log_transformed": 0,
                                           "max_ft_freq": 3, "p": 3, "q": 3}
                de_obj = DataExploration(freq='D', data_shift_truncate=False, is_log_transformed=False, fill_rate=0.9)
                avg_series_df = avg_series_df.groupby(level=0).max()
                agg_cleaned_data, pre_prc = de_obj.profile(avg_series_df)
                if pre_prc['success']:
                    lad_struct_obj = LADStructuralModel(hyper_params=agg_struct_model_config, freq='D')
                    success, model_date, agg_data_model = lad_struct_obj.train(data=agg_cleaned_data, **pre_prc)
                    if not success:
                        if 'AR coefficients' in agg_data_model._params['ErrorMessage']:
                            agg_struct_model_config = {"include_holidays_exog": 1, "is_log_transformed": 0,
                                                       "max_ft_freq": 3, "p": 0, "q": 1}
                            lad_struct_obj = LADStructuralModel(hyper_params=agg_struct_model_config, freq='D')
                            success, model_date, agg_data_model = lad_struct_obj.train(data=agg_cleaned_data, **pre_prc)
                        elif 'MA coefficients' in agg_data_model._params['ErrorMessage']:
                            agg_struct_model_config = {"include_holidays_exog": 1, "is_log_transformed": 0,
                                                       "max_ft_freq": 3, "p": 1, "q": 0}
                            lad_struct_obj = LADStructuralModel(hyper_params=agg_struct_model_config, freq='D')
                            success, model_date, agg_data_model = lad_struct_obj.train(data=agg_cleaned_data, **pre_prc)
                    agg_data_model = agg_data_model if success else None

                detrend_flag = False

        return training_data_sliced_stationarized, detrend_order, \
               agg_data_model, avg_series_df.reset_index().values.tolist()

    def _ma_detrender(self, series=None, padded_series=None, ma_window_length=None):
        """
        This function detrends a the values from a target time window w.r.t a padding around the target window.
        Note: This function is only been used for detrending the scoring window.
        :param list series: The series containing the values from the scoring window.
        :param list padded_series: The series containing the values for the padded scoring window
        :param int ma_window_length: Size of the padding
        :return: Returns 'series' after removing the trend
        :rtype: list
        """

        import numpy as np

        moving_averages = []

        iter_length = len(padded_series) - ma_window_length
        for i in range(0, iter_length):
            ma_current = np.mean(padded_series[i:i + ma_window_length])
            moving_averages.append(ma_current)

        stationarized_series = (np.array(series) / np.array(moving_averages)).tolist()

        return stationarized_series

    def _detect_window_size(self, series=None, streaming=False):
        """
        This function detects the ideal window size based on the seasonality pattern of the data
        :param pandas.DataFrame series: The input sequence of data.
        :param bool streaming: Glag to update the logic for streaming anomaly detection models.
        :return: An int containing the optimal window size
        :rtype: int
        """
        import numpy as np

        n = len(series)

        if not streaming:
            series = np.diff(series, 1)
        # Generating the indices based on odd and event number of terms in the time series
        if int(n) % 2 != 0:
            all_idx = np.arange(1, n // 2 + 1)
        else:
            all_idx = np.arange(1, n // 2)
        # Performing Fourier transformation
        yf = np.real(np.fft.rfft(series))

        # Spectral density for the fourier transformation (to identify the significant frequencies)
        psd = abs(yf[all_idx]) ** 2 + abs(yf[-all_idx]) ** 2

        sig_freq_idx = np.argsort(psd[: int(len(psd) / 2)])

        if streaming:
            return int(np.rint(float(n) / max(1, sig_freq_idx[-1] + 1)))
        return min(self.tc_max_window_length,
                   max(int(np.rint(float(n) / max(1, sig_freq_idx[-1] + 1))),
                       int(np.rint(float(n) / max(1, sig_freq_idx[-2] + 1)))))

    def _local_minima(self, input_dict=None, window_length=None):
        """
        This function finds the index corresponding to the local minimas for detected consecutive trend changes
        :param dict input_dict: A dictionary containing the timestamps as keys for potential trend changes
        and the values being the corresponding p-values
        :param int window_length: The size of the sub windows for input data segmentation.
        :return: List of local minimas
        :rtype: list
        """
        import numpy as np
        import collections

        ordered_dict = collections.OrderedDict(sorted(input_dict.items()))
        key_list = np.array(list(ordered_dict.keys()))
        value_list = np.array(list(ordered_dict.values()))

        diff2_series = np.diff(np.sign(np.diff(value_list)))
        local_min_loc = np.where(diff2_series > 0)[0]
        local_min_loc = local_min_loc + 1 if len(local_min_loc) > 0 else np.zeros(1)

        if len(local_min_loc) > 1:
            min_keys = [key_list[int(loc)] for loc in local_min_loc]
        elif len(local_min_loc) <= 1:
            min_keys = [key_list[0], key_list[-1]] if len(input_dict) > window_length else [key_list[int(loc)] for loc in
                                                                                          local_min_loc]

        return min_keys

    def _shift_intensity(self, change_points=None, df=None, metric=None):
        """
        This function computes the Kullback_Leibler divergence of the the time series around a changepoint detected by the
        pelt_change_point_detection() function. This considers Gaussian assumption on the underlying data distribution.

        :param list change_points: A list storing indices of the potential change points
        :param pandas.dataframe df: A pandas dataframe containing time series ignoring the top 5% volatility
        :param str metric: A string in the dataframe column names that contains the time series
        :return: A list containing the magnitude of changes for every corresponding change points
        :rtype: list
        """
        import numpy as np

        min_changepoint_padding_length = self.min_changepoint_padding_length

        mag_change = []
        float_min = 1e-10
        c_count = 0
        # This loop iterates over all the change points obtained through PELT
        for dates in change_points:
            # KL divergence is measured for the datapoints on the left side of the change point versus the right side
            # of the change point.
            if len(change_points) == 1:
                window_l = df[metric].iloc[:change_points[c_count]].values
                window_r = df[metric].iloc[change_points[c_count]:].values
            elif c_count == 0:
                window_l = df[metric].iloc[:change_points[c_count]].values
                window_r = df[metric].iloc[change_points[c_count]:change_points[c_count + 1]].values
            elif c_count < len(change_points) - 1:
                window_l = df[metric].iloc[change_points[c_count - 1]:change_points[c_count]].values
                window_r = df[metric].iloc[change_points[c_count]:change_points[c_count + 1]].values
            else:
                window_l = df[metric].iloc[change_points[c_count - 1]:change_points[c_count]].values
                window_r = df[metric].iloc[change_points[c_count]:].values
            if len(window_r) <= min_changepoint_padding_length:
                mag_change.append(0)
            else:
                window = np.concatenate((window_l, window_r), axis=0)
                # Mean and standard deviation of the data from the combined window
                w_mean = np.mean(window)
                w_std = np.std(window, ddof=1) if len(window) > 1 else float_min
                # Mean and standard deviation of the data from the window after the change point
                wr_mean = np.mean(window_r)
                wr_std = np.std(window_r, ddof=1) if len(window_r) > 1 else float_min
                # Kullback-Leibler divergence between two normal densities
                mag_change.append(
                    np.log(wr_std / w_std) + ((w_std ** 2 + (w_mean - wr_mean) ** 2) / (2 * (wr_std ** 2))) - 0.5)
            c_count = c_count + 1

        return mag_change

    def _pelt_change_point_detection(self, df=None, metric=None, min_ts_length=None, max_ts_length=None):
        """
        This function computes the significant change points based on PELT and the Kullback-Leibler divergence method.
        :param pandas.dataframe df: A pandas dataframe containing the time series
        :param pandas.dataframe metric: The metric in the dataframe that contains the time series
        :param int min_ts_length: Specifying the minimum required length of the time series for training
        :param int max_ts_length: Specifying the maximum required length of the time series for training.
        The training time series length truncates accordingly based on minimum between max_ts_length and the
        length of the input time series.
        :return: A pandas dataframe containing the time series after the last changepoint
        :rtype: tuple

        >>> df
                          raw  interpolated
        2016-01-02  1753421.0     14.377080
        2016-01-03  1879108.0     14.446308
        2016-01-04  1462725.0     14.195812
        2016-01-05  1525162.0     14.237612
        2016-01-06  1424264.0     14.169166
        ...               ...           ...
        2018-10-14  2185230.0     14.597232
        2018-10-15  1825539.0     14.417386
        2018-10-16  1776778.0     14.390313
        2018-10-17  1792899.0     14.399345
        2018-10-18  1738657.0     14.368624

        >>> copy_df, change_point_list
        (                  raw  interpolated
        2016-01-02  1753421.0     14.377080
        2016-01-03  1879108.0     14.446308
        2016-01-04  1462725.0     14.195812
        2016-01-05  1525162.0     14.237612
        2016-01-06  1424264.0     14.169166
        ...               ...           ...
        2018-10-14  2185230.0     14.597232
        2018-10-15  1825539.0     14.417386
        2018-10-16  1776778.0     14.390313
        2018-10-17  1792899.0     14.399345
        2018-10-18  1738657.0     14.368624
        [1021 rows x 2 columns], ['2016-12-26 00:00:00', '2018-09-10 00:00:00'])
        """
        import numpy as np
        import pandas as pd
        from changepy import pelt
        from changepy.costs import normal_var

        change_point_threshold = self.change_point_threshold

        df_copy = pd.DataFrame(df[metric])

        counts = df_copy[metric].values
        mean = np.mean(counts)

        # Performing changepoint detection with respect to the data variablity shift through PELT
        cdate = pelt(normal_var(counts, mean), len(counts))

        # If PELT detects the first datapoint to be a change point, then we ignore that change point
        if cdate:
            if cdate[0] == 0:
                cdate.remove(0)
        if len(cdate) > 0:
            # Finding the magnitude of divergence around every change point (detected by PELT) by comparing the
            # distributions of the data points on the left and the right side of the change point
            entrp = self._shift_intensity(change_points=cdate, df=df_copy, metric=metric)
            df_change_points = pd.DataFrame({'c_point': cdate, 'entropy': entrp})

            # Narrowing down to the change points which satisfies a required lower bound of divergence
            df_change_points = df_change_points[df_change_points['entropy'] > change_point_threshold]
            cdate = df_change_points['c_point'].values

            # Set the start date of the time series based on the min_ts_length and the max_ts_length and the change
            # points
            if len(cdate) > 0:
                df_subset = df_copy.iloc[cdate]
                change_point_list = [i.__str__() for i in df_subset.index]
                index = df_subset.index[-1]
                copy_df = df.loc[index: df.last_valid_index()] if self.data_shift_truncate else df
                if copy_df.shape[0] < min_ts_length:
                    # Return None, If time series after the change point contains less number of data points
                    # than the minimum required length
                    return None, change_point_list
                elif copy_df.shape[0] < max_ts_length:
                    # Return the time series after the change point if it's length lies between the minimum and the
                    # maximum required length
                    pass
                elif copy_df.shape[0] > max_ts_length:
                    # Truncate the time series after change point if it contains more than required data for
                    # training
                    copy_df = df.iloc[-max_ts_length:]
            else:
                change_point_list = None
                if df.shape[0] < min_ts_length:
                    return None, change_point_list
                else:
                    if df.shape[0] < max_ts_length:
                        copy_df = df
                    else:
                        copy_df = df.iloc[-max_ts_length:]
        else:
            change_point_list = None
            if df.shape[0] < min_ts_length:
                return None, change_point_list
            else:
                if df.shape[0] < max_ts_length:
                    copy_df = df
                else:
                    copy_df = df.iloc[-max_ts_length:]
        return copy_df, change_point_list

    def _trend_changes(self, input_df=None, value_column=None):
        """
        This function detects the trend changes of the input time series
        :param pandas.DataFrame input_df: The input sequence of data.
        :param str value_column: A string containing the column name containing the target series
        :return: list of strings with the potential trend changes.
        :rtype: list[str]

        >>> input_df
                          raw  interpolated
        2016-01-02  1753421.0     14.377080
        2016-01-03  1879108.0     14.446308
        2016-01-04  1462725.0     14.195812
        2016-01-05  1525162.0     14.237612
        2016-01-06  1424264.0     14.169166
        ...               ...           ...
        2018-10-14  2185230.0     14.597232
        2018-10-15  1825539.0     14.417386
        2018-10-16  1776778.0     14.390313
        2018-10-17  1792899.0     14.399345
        2018-10-18  1738657.0     14.368624

        >>> global_trend_changes
        ['2016-04-16 00:00:00', '2016-05-28 00:00:00', '2016-08-06 00:00:00', '2016-11-05 00:00:00',
        '2016-12-31 00:00:00', '2017-02-25 00:00:00', '2017-03-25 00:00:00', '2017-06-03 00:00:00',
        '2017-07-01 00:00:00', '2017-08-26 00:00:00', '2017-09-30 00:00:00', '2017-10-28 00:00:00',
        '2017-12-23 00:00:00', '2018-02-17 00:00:00', '2018-03-17 00:00:00', '2018-05-19 00:00:00',
        '2018-06-30 00:00:00', '2018-09-08 00:00:00']

        """
        import numpy as np
        from scipy import stats
        from statsmodels.tsa.stattools import acf

        min_float = 1e-10
        window_length = self.tc_window_length
        sig_level = self.sig_level

        series = input_df[value_column].tolist()
        timestamps = input_df.index.tolist()

        if not window_length:
            window_length = self._detect_window_size(series=series)

        # Creating a crude estimation of the required window size
        nwindows = int(window_length * 1.5)

        current_mid_point = window_length * nwindows
        past_trend_change = 0
        global_trend_changes = []
        local_trend_changes = {}
        past_p_value = -1

        # If the remaining part of the time series is less than the (window_length * nwindows) we terminate the while loop
        current_reminder = len(series) if (current_mid_point + (window_length * nwindows)) < len(series) else 0
        while current_reminder >= window_length:

            # Creating the left and the right window for slope detection
            l_window = series[current_mid_point - (window_length * nwindows): current_mid_point]
            r_window = series[current_mid_point: current_mid_point + (window_length * nwindows)]
            l_window_length = len(l_window)
            r_window_length = len(r_window)
            N = l_window_length + r_window_length

            # Finding the effective degrees of freedom
            auto_corr = acf(l_window + r_window, nlags=N)
            auto_corr[np.isnan(auto_corr)] = 1
            eff_df = 0
            for i in range(1, N):
                eff_df = eff_df + (((N - i) / float(N)) * auto_corr[i])
            eff_df = max(1, int(np.rint(1 / ((1 / float(N)) + ((2 / float(N)) * eff_df)))) - 4)

            # Creating the left and right indices for running the regression
            l_x, r_x = np.arange(l_window_length), np.arange(r_window_length)

            # Linear regression on the left and the right window
            l_slope, l_intercept, l_r_value, l_p_value, l_std_err = stats.linregress(l_x, l_window)
            r_slope, r_intercept, r_r_value, r_p_value, r_std_err = stats.linregress(r_x, r_window)

            # t-test for slope shift
            l_window_hat = (l_slope * l_x) + l_intercept
            r_window_hat = (r_slope * r_x) + r_intercept

            l_sse = np.sum((l_window - l_window_hat) ** 2)
            r_sse = np.sum((r_window - r_window_hat) ** 2)

            l_const = np.sum((np.arange(1, l_window_length + 1) - ((l_window_length + 1) / 2.0)) ** 2)
            r_const = np.sum((np.arange(1, r_window_length + 1) - ((r_window_length + 1) / 2.0)) ** 2)

            prop_const = (l_const * r_const) / (l_const + r_const)

            total_sse = l_sse + r_sse

            std_err = max(np.sqrt(total_sse / (prop_const * (l_window_length + r_window_length - 4))), min_float)

            t_stat = abs(l_slope - r_slope) / std_err

            p_value = (1 - stats.t.cdf(t_stat, df=eff_df)) * 2

            if p_value < sig_level:
                # Check if the same shift detected multiple times
                if p_value == past_p_value:
                    local_trend_changes.pop(past_trend_change)
                if current_reminder - window_length < window_length:
                    if len(local_trend_changes) > 2:
                        # _local_minima function is called to detec the optimal trend change(s) among a group of local
                        # trend changes
                        current_trend_change = self._local_minima(input_dict=local_trend_changes,
                                                                  window_length=window_length)
                        for key in current_trend_change:
                            global_trend_changes.append(timestamps[key].__str__())
                else:
                    # Handling the trend changes at the tail of the time series
                    local_trend_changes[current_mid_point] = p_value
                    past_trend_change = current_mid_point
                past_p_value = p_value
            else:
                # Handling the trend changes at the tail of the time series
                if (current_mid_point - past_trend_change) <= window_length and len(local_trend_changes) > 2:
                    current_trend_change = self._local_minima(input_dict=local_trend_changes, window_length=window_length)
                    for key in current_trend_change:
                        global_trend_changes.append(timestamps[key].__str__())
                local_trend_changes = {}

            current_mid_point = current_mid_point + window_length
            current_reminder = len(series) - current_mid_point

        return global_trend_changes


[docs]
    def kf_naive_outlier_detection(self, input_series, idx_position):
        """
        This function detects outlier for the specified index position of the series.

        :param numpy.array input_series: Input time series
        :param int idx_position: Target index position
        :return: Anomaly flag
        :rtype: bool

        >>> input_series = [110, 119, 316, 248, 451, 324, 241, 275, 381]
        >>> self.kf_naive_outlier_detection(input_series, 6)
        False
        """
        import numpy as np
        from pykalman import KalmanFilter

        kf = KalmanFilter()

        truncated_series = input_series[-(self.min_ts_length * 3):] \
            if idx_position == len(input_series) - 1 else input_series
        idx_position = len(truncated_series) - 1

        filtered_state_means, filtered_state_covariance = kf.em(truncated_series).filter(truncated_series)

        residuals = truncated_series - filtered_state_means[:, 0]

        # Catching marginal anomalies to avoid during training
        is_anomaly = residuals[idx_position] < np.mean(residuals) \
                     - (1 * np.sqrt(filtered_state_covariance)[idx_position][0][0]) \
                     or residuals[idx_position] > np.mean(residuals) \
                     + (1 * np.sqrt(filtered_state_covariance)[idx_position][0][0])

        return is_anomaly


    def _truncate_by_data_gaps(self, df, target_metric):
        """
        This function truncates time series after large data gaps.

        :param pandas.DataFrame df: Input time series in pandas data frame
        :param str target_metric: Target value column in the input data frame
        :return: Pandas dataframe with truncated pandas data frame
        :rtype: pandas.DataFrame
        """

        import numpy as np

        max_data_gap = abs(self.min_ts_length / 3.0)

        gap_len = 0
        last_avl_idx = None
        for row in df[::-1].iterrows():
            if np.isnan(row[1][target_metric]) or row[1][target_metric] is None:
                gap_len = gap_len + 1
            else:
                gap_len = 0
                last_avl_idx = row[0]

            if gap_len >= max_data_gap and last_avl_idx:
                truncated_df = df[last_avl_idx:]
                return truncated_df

        return df


    def _prepare(self, df, impute_only, streaming=False, **kwargs):
        """
        This function performs a basic data preparation before performing a full profiling

        :param pandas.DataFrame/list df: Input time series in pandas data frame or in list format
        :param bool impute_only: A flag to decide whether to return just after imputation only
        :param streaming: A flag to identify the logic based on streaming vs non-streaming data
        :param kwargs: Other input parameters
        :return: Pandas dataframe with prepared data (with identified frequency for streaming)
        :rtype: tuple
        """

        import pandas as pd

        min_ts_length = self.min_ts_length
        max_ts_length = self.max_ts_length
        target_metric = 'raw'
        imputed_metric = 'interpolated'

        # if input dimension/metric combination timeseries is null, skip modeling
        if len(df) == 0:
            raise ValueError("No model ran because dimension/metric combination is null")
        if not streaming and len(df) < min_ts_length:
            raise ValueError("Current time series length of {}{} is less tha minimum requires "
                             "length {}{} for training".format(len(df), self.freq, min_ts_length, self.freq))

        if isinstance(df, list):
            df = (pd.DataFrame(df, columns=[self._target_index, self._target_metric]).set_index(self._target_index))

        if not self.freq:
            if streaming and len(df) > 1:
                freq = (pd.DatetimeIndex(df.index[1:]) - pd.DatetimeIndex(df.index[:-1])).value_counts().index[0]
        else:
            freq = self.freq

        freq_delta = pd.Timedelta("1" + freq) if not any(char.isdigit() for char in str(freq)) else pd.Timedelta(freq)
        df.index = pd.DatetimeIndex(df.index)
        df = self.add_missing_index(df=df, freq=freq_delta)

        if not streaming:
            df = df.iloc[-min(max_ts_length, len(df)):]
            df = self._truncate_by_data_gaps(df=df, target_metric=target_metric)

            if len(df) < min_ts_length:
                raise ValueError("Due to a recent data gap, training is waiting for more data to populate")

        if not streaming and len(df) < min_ts_length:
            raise ValueError("The training data observed continuous missing data near the end. Require more stable "
                             "data to train")

        if streaming and 'impute_zero' in kwargs and kwargs['impute_zero']:
            df = df.fillna(0)
        else:
            df = self._kalman_smoothing_imputation(df=df, target_metric=target_metric, imputed_metric=imputed_metric,
                                                   impute_only=impute_only)

        if streaming:
            return df, freq
        else:
            return df



[docs]
    def profile(self, df, impute_only=False, **kwargs):
        """
        This function performs required data profiling and pre-processing before hyperparameter optimization or time
        series model training.

        :param list/pandas.DataFrame df: Input time series.
        :param bool impute_only: Flag to perform preprocessing until imputation OR full preprocessing.
        :return: Preprocessed dataframe with batch data summary.
        :rtype: tuple[pandas.dataFrame, dict]

        >>> de_obj = DataExploration(freq='D', data_shift_truncate=1, is_log_transformed=0, fill_rate=0.9)
        >>> data
                       raw
        index
        2020-01-01  1326.0
        2020-01-02  1552.0
        2020-01-03  1432.0
        2020-01-04  1470.0
        2020-01-05  1565.0
        ...            ...
        2020-06-03  1934.0
        2020-06-04  1873.0
        2020-06-05  1674.0
        2020-06-06  1747.0
        2020-06-07  1782.0
        >>> data, summary = de_obj.profile(data)
        >>> data, summary
        (              raw interpolated
        2020-03-16  1371.0       1371.0
        2020-03-17  1325.0       1325.0
        2020-03-18  1318.0       1318.0
        2020-03-19  1270.0       1270.0
        2020-03-20  1116.0       1116.0
        ...            ...          ...
        2020-06-03  1934.0       1934.0
        2020-06-04  1873.0       1873.0
        2020-06-05  1674.0       1674.0
        2020-06-06  1747.0       1747.0
        2020-06-07  1782.0       1782.0
        [84 rows x 2 columns], {'success': True, 'trend_change_list': ['2020-04-01 00:00:00'], 'change_point_list':
        ['2020-03-16 00:00:00'], 'is_log_transformed': 0, 'min_ts_mean': None, 'ts_start': '2020-01-01 00:00:00',
        'ts_end': '2020-06-07 00:00:00'})

        """

        import numpy as np

        min_ts_length = self.min_ts_length
        max_ts_length = self.max_ts_length
        is_log_transformed = self.is_log_transformed
        target_metric = 'raw'
        imputed_metric = 'interpolated'

        try:
            processed_df= self._prepare(df, impute_only)

            if impute_only:
                summary = {'success': True}
                return processed_df, summary

            if self.freq == 'D':
                train_end_anomaly_flag = self.kf_naive_outlier_detection(input_series=processed_df['interpolated'].values,
                                                                         idx_position=len(processed_df['interpolated']) - 1)
                if train_end_anomaly_flag:
                    processed_df = processed_df.iloc[:-1]

            if is_log_transformed and not processed_df[processed_df[target_metric] < 0].empty:
                is_log_transformed = False

            # We want to make sure the time series does not contain any negatives in case of log transformation
            if is_log_transformed:
                min_ts_mean = np.log(max(self.min_ts_mean, 1)) if self.min_ts_mean is not None else None
            else:
                min_ts_mean = self.min_ts_mean

            if np.sum(np.isnan(processed_df[target_metric].values[-(2 * min_ts_length):])) / float(2 * min_ts_length) \
                    > 1 - self.fill_rate:
                if np.sum(np.isnan(processed_df[target_metric].values[-min_ts_length:])) > 0:
                    raise ValueError('Too few observed data near the prediction date')

            if 'ts_start' not in kwargs:
                ts_start = processed_df.index.min()
            else:
                ts_start = max(processed_df.index.min(), kwargs['ts_start'])

            if 'ts_end' not in kwargs:
                ts_end = processed_df.index.max()
            else:
                ts_end = min(processed_df.index.max(), kwargs['ts_end'])

            processed_df = processed_df[ts_start:]

            processed_df = processed_df.loc[:ts_end]

            if is_log_transformed:
                processed_df[imputed_metric] = np.log(processed_df[imputed_metric] + 1) \
                    if is_log_transformed else processed_df[imputed_metric]

            data, change_point_list = self._pelt_change_point_detection(df=processed_df, metric=imputed_metric,
                                                                        min_ts_length=min_ts_length,
                                                                        max_ts_length=max_ts_length)

            trend_change_list = self._trend_changes(input_df=processed_df, value_column=imputed_metric)

            summary = {'success': True,
                       'trend_change_list': trend_change_list if len(trend_change_list) > 0 else None,
                       'change_point_list': change_point_list,
                       'is_log_transformed': is_log_transformed,
                       'min_ts_mean': min_ts_mean,
                       'ts_start': str(ts_start),
                       'ts_end': str(ts_end)}

        except Exception as e:
            # Data exploration failed
            summary = {'success': False,
                       'ErrorMessage': str(e)}
            data = None

        return data, summary



[docs]
    def stream_profile(self, df, impute_only=False, **kwargs):
        """
        This function performs data preparation for streaming data.

        :param df: list/pandas.DataFrame df: Input time series.
        :param impute_only: Flag to perform preprocessing until imputation OR full preprocessing.
        :param kwargs: Other input parameters.
        :return: Prepared ppandas dataframe with profile information.
        :rtype: tuple[pandas.dataFrame, dict]
        """

        from random import sample
        import datetime
        import numpy as np
        import pandas as pd
        from scipy import stats

        try:
            processed_df, freq = self._prepare(df, impute_only=impute_only, streaming=True, **kwargs)

            if impute_only:
                return processed_df, None

            training_end = processed_df.index[-1]
            training_end_time = training_end.time()

            train_start_search_flag = True
            idx_date_list = []
            for idx in processed_df.index:
                if idx.time() == training_end_time and train_start_search_flag:
                    delta = "1" + freq if not any(char.isdigit() for char in str(freq)) else freq
                    training_start = idx + pd.Timedelta(delta)
                    training_start_time = training_start.time()
                    train_start_search_flag = False
                if idx.date() not in idx_date_list:
                    idx_date_list.append(idx.date())

            trunc_df = processed_df[training_start: training_end]

            if not self.window_length:
                window_length_list = []

                # If the window size is not specified, the following logic makes several random segments of the
                # time series which obtains a list of optimal window sizes
                for i in range(100):
                    rand_date = sample(idx_date_list, 1)[0]
                    rand_start_idx = pd.Timestamp(datetime.datetime.combine(rand_date, training_start_time))
                    if rand_date in idx_date_list[:int(len(idx_date_list) / 2)]:
                        time_series_i = trunc_df.loc[rand_start_idx:]['interpolated'].values
                    else:
                        time_series_i = trunc_df.loc[:rand_start_idx]['interpolated'].values

                    window_length_i = self._detect_window_size(time_series_i, streaming=True) if not self.window_length \
                        else self.window_length
                    window_length_list.append(window_length_i)

                window_length_list = np.array(window_length_list)

                # From the list of optimal window sizes, if it is a list of constants, we take the constant as the
                # window size. Otherwise, we obtain the window size that is most frequently observed in the list.
                if np.all(window_length_list == min(window_length_list)):
                    window_length = window_length_list[0]
                else:
                    bin_count = max(1, int((max(window_length_list) - min(window_length_list)) / 12))
                    bins = np.linspace(min(window_length_list) - 1, max(window_length_list) + 1, bin_count)
                    if len(bins) == 1:
                        window_length = int(stats.mode(window_length_list).mode[0])
                    else:
                        digitized = np.digitize(window_length_list, bins)
                        arg_mode = np.argmax([len(window_length_list[digitized == i]) for i in range(1, len(bins))]) + 1
                        window_length = int(stats.mode(window_length_list[digitized == arg_mode]).mode[0])

                if window_length < self.min_window_length:
                    raise ValueError('Training window too small')
                if window_length > self.max_window_length:
                    raise ValueError('Training window too large')
                n_windows = len(trunc_df) // window_length
                if n_windows < self.min_num_train_windows:
                    raise ValueError('Too few training windows')
                if n_windows > self.max_num_train_windows:
                    raise ValueError('Too many training windows')
            else:
                window_length = self.window_length

            summary = {'success': True,
                       'freq': str(freq),
                       'window_length': window_length,
                       'min_window_length': self.min_window_length,
                       'max_window_length': self.max_window_length}

        except Exception as e:
            # Streaming data exploration failed
            summary = {'success': False,
                       'ErrorMessage': str(e)}
            trunc_df = None

        return trunc_df, summary