Source code for analysis_engine.ai.build_datasets_using_scalers

"""
Build scaler normalized train and test datasets
from a ``pandas.DataFrame`` (like a ``Trading History`` stored in s3)

.. note:: This function will create multiple copies of the data so
    this is a memory intensive call which may overflow the
    available memory on a machine if there are many rows
"""

import analysis_engine.consts as ae_consts
import analysis_engine.ai.build_scaler_dataset_from_df as scaler_utils
import spylunking.log.setup_logging as log_utils
import sklearn.model_selection as tt_split

log = log_utils.build_colorized_logger(
    name=__name__)


[docs]def build_datasets_using_scalers(
        train_features,
        test_feature,
        df,
        test_size,
        seed,
        min_feature=-1,
        max_feature=1):
    """build_datasets_using_scalers

    Build train and test datasets using a
    `MinMaxScaler <https://scikit-learn.org/stable/
    modules/generated/
    sklearn.preprocessing.MinMaxScaler.html>`__ for normalizing a dataset
    before training a deep neural network.

    Here's the returned dictionary:

    .. code-block:: python

        res = {
            'status': status,
            'scaled_train_df': scaled_train_df,
            'scaled_test_df': scaled_test_df,
            'scaler_train': scaler_train,
            'scaler_test': scaler_test,
            'x_train': x_train,
            'y_train': y_train,
            'x_test': x_test,
            'y_test': y_test,
        }

    :param train_features: list of strings with all columns (features)
        to train
    :param test_feature: string name of the column to predict.
        This is a single column name in the``df``
        (which is a ``pandas.DataFrame``).
    :param df: dataframe to build scaler test and train datasets
    :param test_size: percent of test to train rows
    :param min_feature: min scaler range
        with default ``-1``
    :param max_feature: max scaler range
        with default ``1``
    """

    status = ae_consts.NOT_RUN
    scaled_train_df = None
    scaled_test_df = None
    scaler_train = None
    scaler_test = None
    x_train = None
    y_train = None
    x_test = None
    y_test = None

    res = {
        'status': status,
        'scaled_train_df': scaled_train_df,
        'scaled_test_df': scaled_test_df,
        'scaler_train': scaler_train,
        'scaler_test': scaler_test,
        'x_train': x_train,
        'y_train': y_train,
        'x_test': x_test,
        'y_test': y_test,
    }

    try:
        log.info(
            f'building scalers '
            f'df.rows={len(df.index)} '
            f'columns={len(list(df.columns.values))} '
            f'train_features={len(train_features)} '
            f'test_feature={test_feature}')

        if test_feature not in df:
            log.error(
                f'did not find test_feature={test_feature} in '
                f'df columns={df.columns.values}')
            status = ae_consts.FAILED
            res['status'] = status
            return res
        for single_train_feature in train_features:
            if single_train_feature not in df:
                log.error(
                    f'did not find '
                    f'train_feature={single_train_feature} in '
                    f'df columns={df.columns.values}')
                status = ae_consts.FAILED
                res['status'] = status
                return res

        train_df = df[train_features]
        test_df = df[[test_feature]]

        log.info(
            f'building scaled train df')
        scaled_train_res = scaler_utils.build_scaler_dataset_from_df(
            df=train_df,
            min_feature=min_feature,
            max_feature=max_feature)

        log.info(
            f'building scaled test df')
        scaled_test_res = scaler_utils.build_scaler_dataset_from_df(
            df=test_df,
            min_feature=min_feature,
            max_feature=max_feature)

        log.info(
            f'scaled df transform '
            f'train_status={scaled_train_res["status"] == ae_consts.SUCCESS} '
            f'test_status={scaled_test_res["status"] == ae_consts.SUCCESS}')

        if scaled_train_res['status'] == ae_consts.SUCCESS \
           and scaled_test_res['status'] == ae_consts.SUCCESS:
            log.info(
                f'scaled train_rows={len(scaled_train_res["df"])} '
                f'test_rows={len(scaled_test_res["df"])}')

            scaler_train = scaled_train_res['scaler']
            scaler_test = scaled_test_res['scaler']
            scaled_train_df = scaled_train_res['df']
            scaled_test_df = scaled_test_res['df']
            (x_train,
             x_test,
             y_train,
             y_test) = tt_split.train_test_split(
                scaled_train_df,
                scaled_test_df,
                test_size=test_size,
                random_state=seed)
        else:
            log.error(
                f'failed df transform '
                f'train_status={scaled_train_res["status"]} '
                f'test_status={scaled_test_res["status"]}')
            status = ae_consts.FAILED
            res['status'] = status
            return res
        # if built both train and test successfully

        log.info(
            f'train_rows={len(train_df.index)} '
            f'test_rows={len(test_df.index)} '
            f'x_train={len(x_train)} '
            f'x_test={len(x_test)} '
            f'y_train={len(y_train)} '
            f'y_test={len(y_test)}')

        res['scaled_train_df'] = scaled_train_df
        res['scaled_test_df'] = scaled_test_df
        res['scaler_train'] = scaler_train
        res['scaler_test'] = scaler_test
        res['x_train'] = x_train
        res['y_train'] = y_train
        res['x_test'] = x_test
        res['y_test'] = y_test

        status = ae_consts.SUCCESS

    except Exception as e:
        log.error(
            f'failed with ex={e} '
            f'building scalers '
            f'df.rows={len(df.index)} '
            f'columns={list(df.columns.values)} '
            f'train_features={train_features} '
            f'test_feature={test_feature}')
        status = ae_consts.ERR
    # try/ex

    res['status'] = status
    return res
# end of build_datasets_using_scalers