Source code for analysis_engine.ai.build_datasets_using_scalers

"""
Build scaler normalized train and test datasets
from a ``pandas.DataFrame`` (like a ``Trading History`` stored in s3)

.. note:: This function will create multiple copies of the data so
    this is a memory intensive call which may overflow the
    available memory on a machine if there are many rows
"""

import analysis_engine.consts as ae_consts
import analysis_engine.ai.build_scaler_dataset_from_df as scaler_utils
import spylunking.log.setup_logging as log_utils
import sklearn.model_selection as tt_split

log = log_utils.build_colorized_logger(
    name=__name__)


[docs]def build_datasets_using_scalers( train_features, test_feature, df, test_size, seed, min_feature=-1, max_feature=1): """build_datasets_using_scalers Build train and test datasets using a `MinMaxScaler <https://scikit-learn.org/stable/ modules/generated/ sklearn.preprocessing.MinMaxScaler.html>`__ for normalizing a dataset before training a deep neural network. Here's the returned dictionary: .. code-block:: python res = { 'status': status, 'scaled_train_df': scaled_train_df, 'scaled_test_df': scaled_test_df, 'scaler_train': scaler_train, 'scaler_test': scaler_test, 'x_train': x_train, 'y_train': y_train, 'x_test': x_test, 'y_test': y_test, } :param train_features: list of strings with all columns (features) to train :param test_feature: string name of the column to predict. This is a single column name in the``df`` (which is a ``pandas.DataFrame``). :param df: dataframe to build scaler test and train datasets :param test_size: percent of test to train rows :param min_feature: min scaler range with default ``-1`` :param max_feature: max scaler range with default ``1`` """ status = ae_consts.NOT_RUN scaled_train_df = None scaled_test_df = None scaler_train = None scaler_test = None x_train = None y_train = None x_test = None y_test = None res = { 'status': status, 'scaled_train_df': scaled_train_df, 'scaled_test_df': scaled_test_df, 'scaler_train': scaler_train, 'scaler_test': scaler_test, 'x_train': x_train, 'y_train': y_train, 'x_test': x_test, 'y_test': y_test, } try: log.info( f'building scalers ' f'df.rows={len(df.index)} ' f'columns={len(list(df.columns.values))} ' f'train_features={len(train_features)} ' f'test_feature={test_feature}') if test_feature not in df: log.error( f'did not find test_feature={test_feature} in ' f'df columns={df.columns.values}') status = ae_consts.FAILED res['status'] = status return res for single_train_feature in train_features: if single_train_feature not in df: log.error( f'did not find ' f'train_feature={single_train_feature} in ' f'df columns={df.columns.values}') status = ae_consts.FAILED res['status'] = status return res train_df = df[train_features] test_df = df[[test_feature]] log.info( f'building scaled train df') scaled_train_res = scaler_utils.build_scaler_dataset_from_df( df=train_df, min_feature=min_feature, max_feature=max_feature) log.info( f'building scaled test df') scaled_test_res = scaler_utils.build_scaler_dataset_from_df( df=test_df, min_feature=min_feature, max_feature=max_feature) log.info( f'scaled df transform ' f'train_status={scaled_train_res["status"] == ae_consts.SUCCESS} ' f'test_status={scaled_test_res["status"] == ae_consts.SUCCESS}') if scaled_train_res['status'] == ae_consts.SUCCESS \ and scaled_test_res['status'] == ae_consts.SUCCESS: log.info( f'scaled train_rows={len(scaled_train_res["df"])} ' f'test_rows={len(scaled_test_res["df"])}') scaler_train = scaled_train_res['scaler'] scaler_test = scaled_test_res['scaler'] scaled_train_df = scaled_train_res['df'] scaled_test_df = scaled_test_res['df'] (x_train, x_test, y_train, y_test) = tt_split.train_test_split( scaled_train_df, scaled_test_df, test_size=test_size, random_state=seed) else: log.error( f'failed df transform ' f'train_status={scaled_train_res["status"]} ' f'test_status={scaled_test_res["status"]}') status = ae_consts.FAILED res['status'] = status return res # if built both train and test successfully log.info( f'train_rows={len(train_df.index)} ' f'test_rows={len(test_df.index)} ' f'x_train={len(x_train)} ' f'x_test={len(x_test)} ' f'y_train={len(y_train)} ' f'y_test={len(y_test)}') res['scaled_train_df'] = scaled_train_df res['scaled_test_df'] = scaled_test_df res['scaler_train'] = scaler_train res['scaler_test'] = scaler_test res['x_train'] = x_train res['y_train'] = y_train res['x_test'] = x_test res['y_test'] = y_test status = ae_consts.SUCCESS except Exception as e: log.error( f'failed with ex={e} ' f'building scalers ' f'df.rows={len(df.index)} ' f'columns={list(df.columns.values)} ' f'train_features={train_features} ' f'test_feature={test_feature}') status = ae_consts.ERR # try/ex res['status'] = status return res
# end of build_datasets_using_scalers