Source code for analysis_engine.show_dataset

"""
Show an algorithm dataset from file, s3 or redis

Supported Datasets:

- ``SA_DATASET_TYPE_ALGO_READY`` - Algorithm-ready datasets

**Supported environment variables**

::

    # to show debug, trace logging please export ``SHARED_LOG_CFG``
    # to a debug logger json file. To turn on debugging for this
    # library, you can export this variable to the repo's
    # included file with the command:
    export SHARED_LOG_CFG=/opt/sa/analysis_engine/log/debug-logging.json
"""

import analysis_engine.consts as ae_consts
import analysis_engine.load_dataset as load_dataset
import spylunking.log.setup_logging as log_utils

log = log_utils.build_colorized_logger(name=__name__)


[docs]def show_dataset( algo_dataset=None, dataset_type=ae_consts.SA_DATASET_TYPE_ALGO_READY, serialize_datasets=ae_consts.DEFAULT_SERIALIZED_DATASETS, path_to_file=None, compress=False, encoding='utf-8', redis_enabled=True, redis_key=None, redis_address=None, redis_db=None, redis_password=None, redis_expire=None, redis_serializer='json', redis_encoding='utf-8', s3_enabled=True, s3_key=None, s3_address=None, s3_bucket=None, s3_access_key=None, s3_secret_key=None, s3_region_name=None, s3_secure=False, slack_enabled=False, slack_code_block=False, slack_full_width=False, verbose=False): """show_dataset Show a supported dataset's internal structure and preview some of the values to debug mapping, serialization issues :param algo_dataset: optional - already loaded algorithm-ready dataset :param dataset_type: optional - dataset type (default is ``SA_DATASET_TYPE_ALGO_READY``) :param serialize_datasets: optional - list of dataset names to deserialize in the dataset :param path_to_file: optional - path to an algorithm-ready dataset in a file :param compress: optional - boolean flag for decompressing the contents of the ``path_to_file`` if necessary (default is ``False`` and algorithms use ``zlib`` for compression) :param encoding: optional - string for data encoding **(Optional) Redis connectivity arguments** :param redis_enabled: bool - toggle for auto-caching all datasets in Redis (default is ``True``) :param redis_key: string - key to save the data in redis (default is ``None``) :param redis_address: Redis connection string format: ``host:port`` (default is ``localhost:6379``) :param redis_db: Redis db to use (default is ``0``) :param redis_password: optional - Redis password (default is ``None``) :param redis_expire: optional - Redis expire value (default is ``None``) :param redis_serializer: not used yet - support for future pickle objects in redis :param redis_encoding: format of the encoded key in redis **(Optional) Minio (S3) connectivity arguments** :param s3_enabled: bool - toggle for auto-archiving on Minio (S3) (default is ``True``) :param s3_key: string - key to save the data in redis (default is ``None``) :param s3_address: Minio S3 connection string format: ``host:port`` (default is ``localhost:9000``) :param s3_bucket: S3 Bucket for storing the artifacts (default is ``dev``) which should be viewable on a browser: http://localhost:9000/minio/dev/ :param s3_access_key: S3 Access key (default is ``trexaccesskey``) :param s3_secret_key: S3 Secret key (default is ``trex123321``) :param s3_region_name: S3 region name (default is ``us-east-1``) :param s3_secure: Transmit using tls encryption (default is ``False``) **(Optional) Slack arguments** :param slack_enabled: optional - boolean for publishing to slack :param slack_code_block: optional - boolean for publishing as a code black in slack :param slack_full_width: optional - boolean for publishing as a to slack using the full width allowed Additonal arguments :param verbose: optional - bool for increasing logging """ use_ds = algo_dataset if not use_ds: log.info( f'loading from file={path_to_file} s3={s3_key} redis={redis_key}') use_ds = load_dataset.load_dataset( dataset_type=dataset_type, compress=compress, encoding=redis_encoding, path_to_file=path_to_file, s3_key=s3_key, s3_address=s3_address, s3_bucket=s3_bucket, s3_access_key=s3_access_key, s3_secret_key=s3_secret_key, s3_region_name=s3_region_name, s3_secure=s3_secure, redis_key=redis_key, redis_address=redis_address, redis_db=redis_db, redis_password=redis_password, redis_expire=redis_expire, redis_serializer=redis_serializer, serialize_datasets=serialize_datasets) if not use_ds: log.error( f'unable to load a dataset from file={path_to_file} ' f's3={s3_key} redis={redis_key}') return None # load if not created if dataset_type == ae_consts.SA_DATASET_TYPE_ALGO_READY: print('-----------------------------------') for root_key in use_ds: print(root_key) all_dates = [] all_ids = [] first_node = None last_node = None end_nodes = [] for root_key in use_ds: second_layer = use_ds[root_key] for ds in second_layer: if 'date' in ds: if len(all_dates) == 0: print('\ndates found in dataset') cur_date = ds.get( 'date', None) if cur_date: print(cur_date) all_dates.append(cur_date) if not first_node: first_node = ds end_nodes.append(ds) last_node = ds if 'id' in ds: if len(all_ids) == 0: print('\nids in the file') cur_id = ds.get( 'id', None) if cur_id: print(cur_id) all_ids.append(cur_id) if first_node and last_node: show_first = {} for ds_key in first_node: if ds_key == 'data': show_first[ds_key] = {} for ds_name in first_node[ds_key]: print(f'first_node has dataset with name: {ds_name}') show_first[ds_key][ds_name] = 'EMPTY_DF' if hasattr( first_node[ds_key][ds_name], 'index'): show_first[ds_key][ds_name] = ( 'pd.DataFrame() rows=' f'{len(first_node[ds_key][ds_name].index)}') else: show_first[ds_key] = first_node[ds_key] print(f'\nfirst node:\n{ae_consts.ppj(show_first)}\n') num_records = len(all_ids) cur_cell = num_records - 4 for cur_node in end_nodes[-5:]: show_node = {} for ds_key in cur_node: if ds_key == 'data': show_node[ds_key] = {} for ds_name in cur_node[ds_key]: show_node[ds_key][ds_name] = 'EMPTY_DF' if hasattr( cur_node[ds_key][ds_name], 'index'): show_node[ds_key][ds_name] = ( 'pd.DataFrame() rows=' f'{len(cur_node[ds_key][ds_name].index)}') else: show_node[ds_key] = cur_node[ds_key] # end of show cur_node print( f'node={cur_cell}/{num_records} values:' f'\n{ae_consts.ppj(show_node)}\n') cur_cell += 1 # end of end_nodes else: if not first_node: print('missing first node in dataset') if not last_node: print('missing last node in dataset') if len(all_dates) > 0: print( f'root_keys={use_ds} from {all_dates[0]} ' f'to {all_dates[-1]}') else: print(f'root_keys={use_ds} missing dates') print('-----------------------------------') return use_ds
# end of show_dataset