jeudi 25 avril 2019

How to apply template method pattern in Python data science process while not knowing exactly the number of repeating steps

I like to apply the template method pattern for a data science project while I need to select or identify target subjects from a large pool of original subjects. I will create tags based on different characteristics of these subjects, i.e., age, sex, disease status, etc.

I prefer this code to be reused for future projects of similar nature. But all projects are somewhat different and the criteria of selecting subjects to be in the final filtered pool are different from one another. How do I structure the subject_selection_steps in such a way that it is flexible and customizable based on project needs. Currently, I only included three tags in my code, but I may need more or less in different projects.

import sys
from abc import ABC, abstractmethod
import pandas as pd
import datetime
import ctypes
import numpy as np
import random
import pysnooper
import var_creator.var_creator as vc
import feature_tagger.feature_tagger as ft
import data_descriptor.data_descriptor as dd
import data_transformer.data_transformer as dt
import helper_functions.helper_functions as hf
import sec1_data_preparation as data_prep
import sec2_prepped_data_import as prepped_data_import

class SubjectGrouping(ABC):
    def __init__(self):
        pass

    def subject_selection_steps(self):
        self._pandas_output_setting()
        self.run_data_preparation()
        self.import_processed_main_data()
        self.inject_test_data()
        self.create_all_subject_list()
        self.CREATE_TAG1()
        self.FILTER_SUBJECT_BY_TAG1()
        self.CREATE_TAG2()
        self.FILTER_SUBJECT_BY_TAG2()
        self.CREATE_TAG3()
        self.FILTER_SUBJECT_BY_TAG3()
        self.finalize_data()        

    def _pandas_output_setting(self):
        '''Set pandas output display setting'''
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 180)

    @abstractmethod
    def run_data_preparation(self):
        '''Run data_preparation_steps from base class'''
        pass

    @abstractmethod
    def import_processed_main_data(self):
        '''Import processed main data'''
        pass

    def inject_test_data(self):
        '''For unitest, by injecting mock cases that for sure fulfill/fail the defined subject selection criteria'''
        pass

    def create_all_subject_list(self):
        '''Gather all the unique subject ids from all datasets and create a full subject list'''
        pass

    def CREATE_TAG1(self): pass
    def FILTER_SUBJECT_BY_TAG1(self): pass
    def CREATE_TAG2(self): pass
    def FILTER_SUBJECT_BY_TAG2(self): pass
    def CREATE_TAG3(self): pass
    def FILTER_SUBJECT_BY_TAG3(self): pass

    def finalize_data(self): 
        pass

class SubjectGrouping_Project1(SubjectGrouping, data_prep.DataPreparation_Project1):
    def __init__(self):
        self.df_dad = None
        self.df_pc = None
        self.df_nacrs = None
        self.df_pin = None
        self.df_reg = None
        self.df_final_subject_group1 = None
        self.df_final_subject_group2 = None
        self.df_final_subject_group3 = None
        self.control_panel = {
            'save_file_switch': False, # WARNING: Will overwrite existing files
            'df_subsampling_switch': True,  # WARNING: Only switch to True when testing
            'df_subsampling_n': 8999,
            'random_seed': 888,
            'df_remove_dup_switch': True,
            'parse_date_switch': True,
            'result_printout_switch': True,
            'comp_loc': 'office',
            'show_df_n_switch': False, # To be implemented. Show df length before and after record removal
            'done_switch': False,
            }

    def run_data_preparation(self):
        self.data_preparation_steps()

    def import_processed_main_data(self):
        x = prepped_data_import.PreppedDataImport_Project1()
        x.data_preparation_steps()
        x.prepped_data_import_steps()
        df_dict = x.return_all_dfs()
        self.df_d, self.df_p, self.df_n, self.df_p, self.df_r = (df_dict['DF_D'], df_dict['DF_P'], 
            df_dict['DF_N'], df_dict['DF_P'], df_dict['DF_R'])
        del x

if __name__=='__main__':
    x = SubjectGrouping_Project1()
    x.subject_selection_steps()

Aucun commentaire:

Enregistrer un commentaire