Source code for machine_learning_tools.preprocessing_ml

'''



Functions used on data before
models analyze the data

Application: 
1) Lasso Linear Regression should have all columns on the same scale
so they get regularized the same amount


Useful link explaining different sklearn scalars for preprocessing: 

http://benalexkeen.com/feature-scaling-with-scikit-learn/



'''
import pandas as pd
import sklearn.preprocessing as sklpre

available_scalars = dict(
    normal_dist = sklpre.StandardScaler, #will ensure all rows are normalized with 0 mean, std dev 1 (should only be if data already normal)
    min_max = sklpre.MinMaxScaler, #subtracts the min and then divides by the (max - min) [subjecdt to outliers]
    min_max_q1_q3 = sklpre.RobustScaler, #same as min max except min = Q1, max = Q3 [better for outliers]
    within_unit_sphere = sklpre.Normalizer, #makes sure all data is inside an n dimensional sphere of radius 1 of origin
)


[docs]def get_scaler(scaler="normal_dist"): """ Purpose: To return the appropriate scalar option """ if type(scaler) != str: return scaler() if scaler not in available_scalars.keys(): return getattr(sklpre,scaler)() else: return available_scalars[scaler]()
[docs]def scale_df( df, scaler="StandardScaler", scaler_trained = None, target_name = None, verbose = False): """ Purpose: To apply a preprocessing scalar to all of the feature columns of a df 1) Get the appropriate scaler Ex: from machine_learning_tools import preprocessing_ml as preml preml.scale_df(df, target_name=target_name, scaler = "RobustScaler", verbose = False) """ scaler = preml.get_scaler(scaler) import numpy as np if verbose: print(f"scaler = {scaler}") df_features = pdml.feature_names(df,target_name) scaler_df = scaler.fit_transform(df[df_features]) scaler_df = pd.DataFrame(scaler_df,columns = df_features) if target_name is not None: # print(f"{df_cp[target_name].groupby(['cell_type_fine']).count()}") df = df.reset_index() scaler_df[target_name] = df[target_name] return scaler_df
[docs]def non_negative_df(df): return df - df.min()
""" How to do encoding and one hot encoding encoding_col = ["Emergent Indication","Stenosis Group","Azizi Classification"] tranformers = dict() for k in encoding_col: le = preprocessing.LabelEncoder() le.fit(df_filt[k]) df_filt[k] = le.transform(df_filt[k]) tranformers[k] = copy(le) one_hot_columns = ["Ethnicity","Etiology of SVC Occlusion","Azizi Type "] new_dfs = [pd.get_dummies(df_filt[[k]],drop_first=False) for k in one_hot_columns] """ #--- from machine_learning_tools --- from . import pandas_ml as pdml from . import preprocessing_ml as preml