The process of normalizing your data

Question

I have a machine learning model to pick different outcome of each game for a college basketball game.
The X values are:
Feature                    Range

Money Line               -100000 - +9000
Money Line %             0.01 - 0.99
Money Line $             0.01 - 0.99
Money Line Move          -75000 - +66622
Money Line Direction     0 or 1
Spread                   -45.5 - +45.5

the Y (target) variable is either 0 or 1

My first question is should I normalize the X values? Since the classifiers target value is 0 or 1 and the independent variable are much larger or smaller.
My second question is when you normalize your X variables should you normalize them during feature selection, param tuning, and then on your model?
I'm a little unclear on the process of normalizing your data.
I'm using an lightGBM Classifier, a backward elimination feature selection using OLS, and a Randomized Search CV for param tuning.
Here is my feature selection code:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

training_data = pd.read_csv(
    "/Users/aus10/Machine_Learning_Betting/Data/Betting_Data/CBB/Training_Data_Betting_CBB.csv", index_col=False)

df_model = training_data.copy()
df_model = df_model.dropna()

target = 'Cover'

X = df_model.drop(target, axis=1)

X = df_model.iloc[:, 1:18]  # independent columns
y = df_model[target]  # target column

X_1 = sm.add_constant(X)

model = sm.OLS(y, X_1).fit()

cols = list(X.columns)
pmax = 1
while (len(cols) > 0):
    p = []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y, X_1).fit()
    p = pd.Series(model.pvalues.values[1:], index=cols)
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax > 0.05):
        cols.remove(feature_with_p_max)
    else:
        break

selected_features_BE = cols
print(selected_features_BE)

Here is the code to tune the params:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

training_data = pd.read_csv(
    '/Users/aus10/Machine_Learning_Betting/Data/Betting_Data/CBB/Training_Data_Betting_CBB.csv')

df_model = training_data.copy()
df_model = df_model.dropna()

# independent columns
X = df_model.loc[:, ['Spread_Move']]
y = df_model['Cover']  # target column

skf = StratifiedKFold(n_splits=2)

skf.get_n_splits(X, y)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

fit_params = {
    'early_stopping_rounds': 30,
    'eval_metric': 'auc',
    'eval_set': [(X_test, y_test)],
    'eval_names': ['valid'],
    'verbose': 100,
    'categorical_feature': 'auto'
}

param_test = {
    'num_leaves': sp_randint(6, 50),
    'min_child_samples': sp_randint(100, 500),
    'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    'subsample': sp_uniform(loc=0.2, scale=0.8),
    'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
    'n_estimators': [100, 300, 500, 800, 1200],
    'max_depth': [5, 8, 15, 25, 30]
}

n_HP_points_to_test = 100

clf = lgb.LGBMClassifier(random_state=314, silent=True,
                         metrix='None', n_jobs=-1)

gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring='roc_auc', cv=3, refit=True, random_state=314, verbose=True)

gs.fit(X_train, y_train, **fit_params)

print('n CBB Spread lgb Best score reached {} with params {} '.format(
    gs.best_score_, gs.best_params_))
print()

and finally, here is the model:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import json

training_data = pd.read_csv(
    '/Users/aus10/Machine_Learning_Betting/Data/Betting_Data/CBB/Training_Data_Betting_CBB.csv')
test_data = pd.read_csv(
    '/Users/aus10/Machine_Learning_Betting/Data/Betting_Data/CBB/Test_Data_Betting_CBB.csv')

df_model = training_data.copy()
df_model = df_model.dropna()

df_test = test_data.copy()
df_test = df_test.dropna()

df_test = df_test.replace(to_replace='None', value=np.nan).dropna()

# independent columns
X = df_model.loc[:, ['Spread_Move']]
y = df_model['Cover']  # target column

results = []

model = lgb.LGBMClassifier(colsample_bytree=0.6433117836032942, max_depth=30, min_child_samples=114, min_child_weight=100.0,
                           n_estimators=1200, num_leaves=7, reg_alpha=1, reg_lambda=20, subsample=0.27963305897119684)

model.fit(X, y)

skf = StratifiedKFold(n_splits=2)

skf.get_n_splits(X, y)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print()
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('n CBB Spread LightGBM {}'.format(
    round(accuracy_score(y_test, y_pred), 2)))

index = 0
count = 0

while count < len(df_test):
    team = df_test.loc[index].at['Team']
    spread_move = df_test.loc[index].at['Spread_Move']

Xnew = [[spread_move]]
    # make a prediction
    ynew = model.predict_proba(Xnew)
    # show the inputs and predicted outputs
    results.append(
        {
            'Team': team,
            'Cover': ynew[0][1]
        })
    index += 1
    count += 1

with open('/Users/aus10/Machine_Learning_Betting/Data/ML_Results/CBB/Cover_Probability_LightGBM.json', 'w') as my_file:
    json.dump(results, my_file)

jeffhale · Accepted Answer

If using OLS for feature selection and a tree-based algorithm for your model, you don't need to worry about standardizing or scaling your data.
You want to scale your data when using an algorithm for feature selection or modeling that has a distance-based component. Then you want to have all X variables on a similar scale.

The process of normalizing your data

One Answer

Add your own answers!

Ask a Question