import pandas as pd
import numpy as np
import seaborn as sns
import shap
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot as plt
1 ML approach to classify species based on prey weights
1.1 Imports
%load_ext watermark
%watermark -n -u -v -iv -w -p sklearn
Last updated: Fri Jun 02 2023
Python implementation: CPython
Python version : 3.8.6
IPython version : 7.19.0
sklearn: 0.0
matplotlib: 3.3.3
numpy : 1.19.4
pandas : 1.1.5
shap : 0.39.0
seaborn : 0.11.0
Watermark: 2.1.0
1.1.1 Miscelaneous configurations
%config InlineBackend.figure_format = 'retina'
'font.family'] = 'Serif'
plt.rcParams['font.size'] = 10
plt.rcParams[
= 'output/RandomForest/' output_dir
1.2 Data
Reading the data and translating names from Spanish to English:
= pd.read_csv("data/stomach_w.csv")
stomach
# Translate column names
= ['Ind', 'Species', 'Season', 'Sex', 'Maturity']
group_names = ['Stomatopods', 'Polychaetes', 'Bivalves', 'Amphipods',
feature_names 'Crabs', 'Echinoderms', 'Shrimps', 'Crustaceans', 'Sipunculids', 'Fishes']
= group_names + feature_names
stomach.columns
# Translate categories
= stomach.Season.map({'Fria': 'Cold', 'Calida': 'Warm'})
stomach.Season = stomach.Sex.map({'Hembra': 'Female', 'Macho': 'Male'})
stomach.Sex = stomach.Maturity.map({'Adulto': 'Adult', 'Juvenil': 'Juvenile'})
stomach.Maturity
# Data preview
stomach.head()
Ind | Species | Season | Sex | Maturity | Stomatopods | Polychaetes | Bivalves | Amphipods | Crabs | Echinoderms | Shrimps | Crustaceans | Sipunculids | Fishes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | H.dipterurus | Cold | Male | Adult | 8.50 | 3.22 | 5.23 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 2 | H.dipterurus | Cold | Female | Juvenile | 0.30 | 2.02 | 0.01 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 3 | H.dipterurus | Warm | Female | Juvenile | 0.50 | 0.40 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 4 | H.dipterurus | Warm | Male | Adult | 0.01 | 0.00 | 2.86 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 5 | H.dipterurus | Cold | Female | Juvenile | 4.54 | 2.60 | 0.11 | 0.03 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Assigning weights as an object (X) and species as other (y):
# Just the grouping columns
= stomach.iloc[:, 1:5]
stomach_group
# Weights of the different prey items
= stomach.iloc[:, 5:]
X # Target labels
= pd.Categorical(stomach_group['Species']).codes y
1.3 Model optimization
A Random Forest (RF) classifier was used to model the separation between both species and their categories, given by the weights of each prey group. This approach was preferred over traditional techniques (e.g., PERMANOVA, ANOSIM and/or SIMPER) because it is one of the more efficient machine learning methods and has the following desirable characteristics: a) it is robust to biased distributions, b) it does not require any transformation of the data, c) it is not based on distances or dissimilarities per se, and c) it is resistant to overfitting (Carvajal et al. 2018). The RF approach consists on building t independent decision trees and making an ensemble with them; i.e., the probability of pertenence to each class for every observation is based on the frequency of “votes” made by whole “forest”. The classification is done via a recursive partitioning of the dataset, combining random subsets of both observations and variables (features) of the original data (i.e., bootstraping and bagging) to “learn” the limits between the target classes for each feature.
Every ML model should be optimized; i.e., its hyper-parameters should be tuned to maximize its performance. Furthermore, that performance should be always evaluated with data unseen by the model to check for overfitting, regardless of the inherent resistance of the approach. In order to achieve both things, the original data set was divided into a train set and a test set. The train set was used to train the model and tune its hyper-parameters using a grid-search and 5-fold cross-validation: i) number of trees in the ensemble; ii) maximum number of features used per tree, and iii) the maximum “depth” of the tree (i.e., the maximum number of partitions applied to the data). The performance metric used for optimization and evaluation was the Area Under the Curve of the Receiver Operator Characteristics (AUC-ROC), which yields a more reliable evaluation than other metrics, such as the accuracy, since the ROC compares the True Positive Rate and the False Positive Rate at various thresholds (Meyer-Baese & Schmid 2014).
# Train-Test split
= train_test_split(X, y, random_state = 0) X_train, X_test, y_train, y_test
The error of a forest depends on the strength of the individual trees in the forest (i.e., how “good” is each tree) and the correlation between any pair of them (Oshiro et al. 2012). Increasing the correlation increases the forest error rate (i.e., decreases its performance). Our dataset is relatively small, in the sense that it has only 10 features (prey items) and a maximum of 392 instances (total stomachs analyzed from both species) before the train-test split. If we chose a high number of trees, then the feature and instance subsets may be exhausted and continuously repeated during bagging, increasing the correlation between individual trees and, consequently, “hindering” the performance of the forest. In other, more accurate words, increasing the number of trees does not necessarily increase the forest’s performance, and it has been proposed that there is a threshold (dataset dependent) beyond which there is no significant gain (Oshiro et al. 2012).
# Flag to save outputs
= 'Global'
classification
# Setting up the optimization via grid-search and cross-validation
= RandomForestClassifier(random_state = 0)
rf_clf = {'n_estimators': [10, 50, 100, 200, 500],
grid_values 'max_features': list(np.arange(1,10)),
'max_depth': list(np.arange(1,50))}
= GridSearchCV(rf_clf,
grid_rf = grid_values,
param_grid = 'roc_auc',
scoring = 8) n_jobs
%%time
# Optimization of the model
grid_rf.fit(X_train, y_train)
CPU times: user 19.3 s, sys: 2.12 s, total: 21.4 s
Wall time: 6min 25s
GridSearchCV(estimator=RandomForestClassifier(random_state=0), n_jobs=8,
param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, ...],
'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'n_estimators': [10, 50, 100, 200, 500]},
scoring='roc_auc')
#with open(output_dir + f'models/{classification}', 'wb') as model_file:
# pickle.dump(grid_rf, model_file)
#with open(output_dir + f'models/{classification}', 'rb') as model_file:
# grid_rf = pickle.load(model_file)
The final forest consisted of 200 trees, with a maximum depth of 2 and maximum 3 features per tree:
# Probability of belonging to class 1; i.e., N. entemedor
= grid_rf.predict_proba(X_test)[:,1]
ypred # Hyper-parameters of the tuned model
= grid_rf.best_params_
best_params best_params
{'max_depth': 2, 'max_features': 3, 'n_estimators': 200}
The AUC of the test set indicates that the model did not overfit:
print(f'Training AUC: {round(grid_rf.best_score_, 2)}')
print(f'Test AUC: {round(roc_auc_score(y_test, ypred), 2)}' )
Training AUC: 0.98
Test AUC: 0.99
1.3.1 Model training and prey importances
# Fitting of the optimized model
= RandomForestClassifier(max_depth = best_params['max_depth'],
rf = best_params['max_features'],
max_features = best_params['n_estimators'],
n_estimators = 6,
n_jobs = 0).fit(X_train, y_train) random_state
1.3.2 SHAP explanations
Feature importances for a RF classifier are traditionally based on the Gini index. Simply put, they represent how much would the error rate of the model increase, in terms of the purity of the final nodes of the tree, if each feature was removed from the data. On the other hand, SHAP explanations are a more robust and informative approach. These explanations are based on the Shapely values used in coalitional game theory, that is, they connect optimal prediction allocation with local explations based on Shapely values. Shapely values are a method that involves fairly distributing both gains and costs to the actors working in a coalition and, since each actor contributes differently, the Shapely values make shure that each actor gets a fair share depending on their contribution (Lundberg & Lee 2017, Lundberg et al. 2020). In other words, the SHAP approach assigns a higher contribution to the variables that “worked more” for achieving the correct predictions. The main advantages of this approach are that it is model-agnostic (independent of the model) and that it allows both global and local interpretations.
# Function to create a beeswarm and waterfall plot
# Waterfall plot: barplot showing the relative contributions
# Beeswarm plot: shows the SHAP values for each observation,
# colored by the value of the feature
def make_shap_beefall_plot(shap_values, features, test, fnames):
# Indices of columns
= features.columns
column_list # Relative contribution
= (np.abs(shap_values).sum(0) / np.abs(shap_values).sum()) * 100
feature_ratio # Ordering of contributions and indices
= column_list[np.argsort(feature_ratio)[::-1]]
column_list = np.sort(feature_ratio)[::-1]
fratio
# Beeswarm plot using SHAP's function
shap.summary_plot(shap_values, test,= fnames,
feature_names = False,
show = 'Weight')
color_bar_label # Get current axes
= plt.gca()
ax # Generate a second X axis
= ax.twiny()
ax2 # Bar (waterfall) plot
-1], fratio[::-1], alpha = 0.3, color = 'gray')
ax2.barh(column_list[::# Set ticks
0, round(fratio.max()), 3))
ax2.set_xticks(np.linspace(# Set axis label
'Relative contribution (%)');
ax2.set_xlabel(# Remove spines
= True, bottom = False, top = False, right = True) sns.despine(left
# Generate a new "explainer" of the model
= shap.TreeExplainer(rf)
explainer # Get the SHAP values for the test data
= explainer.shap_values(X_test) shap_values
1.3.2.1 Global explanation
The following beeswarm plot not only shows the feature importance, but also displays the positive and negative relationships of the predictors with the target class (Narcine entemedor). The more positive the SHAP value, the larger the directly proportional correlation of the feature, meaning that high counts of sipunculids correspond to N. entemedor, while low counts are more associated with H. dipterurus. Low counts of bivalves, on the other hand, are positively correlated with N. entemedor and negatively with H. dipterurus. The plot below shows that the main contributior to the differences between both species are mainly due to sipunculids, closely followed by bivalves, explaining ~70% of the differences.
1], # values for class 1; i.e., N. entemedor
make_shap_beefall_plot(shap_values[= X_train, # train set for contributions
features = X_test, # test features for beeswarm plot
test = feature_names)
fnames #plt.gcf().set_size_inches(4*1.61,4)
#plt.savefig(output_dir + f'figures/{classification}/beeswarm_plot.pdf',
# bbox_inches = 'tight')
An easy way to corroborate those results is showing the total counts of sipunculids and bivalves for each species:
'Species').agg({'Sipunculids': 'sum',
stomach.groupby('Bivalves': 'sum'})
Sipunculids | Bivalves | |
---|---|---|
Species | ||
H.dipterurus | 200.16 | 340.873 |
N.entemedor | 1437.10 | 9.720 |
1.3.2.2 Local explanations
As for the local interpretations, we can view force plots to explain single observations in the dataset. The 64th individual of the test set had a 1% probability of being N. entemedor (f(x)
; meaning that it was classified as H. dipterurus) given by the “high” count of bivalves (more correlated with H. dipterurus than N. entemedor):
= lambda x: 'H. dipterurus' if x == 0 else 'N. entemedor' rename
#shap.initjs()
#random_row = np.random.randint(0, 98) # 64
= 64
random_row = shap.force_plot(explainer.expected_value[1], shap_values[1][random_row],
f
X_test.iloc[random_row],= feature_names,
feature_names = True,
matplotlib = False)
show f'Test ind. # {random_row}; observed: {rename(y_test[random_row])}'); plt.title(
#shap.initjs()
#random_row = np.random.randint(0, 98) # 51
= 51
random_row = shap.force_plot(explainer.expected_value[1], shap_values[1][random_row],
f
X_test.iloc[random_row],= feature_names,
feature_names = True,
matplotlib = False)
show f'Test ind. # {random_row}; observed class: {rename(y_test[random_row])}'); plt.title(
#shap.initjs()
#random_row = np.random.randint(0, 98) # 79
= 79
random_row = shap.force_plot(explainer.expected_value[1], shap_values[1][random_row],
f
X_test.iloc[random_row],= feature_names,
feature_names = True,
matplotlib = False)
show f'Test ind. # {random_row}; observed class: {rename(y_test[random_row])}'); plt.title(
1.4 Intra-specific differences
Unlike the previous RF in which we tried to discriminate two species, here we will try to discriminate both levels of each covariate within each species.
There are some dramatic cases of class imbalance. In such instances the results are strongly limited by the small sample sizes of the groups with less observations. Balancing the dataset with subsampling and SMOTE aids only for training and evaluation of the RFs, thus the results should be taken with caution.
1.4.1 H. dipterurus
= stomach[stomach['Species'] == 'H.dipterurus'].iloc[:, 2:]
stomach_hd = 'Hd' species
The first step is to check for class imbalances. Only the seasons were balanced (~50% of observations for each class); hence, the other two were pre-processed via undersampling of the majority class and oversampling of the minority class using the Synthetic Majority Oversampling TEchnique. In short, this technique selects examples that are close in the feature space and draws a new sample at a point along that line, allowing to generate as many synthetic observations as required; however, the random undersampling of the majority class is also recommended (Chawla et al. 2002).
= stomach_hd.groupby('Season').agg({'Season':'count'}).rename(columns = {'Season':'n'})
season_n season_n
n | |
---|---|
Season | |
Cold | 109 |
Warm | 96 |
= stomach_hd.groupby('Sex').agg({'Sex':'count'}).rename(columns = {'Sex':'n'})
sex_n sex_n
n | |
---|---|
Sex | |
Female | 138 |
Male | 67 |
= stomach_hd.groupby('Maturity').agg({'Maturity':'count'}).rename(columns = {'Maturity': 'n'})
age_n age_n
n | |
---|---|
Maturity | |
Adult | 44 |
Juvenile | 161 |
1.4.1.1 Sexes
= 'Sex'
classification sex_n
n | |
---|---|
Sex | |
Female | 138 |
Male | 67 |
The sexes were balanced to 100 individuals per class. First, a random undersampling:
# Random undersampling
= stomach_hd[stomach_hd.Sex == 'Female'].sample(n = 100, random_state = 0)
under_female = under_female.append(stomach_hd[stomach_hd.Sex == 'Male'])
under_sex = pd.Categorical(under_sex.Sex).codes under_sex.Sex
Then we apply SMOTE to finish balancing the dataset:
= SMOTE(random_state = 0).fit_resample(under_sex.iloc[:, 3:], under_sex.Sex) sex_balanced
Next we generate the train-test split of the balanced dataset
= train_test_split(sex_balanced[0], sex_balanced[1], random_state = 0) X_train, X_test, y_train, y_test
And optimize the Random Forest Classifier
= RandomForestClassifier(random_state = 0)
rfm = GridSearchCV(rfm,
grid_rfm = grid_values,
param_grid = 'roc_auc',
scoring = 8) n_jobs
%%time
grid_rfm.fit(X_train, y_train)
CPU times: user 12.9 s, sys: 1.03 s, total: 13.9 s
Wall time: 6min 10s
GridSearchCV(estimator=RandomForestClassifier(random_state=0), n_jobs=8,
param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, ...],
'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'n_estimators': [10, 50, 100, 200, 500]},
scoring='roc_auc')
#with open(output_dir + f'models/{classification}', 'wb') as model_file:
# pickle.dump(grid_rfm, model_file)
#with open(output_dir + f'models/{classification}', 'rb') as model_file:
# grid_rfm = pickle.load(model_file)
The forest was optimized with 50 trees, with a maximum depth of 28 and maximum one feature per tree:
= grid_rfm.best_params_
best_params best_params, grid_rfm.best_score_
({'max_depth': 28, 'max_features': 1, 'n_estimators': 50}, 0.7091428571428571)
= RandomForestClassifier(max_depth = best_params['max_depth'],
rf = best_params['max_features'],
max_features = best_params['n_estimators'],
n_estimators = 6,
n_jobs = 0).fit(X_train, y_train) random_state
= rf.predict_proba(X_test)[:,1] ypred
The AUC of the test set indicates that the model did overfit and performed just better than random guessing (AUC = 0.5):
print(f'Training AUC: {round(grid_rfm.best_score_, 2)}')
print(f'Test AUC: {round(roc_auc_score(y_test, ypred), 2)}' )
Training AUC: 0.71
Test AUC: 0.6
1.4.1.1.1 SHAP explanations
= shap.TreeExplainer(rf)
explainer = explainer.shap_values(X_test) shap_values
1],
make_shap_beefall_plot(shap_values[= X_train,
features = X_test,
test = feature_names)
fnames 4*1.61,4)
plt.gcf().set_size_inches(+ f'figures/Hd/{classification}/beeswarm_plot.pdf',
plt.savefig(output_dir = 'tight') bbox_inches
1.4.1.2 Maturity stages
= 'Maturity'
classification age_n
n | |
---|---|
Maturity | |
Adult | 44 |
Juvenile | 161 |
The maturity stages were also balanced to 100 individuals per class
# Random undersampling
= stomach_hd[stomach_hd.Maturity == 'Juvenile'].sample(n = 100, random_state = 0)
under_juv = under_juv.append(stomach_hd[stomach_hd.Maturity == 'Adult'])
under_age = pd.Categorical(under_age.Maturity).codes under_age.Maturity
# SMOTE balancing
= SMOTE(random_state = 0).fit_resample(under_age.iloc[:, 3:],
age_balanced under_age.Maturity)
= train_test_split(age_balanced[0],
X_train, X_test, y_train, y_test 1],
age_balanced[= 0) random_state
= RandomForestClassifier(random_state = 0)
rfm = GridSearchCV(rfm,
grid_rfm = grid_values,
param_grid = 'roc_auc',
scoring = 6) n_jobs
%%time
grid_rfm.fit(X_train, y_train)
CPU times: user 9.11 s, sys: 669 ms, total: 9.77 s
Wall time: 6min 55s
GridSearchCV(estimator=RandomForestClassifier(random_state=0), n_jobs=6,
param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, ...],
'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'n_estimators': [10, 50, 100, 200, 500]},
scoring='roc_auc')
#with open(output_dir + f'models/{classification}', 'wb') as model_file:
# pickle.dump(grid_rfm, model_file)
#with open(output_dir + f'models/{classification}', 'rb') as model_file:
# grid_rfm = pickle.load(model_file)
= grid_rfm.best_params_
best_params best_params, grid_rfm.best_score_
({'max_depth': 5, 'max_features': 4, 'n_estimators': 50}, 0.6613650793650794)
= RandomForestClassifier(max_depth = best_params['max_depth'],
rf = best_params['max_features'],
max_features = best_params['n_estimators'],
n_estimators = 6,
n_jobs = 0).fit(X_train, y_train) random_state
= rf.predict_proba(X_test)[:,1] ypred
The AUC of the test set indicates that the model did not overfit:
print(f'Training AUC: {round(grid_rfm.best_score_, 2)}')
print(f'Test AUC: {round(roc_auc_score(y_test, ypred), 2)}' )
Training AUC: 0.66
Test AUC: 0.6
1.4.1.2.1 SHAP explanations
= shap.TreeExplainer(rf)
explainer = explainer.shap_values(X_test) shap_values
1],
make_shap_beefall_plot(shap_values[= X_train,
features = X_test,
test = feature_names)
fnames 4*1.61,4)
plt.gcf().set_size_inches(+ f'figures/Hd/{classification}/beeswarm_plot.pdf',
plt.savefig(output_dir = 'tight') bbox_inches
1.4.1.3 Season
= 'Season'
classification season_n
n | |
---|---|
Season | |
Cold | 109 |
Warm | 96 |
= train_test_split(stomach_hd.iloc[:,3:],
X_train, X_test, y_train, y_test = 0) stomach_hd.Season, random_state
= RandomForestClassifier(random_state = 0)
rfm = GridSearchCV(rfm,
grid_rfm = grid_values,
param_grid = 'roc_auc',
scoring = 6) n_jobs
%%time
grid_rfm.fit(X_train, y_train)
CPU times: user 9.15 s, sys: 673 ms, total: 9.82 s
Wall time: 6min 57s
GridSearchCV(estimator=RandomForestClassifier(random_state=0), n_jobs=6,
param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, ...],
'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'n_estimators': [10, 50, 100, 200, 500]},
scoring='roc_auc')
#with open(output_dir + f'models/{classification}', 'wb') as model_file:
# pickle.dump(grid_rfm, model_file)
#with open(output_dir + f'models/{classification}', 'rb') as model_file:
# grid_rfm = pickle.load(model_file)
= grid_rfm.best_params_
best_params best_params, grid_rfm.best_score_
({'max_depth': 2, 'max_features': 1, 'n_estimators': 100}, 0.6250892857142857)
= RandomForestClassifier(max_depth = best_params['max_depth'],
rf = best_params['max_features'],
max_features = best_params['n_estimators'],
n_estimators = 6,
n_jobs = 0).fit(X_train, y_train) random_state
= rf.predict_proba(X_test)[:,1] ypred
print(f'Training AUC: {round(grid_rfm.best_score_, 2)}')
print(f'Test AUC: {round(roc_auc_score(y_test, ypred), 2)}' )
Training AUC: 0.63
Test AUC: 0.55
1.4.1.3.1 SHAP explanations
= shap.TreeExplainer(rf)
explainer = explainer.shap_values(X_test) shap_values
1],
make_shap_beefall_plot(shap_values[= X_train,
features = X_test,
test = feature_names)
fnames 4*1.61,4)
plt.gcf().set_size_inches(+ f'figures/Hd/{classification}/beeswarm_plot.pdf',
plt.savefig(output_dir = 'tight') bbox_inches
1.4.2 N. entemedor
= 'Ne'
species = stomach[stomach['Species'] == 'N.entemedor'].iloc[:, 2:] stomach_ne
The first step is to check for class imbalances. The season variable was the only one balanced (~50% of observations for each class), hence, the other two were pre-processed via undersampling of the overrepresented class and oversampling of the underrepresented class.
1.4.2.1 Sex
= 'Sex'
classification = stomach_ne.groupby('Sex').agg({'Sex':'count'}).rename(columns = {'Sex':'n'})
sex_n sex_n
n | |
---|---|
Sex | |
Female | 154 |
Male | 33 |
The sexes were balanced to 100 observations per class
# Random undersampling
= stomach_ne[stomach_ne.Sex == 'Female'].sample(n = 100, random_state = 0)
under_female = under_female.append(stomach_ne[stomach_ne.Sex == 'Male'])
under_sex = pd.Categorical(under_sex.Sex).codes under_sex.Sex
# SMOTE balancing
= SMOTE(random_state = 0).fit_resample(under_sex.iloc[:, 3:], under_sex.Sex) sex_balanced
= train_test_split(sex_balanced[0], sex_balanced[1], random_state = 0) X_train, X_test, y_train, y_test
= RandomForestClassifier(random_state = 0)
rfm = GridSearchCV(rfm,
grid_rfm = grid_values,
param_grid = 'roc_auc',
scoring = 6) n_jobs
%%time
grid_rfm.fit(X_train, y_train)
CPU times: user 8.83 s, sys: 629 ms, total: 9.46 s
Wall time: 6min 41s
GridSearchCV(estimator=RandomForestClassifier(random_state=0), n_jobs=6,
param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, ...],
'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'n_estimators': [10, 50, 100, 200, 500]},
scoring='roc_auc')
#with open(output_dir + f'models/{classification}', 'wb') as model_file:
# pickle.dump(grid_rfm, model_file)
#with open(output_dir + f'models/{classification}', 'rb') as model_file:
# grid_rfm = pickle.load(model_file)
= grid_rfm.best_params_
best_params best_params, grid_rfm.best_score_
({'max_depth': 7, 'max_features': 7, 'n_estimators': 100}, 0.8695496031746032)
= RandomForestClassifier(max_depth = best_params['max_depth'],
rf = best_params['max_features'],
max_features = best_params['n_estimators'],
n_estimators = 6,
n_jobs = 0).fit(X_train, y_train) random_state
= rf.predict_proba(X_test)[:,1] ypred
The AUC of the test set indicates that the model did not overfit:
print(f'Training AUC: {round(grid_rfm.best_score_, 2)}')
print(f'Test AUC: {round(roc_auc_score(y_test, ypred), 2)}')
Training AUC: 0.87
Test AUC: 0.87
1.4.2.1.1 SHAP explanations
= shap.TreeExplainer(rf)
explainer = explainer.shap_values(X_test) shap_values
1],
make_shap_beefall_plot(shap_values[= X_train,
features = X_test,
test = feature_names)
fnames #plt.gcf().set_size_inches(4*1.61,4)
#plt.savefig(output_dir + f'figures/Ne/{classification}/beeswarm_plot.pdf',
# bbox_inches = 'tight')
1.4.2.2 Maturity stages
= 'Maturity'
classification = stomach_ne.groupby('Maturity').agg({'Maturity':'count'}).rename(columns = {'Maturity': 'n'})
age_n age_n
n | |
---|---|
Maturity | |
Adult | 173 |
Juvenile | 14 |
The balancing was done to 50 observations per class
# Random undersampling
= stomach_ne[stomach_ne.Maturity == 'Adult'].sample(n = 50, random_state = 0)
under_juv = under_juv.append(stomach_ne[stomach_ne.Maturity == 'Juvenile'])
under_age = pd.Categorical(under_age.Maturity).codes under_age.Maturity
# SMOTE balancing
= SMOTE(random_state = 0).fit_resample(under_age.iloc[:, 3:], under_age.Maturity) age_balanced
= train_test_split(age_balanced[0], age_balanced[1], random_state = 0) X_train, X_test, y_train, y_test
= RandomForestClassifier(random_state = 0)
rfm = GridSearchCV(rfm,
grid_rfm = grid_values,
param_grid = 'roc_auc',
scoring = 6) n_jobs
%%time
grid_rfm.fit(X_train, y_train)
CPU times: user 8.96 s, sys: 640 ms, total: 9.6 s
Wall time: 6min 37s
GridSearchCV(estimator=RandomForestClassifier(random_state=0), n_jobs=6,
param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, ...],
'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'n_estimators': [10, 50, 100, 200, 500]},
scoring='roc_auc')
#with open(output_dir + f'models/{classification}', 'wb') as model_file:
# pickle.dump(grid_rfm, model_file)
#with open(output_dir + f'models/{classification}', 'rb') as model_file:
# grid_rfm = pickle.load(model_file)
= grid_rfm.best_params_
best_params best_params, grid_rfm.best_score_
({'max_depth': 4, 'max_features': 8, 'n_estimators': 100}, 0.7642857142857143)
= RandomForestClassifier(max_depth = best_params['max_depth'],
rf = best_params['max_features'],
max_features = best_params['n_estimators'],
n_estimators = 6,
n_jobs = 0).fit(X_train, y_train) random_state
= rf.predict_proba(X_test)[:,1] ypred
The AUC of the test set indicates that the model did not overfit:
print(f'Training AUC: {round(grid_rfm.best_score_, 2)}')
print(f'Test AUC: {round(roc_auc_score(y_test, ypred), 2)}' )
Training AUC: 0.76
Test AUC: 0.83
1.4.2.2.1 SHAP explanations
= shap.TreeExplainer(rf)
explainer = explainer.shap_values(X_test) shap_values
1],
make_shap_beefall_plot(shap_values[= X_train,
features = X_test,
test = feature_names)
fnames #plt.gcf().set_size_inches(4*1.61,4)
#plt.savefig(output_dir + f'figures/Ne/{classification}/beeswarm_plot.pdf',
# bbox_inches = 'tight')
1.4.2.3 Season
= 'Season'
classification = stomach_ne.groupby('Season').agg({'Season':'count'}).rename(columns = {'Season':'n'})
season_n season_n
n | |
---|---|
Season | |
Cold | 99 |
Warm | 88 |
= train_test_split(stomach_ne.iloc[:,3:],
X_train, X_test, y_train, y_test = 0) stomach_ne.Season, random_state
= RandomForestClassifier(random_state = 0)
rfm = GridSearchCV(rfm,
grid_rfm = grid_values,
param_grid = 'roc_auc',
scoring = 6) n_jobs
%%time
grid_rfm.fit(X_train, y_train)
CPU times: user 8.85 s, sys: 629 ms, total: 9.48 s
Wall time: 6min 44s
GridSearchCV(estimator=RandomForestClassifier(random_state=0), n_jobs=6,
param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, ...],
'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'n_estimators': [10, 50, 100, 200, 500]},
scoring='roc_auc')
#with open(output_dir + f'models/{classification}', 'wb') as model_file:
# pickle.dump(grid_rfm, model_file)
#with open(output_dir + f'models/{classification}', 'rb') as model_file:
# grid_rfm = pickle.load(model_file)
= grid_rfm.best_params_
best_params best_params, grid_rfm.best_score_
({'max_depth': 1, 'max_features': 9, 'n_estimators': 50}, 0.6866586538461539)
= RandomForestClassifier(max_depth = best_params['max_depth'],
rf = best_params['max_features'],
max_features = best_params['n_estimators'],
n_estimators = 6,
n_jobs = 0).fit(X_train, y_train) random_state
= rf.predict_proba(X_test)[:,1] ypred
print(f'Training AUC: {round(grid_rfm.best_score_, 2)}')
print(f'Test AUC: {round(roc_auc_score(y_test, ypred), 2)}' )
Training AUC: 0.69
Test AUC: 0.6
1.4.2.3.1 SHAP explanations
= shap.TreeExplainer(rf)
explainer = explainer.shap_values(X_test) shap_values
1],
make_shap_beefall_plot(shap_values[= X_train,
features = X_test,
test = feature_names)
fnames #plt.gcf().set_size_inches(4*1.61,4)
#plt.savefig(output_dir + f'figures/Ne/{classification}/beeswarm_plot.pdf',
# bbox_inches = 'tight')
End of notebook