Calibrating regressor to flag HCPh T1w images for exclusion

The interactive version of this notebook is available at https://github.com/TheAxonLab/hcph-sops/tree/mkdocs/docs/analysis/qc-regressor-hcph.ipynb

In [1]:

Copied!





import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

In [2]:

Copied!

from mriqc_learn.datasets import load_data
from mriqc_learn.models import preprocess as pp
from mriqc_learn.models.production import init_pipeline_xgboost, init_pipeline_naive
from mriqc_learn.datasets import load_data
from mriqc_learn.models import preprocess as pp
from mriqc_learn.models.production import init_pipeline_xgboost, init_pipeline_naive

Load HCPh data¶

In [3]:

Copied!





iqms_path = Path('/home/cprovins/data/hcph-derivatives-mriqc/group_T1w.tsv')
manual_ratings_path = Path('/home/cprovins/data/hcph-derivatives-mriqc/desc-ratings_T1w.tsv')

df_iqms = pd.read_csv(iqms_path, sep='\t')
df_manual_ratings = pd.read_csv(manual_ratings_path, sep='\t')

# Merge the two dataframes based on the filename
# Cannot handle two raters for now!
df = pd.merge(df_manual_ratings, df_iqms, left_on='subject', right_on='bids_name', how='inner')
assert all(df['subject'] == df['bids_name'])
# Move the column bids_name to the first column
df = df[['bids_name'] + [col for col in df.columns if col != 'bids_name']]
df.drop(columns=['subject'], inplace=True)
df
iqms_path = Path('/home/cprovins/data/hcph-derivatives-mriqc/group_T1w.tsv')
manual_ratings_path = Path('/home/cprovins/data/hcph-derivatives-mriqc/desc-ratings_T1w.tsv')

df_iqms = pd.read_csv(iqms_path, sep='\t')
df_manual_ratings = pd.read_csv(manual_ratings_path, sep='\t')

# Merge the two dataframes based on the filename
# Cannot handle two raters for now!
df = pd.merge(df_manual_ratings, df_iqms, left_on='subject', right_on='bids_name', how='inner')
assert all(df['subject'] == df['bids_name'])
# Move the column bids_name to the first column
df = df[['bids_name'] + [col for col in df.columns if col != 'bids_name']]
df.drop(columns=['subject'], inplace=True)
df

Out[3]:

	bids_name	rater_id	dataset	rating	artifacts	time_sec	confidence	comments	cjv	cnr	...	summary_wm_mean	summary_wm_median	summary_wm_n	summary_wm_p05	summary_wm_p95	summary_wm_stdv	tpm_overlap_csf	tpm_overlap_gm	tpm_overlap_wm	wm2max
0	sub-001_ses-excl027_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.55	["eye-spillover","noise-local"]	209.948	2.50	local noise close to the ventricles	0.604348	1.606833	...	934.354938	955.617892	1.015999e+06	747.667651	1056.077599	94.405582	0.182043	0.522926	0.545003	0.477761
1	sub-001_ses-excl027_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.60	[]	76.032	3.50	NaN	0.604348	1.606833	...	934.354938	955.617892	1.015999e+06	747.667651	1056.077599	94.405582	0.182043	0.522926	0.545003	0.477761
2	sub-001_ses-excl029_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.45	["eye-spillover","inu"]	113.066	3.50	light INU	0.596304	1.650305	...	937.143691	957.493211	1.013283e+06	754.115148	1055.926475	92.421504	0.181235	0.523413	0.544391	0.491119
3	sub-001_ses-excl029_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.55	["eye-spillover"]	60.220	3.50	NaN	0.596304	1.650305	...	937.143691	957.493211	1.013283e+06	754.115148	1055.926475	92.421504	0.181235	0.523413	0.544391	0.491119
4	sub-001_ses-pilot001_acq-morphobox_T1w	celine	mriqc-23.2.0-withoutdwi	1.20	["ghost-aliasing"]	84.348	4.00	low resolution	0.589266	1.757282	...	928.132422	952.478942	4.765519e+05	736.106579	1048.748755	96.711931	0.207131	0.542773	0.559154	0.524738
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
79	sub-001_ses-045_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	2.45	["head-motion","eye-spillover"]	133.657	3.05	HM ringing visible in T1w lower frontal lobe	0.604583	1.588400	...	931.165777	954.003248	1.029516e+06	739.441554	1053.858769	96.443148	0.183835	0.526239	0.543494	0.484465
80	sub-001_ses-046_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.05	["head-motion","eye-spillover"]	125.347	3.45	subtle HM ringing	0.613529	1.589260	...	933.199076	954.786639	1.010457e+06	748.510799	1053.416242	93.550235	0.177092	0.517983	0.542767	0.493663
81	sub-001_ses-047_acq-undistorted_run-1_T1w	celine	mriqc-23.2.0-withoutdwi	1.65	["eye-spillover","uncategorized"]	88.320	3.80	low contrast	0.650858	1.448968	...	931.741325	950.959938	1.042026e+06	746.521985	1063.578192	96.576782	0.181927	0.520435	0.539815	0.295455
82	sub-001_ses-047_acq-undistorted_run-2_T1w	celine	mriqc-23.2.0-withoutdwi	1.60	["eye-spillover","inu","uncategorized"]	58.408	3.65	low contrast	0.661949	1.289697	...	930.195692	944.551342	1.042298e+06	749.759529	1067.060620	95.413866	0.188724	0.522034	0.537868	0.282123
83	sub-001_ses-047_acq-undistorted_run-3_T1w	celine	mriqc-23.2.0-withoutdwi	1.65	["eye-spillover","inu","uncategorized"]	53.255	3.70	low contrast	0.659533	1.348779	...	932.941535	947.395483	1.047439e+06	755.562688	1065.594837	93.783535	0.191076	0.525830	0.541656	0.281388

84 rows × 76 columns

In [4]:

Copied!

# Split the data
(train_x, train_y), (_, _) = load_data(df, seed=2978, split_strategy="none")
train_y
# Split the data
(train_x, train_y), (_, _) = load_data(df, seed=2978, split_strategy="none")
train_y

Out[4]:

	bids_name	rater_id	dataset	rating	artifacts	time_sec	confidence	comments
0	sub-001_ses-excl027_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.55	["eye-spillover","noise-local"]	209.948	2.50	local noise close to the ventricles
1	sub-001_ses-excl027_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.60	[]	76.032	3.50	NaN
2	sub-001_ses-excl029_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.45	["eye-spillover","inu"]	113.066	3.50	light INU
3	sub-001_ses-excl029_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.55	["eye-spillover"]	60.220	3.50	NaN
4	sub-001_ses-pilot001_acq-morphobox_T1w	celine	mriqc-23.2.0-withoutdwi	1.20	["ghost-aliasing"]	84.348	4.00	low resolution
...	...	...	...	...	...	...	...	...
79	sub-001_ses-045_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	2.45	["head-motion","eye-spillover"]	133.657	3.05	HM ringing visible in T1w lower frontal lobe
80	sub-001_ses-046_acq-undistorted_T1w	celine	mriqc-23.2.0-withoutdwi	3.05	["head-motion","eye-spillover"]	125.347	3.45	subtle HM ringing
81	sub-001_ses-047_acq-undistorted_run-1_T1w	celine	mriqc-23.2.0-withoutdwi	1.65	["eye-spillover","uncategorized"]	88.320	3.80	low contrast
82	sub-001_ses-047_acq-undistorted_run-2_T1w	celine	mriqc-23.2.0-withoutdwi	1.60	["eye-spillover","inu","uncategorized"]	58.408	3.65	low contrast
83	sub-001_ses-047_acq-undistorted_run-3_T1w	celine	mriqc-23.2.0-withoutdwi	1.65	["eye-spillover","inu","uncategorized"]	53.255	3.70	low contrast

84 rows × 8 columns

We get a bunch of information from the MRIQC rating widget. However, for the regression only the rating is of interest.

In [5]:

Copied!

# Keep only the rating
train_y = np.array(train_y['rating']).reshape(-1, 1)
# Keep only the rating
train_y = np.array(train_y['rating']).reshape(-1, 1)

Let's print out a pretty view of the data table:

In [6]:

Copied!

train_x
train_x

Out[6]:

	cjv	cnr	efc	fber	fwhm_avg	fwhm_x	fwhm_y	fwhm_z	icvs_csf	icvs_gm	...	summary_wm_mean	summary_wm_median	summary_wm_n	summary_wm_p05	summary_wm_p95	summary_wm_stdv	tpm_overlap_csf	tpm_overlap_gm	tpm_overlap_wm	wm2max
0	0.604348	1.606833	0.536305	10137.548416	4.688191	4.331934	5.241601	4.491037	0.275428	0.378461	...	934.354938	955.617892	1.015999e+06	747.667651	1056.077599	94.405582	0.182043	0.522926	0.545003	0.477761
1	0.604348	1.606833	0.536305	10137.548416	4.688191	4.331934	5.241601	4.491037	0.275428	0.378461	...	934.354938	955.617892	1.015999e+06	747.667651	1056.077599	94.405582	0.182043	0.522926	0.545003	0.477761
2	0.596304	1.650305	0.530764	14122.389990	4.688262	4.344506	5.236868	4.483412	0.275655	0.379307	...	937.143691	957.493211	1.013283e+06	754.115148	1055.926475	92.421504	0.181235	0.523413	0.544391	0.491119
3	0.596304	1.650305	0.530764	14122.389990	4.688262	4.344506	5.236868	4.483412	0.275655	0.379307	...	937.143691	957.493211	1.013283e+06	754.115148	1055.926475	92.421504	0.181235	0.523413	0.544391	0.491119
4	0.589266	1.757282	0.513590	-1.000000	3.978445	3.840736	4.297390	3.797210	0.273247	0.380469	...	928.132422	952.478942	4.765519e+05	736.106579	1048.748755	96.711931	0.207131	0.542773	0.559154	0.524738
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
79	0.604583	1.588400	0.522964	13707.122123	4.687097	4.325848	5.230143	4.505300	0.271614	0.379977	...	931.165777	954.003248	1.029516e+06	739.441554	1053.858769	96.443148	0.183835	0.526239	0.543494	0.484465
80	0.613529	1.589260	0.521511	14608.441198	4.746154	4.386391	5.296872	4.555200	0.276620	0.377185	...	933.199076	954.786639	1.010457e+06	748.510799	1053.416242	93.550235	0.177092	0.517983	0.542767	0.493663
81	0.650858	1.448968	0.526573	613.466801	5.362943	4.963693	5.851936	5.273200	0.270652	0.376713	...	931.741325	950.959938	1.042026e+06	746.521985	1063.578192	96.576782	0.181927	0.520435	0.539815	0.295455
82	0.661949	1.289697	0.537092	482.152209	5.432877	5.020905	5.953775	5.323950	0.269681	0.376620	...	930.195692	944.551342	1.042298e+06	749.759529	1067.060620	95.413866	0.188724	0.522034	0.537868	0.282123
83	0.659533	1.348779	0.545570	364.087655	5.360663	5.005466	5.866987	5.209537	0.268028	0.376936	...	932.941535	947.395483	1.047439e+06	755.562688	1065.594837	93.783535	0.191076	0.525830	0.541656	0.281388

84 rows × 68 columns

Cross-validation of the default regressor¶

Let's cross-validate the performance of our regressor using a Repeated K-fold strategy.

In [7]:

Copied!

# Define a splitting strategy
outer_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2978)
# Define a splitting strategy
outer_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2978)

We can now feed the model into the cross-validation loop:

In [8]:

Copied!





cv_scores = cross_val_score(
    init_pipeline_xgboost(),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)
cv_scores = cross_val_score(
    init_pipeline_xgboost(),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)

We quantify the regressor performance using Mean Absolute Error (MAE).

In [9]:

Copied!

cv_scores = np.absolute(cv_scores)
print('Mean MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )
cv_scores = np.absolute(cv_scores)
print('Mean MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )

Mean MAE: 0.410 (0.105)

A good mean absolute error (MAE) is relative to the dataset at hand. As such, we establish a baseline MSE for our dataset using a naive predictive model that always predicts the mean or the median of the target values.

In [10]:

Copied!





cv_scores = cross_val_score(
    init_pipeline_naive(strategy='mean'),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)
cv_scores = np.absolute(cv_scores)
print('Mean baseline MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )
cv_scores = cross_val_score(
    init_pipeline_naive(strategy='mean'),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)
cv_scores = np.absolute(cv_scores)
print('Mean baseline MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )

Mean baseline MAE: 0.741 (0.164)

In [11]:

Copied!





cv_scores = cross_val_score(
    init_pipeline_naive(strategy='median'),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)
cv_scores = np.absolute(cv_scores)
print('Mean baseline MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )
cv_scores = cross_val_score(
    init_pipeline_naive(strategy='median'),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)
cv_scores = np.absolute(cv_scores)
print('Mean baseline MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )

Mean baseline MAE: 0.733 (0.165)

Optimization of the hyperparameters¶

In [12]:

Copied!





# Define the parameter grid for hyperparameter tuning
param_grid = {
    'model__learning_rate': [0.1, 0.01, 0.001],
    'model__max_depth': [3, 7],
    'model__n_estimators': [50, 70, 90],
    'model__eta': [0.1, 0.01],
    'model__subsample': [0.7, 1.0],
}

# Initialize the XGBoost pipeline
pipeline = init_pipeline_xgboost()

# Initialize the GridSearchCV with the pipeline, parameter grid, and scoring metric
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=outer_cv,
    n_jobs=-1
)
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'model__learning_rate': [0.1, 0.01, 0.001],
    'model__max_depth': [3, 7],
    'model__n_estimators': [50, 70, 90],
    'model__eta': [0.1, 0.01],
    'model__subsample': [0.7, 1.0],
}

# Initialize the XGBoost pipeline
pipeline = init_pipeline_xgboost()

# Initialize the GridSearchCV with the pipeline, parameter grid, and scoring metric
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=outer_cv,
    n_jobs=-1
)

In [ ]:

Copied!

grid_search.fit(train_x, y=train_y)
grid_search.best_params_
# best_parameters = grid_search.best_params
grid_search.fit(train_x, y=train_y)
grid_search.best_params_
# best_parameters = grid_search.best_params

Cross-validation of the optimized regressor¶

In [14]:

Copied!

# Define a splitting strategy
outer_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2978)
# Define a splitting strategy
outer_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2978)

We can now feed the model into the cross-validation loop:

In [15]:

Copied!





# values are hard-coded because they were obtained by running the code on a cluster rather than locally in the notebook
best_params = {'eta': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 90, 'subsample': 0.7}

cv_scores = cross_val_score(
    estimator=init_pipeline_xgboost(**best_params),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)
# values are hard-coded because they were obtained by running the code on a cluster rather than locally in the notebook
best_params = {'eta': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 90, 'subsample': 0.7}

cv_scores = cross_val_score(
    estimator=init_pipeline_xgboost(**best_params),
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
)

In [16]:

Copied!

cv_scores = np.absolute(cv_scores)
print('Mean MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )
cv_scores = np.absolute(cv_scores)
print('Mean MAE: %.3f (%.3f)' % (cv_scores.mean(), cv_scores.std()) )

Mean MAE: 0.381 (0.098)