Does replaying the same movie clip reduce the variability of resting-state fMRI?¶
The interactive version of this notebook is available at https://github.com/TheAxonLab/hcph-sops/tree/mkdocs/docs/analysis.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import nibabel as nib
from pathlib import Path
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection
from confounds import get_confounds
from load_save import save_caption, load_matrices, init_save_layout
#Define filtering parameters
metric = "correlation"
atlas_dimension = 128
fd_threshold = 0.5
fdthresh_str = str(fd_threshold).replace(".", "")
# Define paths
code_path = Path("/data/code/hcph-sops/")
fc_path = Path(f"/data/derivatives/hcph-fc/")
dataset_path = Path("/data/datasets/hcph-dataset")
suppl_path = Path("/data/derivatives/hcph-suppl/")
confound_path = Path("/data/datasets/hcph-dataset/phenotype/mood_env_quest.tsv")
iqms_path = Path("/data/derivatives/hcph-mriqc/group_bold.tsv")
Load the functional connectivity matrices¶
# Set up entities_base for the matrix loading
entities_base = {
'subject': '001',
'task': 'rsmovie',
'measure': metric,
'scale': str(atlas_dimension),
'fd_threshold': fdthresh_str,
}
# Load the FC matrices and associated information using the helper function
matrices_dict = load_matrices(fc_path, entities_base, code_path)
fc_concat = matrices_dict["conn_concat"]
fc_matrix = matrices_dict["conn_matrix"]
files = matrices_dict["files"]
atlas_labels = matrices_dict["atlas_labels"]
region_labels = matrices_dict["region_labels"]
fc_size = matrices_dict["conn_size"]
ses_index = matrices_dict["ses_index"]
# Intialize the BIDSLayout to save the figures and their associated captions
save_layout = init_save_layout(code_path, suppl_path)
# Plot an example FC matrix (the last one loaded)
fc_matrix = np.zeros((fc_size, fc_size))
# reconstruct from upper triangle if needed (optional, for visualization)
fc_matrix[np.triu_indices(fc_size, k=0)] = fc_concat[-1].flatten()
fc_matrix += np.triu(fc_matrix, k=1).T
plt.figure(figsize=(8,6))
sns.heatmap(fc_matrix, cmap='viridis')
plt.title(f"Example FC matrix")
plt.xlabel('Brain Regions')
plt.ylabel('Brain Regions')
print(f"Number of brain regions (matrix size): {fc_size}")
print(f"Shape of concatenated upper triangle FC matrices (number of sessions, number of region pairs): {fc_concat.shape}")
# Test whether the shape is as expected
assert fc_concat.shape == (36, fc_size * (fc_size + 1) // 2)
[get_dataset_dir] Dataset found in /home/cprovins/nilearn_data/difumo_atlases Configuration 'hcph' already exists, skipping add_config_paths. Number of brain regions (matrix size): 119 Shape of concatenated upper triangle FC matrices (number of sessions, number of region pairs): (36, 7140)
One remark to note is the number of brain regions does not correspond to the atlas_dimension, because we removed from the DiFuMo atlas the components that are specific to the CSF, ventricles and sinuses.
# Add the information that apply too all figures generated below to the base entities
entities_base["experiment"] = "fmrivar"
entities_base["extension"] = ".svg"
entities_base["datatype"] = "figures"
entities_base["suffix"] = "bold"
# Entities specific to this plot
entities = {'visualization': 'similmat'}
entities.update(entities_base)
similarity_matrix = np.corrcoef(fc_concat.T, rowvar=False)
print(f"Shape of the similarity matrix (number of sessions x number of sessions): {similarity_matrix.shape}")
mean_corr = np.mean(similarity_matrix)
print(f"Overall mean correlation: {mean_corr}")
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, cmap='viridis')
plt.xticks(ticks=np.arange(len(ses_index))+0.5, labels=ses_index, rotation=90)
plt.yticks(ticks=np.arange(len(ses_index))+0.5, labels=ses_index)
plt.xlabel('Session Index', fontsize=14)
plt.ylabel('Session Index', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Similarity matrix presenting the Pearson correlation between FC matrices for each session.**The overall mean correlation is {mean_corr:.2f}." \
"The fact that the FC matrices are highly correlated probably explains why only two components are enough to explain the vast majority of the variance." \
"It is also visible that session index 30 is less correlated with the other sessions compared to the other pair-wise comparisons."
save_caption(caption, save_layout, entities)
Shape of the similarity matrix (number of sessions x number of sessions): (36, 36) Overall mean correlation: 0.8581150373039423
Run PCA to estimate the residual variance for each rsfMRI run¶
Choose number of principal components¶
We are using the elbow plot to decide the number of principal components to keep.
from sklearn.decomposition import PCA
# Entities specific to this plot
entities = {'visualization': 'explainedvar'}
entities.update(entities_base)
# Run PCA
# Center the data by removing the mean of each connection
fc_concat = fc_concat - np.mean(fc_concat, axis=0)
# Standardize the data to have unit variance
#fc_concat = fc_concat / np.std(fc_concat, axis=0)
# Run PCA
pca = PCA()
pca.fit(fc_concat)
# Extract the explained variance
explained_variance = pca.explained_variance_ratio_
# Generate the elbow plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='-')
ax.set_xlabel('Number of Components', fontsize=14)
ax.set_ylabel('Explained Variance Ratio', fontsize=14)
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
#Define the number of components to keep
n_comp = 6
# Highlight the points corresponding to number of components we keep in red
ax.plot(n_comp, explained_variance[n_comp-1], marker='o', markersize=8, color='red')
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = "**Elbow plot to choose the number of principal components.**The point in red indicates the number of components selected."
save_caption(caption, save_layout, entities)
Plot the principal components¶
# Function to reconstruct the full matrix from the upper triangle
def reconstruct_full_matrix(upper_triangle, size):
full_matrix = np.zeros((size, size))
upper_indices = np.triu_indices(size)
full_matrix[upper_indices] = upper_triangle
full_matrix = full_matrix + full_matrix.T - np.diag(np.diag(full_matrix))
return full_matrix
# Reorder regions by network
network_order = atlas_labels['yeo_networks7'].argsort()
reordered_regions_labels = atlas_labels.iloc[network_order]['difumo_names']
for i, pc in enumerate(pca.components_[:n_comp, :]):
# Reconstruct the full matrix from the upper triangle
full_matrix = reconstruct_full_matrix(pc, fc_size)
# Cluster regions by network
full_matrix = full_matrix[network_order, :][:, network_order]
# Plot the principal component as a heatmap
plt.figure(figsize=(20, 18))
sns.heatmap(full_matrix, cmap='viridis')
plt.title(f'Principal Component {i+1}')
plt.xlabel('Brain Regions')
plt.ylabel('Brain Regions')
# Adjust y-ticks to move them further away from the heatmap
plt.yticks(ticks=np.arange(len(reordered_regions_labels)) + 0.5, labels=reordered_regions_labels, va='center', rotation=0)
plt.gca().tick_params(axis='y', pad=15) # Move y-ticks further away
plt.gca().tick_params(axis='x', pad=15) # Move x-ticks further away
# Add a color bar on the left and bottom ledge indicating the network
network_colors = sns.color_palette("tab20", n_colors=7) # Define 7 colors for the networks
network_segments = atlas_labels.iloc[network_order]['yeo_networks7'].astype('category').cat.codes
# Add color bar on the left with increased thickness
for idx, color in enumerate(network_segments):
plt.gca().add_patch(plt.Rectangle((-1.5, idx), 1.5, 1, color=network_colors[color], transform=plt.gca().transData, clip_on=False))
# Add color bar on the bottom with increased thickness
for idx, color in enumerate(network_segments):
plt.gca().add_patch(plt.Rectangle((idx, full_matrix.shape[0]), 1, 1.5, color=network_colors[color], transform=plt.gca().transData, clip_on=False))
# Add legend for the networks
network_labels = atlas_labels['yeo_networks7'].astype('category').cat.categories
legend_patches = [plt.Line2D([0], [0], color=network_colors[i], lw=4, label=label) for i, label in enumerate(network_labels)]
plt.legend(handles=legend_patches, loc='upper right', bbox_to_anchor=(1.37, 1), title="Networks", fontsize=12, title_fontsize=14)
# Plot all principal components as heatmaps in a single figure with subplots
n_rows = int(np.ceil(n_comp / 3))
n_cols = 3
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
for i, pc in enumerate(pca.components_[:n_comp, :]):
full_matrix = reconstruct_full_matrix(pc, fc_size)
# Cluster regions by network
full_matrix = full_matrix[network_order, :][:, network_order]
ax = axes[i // n_cols, i % n_cols]
sns.heatmap(full_matrix, cmap='viridis', ax=ax, cbar_kws={'shrink': 0.5})
ax.set_title(f'Principal Component {i+1}')
ax.set_xlabel('Brain Regions', labelpad=20)
ax.set_ylabel('Brain Regions', labelpad=20)
ax.set_xticks([]) # Hide x-ticks
ax.set_yticks([]) # Hide y-ticks
# Add color bar on the left with increased thickness
for idx, color in enumerate(network_segments):
ax.add_patch(plt.Rectangle((-3, idx), 3, 1, color=network_colors[color], transform=ax.transData, clip_on=False))
# Add color bar on the bottom with increased thickness
for idx, color in enumerate(network_segments):
ax.add_patch(plt.Rectangle((idx, full_matrix.shape[0]), 1, 3, color=network_colors[color], transform=ax.transData, clip_on=False))
# Add a single legend for the networks on the right of the figure
fig.legend(
handles=legend_patches,
loc='center right',
bbox_to_anchor=(1.1, 0.5), # Position the legend outside the figure, vertically centered
title="Networks",
fontsize=10,
title_fontsize=12
)
# Hide any unused subplots
for j in range(i + 1, n_rows * n_cols):
axes[j // n_cols, j % n_cols].axis('off')
plt.tight_layout()
plt.show()
# Functions to compute mean correlation for both region(s) and network(s) with other regions/networks in a principal component
def mean_region_or_network_correlation(
query, pc_index, atlas_labels, pca, fc_size, reconstruct_full_matrix_func,
query_type="region", with_query=None
):
"""
Compute the mean correlation of region(s) or network(s) with other regions or networks
in a given principal component (PC).
Args:
query (str): String to search for in atlas_labels to identify region(s) or network(s) (case-insensitive).
pc_index (int): Which principal component to use (0-based index).
atlas_labels (pd.DataFrame): DataFrame with atlas meta-data, must have 'difumo_names' and 'yeo_networks7', etc.
pca (sklearn.decomposition.PCA): Fitted PCA object.
fc_size (int): Dimensionality of the FC matrix.
reconstruct_full_matrix_func (function): Function to reshape flattened PC vector into full symmetric matrix.
query_type (str): Either "region" (default) or "network".
with_query (str or None): If provided, compute mean correlation with regions/networks matching this string, instead of with all others.
Returns:
float or None: The mean correlation. None if no query match.
"""
# Select which column will be used for query
if query_type == "region":
query_col = 'difumo_names'
elif query_type == "network":
# You may want to use another label set for networks (e.g., 'yeo_networks7')
# Accept both numeric and string matches
query_col = 'yeo_networks7'
else:
raise ValueError(f"query_type must be 'region' or 'network', got {query_type}")
# Prepare list of indices for main query
mask_main = atlas_labels[query_col].astype(str).str.contains(query, case=False, na=False)
indices_main = np.where(mask_main)[0]
if len(indices_main) == 0:
print(f"No {query_type}(s) matching '{query}' found in atlas_labels!")
return None
# Prepare list of indices for other set (if with_query is given), else use all except main
if with_query is not None:
mask_with = atlas_labels[query_col].astype(str).str.contains(with_query, case=False, na=False)
indices_with = np.where(mask_with)[0]
if len(indices_with) == 0:
print(f"No {query_type}(s) matching 'with_query'='{with_query}' found in atlas_labels!")
return None
# Remove self-overlap
indices_with = np.setdiff1d(indices_with, indices_main)
else:
# Calculate correlation with all *other* regions/networks
indices_with = np.setdiff1d(np.arange(atlas_labels.shape[0]), indices_main)
# Get specified PC weights and reconstruct full matrix
pc_vec = pca.components_[pc_index, :]
pc_matrix = reconstruct_full_matrix_func(pc_vec, fc_size)
mean_corrs = []
for i in indices_main:
# Get correlations to all indices_with
corrs = pc_matrix[i, indices_with]
# If main and with_query overlap is possible, remove self (shouldn't be for networks)
corrs = corrs.flatten() # In case it's size 1
mean_corrs.append(np.nanmean(corrs))
mean_correlation = np.mean(mean_corrs)
msg_type = f"{query_type}(s)"
other_desc = f"with {query_type}(s) matching '{with_query}'" if with_query is not None else "with all other regions/networks"
print(f"Mean correlation of {msg_type} matching '{query}' {other_desc} in PC{pc_index+1}: {mean_correlation:.4f}")
return mean_correlation
# One region with all other regions
mean_region_or_network_correlation(
query='retrosplenial',
pc_index=0,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="region"
)
mean_region_or_network_correlation(
query='retrosplenial',
pc_index=1,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="region"
)
mean_region_or_network_correlation(
query='internal capsule',
pc_index=1,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="region"
)
mean_region_or_network_correlation(
query='cerebellum crus I posterior',
pc_index=1,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="region"
)
# Networks with networks
# PC3: DMN vs ContA, DorsAttnB, SalVentAttnA, visual networks
print("\n--- PC3: DMN vs ContA, DorsAttnB, SalVentAttnA, visual ---")
dmn_vs_ContA_pc3 = mean_region_or_network_correlation(
query='defaultB',
with_query='ContA',
pc_index=2,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
dmn_vs_dorsal_pc3 = mean_region_or_network_correlation(
query='defaultB',
with_query='DorsAttnB',
pc_index=2,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
dmn_vs_SalVentAttnA_pc3 = mean_region_or_network_correlation(
query='defaultB',
with_query='SalVentAttnA',
pc_index=2,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
dmn_vs_visual_pc3 = mean_region_or_network_correlation(
query='defaultB',
with_query='VisCent',
pc_index=2,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
# PC4: DMN vs SomMotA/SalVentAttnA
print("\n--- PC4: DMN vs SomMotA, DMN vs SalVentAttnA ---")
dmn_vs_SomMotA_pc4 = mean_region_or_network_correlation(
query='defaultB',
with_query='SomMotA',
pc_index=3,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
dmn_vs_SalVentAttnA_pc4 = mean_region_or_network_correlation(
query='defaultB',
with_query='SalVentAttnA',
pc_index=3,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
# PC5: DorsAttnB vs DMN, DorsAttnB vs ContA, ContA vs DMN, ContA vs SomMotA
print("\n--- PC5: DorsAttnB vs DMN/ContA, ContA vs DMN/SomMotA ---")
dorsal_vs_dmn_pc5 = mean_region_or_network_correlation(
query='DorsAttnB',
with_query='defaultB',
pc_index=4,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
dorsal_vs_ContA_pc5 = mean_region_or_network_correlation(
query='DorsAttnB',
with_query='ContA',
pc_index=4,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
ContA_vs_dmn_pc5 = mean_region_or_network_correlation(
query='ContA',
with_query='defaultB',
pc_index=4,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
ContA_vs_SomMotA_pc5 = mean_region_or_network_correlation(
query='ContA',
with_query='SomMotA',
pc_index=4,
atlas_labels=atlas_labels,
pca=pca,
fc_size=fc_size,
reconstruct_full_matrix_func=reconstruct_full_matrix,
query_type="network"
)
Mean correlation of region(s) matching 'retrosplenial' with all other regions/networks in PC1: 0.0285 Mean correlation of region(s) matching 'retrosplenial' with all other regions/networks in PC2: 0.0263 Mean correlation of region(s) matching 'internal capsule' with all other regions/networks in PC2: 0.0290 Mean correlation of region(s) matching 'cerebellum crus I posterior' with all other regions/networks in PC2: 0.0250 --- PC3: DMN vs ContA, DorsAttnB, SalVentAttnA, visual --- Mean correlation of network(s) matching 'defaultB' with network(s) matching 'ContA' in PC3: -0.0122 Mean correlation of network(s) matching 'defaultB' with network(s) matching 'DorsAttnB' in PC3: -0.0090 Mean correlation of network(s) matching 'defaultB' with network(s) matching 'SalVentAttnA' in PC3: -0.0043 Mean correlation of network(s) matching 'defaultB' with network(s) matching 'VisCent' in PC3: -0.0075 --- PC4: DMN vs SomMotA, DMN vs SalVentAttnA --- Mean correlation of network(s) matching 'defaultB' with network(s) matching 'SomMotA' in PC4: -0.0118 Mean correlation of network(s) matching 'defaultB' with network(s) matching 'SalVentAttnA' in PC4: -0.0124 --- PC5: DorsAttnB vs DMN/ContA, ContA vs DMN/SomMotA --- Mean correlation of network(s) matching 'DorsAttnB' with network(s) matching 'defaultB' in PC5: 0.0062 Mean correlation of network(s) matching 'DorsAttnB' with network(s) matching 'ContA' in PC5: 0.0069 Mean correlation of network(s) matching 'ContA' with network(s) matching 'defaultB' in PC5: -0.0075 Mean correlation of network(s) matching 'ContA' with network(s) matching 'SomMotA' in PC5: -0.0097
Reconstruct the FC with a limited number of components¶
After extracting the principal components, each FC matrix can be expressed as a linear combinations of the principal components plus a residual matrix.
print("Shape of PCA components matrix (n_components, n_features):", pca.components_.shape)
# Project PC components back into each FC subject space
fc_projected = pca.transform(fc_concat)[:, :n_comp]
print("Shape of projected FC onto the first n_comp PCs (n_samples, n_comp):", fc_projected.shape)
fc_reconstructed = np.dot(fc_projected, pca.components_[:n_comp, :]) + np.mean(fc_concat, axis=0)
print("Shape of reconstructed FC matrix (n_samples, n_features):", fc_reconstructed.shape)
Shape of PCA components matrix (n_components, n_features): (36, 7140) Shape of projected FC onto the first n_comp PCs (n_samples, n_comp): (36, 6) Shape of reconstructed FC matrix (n_samples, n_features): (36, 7140)
Evolution of the PC loading across movie repetition¶
# Create the figure and subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15,10), sharex=True)
axes = axes.flatten() # Flatten the axes array for easy indexing
# Ensure axes is iterable even if n_comp == 1
if n_comp == 1:
axes = [axes]
# Collect p-values for multiple comparisons
slopes = []
pvals = []
regress_results = []
for i in range(n_comp):
X = np.arange(fc_projected.shape[0])
y = fc_projected[:, i]
res = stats.linregress(X, y)
slopes.append(res.slope)
pvals.append(res.pvalue)
regress_results.append(res)
# Correct p-values for multiple comparisons using FDR
_, pvals_fdr = fdrcorrection(pvals, alpha=0.05, method='indep')
for i in range(n_comp):
ax = axes[i]
ax.plot(range(fc_projected.shape[0]), fc_projected[:, i], marker='o', label=f'PC{i+1}')
res = regress_results[i]
# Use original slope, draw regression line, and annotate with corrected p-value
ax.plot(
X,
res.intercept + res.slope * X,
linestyle='--',
label=f'Linear Fit\nβ={res.slope:.3f}, p={pvals_fdr[i]:.2g}'
)
ax.set_xticks(np.arange(len(ses_index)))
ax.set_xticklabels(ses_index, rotation=90)
ax.set_xlabel('Session Index', fontsize=12)
ax.set_ylabel('PC Weight', fontsize=12)
ax.set_title(f'PC{i+1}', fontsize=14)
ax.legend()
plt.tight_layout()
plt.show()
Most strikingly, the brain pattern captured by PC1 becomes less prevalent across sessions, whereas those captured by PC4 become more prevalent.
Compute the residual matrix¶
# Calculate the residuals as the difference between the full and reconstructed FC matrices
residuals = fc_concat - fc_reconstructed
print(f"Residuals matrix shape (n_sessions, n_features): {residuals.shape}")
Residuals matrix shape (n_sessions, n_features): (36, 7140)
import numpy as np
from sklearn.decomposition import PCA
# Project the data onto the residual components (from n_comp + 1 onwards)
residuals_frompc = pca.transform(fc_concat)[:, n_comp:]
# Reconstruct the data using the residual components
residuals_frompc = np.dot(residuals_frompc, pca.components_[n_comp:, :]) + np.mean(fc_concat, axis=0)
# Print the shape of the reconstructed data
print(f"Residuals from PC matrix shape (n_sessions, n_features): {residuals_frompc.shape}")
Residuals from PC matrix shape (n_sessions, n_features): (36, 7140)
Plot residual matrix¶
Sanity check the reconstruction function¶
# Test my reconstruction function
upper_triangle = fc_matrix[np.triu_indices_from(fc_matrix, k=0)]
full_matrix = reconstruct_full_matrix(upper_triangle, fc_size)
full_matrix.shape
assert np.array_equal(full_matrix, fc_matrix)
residual = reconstruct_full_matrix(residuals[0,:], fc_size)
# Plot the residual matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(residual, cmap='viridis')
plt.title('Residual Matrix Heatmap')
plt.show()
# Check whether reconstructing the residuals a different way leads to the same result
residual_frompc = reconstruct_full_matrix(residuals_frompc[0,:], fc_size)
# Plot the residual matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(residual_frompc, cmap='viridis')
plt.title('Residual Matrix Heatmap')
plt.show()
Plot the original FC, reconstructed FC and residual matrix for each session¶
# Iterate over each session
for s in range(fc_concat.shape[0]):
# Reconstruct the full matrices for original FC, reconstructed FC, and residuals
original_fc = reconstruct_full_matrix(fc_concat[s, :], fc_size)
reconstructed_fc = reconstruct_full_matrix(fc_reconstructed[s, :], fc_size)
residual_fc = reconstruct_full_matrix(residuals[s, :], fc_size)
# Create a 1x3 subplot
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Plot the original FC matrix
sns.heatmap(original_fc, cmap='viridis', ax=axes[0])
axes[0].set_title(f'Session {ses_index[s]}: Original FC', fontsize=14)
# Plot the reconstructed FC matrix
sns.heatmap(reconstructed_fc, cmap='viridis', ax=axes[1])
axes[1].set_title(f'Session {ses_index[s]}: Reconstructed FC', fontsize=14)
# Plot the residual matrix
sns.heatmap(residual_fc, cmap='viridis', ax=axes[2])
axes[2].set_title(f'Session {ses_index[s]}: Residual', fontsize=14)
# Adjust layout and display the plot
plt.tight_layout()
plt.show()
Plot the variance of the residuals over sessions¶
def plot_res_variance(residuals, groupedby=False, min_res_var=0, max_res_var=1000, ses_id=None):
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
# Compute the variance of the residuals for each session
residuals_var = np.var(residuals, axis=1)
min_res_var = min(min_res_var, min(residuals_var))
max_res_var = max(max_res_var, max(residuals_var))
# Plot the variance of the residuals as a function of the session index
plt.plot(range(len(residuals_var)), residuals_var, marker='o', linestyle='-', label='Variance of Residuals')
# Fit and plot a negative exponential curve
def neg_exp(x, a, b, c):
return a * np.exp(-b * x) + c
if residuals_var.shape[0] > 1:
# Fit and plot a linear curve
linear_model = np.poly1d(np.polyfit(range(len(residuals_var)), residuals_var, 1))
linear_r2 = r2_score(residuals_var, linear_model(range(len(residuals_var))))
plt.plot(range(len(residuals_var)), linear_model(range(len(residuals_var))), linestyle='--', label=f'Linear Fit (R$^2$={linear_r2:.2f})')
if residuals_var.shape[0] > 3:
#popt, _ = curve_fit(neg_exp, range(len(residuals_var)), residuals_var, p0=(1, 0.01, 1))
#neg_exp_r2 = r2_score(residuals_var, neg_exp(range(len(residuals_var)), *popt))
#plt.plot(range(len(residuals_var)), neg_exp(range(len(residuals_var)), *popt), linestyle='--', label=f'Negative Exponential Fit (R$^2$={neg_exp_r2:.2f})')
# Fit and plot a quadratic curve
quadratic_model = np.poly1d(np.polyfit(range(len(residuals_var)), residuals_var, 2))
quadratic_r2 = r2_score(residuals_var, quadratic_model(range(len(residuals_var))))
plt.plot(range(len(residuals_var)), quadratic_model(range(len(residuals_var))), linestyle='--', label=f'Quadratic Fit (R$^2$={quadratic_r2:.2f})')
if ses_id is not None:
plt.xticks(ticks=np.arange(len(ses_id)), labels=ses_id)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend()
if groupedby:
plt.title(groupedby, fontsize=16)
return min_res_var, max_res_var
# Exclude 30th column
#residuals_wo_outlier = np.delete(residuals, 30, axis=1)
plt.figure(figsize=(10, 6))
# Plot with residuals
#plt.subplot(1, 2, 1)
plot_res_variance(residuals)
plt.xticks(ticks=np.arange(len(ses_index)), labels=ses_index, rotation=90)
plt.xlabel('Session Index', fontsize=16)
plt.ylabel('Variance of Residuals', fontsize=16)
#plt.title('All Sessions', fontsize=16)
#plt.ylim(0.06, 0.24)
"""
# Originally in the abstract I had investigated the effect of excluding the outlier session on the variance of the residuals.
# Plot with residuals_wo_outlier
plt.subplot(1, 2, 2)
plot_res_variance(residuals_wo_outlier)
#plot_res_variance(residuals)
plt.xticks(ticks=np.arange(len(ses_index)-1), labels=np.delete(ses_index, 30), rotation=90)
plt.xlabel('Session Index', fontsize=14)
plt.ylabel('Variance of Residuals', fontsize=14)
plt.title('Excluding the Outlier', fontsize=16)
#plt.ylim(0.06, 0.24)
"""
plt.tight_layout()
plt.show()
Plot edgewise residual over sessions¶
def plot_edgewise_residual(residuals, groupedby=False, ses_id=None):
# Calculate the interquartile range (IQR)
q1 = np.percentile(residuals, 25, axis=1)
q3 = np.percentile(residuals, 75, axis=1)
# Plot the evolution of the residual for each edge
for i in range(residuals.shape[1]):
plt.plot(range(residuals.shape[0]), residuals[:,i], color='blue', alpha=0.1)
# Plot the inter-quartile
plt.plot(range(residuals.shape[0]), q1, color='orange', label='25th percentile')
plt.plot(range(residuals.shape[0]), q3, color='orange', label='75th percentile')
if ses_id is not None:
plt.xticks(ticks=np.arange(len(ses_id)), labels=ses_id)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend()
if groupedby:
plt.title(groupedby, fontsize=16)
plot_edgewise_residual(residuals)
plt.xlabel('Session Index', fontsize=14)
plt.ylabel('Residual Value', fontsize=14)
plt.show()
Retrieve confounds¶
Retrieve scan time and day of week from the scans.tsv in the dataset repo¶
# Retrieve the confounds from the scans.tsv
confounds_of_interest = [
'caffeine_2h',
'caffeine_24h'
]
scans_df = get_confounds(dataset_path, confound_path, confounds_of_interest, iqms_path=iqms_path)
# Keep only the magnitude part of functional scans and first echo
scans_df = scans_df[scans_df["modality"] == "func"]
# Create dataframe based on the list of files
files_df = pd.DataFrame(files, columns=['file_path'])
files_df = files_df.assign(
subject=files_df["file_path"].str.extract(r"sub-(\d+)/"),
session=files_df["file_path"].str.extract(r"ses-(\w+)/"),
task=files_df["file_path"].str.extract(r"task-(\w+)_"),
)
#Pair filenames with acquisition time
confounds_df = pd.merge(
scans_df, files_df, on=["subject", "session", "task"], how="inner"
)
confounds_df.drop(columns=["file_path"], inplace=True)
print(confounds_df.head())
Warning: Only the IQMs corresponding to the second echo are retained in the IQMs DataFrame.
filename \
0 sub-001/ses-001/func/sub-001_ses-001_task-rsmo...
1 sub-001/ses-003/func/sub-001_ses-003_task-rsmo...
2 sub-001/ses-004/func/sub-001_ses-004_task-rsmo...
3 sub-001/ses-005/func/sub-001_ses-005_task-rsmo...
4 sub-001/ses-006/func/sub-001_ses-006_task-rsmo...
acq_time subject session modality task pe_dir \
0 2023-10-20T19:50:01.187500 001 001 func rsmovie LR
1 2023-10-21T10:12:32.207500 001 003 func rsmovie LR
2 2023-10-21T12:22:08.222500 001 004 func rsmovie RL
3 2023-10-22T10:33:53.380000 001 005 func rsmovie PA
4 2023-10-22T12:18:06.200000 001 006 func rsmovie PA
day_of_week time_of_day fd_mean caffeine_2h caffeine_24h
0 Friday 20:00:00 0.119245 0.0 2.0
1 Saturday 10:00:00 0.096035 1.0 1.0
2 Saturday 12:00:00 0.146882 0.0 1.0
3 Sunday 11:00:00 0.112293 1.0 1.0
4 Sunday 12:00:00 0.088701 1.0 1.0
# Merge residuals with confounds_df to get the time_of_day for each session
residuals_df = pd.DataFrame(residuals)
print("Shape of residuals array (n_samples, n_features):", residuals.shape)
residuals_df = residuals_df.assign(
session = files_df['session'],
)
residuals_df = pd.merge(
residuals_df,
confounds_df[['session', 'day_of_week', 'time_of_day', 'pe_dir', 'caffeine_2h', 'caffeine_24h']],
on='session'
)
print("Shape of residuals_df after merging with confounds_df:", residuals_df.shape)
print("First five rows of residuals_df after merge:")
print(residuals_df.head())
Shape of residuals array (n_samples, n_features): (36, 7140)
Shape of residuals_df after merging with confounds_df: (36, 7146)
First five rows of residuals_df after merge:
0 1 2 3 4 5 6 \
0 -1.731915e-15 0.008568 -0.014353 -0.080859 -0.002130 -0.012745 -0.011353
1 -1.081627e-15 0.001091 0.020877 -0.020616 0.032127 0.067218 0.034650
2 -8.339862e-16 0.016150 -0.056996 -0.079078 -0.110628 -0.012115 0.010908
3 8.614996e-17 0.013174 -0.056856 -0.045357 0.031062 0.009405 -0.018774
4 -8.310795e-16 -0.030164 0.134168 0.060146 0.034667 0.028640 0.026211
7 8 9 ... 7136 7137 7138 7139 session \
0 0.002458 0.055776 0.125133 ... 0.026551 0.0 0.031856 0.0 001
1 0.029521 -0.033804 -0.107729 ... 0.031296 0.0 0.035877 0.0 003
2 0.011550 0.001944 0.015197 ... 0.000338 0.0 -0.024277 0.0 004
3 -0.024694 0.006120 -0.001600 ... -0.009069 0.0 0.026941 0.0 005
4 0.040799 -0.000657 -0.049267 ... -0.041234 0.0 -0.006108 0.0 006
day_of_week time_of_day pe_dir caffeine_2h caffeine_24h
0 Friday 20:00:00 LR 0.0 2.0
1 Saturday 10:00:00 LR 1.0 1.0
2 Saturday 12:00:00 RL 0.0 1.0
3 Sunday 11:00:00 PA 1.0 1.0
4 Sunday 12:00:00 PA 1.0 1.0
[5 rows x 7146 columns]
res = residuals_df.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir']).values.T
residuals_df = residuals_df.assign(
variance = np.var(res, axis=0)
)
# Cast strings to type compatible with statistical models
residuals_df["session"] = pd.to_numeric(residuals_df["session"])
residuals_df["caffeine_2h"] = pd.to_numeric(residuals_df["caffeine_2h"])
residuals_df["caffeine_24h"] = pd.to_numeric(residuals_df["caffeine_24h"])
residuals_df["day_of_week"] = residuals_df["day_of_week"].astype("category")
residuals_df["time_of_day"] = residuals_df["time_of_day"].astype("category")
residuals_df["pe_dir"] = residuals_df["pe_dir"].astype("category")
print(residuals_df.head())
print("Here are the characteristics of the session with the highest variance:")
max_variance_row = residuals_df.loc[residuals_df['variance'].idxmax()]
print(max_variance_row)
residuals_wo_outlier_df = residuals_df.drop(max_variance_row.name)
0 1 2 3 4 5 6 \
0 -1.731915e-15 0.008568 -0.014353 -0.080859 -0.002130 -0.012745 -0.011353
1 -1.081627e-15 0.001091 0.020877 -0.020616 0.032127 0.067218 0.034650
2 -8.339862e-16 0.016150 -0.056996 -0.079078 -0.110628 -0.012115 0.010908
3 8.614996e-17 0.013174 -0.056856 -0.045357 0.031062 0.009405 -0.018774
4 -8.310795e-16 -0.030164 0.134168 0.060146 0.034667 0.028640 0.026211
7 8 9 ... 7137 7138 7139 session \
0 0.002458 0.055776 0.125133 ... 0.0 0.031856 0.0 1
1 0.029521 -0.033804 -0.107729 ... 0.0 0.035877 0.0 3
2 0.011550 0.001944 0.015197 ... 0.0 -0.024277 0.0 4
3 -0.024694 0.006120 -0.001600 ... 0.0 0.026941 0.0 5
4 0.040799 -0.000657 -0.049267 ... 0.0 -0.006108 0.0 6
day_of_week time_of_day pe_dir caffeine_2h caffeine_24h variance
0 Friday 20:00:00 LR 0.0 2.0 0.002463
1 Saturday 10:00:00 LR 1.0 1.0 0.002472
2 Saturday 12:00:00 RL 0.0 1.0 0.002126
3 Sunday 11:00:00 PA 1.0 1.0 0.001782
4 Sunday 12:00:00 PA 1.0 1.0 0.002011
[5 rows x 7147 columns]
Here are the characteristics of the session with the highest variance:
0 0.0
1 -0.0818
2 0.034087
3 -0.179126
4 -0.102308
...
time_of_day 20:00:00
pe_dir RL
caffeine_2h 0.0
caffeine_24h 2.0
variance 0.004153
Name: 21, Length: 7147, dtype: object
Test habituation significance¶
# Fit the linear mixed effects model
X = sm.add_constant(residuals_df["session"]) # Adds the intercept term
y = residuals_df["variance"]
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: variance R-squared: 0.016
Model: OLS Adj. R-squared: -0.013
Method: Least Squares F-statistic: 0.5514
Date: Fri, 10 Apr 2026 Prob (F-statistic): 0.463
Time: 20:21:40 Log-Likelihood: 217.20
No. Observations: 36 AIC: -430.4
Df Residuals: 34 BIC: -427.2
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0027 0.000 14.080 0.000 0.002 0.003
session 4.997e-06 6.73e-06 0.743 0.463 -8.68e-06 1.87e-05
==============================================================================
Omnibus: 1.571 Durbin-Watson: 1.210
Prob(Omnibus): 0.456 Jarque-Bera (JB): 1.178
Skew: 0.441 Prob(JB): 0.555
Kurtosis: 2.906 Cond. No. 53.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# #Statistics excluding the outlier
# # Fit the linear mixed effects model
# X = sm.add_constant(residuals_wo_outlier_df["session"]) # Adds the intercept term
# y = residuals_wo_outlier_df["variance"]
# model_wo_outlier = sm.OLS(y, X).fit()
# print(model_wo_outlier.summary())
# from statsmodels.stats.multitest import multipletests
# # Assuming you have a list of p-values from multiple tests
# p_values = [model.pvalues["session"], model_wo_outlier.pvalues["session"]]
# # Apply Bonferroni correction
# corrected_p_values = multipletests(p_values, method='bonferroni')[1]
# print(f"Original p-values: {p_values}")
# print(f"Corrected p-values: {corrected_p_values}")
Test habituation significance when accounting for confounds of no interest¶
Based on the interaction plots below, we determined that three out of the five confounds of interest we investigated seem to have an influence on the residual evolution. As such, let us add day of week, coffee consumption in the 24h prior to acquisition and phase encoding direction as nuisance variable in the OLS.
# Add day_of_week as a nuisance variable (include it as a covariate, one-hot encoded)
X = pd.get_dummies(residuals_df[["session", "day_of_week"]], drop_first=True)
X = sm.add_constant(X)
# Convert all data to float64 to avoid dtype=object errors
X = X.astype('float64')
y = pd.to_numeric(residuals_df["variance"], errors='coerce')
y = y.astype('float64')
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: variance R-squared: 0.156
Model: OLS Adj. R-squared: -0.055
Method: Least Squares F-statistic: 0.7412
Date: Fri, 10 Apr 2026 Prob (F-statistic): 0.639
Time: 20:21:40 Log-Likelihood: 219.97
No. Observations: 36 AIC: -423.9
Df Residuals: 28 BIC: -411.3
Df Model: 7
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------
const 0.0029 0.000 9.187 0.000 0.002 0.004
session 4.848e-06 6.99e-06 0.694 0.494 -9.47e-06 1.92e-05
day_of_week_Monday -7.429e-05 0.000 -0.201 0.842 -0.001 0.001
day_of_week_Saturday -0.0005 0.000 -1.234 0.228 -0.001 0.000
day_of_week_Sunday -0.0005 0.000 -1.362 0.184 -0.001 0.000
day_of_week_Thursday -0.0003 0.000 -0.705 0.487 -0.001 0.001
day_of_week_Tuesday -0.0001 0.000 -0.371 0.713 -0.001 0.001
day_of_week_Wednesday 0.0002 0.000 0.415 0.682 -0.001 0.001
==============================================================================
Omnibus: 5.252 Durbin-Watson: 1.404
Prob(Omnibus): 0.072 Jarque-Bera (JB): 3.974
Skew: 0.784 Prob(JB): 0.137
Kurtosis: 3.440 Cond. No. 213.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Add phase encoding direction (pe_dir) as a nuisance variable (include it as a covariate, one-hot encoded)
X = pd.get_dummies(residuals_df[["session", "pe_dir"]], drop_first=True)
X = sm.add_constant(X)
# Convert all data to float64 to avoid dtype=object errors
X = X.astype('float64')
y = pd.to_numeric(residuals_df["variance"], errors='coerce')
y = y.astype('float64')
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: variance R-squared: 0.180
Model: OLS Adj. R-squared: 0.074
Method: Least Squares F-statistic: 1.701
Date: Fri, 10 Apr 2026 Prob (F-statistic): 0.175
Time: 20:21:40 Log-Likelihood: 220.48
No. Observations: 36 AIC: -431.0
Df Residuals: 31 BIC: -423.1
Df Model: 4
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0025 0.000 10.045 0.000 0.002 0.003
session 6.774e-06 6.53e-06 1.037 0.308 -6.55e-06 2.01e-05
pe_dir_LR 9.676e-05 0.000 0.360 0.722 -0.000 0.001
pe_dir_PA -0.0002 0.000 -0.737 0.467 -0.001 0.000
pe_dir_RL 0.0005 0.000 1.690 0.101 -9.49e-05 0.001
==============================================================================
Omnibus: 0.781 Durbin-Watson: 1.125
Prob(Omnibus): 0.677 Jarque-Bera (JB): 0.832
Skew: 0.211 Prob(JB): 0.660
Kurtosis: 2.387 Cond. No. 126.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Add caffeine_24h as a nuisance variable (include it as a covariate, one-hot encoded)
X = pd.get_dummies(residuals_df[["session", "caffeine_24h"]], drop_first=True)
X = sm.add_constant(X)
# Convert all data to float64 to avoid dtype=object errors
X = X.astype('float64')
y = pd.to_numeric(residuals_df["variance"], errors='coerce')
y = y.astype('float64')
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: variance R-squared: 0.370
Model: OLS Adj. R-squared: 0.332
Method: Least Squares F-statistic: 9.700
Date: Fri, 10 Apr 2026 Prob (F-statistic): 0.000486
Time: 20:21:40 Log-Likelihood: 225.24
No. Observations: 36 AIC: -444.5
Df Residuals: 33 BIC: -439.7
Df Model: 2
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------
const 0.0012 0.000 3.147 0.003 0.000 0.002
session 4.643e-06 5.47e-06 0.849 0.402 -6.48e-06 1.58e-05
caffeine_24h 0.0008 0.000 4.309 0.000 0.000 0.001
==============================================================================
Omnibus: 4.380 Durbin-Watson: 1.914
Prob(Omnibus): 0.112 Jarque-Bera (JB): 3.383
Skew: 0.744 Prob(JB): 0.184
Kurtosis: 3.200 Cond. No. 146.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Add caffeine_24h, pe_dir, and day_of_week as nuisance variables (all one-hot encoded)
nuisance_vars = ["caffeine_24h", "pe_dir", "day_of_week"]
X = pd.get_dummies(residuals_df[["session"] + nuisance_vars], drop_first=True)
X = sm.add_constant(X)
# Convert all data to float64 to avoid dtype=object errors
X = X.astype('float64')
y = pd.to_numeric(residuals_df["variance"], errors='coerce')
y = y.astype('float64')
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: variance R-squared: 0.553
Model: OLS Adj. R-squared: 0.348
Method: Least Squares F-statistic: 2.697
Date: Fri, 10 Apr 2026 Prob (F-statistic): 0.0204
Time: 20:21:40 Log-Likelihood: 231.40
No. Observations: 36 AIC: -438.8
Df Residuals: 24 BIC: -419.8
Df Model: 11
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------
const 0.0004 0.001 0.603 0.552 -0.001 0.002
session 8.417e-06 5.62e-06 1.497 0.147 -3.19e-06 2e-05
caffeine_24h 0.0011 0.000 3.844 0.001 0.001 0.002
pe_dir_LR 8.816e-05 0.000 0.355 0.726 -0.000 0.001
pe_dir_PA -0.0002 0.000 -0.769 0.449 -0.001 0.000
pe_dir_RL 0.0004 0.000 1.547 0.135 -0.000 0.001
day_of_week_Monday -0.0001 0.000 -0.479 0.636 -0.001 0.000
day_of_week_Saturday 0.0004 0.000 1.092 0.286 -0.000 0.001
day_of_week_Sunday 0.0004 0.000 1.046 0.306 -0.000 0.001
day_of_week_Thursday -0.0002 0.000 -0.640 0.528 -0.001 0.000
day_of_week_Tuesday -0.0001 0.000 -0.444 0.661 -0.001 0.000
day_of_week_Wednesday 0.0002 0.000 0.667 0.511 -0.001 0.001
==============================================================================
Omnibus: 0.477 Durbin-Watson: 1.731
Prob(Omnibus): 0.788 Jarque-Bera (JB): 0.403
Skew: 0.240 Prob(JB): 0.817
Kurtosis: 2.803 Cond. No. 297.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Accounting for confounds of no interest still indicates that the residual variance modestly and non-significantly increase over movie repetition. As such, it does not change our conclusion that we did not find conclusive evidence of an habituation effect.
Plot residuals grouped by bins of confound factor¶
Day of week¶
# Entities specific to this plot
entities = {'visualization': 'resvar', 'confound': 'day'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(3, 3, figsize=(20, 20))
min_res_var = 1000
max_res_var = 0
for i, (day_of_week, group) in enumerate(residuals_df.groupby('day_of_week', observed=False)):
plt.sca(axes[i // 3, i % 3])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
min_res_var, max_res_var = plot_res_variance(res, groupedby=day_of_week, min_res_var=min_res_var, max_res_var=max_res_var, ses_id=ses_id)
#Set same y-axis limits for all subplots
for ax in axes.flat:
ax.set_ylim(min_res_var-0.0005, max_res_var+0.0005)
# Hide the last two subplots (axes[2, 1] and axes[2, 2])
axes[2, 1].axis('off')
axes[2, 2].axis('off')
fig.text(0.5, 0.08, 'Session Index', ha='center', fontsize=14)
fig.text(0.08, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Residual variance evolution separated by day of week.**" \
"We fitted negative exponential and quadratic curves only if the plot had more than three points."
save_caption(caption, save_layout, entities)
Based on the interaction plot, day of week seems to influence the residuals evolution, as such we will add it the OLS as a nuisance regressor.
Time of day¶
# Entities specific to this plot
entities = {'visualization': 'resvar', 'confound': 'time'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(3, 4, figsize=(30, 20))
min_res_var = 1000
max_res_var = 0
for i, (time_of_day, group) in enumerate(residuals_df.groupby('time_of_day', observed=False)):
print("Time of day:", time_of_day)
plt.sca(axes[i // 4, i % 4])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
print("Shape of residuals:", res.shape)
if res.shape[1] > 1:
min_res_var, max_res_var = plot_res_variance(res, groupedby=time_of_day, min_res_var=min_res_var, max_res_var=max_res_var, ses_id=ses_id)
#Set same y-axis limits for all subplots
for ax in axes.flat:
ax.set_ylim(min_res_var-0.01, max_res_var+0.01)
# Hide the last axis
axes[2, 2].axis('off')
axes[2, 3].axis('off')
fig.text(0.5, 0.08, 'Session Index', ha='center', fontsize=14)
fig.text(0.08, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Residual variance evolution separated by time of day.**"
save_caption(caption, save_layout, entities)
Time of day: 09:00:00 Shape of residuals: (1, 7141) Time of day: 10:00:00 Shape of residuals: (2, 7141) Time of day: 11:00:00 Shape of residuals: (2, 7141) Time of day: 12:00:00 Shape of residuals: (3, 7141) Time of day: 18:00:00 Shape of residuals: (2, 7141) Time of day: 19:00:00 Shape of residuals: (2, 7141) Time of day: 20:00:00 Shape of residuals: (11, 7141) Time of day: 21:00:00 Shape of residuals: (5, 7141) Time of day: 22:00:00 Shape of residuals: (7, 7141) Time of day: 23:00:00 Shape of residuals: (1, 7141)
Based on the interaction plot, time of day does not have a substantial influence on the residuals evolution, as such it is not necessary to add it as a nuisance variable in the OLS.
Phase Encoding Direction¶
# Entities specific to this plot
entities = {'visualization': 'resvar', 'confound': 'pe_dir'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
min_res_var = 1000
max_res_var = 0
for i, (pe_dir, group) in enumerate(residuals_df.groupby('pe_dir', observed=False)):
plt.sca(axes[i // 2, i % 2])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
min_res_var, max_res_var = plot_res_variance(res, groupedby=pe_dir, min_res_var=min_res_var, max_res_var=max_res_var, ses_id=ses_id)
#Set same y-axis limits for all subplots
for ax in axes.flat:
ax.set_ylim(min_res_var-0.0005, max_res_var+0.0005)
fig.text(0.5, 0.05, 'Session Index', ha='center', fontsize=14)
fig.text(0.05, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Residual variance evolution separated by phase encoding direction.**"
save_caption(caption, save_layout, entities)
Based on the interaction plot, phase encoding direction seems to influence the residuals evolution, as such we will add it the OLS as a nuisance regressor.
# Entities specific to this plot
entities = {'visualization': 'edgewisevar', 'confound': 'pe_dir'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
min_res_var = 1000
max_res_var = 0
for i, (pe_dir, group) in enumerate(residuals_df.groupby('pe_dir', observed=False)):
plt.sca(axes[i // 2, i % 2])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
plot_edgewise_residual(res, groupedby=pe_dir, ses_id=ses_id)
fig.text(0.5, 0.05, 'Session Index', ha='center', fontsize=14)
fig.text(0.05, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
/tmp/ipykernel_709473/2248055642.py:18: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. plt.savefig(save_path) /home/cprovins/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. fig.canvas.print_figure(bytes_io, **kw)
Caffeine consumption 2h before¶
# Entities specific to this plot
entities = {'visualization': 'resvar', 'confound': 'caffeine2h'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(1, 2, figsize=(21, 6))
min_res_var = 1000
max_res_var = 0
for i, (caf, group) in enumerate(residuals_df.groupby('caffeine_2h')):
plt.sca(axes[i])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
min_res_var, max_res_var = plot_res_variance(res, groupedby=caf, min_res_var=min_res_var, max_res_var=max_res_var, ses_id=ses_id)
#Set same y-axis limits for all subplots
for ax in axes.flat:
ax.set_ylim(min_res_var-0.0001, max_res_var+0.0001)
# Set x-ticks with 90 degree rotation for the first subplot
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=90)
axes[0].set_title("0 cup of coffee")
axes[1].set_title("1 cup of coffee")
fig.text(0.5, 0.05, 'Session Index', ha='center', fontsize=14)
fig.text(0.05, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Residual variance evolution separated by caffeine consumption 2h before.**"
save_caption(caption, save_layout, entities)
# Entities specific to this plot
entities = {'visualization': 'edgewisevar', 'confound': 'caffeine2h'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(1, 2, figsize=(19, 6))
min_res_var = 1000
max_res_var = 0
for i, (caf, group) in enumerate(residuals_df.groupby('caffeine_2h')):
plt.sca(axes[i])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
plot_edgewise_residual(res, groupedby=caf, ses_id= ses_id)
# Set x-ticks with 90 degree rotation for the first subplot
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=90)
axes[0].set_title("0 cup of coffee")
axes[1].set_title("1 cup of coffee")
fig.text(0.5, 0.05, 'Session Index', ha='center', fontsize=14)
fig.text(0.05, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Edgewise variance evolution separated by caffeine consumption 2h before.**"
save_caption(caption, save_layout, entities)
Caffeine consumption 24h before¶
# Entities specific to this plot
entities = {'visualization': 'resvar', 'confound': 'caffeine24h'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(1, 2, figsize=(16, 7))
min_res_var = 1000
max_res_var = 0
for i, (caf, group) in enumerate(residuals_df.groupby('caffeine_24h')):
plt.sca(axes[i])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
min_res_var, max_res_var = plot_res_variance(res, groupedby=caf, min_res_var=min_res_var, max_res_var=max_res_var, ses_id=ses_id)
#Set same y-axis limits for all subplots
for ax in axes.flat:
ax.set_ylim(min_res_var-0.0005, max_res_var+0.0005)
# Set x-ticks with 90 degree rotation for the first subplot
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=90)
axes[0].set_title("1 cup of coffee")
axes[1].set_title("2 cups of coffee")
fig.text(0.5, 0.05, 'Session Index', ha='center', fontsize=14)
fig.text(0.05, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Residual variance evolution separated by caffeine consumption 24h before.**"
save_caption(caption, save_layout, entities)
Based on the interaction plot, coffee consumption seems to influence the residuals evolution, as such we will add it the OLS as a nuisance regressor.
# Entities specific to this plot
entities = {'visualization': 'edgewisevar', 'confound': 'caffeine24h'}
entities.update(entities_base)
# Create figure with subplots for variance and edgewise residuals
fig, axes = plt.subplots(1, 2, figsize=(16, 7))
min_res_var = 1000
max_res_var = 0
for i, (caf, group) in enumerate(residuals_df.groupby('caffeine_24h')):
plt.sca(axes[i])
ses_id = group['session'].apply(lambda x: str(x).zfill(3)).values
res = group.drop(columns=['session', 'day_of_week', 'time_of_day','pe_dir','caffeine_2h', 'caffeine_24h']).values
plot_edgewise_residual(res, groupedby=caf, ses_id=ses_id)
# Set x-ticks with 90 degree rotation for the first subplot
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=90)
axes[0].set_title("1 cup of coffee")
axes[1].set_title("2 cups of coffee")
fig.text(0.5, 0.05, 'Session Index', ha='center', fontsize=14)
fig.text(0.05, 0.5, 'Variance of Residuals', va='center', rotation='vertical', fontsize=14)
save_path = save_layout.build_path(entities, validate=False)
plt.savefig(save_path)
plt.show()
caption = f"**Edgewise variance evolution separated by caffeine consumption 24h before.**"
save_caption(caption, save_layout, entities)
Do some confound explain why ses-043 is seemingly less correlated to the other sessions? Can we find a reason that justifies its exclusion from the fMRI analysis?¶
confounds_path = '/data/datasets/hcph-dataset/phenotype/mood_env_quest.tsv'
confounds_df = pd.read_csv(confounds_path, sep='\t')
# Keep only the confounds from the reliability sessions
confounds_df = confounds_df[confounds_df['session_number'].astype(str).str.zfill(3).str.startswith('0')]
confounds_df['session_number'] = confounds_df['session_number'].astype(str).str.zfill(3)
confounds_df.head()
| participant_id | session_number | scanner | pe_dir | date | time | weather_temperature | weather_wind | weather_precipitation | weather_precipitation_type | ... | slept_qct | slept_rest | slept_bht | rumination | anxiety | physical_pain | time_slept_t1w | time_slept_rest | time_slept_diffusion | details_rumination_anxiety_pain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 37 | 1 | 048 | Prisma | PA | 11-14-2023 | 20:45 | 11-14 | 10.4 | 43.0 | 4 Rain or drizzle | ... | No | No | No | 3.0 | 6.0 | 6.0 | NaN | NaN | NaN | NaN |
| 38 | 1 | 047 | Prisma | RL | 11-14-2023 | 18:50 | 11-14 | 10.4 | 43.0 | 4 Rain or drizzle | ... | No | No | No | 3.0 | 6.0 | 5.0 | NaN | NaN | NaN | NaN |
| 39 | 1 | 046 | Prisma | LR | 11-13-2023 | 20:15 | 10-16 | 8.7 | 7.0 | 2 Scattered precipitation (liquid) | ... | No | No | No | 2.0 | 6.0 | 6.0 | NaN | NaN | NaN | NaN |
| 40 | 1 | 045 | Prisma | LR | 11-13-2023 | 18:30 | 10-16 | 8.7 | 7.0 | 2 Scattered precipitation (liquid) | ... | No | No | No | 3.0 | 6.0 | 6.0 | NaN | NaN | NaN | NaN |
| 41 | 1 | 044 | Prisma | LR | 11-12-2023 | 11:10 | 5-10 | 7.0 | 0.1 | 2 Scattered precipitation (liquid) | ... | No | No | No | 3.0 | 6.0 | 6.0 | NaN | NaN | NaN | NaN |
5 rows × 78 columns
confound_to_plot = confounds_df.columns.drop(['participant_id','session_number', 'scanner', 'pe_dir', 'remove_jewelry', 'mr_room_helium', 'details_rumination_anxiety_pain', 'time_slept_t1w', 'time_slept_rest', 'time_slept_diffusion'])
confound_to_plot
Index(['date', 'time', 'weather_temperature', 'weather_wind',
'weather_precipitation', 'weather_precipitation_type',
'weather_pressure', 'weather_humidity', 'weather_daylight',
'blood_pressure', 'caffeine_24h', 'caffeine_2h', 'alcohol', 'triptans',
'nsaids', 'weight', 'general_health', 'general_stress',
'ease_concentration', 'beh_work', 'beh_freetime', 'beh_sport',
'beh_outdoors', 'beh_electronic_devices', 'beh_active_social',
'beh_passive_social', 'sleep_quality', 'remember_dreams', 'day_dreams',
'time_to_bed', 'time_in_bed', 'steps', 'distance_walked',
'stories_climbed', 'calories', 'interested', 'distressed', 'excited',
'upset', 'strong', 'guilty', 'scared', 'hostile', 'enthusiastic',
'proud', 'irritable', 'alert', 'ashamed', 'inspired', 'nervous',
'determined', 'attentive', 'jittery', 'active', 'afraid',
'filled_all_entries', 'mr_room_temperature', 'mr_room_humidity',
'mr_room_pressure', 'slept_session', 'slept_t1w', 'slept_diffusion',
'slept_qct', 'slept_rest', 'slept_bht', 'rumination', 'anxiety',
'physical_pain'],
dtype='object')
# Convert 'time' column to datetime, round up to the closest full hour
confounds_df['time'] = pd.to_datetime(confounds_df['time'], format='%H:%M', errors='coerce').dt.round('h').dt.strftime('%H').astype(float)
slept_cols = [col for col in confounds_df.columns if col.startswith('slept')]
confounds_df[slept_cols] = confounds_df[slept_cols].replace({'Yes': 1, 'No': 0})
Visualize the distribution of all confounds and highlight as a red dot the value corresponding to ses-043
# Entities specific to this plot
entities = {'visualization': 'confounddist'}
entities.update(entities_base)
# Filter sessions with '043' in session_number (as string, zero-padded if needed)
highlight_mask = confounds_df['session_number'].astype(str).str.zfill(3).str.contains('043')
n_cols = 5
n_plots = len(confound_to_plot)
n_rows = int(np.ceil(n_plots / n_cols))
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3+10))
axes = axes.flatten()
for idx, confound in enumerate(confound_to_plot):
ax = axes[idx]
y = confounds_df[confound]
x = np.arange(len(y))
ax.scatter(x, y, alpha=0.7)
# Highlight session 043
ax.scatter(x[highlight_mask], y[highlight_mask], color='red', label='Session 043')
ax.set_title(confound)
ax.grid(True)
# Hide unused subplots
for j in range(idx + 1, len(axes)):
axes[j].set_visible(False)
#plt.tight_layout()
plt.show()
caption = f"**Confounds distribution.** To try to understand whether some confound can explain that Session 043 is seemingly less correlated" \
"to the other sessions, we plotted the distribution of all confounds and highlighted as a red dot the value corresponding to Session 043." \
"Not that it justifies exclusion of ses-043, but it is interesting to see that ses-043 is an outlier in different confound dimensions." \
"Notably, it was one of the only two sessions where the participant reported to be socially active for extended duration, to be outdoor," \
"away from electronic devices, to have more freetime and to have walked more distance and more number of steps than usual." \
"This session is also associated with the only time the sleep quality was recorded at the minimum."
save_caption(caption, save_layout, entities)
Not that it justifies exclusion of ses-043, but it is interesting to see that ses-043 is an outlier in different confound dimensions. Notably, it was one of the only two sessions where the participant reported to be socially active for extended duration, to be outdoor, away from electronic devices, to have more freetime and to have walked more distance and more number of steps than usual. This session is also associated with the only time the sleep quality was recorded at the minimum.