Note

Click here to download the full example code

Figure_2_BCDE.ipynb

Automatically generated by Colaboratory.

Original file is located at: https://colab.research.google.com/github/HernandoMV/APE_paper/blob/main/docs/figures_notebooks/Figure_2_BCDE.ipynb

# This notebook performs a statistical analysis on the learning rates between two groups of mice.
# Performs model fitting of individual mice, comparison of parameter, and statistical analysis
# between the groups to conclude differences and points of divergence in learning

# Commented out IPython magic to ensure Python compatibility.
# run this on Colab
# !rm -rf APE_paper/
# !git clone https://github.com/HernandoMV/APE_paper.git
# %pip install mouse-behavior-analysis-tools
# %cd APE_paper/docs/figures_notebooks

Commented out IPython magic to ensure Python compatibility. %load_ext autoreload %autoreload 2

import matplotlib.pylab as plt
import numpy as np
import seaborn as sns
import pandas as pd
from scipy import stats
from IPython.display import clear_output
from itertools import chain
from os.path import exists
import urllib.request

from mouse_behavior_analysis_tools.utils import custom_functions as cuf
from mouse_behavior_analysis_tools.utils.misc_utils import update_progress
from mouse_behavior_analysis_tools.plot import make_figures
from mouse_behavior_analysis_tools.model import behavior_model as bm
from mouse_behavior_analysis_tools.utils import model_utils

clear_output()

# This dataset has been pre-processed, but conditions have not been selected
# This preprocessing includes removal of disengaged trials and
# removal of the first 5 trials of each session

# download data from server
dataset_name = '6OHDA_dataframe.csv'
url = "https://zenodo.org/record/7261639/files/" + dataset_name
dataset_path = '../data/' + dataset_name
# download if data is not there
if not exists(dataset_path):
    print('Downloading data...')
    urllib.request.urlretrieve(url, dataset_path)
else:
    print('Data already in directory')
# load
df_to_plot = pd.read_csv(dataset_path, index_col=0)

# load the image data
image_dataset_name = '6OHDA_image-data.csv'
url = "https://zenodo.org/record/7261639/files/" + image_dataset_name
image_dataset_path = '../data/' + image_dataset_name
# download if data is not there
if not exists(image_dataset_path):
    print('Downloading data...')
    urllib.request.urlretrieve(url, image_dataset_path)
else:
    print('Data already in directory')
# load
iar = pd.read_csv(image_dataset_path, index_col=0)

# parameters for the plotting
hue_order = ['CortexBuffer', '6OHDAtail']
color_palette = [(0.24715576, 0.49918708, 0.57655991),
             (0.72162039, 0.3669119 , 0.22526315)]

sns.set_palette(color_palette)

# maximum number of trials performed per mouse in the dataset:
df_to_plot.groupby(['AnimalID', 'ExperimentalGroup', 'Protocol']).max()['CumulativeTrialNumberByProtocol']

"""#### Performance by session and 200-sized bins """

# bin trials every 200
df_to_plot["TrialIndexBinned200"] = (df_to_plot.CumulativeTrialNumberByProtocol // 200) * 200 + 100

# Sanity check on the data to see that it looks good

fig = make_figures.make_figure_performance_trials_animals_bin(df_to_plot)
clear_output()
plt.show(fig)
# uncomment here to save the plot
# data_directory = ''
# plt.savefig(data_directory + 'Performance_by_session_individual_animals.pdf',
#             transparent=True, bbox_inches='tight')

"""Colored dots show the performance of the past 100 trials using an average running window. Each color represents a distinct session. The black line shows the performance value of the past 200 trials using trial bining.

#### Fit a sigmoid to every mouse to calculate and compare stages and learning rates

The sigmoid function is (perf_end - 0.5) / (1 + np.exp(-slope * (x - bias))) + 0.5

Data is scaled before the fitting and parameters are rescaled afterwards

The maximum performace possible is defined as the maximum of the median of the trials binned every 200
"""

#calculate the maximum performance for every mouse based on the trials binned every 200
df_bin200tr = df_to_plot.groupby(['AnimalID','ExperimentalGroup','TrialIndexBinned200','Protocol']).median().reset_index()
mouse_max_perf = df_bin200tr.groupby('AnimalID').max().reset_index()[['AnimalID', 'CurrentPastPerformance100']]

# fit the model
fit_df = bm.get_df_of_behavior_model_by_animal(df_to_plot, mouse_max_perf)

# find the steepest point of each slope (max of derivative)
der_max_dir = bm.get_steepest_point_of_slope(fit_df)

# plot the curves again pointing to the maximum
# sanity check to see that these scaled values recreate the curves
fig = make_figures.make_figure_performance_trials_animals_model(df_to_plot, fit_df, der_max_dir)
clear_output()
plt.show(fig)
# plt.savefig(data_directory + 'Sigmoid_fitting.pdf', transparent=True, bbox_inches='tight')

"""As a learning speed measure I use the point of the maximum learning rate, which is the maximum of the derivative of the fitting, which is also the middle point of the curve (in between 50% and maximum performance)
The bias indicates where this point lies in the x axis: at which trial number this point is reached
"""

# add the maximum value of the derivative to the fit_df dataframe
for key, value in der_max_dir.items():
    fit_df.loc[fit_df.index[fit_df['AnimalID'] == key].tolist()[0], 'max_of_der'] = value[1]

"""### Merge with image data"""

merged_df = pd.merge(left=fit_df, right=iar, how='left', left_on='AnimalID', right_on='mouse id')[
    ['AnimalID',
     'maximum_performance',
     'bias',
     'ExperimentalGroup',
     'max_of_der',
     'median_performance_reached_in_last_ntrials',
     'n trials to criterion',
     'max performance reached',
     'ratio posterior/anterior']
]

"""##### The following cell creates **Figure 2 E** from the paper"""

fig = make_figures.make_figure_6ohda_lesion_correlation(merged_df, color_palette)
plt.show(fig)
# plt.savefig(data_directory + '6ohda_lesion-correlation.pdf', transparent=True, bbox_inches='tight')

# There is one lesioned animal that is very close to the controls in terms of the lesioned area:
merged_df[['AnimalID', 'ExperimentalGroup', 'ratio posterior/anterior']]

# Remove this animal from the study
ans_to_remove = ['SomFlp08']
# ans_to_remove = []
df_to_plot = df_to_plot[~df_to_plot.AnimalID.isin(ans_to_remove)].copy()
fit_df = fit_df[~fit_df.AnimalID.isin(ans_to_remove)].copy()
print('Animals removed from the following analysis based on this:', ans_to_remove)

"""### Differences of parameters between groups"""

# calculate significance
parameters_to_show = ['maximum_performance', 'max_of_der']
# define three levels of significance for the plotting
sig_levels = [0.05, 0.01, 0.001]
pvals = []
print('Kluskal-Wallis tests on the parameters')
for var in parameters_to_show:
    kt = stats.kruskal(fit_df[fit_df.ExperimentalGroup==hue_order[0]][var].dropna(),
                       fit_df[fit_df.ExperimentalGroup==hue_order[1]][var].dropna())
    print( var + ':\t\tpvalue: ' + str(kt.pvalue) )
    pvals.append(kt.pvalue)

"""##### The following cell creates **Figures 2 C, D** from the paper"""

# compare the parameters between groups
titles = ['maximum\nperformance', 'maximum\nlearning rate']
ylabs = ['task performance (%)', '\u0394performance \u00D7 trials$\mathregular{^{-1}}$']
yvals = [70, 0.001] # for the statistics

fig = make_figures.make_figure_learning_parameters_between_groups(fit_df, parameters_to_show,
                                                                  titles, ylabs,
                                                                  pvals, sig_levels,
                                                                  color_palette, hue_order, yvals)

# plt.savefig(data_directory + 'Parameters_group_comparison.pdf', transparent=True, bbox_inches='tight')
plt.show(fig)

for i,s in enumerate(sig_levels):
    print(i+1, 'asterisks: pval <', s)



"""### calculate the statistical differences of performances between groups at different points in learning

##### The following cell creates the first part of **Figure 2 B** from the paper
"""

# Performance differences between groups across learning
col_to_plot = 'CurrentPastPerformance100'
fig1 = make_figures.make_figure_differences_performance_between_groups(df_to_plot, col_to_plot,
                                                                       hue_order, color_palette)

# plt.savefig(data_directory + 'Performance_between_groups.pdf', transparent=True, bbox_inches='tight')
plt.show(fig1)
print('Shaded area indicates std, and performance is calculated using', col_to_plot)

7. Separate performance on high and low tones¶

create a mask for the TrialHighPerc columns

high_mask = df_to_plot['TrialHighPerc'] == 98.0
low_mask = df_to_plot['TrialHighPerc'] == 2.0
df_high = df_to_plot[high_mask].copy()
df_low = df_to_plot[low_mask].copy()

# reset the index and the cumulative trial number by protocol
df_high.reset_index(inplace=True)
df_low.reset_index(inplace=True)
df_high.drop('index', axis=1, inplace=True)
df_low.drop('index', axis=1, inplace=True)

# Restart the count of CumulativeTrialNumber for every protocol
df_high['CumulativeTrialNumberByProtocol'] = np.nan
df_low['CumulativeTrialNumberByProtocol'] = np.nan

for Aid in pd.unique(df_high['AnimalID']):
    for Prot in pd.unique(df_high['Protocol']):
        conditions = np.logical_and(df_high['AnimalID'] == Aid, df_high['Protocol'] == Prot)
        df_high.CumulativeTrialNumberByProtocol.loc[df_high[conditions].index] = \
            np.arange(len(df_high[conditions])) + 1

for Aid in pd.unique(df_low['AnimalID']):
    for Prot in pd.unique(df_low['Protocol']):
        conditions = np.logical_and(df_low['AnimalID'] == Aid, df_low['Protocol'] == Prot)
        df_low.CumulativeTrialNumberByProtocol.loc[df_low[conditions].index] = \
            np.arange(len(df_low[conditions])) + 1


# calculate performance
PAST_WINDOW = 50
CumPerListHigh = []
CumPerListLow = []
for Sid in pd.unique(df_to_plot['SessionID']):
    CumPerListHigh.append(cuf.perf_window_calculator(df_high[df_high['SessionID']==Sid],
                                                     PAST_WINDOW))
    CumPerListLow.append(cuf.perf_window_calculator(df_low[df_low['SessionID']==Sid],
                                                     PAST_WINDOW))
# flatten the list of lists
df_high['CurrentPastPerformance100biasremoved'] = np.array(list(chain(*[x for x in CumPerListHigh])))
df_low['CurrentPastPerformance100biasremoved'] = np.array(list(chain(*[x for x in CumPerListLow])))

Plot performance on high and low tones

col_to_plot = 'CurrentPastPerformance100biasremoved'
figX = make_figures.make_figure_differences_performance_between_groups(df_high, col_to_plot,
                                                                       hue_order, color_palette)
data_directory = ''
plt.savefig(data_directory + '6OHDA_Performance_between_groups_high_tones.pdf', transparent=True, bbox_inches='tight')
figX.show()

"""#### Calculate the significance by resampling: suffle the group labels multiple times and calculate the likelihood of observing this data
The shuffling respects the proportion of mice in every group.

References:

Supplementary figure 4 in here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2562676/

See also methods here: https://www.biorxiv.org/content/10.1101/716274v3.full
"""

# define a 100-trial window to bin the data
xbin = 100
df_to_plot["TrialIndexBinned"] = (df_to_plot.CumulativeTrialNumberByProtocol // xbin) * xbin + xbin / 2
print('Trials are binned in groups of', xbin)

# groupby so each animal has a mean of the performance in each bin
df_bintr = df_to_plot.groupby(['AnimalID','ExperimentalGroup','TrialIndexBinned','Protocol']).mean().reset_index()

# create a scaled version of the performance
df_bintr['Performance'] = df_bintr.FirstPokeCorrect * 100

# calculate the differences of the real means using the binned data
real_data_pd = df_bintr[df_bintr.ExperimentalGroup == hue_order[1]].groupby('TrialIndexBinned').mean()['Performance'] -\
               df_bintr[df_bintr.ExperimentalGroup == hue_order[0]].groupby('TrialIndexBinned').mean()['Performance']

# Select the amount of times to shuffle. Originally this is done 10,000 times
# use a smaller number to speed things up
nsh = 1000
print('Data is shuffled', nsh, 'times')

# select the important columns to get the shuffled data
df_colsel = df_bintr[['AnimalID', 'ExperimentalGroup', 'TrialIndexBinned', 'Performance']].copy()
# get the shuffled data
shrdf = cuf.get_shuffled_means_difference_df(df_colsel, hue_order, nsh)

# calculate the confidence intervals for the shuffled data
pos_ci = shrdf.groupby('TrialIndexBinned').quantile(.95)
neg_ci = shrdf.groupby('TrialIndexBinned').quantile(.05)

"""##### The following cell creates the second part of **Figure 2 B** from the paper

This shows how likely is that each time point crosses 'random' line (point-base significance).
"""

fig2 = make_figures.make_figure_differences_performance_significance(real_data_pd, pos_ci, neg_ci)
# plt.savefig(data_directory + 'Differences_of_means_significance_by_trial_bins.pdf',transparent=True, bbox_inches='tight')
plt.show(fig2)

Total running time of the script: ( 0 minutes 0.000 seconds)

Gallery generated by Sphinx-Gallery