.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_examples/figure_2_bcde.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        Click :ref:`here <sphx_glr_download_auto_examples_figure_2_bcde.py>`
        to download the full example code

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_examples_figure_2_bcde.py:

Figure_2_BCDE.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/github/HernandoMV/APE_paper/blob/main/docs/figures_notebooks/Figure_2_BCDE.ipynb

.. GENERATED FROM PYTHON SOURCE LINES 9-21

.. code-block:: default


    # This notebook performs a statistical analysis on the learning rates between two groups of mice.
    # Performs model fitting of individual mice, comparison of parameter, and statistical analysis
    # between the groups to conclude differences and points of divergence in learning

    # Commented out IPython magic to ensure Python compatibility.
    # run this on Colab
    # !rm -rf APE_paper/
    # !git clone https://github.com/HernandoMV/APE_paper.git
    # %pip install mouse-behavior-analysis-tools
    # %cd APE_paper/docs/figures_notebooks


.. GENERATED FROM PYTHON SOURCE LINES 22-25

Commented out IPython magic to ensure Python compatibility.
%load_ext autoreload
%autoreload 2

.. GENERATED FROM PYTHON SOURCE LINES 25-216

.. code-block:: default

    import matplotlib.pylab as plt
    import numpy as np
    import seaborn as sns
    import pandas as pd
    from scipy import stats
    from IPython.display import clear_output
    from itertools import chain
    from os.path import exists
    import urllib.request

    from mouse_behavior_analysis_tools.utils import custom_functions as cuf
    from mouse_behavior_analysis_tools.utils.misc_utils import update_progress
    from mouse_behavior_analysis_tools.plot import make_figures
    from mouse_behavior_analysis_tools.model import behavior_model as bm
    from mouse_behavior_analysis_tools.utils import model_utils

    clear_output()

    # This dataset has been pre-processed, but conditions have not been selected
    # This preprocessing includes removal of disengaged trials and
    # removal of the first 5 trials of each session

    # download data from server
    dataset_name = '6OHDA_dataframe.csv'
    url = "https://zenodo.org/record/7261639/files/" + dataset_name
    dataset_path = '../data/' + dataset_name
    # download if data is not there
    if not exists(dataset_path):
        print('Downloading data...')
        urllib.request.urlretrieve(url, dataset_path)
    else:
        print('Data already in directory')
    # load
    df_to_plot = pd.read_csv(dataset_path, index_col=0)

    # load the image data
    image_dataset_name = '6OHDA_image-data.csv'
    url = "https://zenodo.org/record/7261639/files/" + image_dataset_name
    image_dataset_path = '../data/' + image_dataset_name
    # download if data is not there
    if not exists(image_dataset_path):
        print('Downloading data...')
        urllib.request.urlretrieve(url, image_dataset_path)
    else:
        print('Data already in directory')
    # load
    iar = pd.read_csv(image_dataset_path, index_col=0)

    # parameters for the plotting
    hue_order = ['CortexBuffer', '6OHDAtail']
    color_palette = [(0.24715576, 0.49918708, 0.57655991),
                 (0.72162039, 0.3669119 , 0.22526315)]

    sns.set_palette(color_palette)

    # maximum number of trials performed per mouse in the dataset:
    df_to_plot.groupby(['AnimalID', 'ExperimentalGroup', 'Protocol']).max()['CumulativeTrialNumberByProtocol']

    """#### Performance by session and 200-sized bins """

    # bin trials every 200
    df_to_plot["TrialIndexBinned200"] = (df_to_plot.CumulativeTrialNumberByProtocol // 200) * 200 + 100

    # Sanity check on the data to see that it looks good

    fig = make_figures.make_figure_performance_trials_animals_bin(df_to_plot)
    clear_output()
    plt.show(fig)
    # uncomment here to save the plot
    # data_directory = ''
    # plt.savefig(data_directory + 'Performance_by_session_individual_animals.pdf',
    #             transparent=True, bbox_inches='tight')

    """Colored dots show the performance of the past 100 trials using an average running window. Each color represents a distinct session. The black line shows the performance value of the past 200 trials using trial bining.

    #### Fit a sigmoid to every mouse to calculate and compare stages and learning rates

    The sigmoid function is (perf_end - 0.5) / (1 + np.exp(-slope * (x - bias))) + 0.5

    Data is scaled before the fitting and parameters are rescaled afterwards

    The maximum performace possible is defined as the maximum of the median of the trials binned every 200
    """

    #calculate the maximum performance for every mouse based on the trials binned every 200
    df_bin200tr = df_to_plot.groupby(['AnimalID','ExperimentalGroup','TrialIndexBinned200','Protocol']).median().reset_index()
    mouse_max_perf = df_bin200tr.groupby('AnimalID').max().reset_index()[['AnimalID', 'CurrentPastPerformance100']]

    # fit the model
    fit_df = bm.get_df_of_behavior_model_by_animal(df_to_plot, mouse_max_perf)

    # find the steepest point of each slope (max of derivative)
    der_max_dir = bm.get_steepest_point_of_slope(fit_df)

    # plot the curves again pointing to the maximum
    # sanity check to see that these scaled values recreate the curves
    fig = make_figures.make_figure_performance_trials_animals_model(df_to_plot, fit_df, der_max_dir)
    clear_output()
    plt.show(fig)
    # plt.savefig(data_directory + 'Sigmoid_fitting.pdf', transparent=True, bbox_inches='tight')

    """As a learning speed measure I use the point of the maximum learning rate, which is the maximum of the derivative of the fitting, which is also the middle point of the curve (in between 50% and maximum performance)
    The bias indicates where this point lies in the x axis: at which trial number this point is reached
    """

    # add the maximum value of the derivative to the fit_df dataframe
    for key, value in der_max_dir.items():
        fit_df.loc[fit_df.index[fit_df['AnimalID'] == key].tolist()[0], 'max_of_der'] = value[1]

    """### Merge with image data"""

    merged_df = pd.merge(left=fit_df, right=iar, how='left', left_on='AnimalID', right_on='mouse id')[
        ['AnimalID',
         'maximum_performance',
         'bias',
         'ExperimentalGroup',
         'max_of_der',
         'median_performance_reached_in_last_ntrials',
         'n trials to criterion',
         'max performance reached',
         'ratio posterior/anterior']
    ]

    """##### The following cell creates **Figure 2 E** from the paper"""

    fig = make_figures.make_figure_6ohda_lesion_correlation(merged_df, color_palette)
    plt.show(fig)
    # plt.savefig(data_directory + '6ohda_lesion-correlation.pdf', transparent=True, bbox_inches='tight')

    # There is one lesioned animal that is very close to the controls in terms of the lesioned area:
    merged_df[['AnimalID', 'ExperimentalGroup', 'ratio posterior/anterior']]

    # Remove this animal from the study
    ans_to_remove = ['SomFlp08']
    # ans_to_remove = []
    df_to_plot = df_to_plot[~df_to_plot.AnimalID.isin(ans_to_remove)].copy()
    fit_df = fit_df[~fit_df.AnimalID.isin(ans_to_remove)].copy()
    print('Animals removed from the following analysis based on this:', ans_to_remove)

    """### Differences of parameters between groups"""

    # calculate significance
    parameters_to_show = ['maximum_performance', 'max_of_der']
    # define three levels of significance for the plotting
    sig_levels = [0.05, 0.01, 0.001]
    pvals = []
    print('Kluskal-Wallis tests on the parameters')
    for var in parameters_to_show:
        kt = stats.kruskal(fit_df[fit_df.ExperimentalGroup==hue_order[0]][var].dropna(),
                           fit_df[fit_df.ExperimentalGroup==hue_order[1]][var].dropna())
        print( var + ':\t\tpvalue: ' + str(kt.pvalue) )
        pvals.append(kt.pvalue)

    """##### The following cell creates **Figures 2 C, D** from the paper"""

    # compare the parameters between groups
    titles = ['maximum\nperformance', 'maximum\nlearning rate']
    ylabs = ['task performance (%)', '\u0394performance \u00D7 trials$\mathregular{^{-1}}$']
    yvals = [70, 0.001] # for the statistics

    fig = make_figures.make_figure_learning_parameters_between_groups(fit_df, parameters_to_show,
                                                                      titles, ylabs,
                                                                      pvals, sig_levels,
                                                                      color_palette, hue_order, yvals)

    # plt.savefig(data_directory + 'Parameters_group_comparison.pdf', transparent=True, bbox_inches='tight')
    plt.show(fig)

    for i,s in enumerate(sig_levels):
        print(i+1, 'asterisks: pval <', s)


    """### calculate the statistical differences of performances between groups at different points in learning

    ##### The following cell creates the first part of **Figure 2 B** from the paper
    """

    # Performance differences between groups across learning
    col_to_plot = 'CurrentPastPerformance100'
    fig1 = make_figures.make_figure_differences_performance_between_groups(df_to_plot, col_to_plot,
                                                                           hue_order, color_palette)

    # plt.savefig(data_directory + 'Performance_between_groups.pdf', transparent=True, bbox_inches='tight')
    plt.show(fig1)
    print('Shaded area indicates std, and performance is calculated using', col_to_plot)


.. GENERATED FROM PYTHON SOURCE LINES 217-220

7. Separate performance on high and low tones
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
create a mask for the TrialHighPerc columns

.. GENERATED FROM PYTHON SOURCE LINES 220-263

.. code-block:: default


    high_mask = df_to_plot['TrialHighPerc'] == 98.0
    low_mask = df_to_plot['TrialHighPerc'] == 2.0
    df_high = df_to_plot[high_mask].copy()
    df_low = df_to_plot[low_mask].copy()

    # reset the index and the cumulative trial number by protocol
    df_high.reset_index(inplace=True)
    df_low.reset_index(inplace=True)
    df_high.drop('index', axis=1, inplace=True)
    df_low.drop('index', axis=1, inplace=True)

    # Restart the count of CumulativeTrialNumber for every protocol
    df_high['CumulativeTrialNumberByProtocol'] = np.nan
    df_low['CumulativeTrialNumberByProtocol'] = np.nan

    for Aid in pd.unique(df_high['AnimalID']):
        for Prot in pd.unique(df_high['Protocol']):
            conditions = np.logical_and(df_high['AnimalID'] == Aid, df_high['Protocol'] == Prot)
            df_high.CumulativeTrialNumberByProtocol.loc[df_high[conditions].index] = \
                np.arange(len(df_high[conditions])) + 1

    for Aid in pd.unique(df_low['AnimalID']):
        for Prot in pd.unique(df_low['Protocol']):
            conditions = np.logical_and(df_low['AnimalID'] == Aid, df_low['Protocol'] == Prot)
            df_low.CumulativeTrialNumberByProtocol.loc[df_low[conditions].index] = \
                np.arange(len(df_low[conditions])) + 1


    # calculate performance
    PAST_WINDOW = 50
    CumPerListHigh = []
    CumPerListLow = []
    for Sid in pd.unique(df_to_plot['SessionID']):
        CumPerListHigh.append(cuf.perf_window_calculator(df_high[df_high['SessionID']==Sid],
                                                         PAST_WINDOW))
        CumPerListLow.append(cuf.perf_window_calculator(df_low[df_low['SessionID']==Sid],
                                                         PAST_WINDOW))
    # flatten the list of lists
    df_high['CurrentPastPerformance100biasremoved'] = np.array(list(chain(*[x for x in CumPerListHigh])))
    df_low['CurrentPastPerformance100biasremoved'] = np.array(list(chain(*[x for x in CumPerListLow])))


.. GENERATED FROM PYTHON SOURCE LINES 264-265

Plot performance on high and low tones

.. GENERATED FROM PYTHON SOURCE LINES 265-281

.. code-block:: default

    col_to_plot = 'CurrentPastPerformance100biasremoved'
    figX = make_figures.make_figure_differences_performance_between_groups(df_high, col_to_plot,
                                                                           hue_order, color_palette)
    data_directory = ''
    plt.savefig(data_directory + '6OHDA_Performance_between_groups_high_tones.pdf', transparent=True, bbox_inches='tight')
    figX.show()


.. GENERATED FROM PYTHON SOURCE LINES 282-330

.. code-block:: default


    """#### Calculate the significance by resampling: suffle the group labels multiple times and calculate the likelihood of observing this data
    The shuffling respects the proportion of mice in every group.

    References:

    Supplementary figure 4 in here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2562676/

    See also methods here: https://www.biorxiv.org/content/10.1101/716274v3.full
    """

    # define a 100-trial window to bin the data
    xbin = 100
    df_to_plot["TrialIndexBinned"] = (df_to_plot.CumulativeTrialNumberByProtocol // xbin) * xbin + xbin / 2
    print('Trials are binned in groups of', xbin)

    # groupby so each animal has a mean of the performance in each bin
    df_bintr = df_to_plot.groupby(['AnimalID','ExperimentalGroup','TrialIndexBinned','Protocol']).mean().reset_index()

    # create a scaled version of the performance
    df_bintr['Performance'] = df_bintr.FirstPokeCorrect * 100

    # calculate the differences of the real means using the binned data
    real_data_pd = df_bintr[df_bintr.ExperimentalGroup == hue_order[1]].groupby('TrialIndexBinned').mean()['Performance'] -\
                   df_bintr[df_bintr.ExperimentalGroup == hue_order[0]].groupby('TrialIndexBinned').mean()['Performance']

    # Select the amount of times to shuffle. Originally this is done 10,000 times
    # use a smaller number to speed things up
    nsh = 1000
    print('Data is shuffled', nsh, 'times')

    # select the important columns to get the shuffled data
    df_colsel = df_bintr[['AnimalID', 'ExperimentalGroup', 'TrialIndexBinned', 'Performance']].copy()
    # get the shuffled data
    shrdf = cuf.get_shuffled_means_difference_df(df_colsel, hue_order, nsh)

    # calculate the confidence intervals for the shuffled data
    pos_ci = shrdf.groupby('TrialIndexBinned').quantile(.95)
    neg_ci = shrdf.groupby('TrialIndexBinned').quantile(.05)

    """##### The following cell creates the second part of **Figure 2 B** from the paper

    This shows how likely is that each time point crosses 'random' line (point-base significance).
    """

    fig2 = make_figures.make_figure_differences_performance_significance(real_data_pd, pos_ci, neg_ci)
    # plt.savefig(data_directory + 'Differences_of_means_significance_by_trial_bins.pdf',transparent=True, bbox_inches='tight')
    plt.show(fig2)

.. rst-class:: sphx-glr-timing

   **Total running time of the script:** ( 0 minutes  0.000 seconds)


.. _sphx_glr_download_auto_examples_figure_2_bcde.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example


    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: figure_2_bcde.py <figure_2_bcde.py>`

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: figure_2_bcde.ipynb <figure_2_bcde.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_