Source code for thebeat.stats

# Copyright (C) 2022-2023  Jelle van der Werff
#
# This file is part of thebeat.
#
# thebeat is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# thebeat is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with thebeat.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import annotations
from typing import Optional, Union
import scipy.stats
import scipy.fft
import numpy as np
import thebeat.core
from thebeat.helpers import sequence_to_binary
import matplotlib.pyplot as plt
import scipy.signal
import scipy.stats
from scipy.fft import rfft, rfftfreq
import pandas as pd
import thebeat.helpers
import Levenshtein
import warnings


[docs]def acf_df(sequence: thebeat.core.Sequence,
           resolution,
           smoothing_window: Optional[float] = None,
           smoothing_sd: Optional[float] = None) -> pd.DataFrame:
    """

    Perform autocorrelation analysis on a :py:class:`Sequence` object,
    and return a :class:`Pandas.DataFrame` object containing the results.

    Parameters
    ----------

    sequence
        A :py:class:`~thebeat.core.Sequence` object.
    resolution
        The temporal resolution. If the used Sequence is in seconds, you might want to use 0.001.
        If the Sequence is in milliseconds, try using 1. Incidentally, the number of lags
        for the autocorrelation function is calculated as
        ``n_lags = sequence_duration / resolution``.
    smoothing_window
        The window within which a normal probability density function is used for
        smoothing out the analysis.
    smoothing_sd
        The standard deviation of the normal probability density function used for smoothing out the analysis.

    Returns
    -------
    :class:`pandas.DataFrame`
        DataFrame containing two columns: the timestamps, and the autocorrelation factor.

    Notes
    -----
    This function is based on the procedure described in :cite:t:`ravignaniMeasuringRhythmicComplexity2017`.
    There, one can also find a more detailed description of the smoothing procedure.

    Examples
    --------
    >>> rng = np.random.default_rng(seed=123)  # for reproducability
    >>> seq = thebeat.core.Sequence.generate_random_uniform(n_events=10,a=400,b=600,rng=rng)
    >>> df = acf_df(seq, smoothing_window=50, smoothing_sd=20, resolution=10)
    >>> print(df.head(3))
       timestamp  correlation
    0          0     1.000000
    1         10     0.851373
    2         20     0.590761
    """

    correlations = acf_values(sequence=sequence, resolution=resolution, smoothing_window=smoothing_window,
                              smoothing_sd=smoothing_sd)
    correlations = correlations / np.max(correlations)
    timestamps = np.arange(start=0, stop=len(correlations) * resolution, step=resolution)

    df = pd.DataFrame(
        {
            "timestamp": timestamps,
            "correlation": correlations
        }
    )

    return df


[docs]def acf_plot(sequence: thebeat.core.Sequence,
             resolution,
             max_lag: Optional[float] = None,
             smoothing_window: Optional[float] = None,
             smoothing_sd: Optional[float] = None,
             style: str = 'seaborn-v0_8',
             title: str = 'Autocorrelation',
             x_axis_label: str = 'Lag',
             y_axis_label: str = 'Correlation',
             figsize: Optional[tuple] = None,
             dpi: int = 100,
             ax: Optional[plt.Axes] = None,
             suppress_display: bool = False) -> tuple[plt.Figure, plt.Axes]:
    """
    This function can be used for plotting an autocorrelation plot from a :class:`~thebeat.core.Sequence`.

    Parameters
    ----------
    sequence
        A :py:class:`~thebeat.core.Sequence` object.
    resolution
        The temporal resolution. If the used Sequence is in seconds, you might want to use 0.001.
        If the Sequence is in milliseconds, try using 1. Incidentally, the number of lags
        for the autocorrelation function is calculated as
        ``n_lags = sequence_duration_in_ms / resolution``.
    max_lag
        The maximum lag to be plotted. Defaults to the sequence duration.
    smoothing_window
        The window (in milliseconds) within which a normal probability density function is used for
        smoothing out the analysis.
    smoothing_sd
        The standard deviation of the normal probability density function used for smoothing out the analysis.
    style
        Style used by matplotlib. See `matplotlib style sheets reference
        <https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html>`_.
    title
        If desired, one can provide a title for the plot. This takes precedence over using the
        :class:`~thebeat.core.Sequence` or :class:`~thebeat.core.SoundSequence` ``name`` attribute as the title of the
        plot (if the object has one).
    x_axis_label
        A label for the x axis.
    y_axis_label
        A label for the y axis.
    figsize
        A tuple containing the desired output size of the plot in inches, e.g. ``(4, 1)``.
        This refers to the ``figsize`` parameter in :func:`matplotlib.pyplot.figure`.
    dpi
        The desired output resolution of the plot in dots per inch (DPI). This refers to the ``dpi`` parameter
        in :func:`matplotlib.pyplot.figure`.
    ax
        If desired, one can provide an existing :class:`matplotlib.axes.Axes` object to plot the autocorrelation
        plot on. This is for instance useful if you want to plot multiple autocorrelation plots on the same figure.
    suppress_display
        If ``True``, :func:`matplotlib.pyplot.show` is not run.

    Notes
    -----
    This function is based on the procedure described in :cite:t:`ravignaniMeasuringRhythmicComplexity2017`.
    There, one can also find a more detailed description of the smoothing procedure.

    """

    onsets = sequence.onsets

    correlation = acf_values(sequence=sequence, resolution=resolution, smoothing_window=smoothing_window,
                             smoothing_sd=smoothing_sd)

    x_step = resolution
    max_lag = int(max_lag // resolution) if max_lag else np.floor(np.max(onsets) / resolution).astype(int)

    # plot
    try:
        y = correlation[:max_lag]
        y = y / np.max(y)  # normalize
    except ValueError:
        raise ValueError("We end up with an empty y axis. Try changing the resolution.")

    # Make x axis
    x = np.arange(start=0, stop=len(y) * x_step, step=x_step)

    with plt.style.context(style):
        if ax is None:
            fig, ax = plt.subplots(figsize=figsize, tight_layout=True, dpi=dpi)
        else:
            fig = ax.get_figure()

        ax.set_xlabel(x_axis_label)
        ax.set_ylabel(y_axis_label)
        ax.set_title(title)
        ax.plot(x, y)

    if not suppress_display and ax is not None:
        plt.show()

    return fig, ax


[docs]def acf_values(sequence: thebeat.core.Sequence,
               resolution,
               smoothing_window: Optional[float] = None,
               smoothing_sd: Optional[float] = None) -> np.ndarray:
    """

    Perform autocorrelation. This function takes a :class:`~thebeat.core.Sequence` object, and returns an array with
    steps of ``resolution`` of unstandardized correlation factors.

    Parameters
    ----------
    sequence
        A :class:`~thebeat.core.Sequence` object.
    resolution
        The temporal resolution. If the used Sequence is in seconds, you might want to use 0.001.
        If the Sequence is in milliseconds, try using 1. Incidentally, the number of lags
        for the autocorrelation function is calculated as
        ``n_lags = sequence_duration / resolution``.
    smoothing_window
        The window within which a normal probability density function is used for
        smoothing out the analysis.
    smoothing_sd
        The standard deviation of the normal probability density function used for smoothing out the analysis.

    Notes
    -----
    This function is based on the procedure described in :cite:t:`ravignaniMeasuringRhythmicComplexity2017`. There,
    one can also find a more detailed description of the smoothing procedure.

    This function uses the :func:`numpy.correlate` to calculate the correlations.

    """

    signal = thebeat.helpers.sequence_to_binary(sequence, resolution)

    # npdf
    if smoothing_window and smoothing_sd:
        x = np.arange(start=-smoothing_window / 2, stop=smoothing_window / 2, step=resolution)
        npdf = scipy.stats.norm.pdf(x, 0, smoothing_sd)
        npdf = npdf / np.max(npdf)
        signal_convoluted = np.convolve(signal, npdf)
        signal = signal_convoluted[round(resolution * smoothing_window / 2):]

    try:
        correlation = np.correlate(signal, signal, 'full')
        correlation = correlation[round(len(correlation) / 2) - 1:]
    except ValueError as e:
        raise ValueError("Error! Hint: Most likely your resolution is too large for the chosen smoothing_window"
                         "and smoothing_sd. Try choosing a smaller resolution.") from e

    return correlation


[docs]def ccf_df(test_sequence: thebeat.core.Sequence,
           reference_sequence: thebeat.core.Sequence,
           resolution,
           smoothing_window: Optional[float] = None,
           smoothing_sd: Optional[float] = None) -> pd.DataFrame:
    """

    Perform autocorrelation analysis on a :py:class:`Sequence` object,
    and return a :class:`Pandas.DataFrame` object containing the results.

    Parameters
    ----------

    test_sequence
        The test sequence.
    reference_sequence
        The reference sequence.
    resolution
        The temporal resolution. If the used Sequence is in seconds, you might want to use 0.001.
        If the Sequence is in milliseconds, try using 1. Incidentally, the number of lags
        for the autocorrelation function is calculated as
        ``n_lags = sequence_duration / resolution``.
    smoothing_window
        The window within which a normal probability density function is used for
        smoothing out the analysis.
    smoothing_sd
        The standard deviation of the normal probability density function used for smoothing out the analysis.

    Returns
    -------
    :class:`pandas.DataFrame`
        DataFrame containing two columns: the timestamps, and the cross-correlation factor.

    Notes
    -----
    This function is based on the procedure described in :cite:t:`ravignaniMeasuringRhythmicComplexity2017`.
    There, one can also find a more detailed description of the smoothing procedure.


    """

    correlations = ccf_values(test_sequence=test_sequence, reference_sequence=reference_sequence, resolution=resolution,
                              smoothing_window=smoothing_window, smoothing_sd=smoothing_sd)
    # normalize
    correlations = correlations / np.max(correlations)
    timestamps = np.arange(start=0, stop=len(correlations) * resolution, step=resolution)

    df = pd.DataFrame(
        {
            "timestamp": timestamps,
            "correlation": correlations
        }
    )

    return df


[docs]def ccf_plot(test_sequence: thebeat.core.Sequence,
             reference_sequence: thebeat.core.Sequence,
             resolution,
             smoothing_window: Optional[float] = None,
             smoothing_sd: Optional[float] = None,
             style: str = 'seaborn-v0_8',
             title: str = 'Cross-correlation',
             x_axis_label: str = 'Lag',
             y_axis_label: str = 'Correlation',
             figsize: Optional[tuple] = None,
             dpi: int = 100,
             ax: Optional[plt.Axes] = None,
             suppress_display: bool = False) -> tuple[plt.Figure, plt.Axes]:
    """
    Calculate and plot the cross-correlation function (CCF) between two :class:`~thebeat.core.Sequence` objects.
    The test sequence is compared to the reference sequence.

    Parameters
    ----------
    test_sequence
        The test sequence.
    reference_sequence
        The reference sequence.
    resolution
        The temporal resolution. If the used Sequence is in milliseconds, you probably want 1. If the Sequence is in
        seconds, try using 0.001.
    smoothing_window
        The window within which a normal probability density function is used for
        smoothing out the analysis.
    smoothing_sd
        The standard deviation of the normal probability density function used for smoothing out the analysis.
    style
        The matplotlib style to use. See
        `matplotlib style reference <https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html>`_.
    title
        The title of the plot.
    x_axis_label
        The label of the x axis.
    y_axis_label
        The label of the y axis.
    figsize
        A tuple containing the desired output size of the plot in inches, e.g. ``(4, 1)``.
        This refers to the ``figsize`` parameter in :func:`matplotlib.pyplot.figure`.
    dpi
        The resolution of the plot in dots per inch. This refers to the ``dpi`` parameter in
        :func:`matplotlib.pyplot.figure`.
    ax
        A :class:`matplotlib.axes.Axes` object. If ``None``, a new Figure and Axes is created.
    suppress_display
        If ``True``, the plot is not displayed. This is useful e.g. if you only want to save the plot to a file

    Returns
    -------
    fig
        The :class:`matplotlib.figure.Figure` object.
    ax
        The :class:`matplotlib.axes.Axes` object.

    Notes
    -----
    This function is based on the procedure described in :cite:t:`ravignaniMeasuringRhythmicComplexity2017`. There,
    one can also find a more detailed description of the smoothing procedure.

    """

    # Get correlation factors
    correlation = ccf_values(test_sequence=test_sequence, reference_sequence=reference_sequence,
                             resolution=resolution, smoothing_window=smoothing_window,
                             smoothing_sd=smoothing_sd)

    # Make y axis
    x_step = resolution
    max_lag = np.floor(np.max(np.concatenate([test_sequence.onsets, reference_sequence.onsets])) / resolution).astype(
        int)
    try:
        y = correlation[:max_lag]
        y = y / np.max(y)  # normalize
    except ValueError:
        raise ValueError("We end up with an empty y axis. Try changing the resolution.")

    # Make x axis
    x = np.arange(start=0, stop=len(y) * x_step, step=x_step)

    # Plot
    with plt.style.context(style):
        if ax is None:
            fig, ax = plt.subplots(figsize=figsize, tight_layout=True, dpi=dpi)
        else:
            fig = ax.get_figure()

        ax.set_xlabel(x_axis_label)
        ax.set_ylabel(y_axis_label)
        ax.set_title(title)
        ax.plot(x, y)

    if not suppress_display and ax is not None:
        plt.show()

    return fig, ax


[docs]def ccf_values(test_sequence: thebeat.core.Sequence,
               reference_sequence: thebeat.core.Sequence,
               resolution: float,
               smoothing_window: Optional[float] = None,
               smoothing_sd: Optional[float] = None) -> np.ndarray:
    """
    Returns the unstandardized cross-correlation function (CCF) for two :class:`~thebeat.core.Sequence` objects.
    The test sequence is compared to the reference sequence.

    Parameters
    ----------
    test_sequence
        The test sequence.
    reference_sequence
        The reference sequence.
    resolution
        The temporal resolution. If the used Sequence is in milliseconds, you probably want 1. If the Sequence is in
        seconds, try using 0.001.
    smoothing_window
        The window within which a normal probability density function is used for
        smoothing out the analysis.
    smoothing_sd
        The standard deviation of the normal probability density function used for smoothing out the analysis.

    Returns
    -------
    correlation
        The unstandardized cross-correlation function.

    """

    # Make into 0's and 1's
    test_signal = thebeat.helpers.sequence_to_binary(test_sequence, resolution)
    ref_signal = thebeat.helpers.sequence_to_binary(reference_sequence, resolution)

    # npdf
    if smoothing_window and smoothing_sd:
        x = np.arange(start=-smoothing_window / 2, stop=smoothing_window / 2, step=resolution)
        npdf = scipy.stats.norm.pdf(x, 0, smoothing_sd)
        npdf = npdf / np.max(npdf)
        test_signal_convoluted = np.convolve(test_signal, npdf)
        ref_signal_convoluted = np.convolve(ref_signal, npdf)
        test_signal = test_signal_convoluted[round(resolution * smoothing_window / 2):]
        ref_signal = ref_signal_convoluted[round(resolution * smoothing_window / 2):]

    # Make signals of equal length
    diff = len(ref_signal) - len(test_signal)
    if diff > 0:  # ref_signal is longer
        test_signal = np.concatenate((test_signal, np.zeros(diff)))
    elif diff < 0:  # test_signal is longer
        ref_signal = np.concatenate((ref_signal, np.zeros(-diff)))

    # Calculate cross-correlation
    try:
        correlation = np.correlate(test_signal, ref_signal, 'full')
        correlation = correlation[round(len(correlation) / 2) - 1:]
    except ValueError as e:
        raise ValueError("Error! Hint: Most likely your resolution is too large for the chosen smoothing_window"
                         "and smoothing_sd. Try choosing a smaller resolution.") from e

    return correlation


[docs]def edit_distance_rhythm(test_rhythm: thebeat.music.Rhythm,
                         reference_rhythm: thebeat.music.Rhythm,
                         smallest_note_value: int = 16) -> float:
    """
    Caculates edit/Levenshtein distance between two rhythms. The ``smallest_note_value`` determines
    the underlying grid that is used. If e.g. 16, the underlying grid is composed of 1/16th notes.

    Note
    ----
    Based on the procedure described in :cite:t:`postEditDistanceMeasure2011`.

    Parameters
    ----------
    test_rhythm
        The rhythm to be tested.
    reference_rhythm
        The rhythm to which ``test_rhythm`` will be compared.
    smallest_note_value
        The smallest note value that is used in the underlying grid. 16 means 1/16th notes, 4 means 1/4th notes, etc.

    Examples
    --------
    >>> from thebeat.music import Rhythm
    >>> test_rhythm = Rhythm.from_fractions([1/4, 1/4, 1/4, 1/4])
    >>> reference_rhythm = Rhythm.from_fractions([1/4, 1/8, 1/8, 1/4, 1/4])
    >>> print(edit_distance_rhythm(test_rhythm, reference_rhythm))
    1

    """
    if not isinstance(test_rhythm, thebeat.music.Rhythm) or not isinstance(reference_rhythm, thebeat.music.Rhythm):
        raise TypeError("test_rhythm and reference_rhythm must be of type Rhythm")

    test_string = thebeat.helpers.rhythm_to_binary(rhythm=test_rhythm,
                                                   smallest_note_value=smallest_note_value)
    reference_string = thebeat.helpers.rhythm_to_binary(rhythm=reference_rhythm,
                                                        smallest_note_value=smallest_note_value)

    # calculate edit distance
    edit_distance = Levenshtein.distance(test_string, reference_string)

    return edit_distance


[docs]def edit_distance_sequence(test_sequence: thebeat.core.Sequence,
                           reference_sequence: thebeat.core.Sequence,
                           resolution: int) -> float:
    """
    Calculates the edit/Levenshtein distance between two sequences.

    If Sequences are not quantized to ``resolution``, they will be quantized to that resolution first.

    Note
    ----
    The resolution also represents the underlying grid. If, for example, the resolution is 50, that means that
    a grid will be created with steps of 50. The onsets of the sequence are then placed on the grid for both
    sequences. The resulting sequences consist of ones and zeros, where ones represent the event onsets. This string
    for ``test_sequence`` is compared to the string of the ``reference_sequence``. Note that ``test_sequence`` and
    ``reference_sequence`` can be interchanged without an effect on the results.

    Parameters
    ----------
    test_sequence
        The sequence to be tested.
    reference_sequence
        The sequence to which ``test_sequence`` will be compared.
    resolution
        The resolution to which the sequences will be quantized.
        If the sequences are already quantized to this resolution,
        they will not be quantized again.

    """
    if not isinstance(test_sequence, thebeat.core.Sequence) or not isinstance(reference_sequence, thebeat.core.Sequence):
        raise TypeError("test_sequence and reference_sequence must be of type Sequence")

    # Check whether we need to quantize the sequences
    if np.any(test_sequence.onsets % resolution != 0):
        warnings.warn(f"Test sequence has been quantized to onsets that are multiples of {resolution}.")
        test_sequence.quantize(to=resolution)

    if np.any(reference_sequence.onsets % resolution != 0):
        warnings.warn(f"Reference sequence has been quantized to onsets that are multiples of {resolution}.")
        reference_sequence.quantize(to=resolution)


    test_string = thebeat.helpers.sequence_to_binary(sequence=test_sequence,
                                                     resolution=resolution)
    reference_string = thebeat.helpers.sequence_to_binary(sequence=reference_sequence,
                                                          resolution=resolution)

    # calculate edit distance
    edit_distance = Levenshtein.distance(test_string, reference_string)

    return edit_distance

[docs]def fft_plot(sequence: thebeat.core.Sequence,
             unit_size: float,
             x_min: float = 0,
             x_max: Optional[float] = None,
             style: str = 'seaborn-v0_8',
             title: str = 'Fourier transform',
             x_axis_label: str = 'Cycles per unit',
             y_axis_label: str = 'Absolute power',
             figsize: Optional[tuple] = None,
             dpi: int = 100,
             ax: Optional[plt.Axes] = None,
             suppress_display: bool = False) -> tuple[plt.Figure, plt.Axes]:
    """
    Plots the Fourier transform of a :class:`~thebeat.core.Sequence` object.
    The ``unit_size`` parameter is required, because Sequence objects are agnostic about the used time unit.
    You can use 1000 if the Sequence is in milliseconds, and 1 if the Sequence is in seconds.
    Note that the first frame is discarded since it will always have the highest power, yet is not informative.

    Parameters
    ----------
    sequence
        The sequence.
    unit_size
        The size of the unit in which the sequence is measured. If the sequence is in milliseconds,
        you probably want 1000. If the sequence is in seconds, you probably want 1.
    x_min
        The minimum number of cycles per unit to be plotted.
    x_max
        The maximum number of cycles per unit to be plotted.
    style
        The matplotlib style to use. See
        `matplotlib style reference <https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html>`_.
    title
        The title of the plot.
    x_axis_label
        The label of the x axis.
    y_axis_label
        The label of the y axis.
    figsize
        A tuple containing the desired output size of the plot in inches, e.g. ``(4, 1)``.
        This refers to the ``figsize`` parameter in :func:`matplotlib.pyplot.figure`.
    dpi
        The resolution of the plot in dots per inch.
    ax
        A matplotlib Axes object to plot on. If not provided, a new figure and axes will be created.
    suppress_display
        If True, the plot will not be displayed.

    Returns
    -------
    fig
        The matplotlib Figure object.
    ax
        The matplotlib Axes object.

    Examples
    --------
    >>> from thebeat import Sequence
    >>> from thebeat.stats import fft_plot
    >>> seq = Sequence.generate_random_normal(n_events=100, mu=500, sigma=25)  # milliseconds
    >>> fft_plot(seq, unit_size=1000)
    (<Figure size 800x550 with 1 Axes>, <AxesSubplot: title={'center': 'Fourier transform'}, xlabel='Cycles per unit', ylabel='Absolute power'>)

    >>> seq = Sequence.generate_random_normal(n_events=100, mu=0.5, sigma=0.025)  # seconds
    >>> fft_plot(seq, unit_size=1, x_max=5)
    (<Figure size 800x550 with 1 Axes>, <AxesSubplot: title={'center': 'Fourier transform'}, xlabel='Cycles per unit', ylabel='Absolute power'>)

    """

    # Calculate step size
    step_size = unit_size / 1000

    # Make a sequence of ones and zeroes
    timeseries = sequence_to_binary(sequence, resolution=step_size)
    duration = np.max(sequence.onsets)
    x_length = np.ceil(duration / step_size).astype(int)

    # Do the fft
    yf = rfft(timeseries)[1:]
    xf = rfftfreq(x_length, d=step_size)[1:] * (step_size / 0.001)

    # Calculate reasonable max_freq
    max_freq_index = np.min(np.where(xf > x_max)) if x_max else len(xf) / 10
    yf = yf[:int(max_freq_index)]
    xf = xf[:int(max_freq_index)]

    # Plot
    with plt.style.context(style):
        if ax is None:
            fig, ax = plt.subplots(figsize=figsize, tight_layout=True, dpi=dpi)
            ax_provided = False
        else:
            fig = ax.get_figure()
            ax_provided = True
        ax.plot(xf, np.abs(yf))
        ax.set_xlabel(x_axis_label)
        ax.set_ylabel(y_axis_label)
        ax.set_xlim(x_min, None)
        ax.set_title(title)

    if not suppress_display and ax_provided is False:
        fig.show()

    return fig, ax


[docs]def ks_test(sequence: thebeat.core.Sequence,
            reference_distribution: str = 'normal',
            alternative: str = 'two-sided'):
    """
    This function returns the `D` statistic and `p` value of a one-sample Kolmogorov-Smirnov test.
    It calculates how different the distribution of inter-onset intervals (IOIs) is compared to the provided reference
    distribution.

    If `p` is significant it means that the IOIs are not distributed according to the provided reference distribution.

    Parameters
    ----------
    sequence
        A :class:`~thebeat.core.Sequence` object.
    reference_distribution
        Either 'normal' or 'uniform'. The distribution against which the distribution of inter-onset intervals (IOIs)
        is compared.
    alternative
        Either ‘two-sided’, ‘less’ or ‘greater’. See :func:`scipy.stats.kstest` for more information.

    Returns
    -------
    :class:`scipy.stats._stats_py.KstestResult`
        A SciPy named tuple containing the `D` statistic and the `p` value.

    Notes
    -----
    This function uses :func:`scipy.stats.kstest`. For more information about the use of the
    Kolmogorov-Smirnov test in rhythm research, see :cite:t:`jadoulSeekingTemporalPredictability2016` and
    :cite:t:`ravignaniMeasuringRhythmicComplexity2017`.

    Examples
    --------
    >>> rng = np.random.default_rng(seed=123)
    >>> seq = thebeat.core.Sequence.generate_random_normal(n_events=100,mu=500,sigma=25,rng=rng)
    >>> print(ks_test(seq))
    KstestResult(statistic=0.07176677141846549, pvalue=0.6608009345687911)

    """

    sequence = sequence.iois

    if reference_distribution == 'normal':
        mean = np.mean(sequence)
        sd = np.std(sequence)
        dist = scipy.stats.norm(loc=mean, scale=sd).cdf
        return scipy.stats.kstest(sequence, dist, alternative=alternative)
    elif reference_distribution == 'uniform':
        a = min(sequence)
        b = max(sequence)
        scale = b - a
        dist = scipy.stats.uniform(loc=a, scale=scale).cdf
        return scipy.stats.kstest(sequence, dist, alternative=alternative)
    else:
        raise ValueError("Unknown distribution. Choose 'normal' or 'uniform'.")


[docs]def get_rhythmic_entropy(sequence: Union[thebeat.core.Sequence, thebeat.music.Rhythm],
                         bin_fraction: float = 0.03125):
    """
    Calculate Shannon entropy from bins. This is a measure of rhythmic complexity.
    If many different 'note durations' are present, entropy is high. If only a few are present, entropy is low.
    A sequence that is completely isochronous has a Shannon entropy of 0.

    The bin size is determined from the average inter-onset interval in the
    :py:class:`thebeat.core.Sequence` object (i.e. the tempo) and the ``bin_fraction``.
    The ``bin_fraction`` corresponds to temporal sensitivity. The default is 1/32th of the average IOI.
    This implies that the smallest note value that can be detected is a 1/32th note.

    Parameters
    ----------
    sequence
        The :py:class:`thebeat.core.Sequence` object for which Shannon entropy is calculated.
    bin_fraction
        The fraction of the average inter-onset interval (IOI) that determines the bin size.
        It is multiplied by the average IOI to get the bin size.

    Example
    -------
    A :py:class:`~thebeat.core.Sequence` has an average IOI of 500 ms. With a bin_fraction of 0.03125
    (corresponding to 1/32th note value) the bins will have a size of 15.625 ms.
    The entropy will be calculated from the number of IOIs in each bin.

    References
    ----------
    #todo add reference here for this type of entropy calculation.

    """
    bin_size = np.mean(sequence.iois) * bin_fraction
    bins = np.arange(0, np.max(sequence.iois) + 2 * bin_size, bin_size) - bin_size / 2  # shift bins to center
    bin_counts = np.histogram(sequence.iois, bins=bins)[0]

    return scipy.stats.entropy(bin_counts)


[docs]def get_cov(sequence: thebeat.core.Sequence) -> np.float64:
    """
    Calculate the coefficient of variantion of the inter-onset intervals (IOIS) in a
    :py:class:`thebeat.core.Sequence` object.

    Parameters
    ----------
    sequence
        A :py:class:`thebeat.core.Sequence` object.

    Returns
    -------
    float
        The covariance of the sequence.

    """
    return np.float64(np.std(sequence.iois) / np.mean(sequence.iois))


[docs]def get_npvi(sequence: thebeat.core.Sequence) -> np.float64:
    """

    This function calculates the normalized pairwise variability index (nPVI) for a provided :py:class:`Sequence` or
    :py:class:`SoundSequence` object, or for an interable of inter-onset intervals (IOIs).

    Parameters
    ----------
    sequence
        Either a :py:class:`Sequence` or :py:class:`SoundSequence` object, or an iterable containing inter-onset
        intervals (IOIs).

    Returns
    -------
    :class:`numpy.float64`
        The nPVI for the provided sequence.

    Notes
    -----
    The normalied pairwise variability index (nPVI) is a measure of the variability of adjacent temporal intervals.
    The nPVI is zero for sequences that are perfectly isochronous.
    See :cite:t:`jadoulSeekingTemporalPredictability2016` and :cite:t:`ravignaniMeasuringRhythmicComplexity2017`
    for more information on its use in rhythm research.

    Examples
    --------
    >>> seq = thebeat.core.Sequence.generate_isochronous(n_events=10, ioi=500)
    >>> print(get_npvi(seq))
    0.0

    >>> rng = np.random.default_rng(seed=123)
    >>> seq = thebeat.core.Sequence.generate_random_normal(n_events=10,mu=500,sigma=50,rng=rng)
    >>> print(get_npvi(seq))
    37.6263174529546
    """

    if isinstance(sequence, (thebeat.core.Sequence, thebeat.core.SoundSequence)):
        iois = sequence.iois
    else:
        iois = np.array(sequence)

    npvi_values = []

    for i in range(1, len(iois)):
        diff = iois[i] - iois[i - 1]
        mean = np.mean(iois[i] + iois[i - 1])
        npvi_values.append(np.abs(diff / mean))

    npvi = np.mean(npvi_values) * (100 * (len(iois) - 1))

    return np.float64(npvi)


[docs]def get_ugof_isochronous(test_sequence: thebeat.core.Sequence,
                         reference_ioi: float,
                         output_statistic: str = 'mean') -> np.float64:
    r"""

    This function calculates the universal goodness of fit (``ugof``) measure.
    The ``ugof`` statistic quantifies how well a theoretical sequence describes a sequence at hand
    (the ``test_sequence``). This function can only calculate ``ugof`` using a theoretical sequence that is isochronous.

    The ``reference_ioi`` is the IOI of an isochronous theoretical sequence.

    Parameters
    ----------
    test_sequence
        A :py:class:`~thebeat.core.Sequence` or :py:class:`~thebeat.core.SoundSequence` object that will be compared
        to ``reference_sequence``.
    reference_ioi
        A number (float or int) representing the IOI of an isochronous sequence.
    output_statistic
        Either 'mean' (the default) or 'median'. This determines whether for the individual ugof values we take the mean
        or the median as the output statistic.

    Returns
    -------
    :class:`numpy.float64`
        The ugof statistic.

    Notes
    -----
    This measure is described in :cite:t:`burchardtNovelIdeasFurther2021`.
    Please also refer to `this Github page <https://github.com/LSBurchardt/R_app_rhythm>`_ for an R implementation of
    the *ugof* measure.


    Examples
    --------
    >>> seq = thebeat.core.Sequence.generate_isochronous(n_events=10, ioi=1000)
    >>> ugof = get_ugof_isochronous(seq, reference_ioi=68.21)
    >>> print(ugof)
    0.41817915

    """

    # Input validation and getting onsets for test sequence
    if not isinstance(test_sequence, thebeat.core.sequence.Sequence):
        raise TypeError('test_sequence must be a Sequence object')
    test_onsets = test_sequence.onsets

    # Input validation and getting onsets for reference sequence
    if not isinstance(reference_ioi, (int, float)):
        raise TypeError('reference_sequence must be a number (int or float)')

    reference_onsets = np.arange(start=0,
                                 stop=np.max(test_onsets) + reference_ioi + 1,
                                 step=reference_ioi)

    # For each onset, get the closest theoretical beat and get the absolute difference
    minimal_deviations = np.min(np.abs(test_onsets[:, None] - reference_onsets), axis=1)
    maximal_deviation = reference_ioi / 2

    # calculate ugofs
    ugof_values = minimal_deviations / maximal_deviation

    if output_statistic == 'mean':
        return np.float32(np.mean(ugof_values))
    elif output_statistic == 'median':
        return np.float32(np.median(ugof_values))
    else:
        raise ValueError("The output statistic can only be 'median' or 'mean'.")