Source code for medkit.text.metrics.irr_utils

"""Metrics to assess inter-annotator agreement"""

__all__ = ["krippendorff_alpha"]
from typing import List, Union
import numpy as np


def _get_values_by_unit_matrix(
    reliability_data: np.ndarray, labels_set: np.ndarray
) -> np.ndarray:
    """
    Return the label counts given the annotators_data.

    Parameters
    ----------
    reliability_data : ndarray, with shape (m_annotators, n_samples)
        numpy array with labels given to `n_samples` by `m_annotators`
        The missing labels are represented with `None`.

    labels_set : ndarray, with shape (n_labels,)
        Possible labels the item can take.

    Returns
    -------
    values_by_unit : ndarray, with shape (n_labels, n_samples)
        Number of annotators that assigned a certain label by annotation.
        Where `n_labels` is the number of possible labels and `n_samples`
        is the number of annotations.
    """
    ann_data_expanded = np.expand_dims(reliability_data, 2)
    return np.sum(ann_data_expanded == labels_set, axis=0).T


def _compute_observed_disagreement(values_by_unit_matrix: np.ndarray) -> float:
    """
    Return the observed disagreement given values-by-unit matrix.

    Parameters
    ----------
    values_by_unit_matrix : ndarray, with shape (n_labels, n_samples)
        Count of annotators that assigned a certain label by annotation.

    Returns
    -------
    do : float
        observed disagreement among labels assigned to annotations
    """
    # select only units with disagreement
    # units with more than two assigned values
    units_to_keep = np.count_nonzero(values_by_unit_matrix, 0) > 1
    matrix_disagreement = values_by_unit_matrix[:, units_to_keep]
    total_by_unit = matrix_disagreement.sum(0)

    do = 0
    for u, unit in enumerate(matrix_disagreement.T):
        unit = unit[unit > 0]
        for n in range(0, len(unit)):
            # only nominal weight is supported in this function
            # perfect agreement seen as 0 disagreement
            p_unit = np.dot(unit[n], unit[n + 1 :]) / (total_by_unit[u] - 1)
            do += np.sum(p_unit)
    return do


def _compute_expected_disagreement(values_by_unit_matrix: np.ndarray) -> float:
    """
    Return the expected disagreement given values-by-unit matrix.

    Parameters
    ----------
    values_by_unit_matrix : ndarray, with shape (n_labels, n_samples)
        Count of annotators that assigned a certain label by annotation.

    Returns
    -------
    de : float
        expected disagreement annotators will have by chance
    """
    # all units with at least 1 value
    paried_units = values_by_unit_matrix.sum(0) > 1
    total_by_value = values_by_unit_matrix[:, paried_units].sum(1)

    de = 0
    # only nominal weight is supported in this function
    for n_c in range(0, len(total_by_value) - 1):
        de += np.sum(np.dot(total_by_value[n_c], total_by_value[n_c + 1 :]))
    return de


[docs]def krippendorff_alpha(all_annotators_data: List[List[Union[None, str, int]]]) -> float:
    """
    Compute Krippendorff's alpha: a coefficient of agreement among many
    annotators.

    This coefficient is a generalization of several reliability indices.
    The general form is:

    .. math::
        \\alpha = 1 - \\frac{D_o}{D_e}

    where :math:`D_o` is the observed disagreement among labels assigned to
    units or annotations and :math:`D_e` is the disagreement between annotators
    attributable to chance. The arguments of the disagreement measures are values
    in coincidence matrices.

    This function implements the general computational form proposed in [1]_,
    but only supports binaire or nominal labels.

    Parameters
    ----------
    all_annotators_data : array_like, (m_annotators,n_samples)
        Reliability_data, list or array of labels given to `n_samples` by `m_annotators`.
        Missing labels are represented with `None`

    Returns
    -------
    alpha : float
        The alpha coefficient, a number between 0 and 1.
        A value of 0 indicates the absence of reliability, and
        a value of 1 indicates perfect reliability.

    Raises
    ------
    AssertionError
        Raise if any list of labels within `all_annotators_data` differs in size or
        if there is only one label to be compared.

    References
    ----------
    .. [1] K. Krippendorff, “Computing Krippendorff's alpha-reliability,”
            ScholarlyCommons, 25-Jan-2011, pp. 8-10. [Online].
            Available: https://repository.upenn.edu/asc_papers/43/

    Examples
    --------
    Three annotators labelled six items. Some labels are missing.

    >>> annotator_A = ['yes','yes','no','no','yes',None]
    >>> annotator_B = [None,'yes','no','yes','yes','no']
    >>> annotator_C = ['yes','no','no','yes','yes',None]
    >>> krippendorff_alpha([annotator_A,annotator_B,annotator_C])
    0.42222222222222217
    """
    assert all(
        len(d) == len(all_annotators_data[0]) for d in all_annotators_data
    ), "Number of labels should be the same for all annotators"

    all_annotators_data = np.asarray(all_annotators_data)
    labels_set = np.array(list(set(all_annotators_data.flatten()) - set([None])))
    assert len(labels_set) > 1, "There must be more than one label in annotators data"

    values_count = _get_values_by_unit_matrix(all_annotators_data, labels_set)
    do = _compute_observed_disagreement(values_count)
    de = _compute_expected_disagreement(values_count)
    total_paried_values = np.sum(values_count[:, values_count.sum(0) > 1])

    alpha = 1 - (total_paried_values - 1) * (do / de)
    return alpha