"""Metrics to assess inter-annotator agreement"""
__all__ = ["krippendorff_alpha"]
from typing import List, Union
import numpy as np
def _get_values_by_unit_matrix(
reliability_data: np.ndarray, labels_set: np.ndarray
) -> np.ndarray:
"""
Return the label counts given the annotators_data.
Parameters
----------
reliability_data : ndarray, with shape (m_annotators, n_samples)
numpy array with labels given to `n_samples` by `m_annotators`
The missing labels are represented with `None`.
labels_set : ndarray, with shape (n_labels,)
Possible labels the item can take.
Returns
-------
values_by_unit : ndarray, with shape (n_labels, n_samples)
Number of annotators that assigned a certain label by annotation.
Where `n_labels` is the number of possible labels and `n_samples`
is the number of annotations.
"""
ann_data_expanded = np.expand_dims(reliability_data, 2)
return np.sum(ann_data_expanded == labels_set, axis=0).T
def _compute_observed_disagreement(values_by_unit_matrix: np.ndarray) -> float:
"""
Return the observed disagreement given values-by-unit matrix.
Parameters
----------
values_by_unit_matrix : ndarray, with shape (n_labels, n_samples)
Count of annotators that assigned a certain label by annotation.
Returns
-------
do : float
observed disagreement among labels assigned to annotations
"""
# select only units with disagreement
# units with more than two assigned values
units_to_keep = np.count_nonzero(values_by_unit_matrix, 0) > 1
matrix_disagreement = values_by_unit_matrix[:, units_to_keep]
total_by_unit = matrix_disagreement.sum(0)
do = 0
for u, unit in enumerate(matrix_disagreement.T):
unit = unit[unit > 0]
for n in range(0, len(unit)):
# only nominal weight is supported in this function
# perfect agreement seen as 0 disagreement
p_unit = np.dot(unit[n], unit[n + 1 :]) / (total_by_unit[u] - 1)
do += np.sum(p_unit)
return do
def _compute_expected_disagreement(values_by_unit_matrix: np.ndarray) -> float:
"""
Return the expected disagreement given values-by-unit matrix.
Parameters
----------
values_by_unit_matrix : ndarray, with shape (n_labels, n_samples)
Count of annotators that assigned a certain label by annotation.
Returns
-------
de : float
expected disagreement annotators will have by chance
"""
# all units with at least 1 value
paried_units = values_by_unit_matrix.sum(0) > 1
total_by_value = values_by_unit_matrix[:, paried_units].sum(1)
de = 0
# only nominal weight is supported in this function
for n_c in range(0, len(total_by_value) - 1):
de += np.sum(np.dot(total_by_value[n_c], total_by_value[n_c + 1 :]))
return de
[docs]def krippendorff_alpha(all_annotators_data: List[List[Union[None, str, int]]]) -> float:
"""
Compute Krippendorff's alpha: a coefficient of agreement among many
annotators.
This coefficient is a generalization of several reliability indices.
The general form is:
.. math::
\\alpha = 1 - \\frac{D_o}{D_e}
where :math:`D_o` is the observed disagreement among labels assigned to
units or annotations and :math:`D_e` is the disagreement between annotators
attributable to chance. The arguments of the disagreement measures are values
in coincidence matrices.
This function implements the general computational form proposed in [1]_,
but only supports binaire or nominal labels.
Parameters
----------
all_annotators_data : array_like, (m_annotators,n_samples)
Reliability_data, list or array of labels given to `n_samples` by `m_annotators`.
Missing labels are represented with `None`
Returns
-------
alpha : float
The alpha coefficient, a number between 0 and 1.
A value of 0 indicates the absence of reliability, and
a value of 1 indicates perfect reliability.
Raises
------
AssertionError
Raise if any list of labels within `all_annotators_data` differs in size or
if there is only one label to be compared.
References
----------
.. [1] K. Krippendorff, “Computing Krippendorff's alpha-reliability,”
ScholarlyCommons, 25-Jan-2011, pp. 8-10. [Online].
Available: https://repository.upenn.edu/asc_papers/43/
Examples
--------
Three annotators labelled six items. Some labels are missing.
>>> annotator_A = ['yes','yes','no','no','yes',None]
>>> annotator_B = [None,'yes','no','yes','yes','no']
>>> annotator_C = ['yes','no','no','yes','yes',None]
>>> krippendorff_alpha([annotator_A,annotator_B,annotator_C])
0.42222222222222217
"""
assert all(
len(d) == len(all_annotators_data[0]) for d in all_annotators_data
), "Number of labels should be the same for all annotators"
all_annotators_data = np.asarray(all_annotators_data)
labels_set = np.array(list(set(all_annotators_data.flatten()) - set([None])))
assert len(labels_set) > 1, "There must be more than one label in annotators data"
values_count = _get_values_by_unit_matrix(all_annotators_data, labels_set)
do = _compute_observed_disagreement(values_count)
de = _compute_expected_disagreement(values_count)
total_paried_values = np.sum(values_count[:, values_count.sum(0) > 1])
alpha = 1 - (total_paried_values - 1) * (do / de)
return alpha