Source code for medkit.text.metrics.classification

"""
This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[metrics-text-classification]`.
"""

__all__ = ["TextClassificationEvaluator"]
import logging
from typing import Dict, List, Union
from typing_extensions import Literal

from sklearn.metrics import classification_report, cohen_kappa_score

from medkit.core.text import TextDocument
from medkit.text.metrics.irr_utils import krippendorff_alpha

logger = logging.getLogger(__name__)


[docs]class TextClassificationEvaluator: """An evaluator for attributes of TextDocuments""" def __init__(self, attr_label: str): """Initialize the text classification evaluator Parameters ---------- attr_label: Label of the attribute to evaluate. """ self.attr_label = attr_label def _extract_attr_values( self, docs: List[TextDocument] ) -> List[Union[str, int, bool]]: """Prepare docs attrs to compute the metric Parameters ---------- docs : List[TextDocument] List of documents with attributes Returns ------- attr_values : List[Union[str,int,bool]] List with the representation of the attribute by document. """ attr_values = [] for doc in docs: attrs = doc.attrs.get(label=self.attr_label) if not attrs: raise ValueError( f"No attribute with label {self.attr_label} was found in the" " document" ) if len(attrs) > 1: logger.warning( f"Found several attributes with label '{self.attr_label}', ignoring" " all but first" ) attr_value = attrs[0].value if not isinstance(attr_value, (str, int, bool)): raise ValueError( "The type of the attr value is not supported by this evaluator." "Only str,int or bool are supported." ) attr_values.append(attr_value) return attr_values
[docs] def compute_classification_report( self, true_docs: List[TextDocument], predicted_docs: List[TextDocument], metrics_by_attr_value: bool = True, average: Literal["macro", "weighted"] = "macro", ) -> Dict[str, Union[float, int]]: """Compute classification metrics of document attributes giving annotated documents. This method uses `sklearn.metrics.classification_report` to compute precision, recall and F1-score for value of the attribute. .. warning:: The set of true and predicted documents must be sorted to calculate the metric Parameters ---------- true_docs: Text documents containing attributes of reference predicted_docs: Text documents containing predicted attributes metrics_by_attr_value: Whether return metrics by attribute value. If False, only global metrics are returned average: Type of average to be performed in metrics. - `macro`, unweighted mean (default) - `weighted`, weighted average by support (number of true instances by attr value) Returns ------- Dict[str,Union[float,int]]: A dictionary with the computed metrics """ true_tags = self._extract_attr_values(true_docs) pred_tags = self._extract_attr_values(predicted_docs) report = classification_report( y_true=true_tags, y_pred=pred_tags, output_dict=True, zero_division=0, ) scores = { f"{average}_{key}": value for key, value in report[f"{average} avg"].items() } scores["support"] = scores.pop(f"{average}_support") scores["accuracy"] = report.pop("accuracy") if metrics_by_attr_value: for value_key in report: if value_key.endswith("avg"): continue for metric_key, metric_value in report[value_key].items(): scores[f"{value_key}_{metric_key}"] = metric_value return scores
[docs] def compute_cohen_kappa( self, docs_annotator_1: List[TextDocument], docs_annotator_2: List[TextDocument] ) -> Dict[str, Union[float, int]]: """Compute the cohen's kappa score, an inter-rated agreement score between two annotators. This method uses 'sklearn' as backend to compute the level of agreement. .. warning:: The set of documents must be sorted to calculate the metric Parameters ---------- docs_annotator_1: Text documents containing attributes annotated by the first annotator docs_annotator_2: Text documents to compare, these documents contain attributes annotated by the other annotator Returns ------- Dict[str, Union[float, int]]: A dictionary with cohen's kappa score and support (number of annotated docs). The value is a number between -1 and 1, where 1 indicates perfect agreement; zero or lower indicates chance agreement. """ ann1_tags = self._extract_attr_values(docs_annotator_1) ann2_tags = self._extract_attr_values(docs_annotator_2) scores = { "cohen_kappa": cohen_kappa_score(y1=ann1_tags, y2=ann2_tags), "support": len(ann1_tags), } return scores
[docs] def compute_krippendorff_alpha( self, docs_annotators: List[List[TextDocument]] ) -> Dict[str, Union[float, int]]: """Compute the Krippendorff alpha score, an inter-rated agreement score between multiple annotators. .. warning:: Documents must be sorted to calculate the metric. .. note:: See :mod:`medkit.text.metrics.irr_utils.krippendorff_alpha` for more information about the score Parameters ---------- docs_annotators: A list of list of Text documents containing attributes. The size of the list is the number of annotators to compare. Returns ------- Dict[str, Union[float,int]]: A dictionary with the krippendorff alpha score, number of annotators and support (number of documents). A value of 1 indicates perfect reliability between annotators; zero or lower indicates absence of reliability. """ if len(docs_annotators) < 2 or not isinstance(docs_annotators[0], list): raise ValueError( "'docs_annotators' should contain at least two list of TextDocuments to" " compare" ) all_annotators_data = [] for docs in docs_annotators: annotator_tags = self._extract_attr_values(docs) all_annotators_data.append(annotator_tags) scores = { "krippendorff_alpha": krippendorff_alpha(all_annotators_data), "nb_annotators": len(all_annotators_data), "support": len(all_annotators_data[0]), } return scores