Source code for medkit.text.metrics.ner

"""
This package needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[metrics-ner]`.
"""

__all__ = ["SeqEvalEvaluator", "SeqEvalMetricsComputer"]

from typing import Any, Dict, List, Optional, Union
from typing_extensions import Literal

from seqeval.metrics import accuracy_score, classification_report
from seqeval.scheme import BILOU, IOB2

from medkit.core.text import TextDocument, Entity, span_utils
from medkit.text.ner import hf_tokenization_utils
from medkit.training.utils import BatchData


def _compute_seqeval_from_dict(
    y_true_all: List[List[str]],
    y_pred_all: List[List[str]],
    tagging_scheme: Literal["bilou", "iob2"],
    return_metrics_by_label: bool,
    average: Literal["macro", "weighted"],
) -> Dict[str, Union[float, int]]:
    """Compute seqeval metrics using preprocessed data"""

    # internal configuration for seqeval
    # 'bilou' only works with 'strict' mode
    scheme = BILOU if tagging_scheme == "bilou" else IOB2
    mode = "strict" if tagging_scheme == "bilou" else None

    # returns precision, recall, F1 score for each class.
    report = classification_report(
        y_true=y_true_all,
        y_pred=y_pred_all,
        scheme=scheme,
        output_dict=True,
        zero_division=0,
        mode=mode,
    )
    # add average metrics
    scores = {
        f"{average}_{key}": value for key, value in report[f"{average} avg"].items()
    }
    scores["support"] = scores.pop(f"{average}_support")
    scores["accuracy"] = accuracy_score(y_true=y_true_all, y_pred=y_pred_all)

    if return_metrics_by_label:
        for value_key in report:
            if value_key.endswith("avg"):
                continue
            for metric_key, metric_value in report[value_key].items():
                scores[f"{value_key}_{metric_key}"] = metric_value

    return scores


[docs]class SeqEvalEvaluator:
    """Evaluator to compute the performance of labeling tasks such as
    named entity recognition. This evaluator compares TextDocuments of reference
    with its predicted annotations and returns a dictionary of metrics.

    The evaluator converts the set of entities and documents to tags before compute the metric.
    It supports two schemes, IOB2 (a BIO scheme) and BILOU. The IOB2 scheme tags the Beginning,
    the Inside and the Outside text of a entity. The BILOU scheme tags the Beginning,
    the Inside and the Last tokens of multi-token entity as well as Unit-length entity.

    For more information about IOB schemes, refer to the `Wikipedia page <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`_

    .. hint::
        If **tokenizer** is not defined, the evaluator tokenizes the text by character.
        This may generate a lot of tokens with large documents and may affect execution time.
        You can use a fast tokenizer from HuggingFace, i.e. : bert tokenizer

        >>> from transformers import AutoTokenizer
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
    """

    def __init__(
        self,
        tagging_scheme: Literal["bilou", "iob2"] = "bilou",
        return_metrics_by_label: bool = True,
        average: Literal["macro", "weighted"] = "macro",
        tokenizer: Optional[Any] = None,
        labels_remapping: Optional[Dict[str, str]] = None,
    ):
        """
        Parameters
        ----------
        tagging_scheme:
            Scheme for tagging the tokens, it can be `bilou` or `iob2`
        return_metrics_by_label:
            If `True`, return the metrics by label in the output dictionary.
            If `False`, only global metrics are returned
        average:
            Type of average to be performed in metrics.
            - `macro`, unweighted mean (default)
            - `weighted`, weighted average by support (number of true instances by label)
        tokenizer:
            Optional Fast Tokenizer to convert text into tokens.
            If not provided, the text is tokenized by character.
        labels_remapping:
            Optional remapping of labels, useful when there is a mismatch
            between the predicted labels and the reference labels to evaluate
            against. If a label (of a reference of predicted entity) is found in
            this dict, the corresponding value will be used as label instead.
        """
        self.tokenizer = tokenizer
        self.tagging_scheme = tagging_scheme
        self.return_metrics_by_label = return_metrics_by_label
        self.average = average
        self.labels_remapping = labels_remapping

[docs]    def compute(
        self, documents: List[TextDocument], predicted_entities: List[List[Entity]]
    ) -> Dict[str, float]:
        """Compute metrics of entity matching giving predictions.

        Parameters
        ----------
        documents:
            Text documents containing entities of reference
        predicted_entities:
            List of predicted entities by document

        Returns
        -------
        Dict[str, float]:
            A dictionary with average and per type metrics if required. The metrics included are:
            accuracy, precision, recall and F1 score.
        """
        true_tags_all, pred_tags_all = [], []

        for document, pred_entities in zip(documents, predicted_entities):
            text = document.text
            true_entities = document.anns.entities

            true_tags_all.append(
                self._tag_text_with_entities(text=text, entities=true_entities)
            )
            pred_tags_all.append(
                self._tag_text_with_entities(text=text, entities=pred_entities)
            )
        scores = _compute_seqeval_from_dict(
            y_true_all=true_tags_all,
            y_pred_all=pred_tags_all,
            tagging_scheme=self.tagging_scheme,
            return_metrics_by_label=self.return_metrics_by_label,
            average=self.average,
        )
        return scores

    def _tag_text_with_entities(self, text: str, entities: List[Entity]):
        if self.tokenizer is not None:
            # tags tokenized text, creates one tag per token
            text_encoding = self.tokenizer(text).encodings[0]
            tags = hf_tokenization_utils.transform_entities_to_tags(
                text_encoding=text_encoding,
                entities=entities,
                tagging_scheme=self.tagging_scheme,
            )
            return tags

        # tags untokenized text, create one tag per character
        tags = ["O"] * len(text)
        for ent in entities:
            label = ent.label
            if self.labels_remapping:
                label = self.labels_remapping.get(label, label)
            ent_spans = span_utils.normalize_spans(ent.spans)
            # skip if all spans were ModifiedSpans and we are
            # not able to refer back to text
            if not ent_spans:
                continue

            start_char = ent_spans[0].start
            end_char = ent_spans[-1].end
            chars_entity = list(range(start_char, end_char))

            if not chars_entity:
                continue

            entity_tags = hf_tokenization_utils.create_entity_tags(
                nb_tags=len(chars_entity),
                label=label,
                tagging_scheme=self.tagging_scheme,
            )
            for token_idx, tag in zip(chars_entity, entity_tags):
                tags[token_idx] = tag

        return tags


[docs]class SeqEvalMetricsComputer:
    """An implementation of :class:`~medkit.training.MetricsComputer` using seqeval
    to compute metrics in the training of named-entity recognition components.

    The metrics computer can be used with a :class:`~medkit.training.Trainer`
    """

    def __init__(
        self,
        id_to_label: Dict[int, str],
        tagging_scheme: Literal["bilou", "iob2"] = "bilou",
        return_metrics_by_label: bool = True,
        average: Literal["macro", "weighted"] = "macro",
    ):
        """
        id_to_label:
            Mapping integer value to label, it should be the same used in preprocess
        tagging_scheme:
            Scheme used for tagging the tokens, it can be `bilou` or `iob2`
        return_metrics_by_label:
            If `True`, return the metrics by label in the output dictionary.
            If `False`, only return average metrics
        average:
            Type of average to be performed in metrics.
            - `macro`, unweighted mean (default)
            - `weighted`, weighted average by support (number of true instances by attr value)

        """
        self.id_to_label = id_to_label
        self.tagging_scheme = tagging_scheme
        self.return_metrics_by_label = return_metrics_by_label
        self.average = average

[docs]    def prepare_batch(
        self, model_output: BatchData, input_batch: BatchData
    ) -> Dict[str, List[List[str]]]:
        """Prepare a batch of tensors to compute the metric

        Parameters
        ----------
        model_output:
            A batch data including the `logits` predicted by the model
        input_batch:
            A batch data including the `labels` of reference

        Returns
        -------
        Dict[str, List[List[str]]]
            A dictionary with the true and predicted tags representation of a batch data
        """
        predictions_ids = (
            model_output["logits"].argmax(dim=-1).detach().to("cpu").numpy()
        )
        references_ids = input_batch["labels"].detach().to("cpu").numpy()
        # ignore special tokens
        mask_special_tokens = references_ids != hf_tokenization_utils.SPECIAL_TAG_ID_HF

        batch_true_tags = [
            [self.id_to_label[tag] for tag in ref[mask]]
            for ref, mask in zip(references_ids, mask_special_tokens)
        ]

        batch_pred_tags = [
            [self.id_to_label[tag] for tag in pred[mask]]
            for pred, mask in zip(predictions_ids, mask_special_tokens)
        ]

        return {"y_true": batch_true_tags, "y_pred": batch_pred_tags}

[docs]    def compute(self, all_data: Dict[str, List[Any]]) -> Dict[str, float]:
        """Compute metrics using the tag representation collected by batches
        during the training/evaluation loop.

        Parameters
        ----------
        all_data:
            A dictionary with the true and predicted tags collected by batches

        Returns
        -------
        Dict[str, float]:
            A dictionary with average and per label metrics if required. The metrics
            included are : accuracy, precision, recall and F1 score.

        """
        # extract and format data from all_data
        y_true_all = all_data.get("y_true", [])
        y_pred_all = all_data.get("y_pred", [])
        scores = _compute_seqeval_from_dict(
            y_pred_all=y_pred_all,
            y_true_all=y_true_all,
            tagging_scheme=self.tagging_scheme,
            return_metrics_by_label=self.return_metrics_by_label,
            average=self.average,
        )
        return scores