Source code for medkit.text.metrics.ner

"""
This package needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[metrics-ner]`.
"""

__all__ = ["SeqEvalEvaluator", "SeqEvalMetricsComputer"]

from typing import Any, Dict, List, Optional, Union
from typing_extensions import Literal

from seqeval.metrics import accuracy_score, classification_report
from seqeval.scheme import BILOU, IOB2

from medkit.core.text import TextDocument, Entity, span_utils
from medkit.text.ner import hf_tokenization_utils
from medkit.training.utils import BatchData


def _compute_seqeval_from_dict(
    y_true_all: List[List[str]],
    y_pred_all: List[List[str]],
    tagging_scheme: Literal["bilou", "iob2"],
    return_metrics_by_label: bool,
    average: Literal["macro", "weighted"],
) -> Dict[str, Union[float, int]]:
    """Compute seqeval metrics using preprocessed data"""

    # internal configuration for seqeval
    # 'bilou' only works with 'strict' mode
    scheme = BILOU if tagging_scheme == "bilou" else IOB2
    mode = "strict" if tagging_scheme == "bilou" else None

    # returns precision, recall, F1 score for each class.
    report = classification_report(
        y_true=y_true_all,
        y_pred=y_pred_all,
        scheme=scheme,
        output_dict=True,
        zero_division=0,
        mode=mode,
    )
    # add average metrics
    scores = {
        f"{average}_{key}": value for key, value in report[f"{average} avg"].items()
    }
    scores["support"] = scores.pop(f"{average}_support")
    scores["accuracy"] = accuracy_score(y_true=y_true_all, y_pred=y_pred_all)

    if return_metrics_by_label:
        for value_key in report:
            if value_key.endswith("avg"):
                continue
            for metric_key, metric_value in report[value_key].items():
                scores[f"{value_key}_{metric_key}"] = metric_value

    return scores


[docs]class SeqEvalEvaluator: """Evaluator to compute the performance of labeling tasks such as named entity recognition. This evaluator compares TextDocuments of reference with its predicted annotations and returns a dictionary of metrics. The evaluator converts the set of entities and documents to tags before compute the metric. It supports two schemes, IOB2 (a BIO scheme) and BILOU. The IOB2 scheme tags the Beginning, the Inside and the Outside text of a entity. The BILOU scheme tags the Beginning, the Inside and the Last tokens of multi-token entity as well as Unit-length entity. For more information about IOB schemes, refer to the `Wikipedia page <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`_ .. hint:: If **tokenizer** is not defined, the evaluator tokenizes the text by character. This may generate a lot of tokens with large documents and may affect execution time. You can use a fast tokenizer from HuggingFace, i.e. : bert tokenizer >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True) """ def __init__( self, tagging_scheme: Literal["bilou", "iob2"] = "bilou", return_metrics_by_label: bool = True, average: Literal["macro", "weighted"] = "macro", tokenizer: Optional[Any] = None, labels_remapping: Optional[Dict[str, str]] = None, ): """ Parameters ---------- tagging_scheme: Scheme for tagging the tokens, it can be `bilou` or `iob2` return_metrics_by_label: If `True`, return the metrics by label in the output dictionary. If `False`, only global metrics are returned average: Type of average to be performed in metrics. - `macro`, unweighted mean (default) - `weighted`, weighted average by support (number of true instances by label) tokenizer: Optional Fast Tokenizer to convert text into tokens. If not provided, the text is tokenized by character. labels_remapping: Optional remapping of labels, useful when there is a mismatch between the predicted labels and the reference labels to evaluate against. If a label (of a reference of predicted entity) is found in this dict, the corresponding value will be used as label instead. """ self.tokenizer = tokenizer self.tagging_scheme = tagging_scheme self.return_metrics_by_label = return_metrics_by_label self.average = average self.labels_remapping = labels_remapping
[docs] def compute( self, documents: List[TextDocument], predicted_entities: List[List[Entity]] ) -> Dict[str, float]: """Compute metrics of entity matching giving predictions. Parameters ---------- documents: Text documents containing entities of reference predicted_entities: List of predicted entities by document Returns ------- Dict[str, float]: A dictionary with average and per type metrics if required. The metrics included are: accuracy, precision, recall and F1 score. """ true_tags_all, pred_tags_all = [], [] for document, pred_entities in zip(documents, predicted_entities): text = document.text true_entities = document.anns.entities true_tags_all.append( self._tag_text_with_entities(text=text, entities=true_entities) ) pred_tags_all.append( self._tag_text_with_entities(text=text, entities=pred_entities) ) scores = _compute_seqeval_from_dict( y_true_all=true_tags_all, y_pred_all=pred_tags_all, tagging_scheme=self.tagging_scheme, return_metrics_by_label=self.return_metrics_by_label, average=self.average, ) return scores
def _tag_text_with_entities(self, text: str, entities: List[Entity]): if self.tokenizer is not None: # tags tokenized text, creates one tag per token text_encoding = self.tokenizer(text).encodings[0] tags = hf_tokenization_utils.transform_entities_to_tags( text_encoding=text_encoding, entities=entities, tagging_scheme=self.tagging_scheme, ) return tags # tags untokenized text, create one tag per character tags = ["O"] * len(text) for ent in entities: label = ent.label if self.labels_remapping: label = self.labels_remapping.get(label, label) ent_spans = span_utils.normalize_spans(ent.spans) # skip if all spans were ModifiedSpans and we are # not able to refer back to text if not ent_spans: continue start_char = ent_spans[0].start end_char = ent_spans[-1].end chars_entity = list(range(start_char, end_char)) if not chars_entity: continue entity_tags = hf_tokenization_utils.create_entity_tags( nb_tags=len(chars_entity), label=label, tagging_scheme=self.tagging_scheme, ) for token_idx, tag in zip(chars_entity, entity_tags): tags[token_idx] = tag return tags
[docs]class SeqEvalMetricsComputer: """An implementation of :class:`~medkit.training.MetricsComputer` using seqeval to compute metrics in the training of named-entity recognition components. The metrics computer can be used with a :class:`~medkit.training.Trainer` """ def __init__( self, id_to_label: Dict[int, str], tagging_scheme: Literal["bilou", "iob2"] = "bilou", return_metrics_by_label: bool = True, average: Literal["macro", "weighted"] = "macro", ): """ id_to_label: Mapping integer value to label, it should be the same used in preprocess tagging_scheme: Scheme used for tagging the tokens, it can be `bilou` or `iob2` return_metrics_by_label: If `True`, return the metrics by label in the output dictionary. If `False`, only return average metrics average: Type of average to be performed in metrics. - `macro`, unweighted mean (default) - `weighted`, weighted average by support (number of true instances by attr value) """ self.id_to_label = id_to_label self.tagging_scheme = tagging_scheme self.return_metrics_by_label = return_metrics_by_label self.average = average
[docs] def prepare_batch( self, model_output: BatchData, input_batch: BatchData ) -> Dict[str, List[List[str]]]: """Prepare a batch of tensors to compute the metric Parameters ---------- model_output: A batch data including the `logits` predicted by the model input_batch: A batch data including the `labels` of reference Returns ------- Dict[str, List[List[str]]] A dictionary with the true and predicted tags representation of a batch data """ predictions_ids = ( model_output["logits"].argmax(dim=-1).detach().to("cpu").numpy() ) references_ids = input_batch["labels"].detach().to("cpu").numpy() # ignore special tokens mask_special_tokens = references_ids != hf_tokenization_utils.SPECIAL_TAG_ID_HF batch_true_tags = [ [self.id_to_label[tag] for tag in ref[mask]] for ref, mask in zip(references_ids, mask_special_tokens) ] batch_pred_tags = [ [self.id_to_label[tag] for tag in pred[mask]] for pred, mask in zip(predictions_ids, mask_special_tokens) ] return {"y_true": batch_true_tags, "y_pred": batch_pred_tags}
[docs] def compute(self, all_data: Dict[str, List[Any]]) -> Dict[str, float]: """Compute metrics using the tag representation collected by batches during the training/evaluation loop. Parameters ---------- all_data: A dictionary with the true and predicted tags collected by batches Returns ------- Dict[str, float]: A dictionary with average and per label metrics if required. The metrics included are : accuracy, precision, recall and F1 score. """ # extract and format data from all_data y_true_all = all_data.get("y_true", []) y_pred_all = all_data.get("y_pred", []) scores = _compute_seqeval_from_dict( y_pred_all=y_pred_all, y_true_all=y_true_all, tagging_scheme=self.tagging_scheme, return_metrics_by_label=self.return_metrics_by_label, average=self.average, ) return scores