Source code for medkit.text.metrics.ner

__all__ = ["SeqEvalEvaluator", "SeqEvalMetricsComputer"]

from typing import Any, Dict, List, Optional
from typing_extensions import Literal

from seqeval.metrics import accuracy_score, classification_report
from seqeval.scheme import BILOU, IOB2

from medkit.core.text import TextDocument, Entity, span_utils
from medkit.text.ner import hf_tokenization_utils
from medkit.training.utils import BatchData


def _compute_seqeval_from_dict(
    y_true_all: List[List[str]],
    y_pred_all: List[List[str]],
    tagging_scheme: Literal["bilou", "iob2"],
    return_metrics_by_label,
) -> Dict[str, float]:
    """Compute seqeval metrics using preprocessed data"""

    # internal configuration for seqeval
    # 'bilou' only works with 'strict' mode
    scheme = BILOU if tagging_scheme == "bilou" else IOB2
    mode = "strict" if tagging_scheme == "bilou" else None

    # returns precision, recall, F1 score for each class.
    report = classification_report(
        y_true=y_true_all,
        y_pred=y_pred_all,
        scheme=scheme,
        output_dict=True,
        zero_division=0,
        mode=mode,
    )
    # add overall_metrics
    scores = {f"overall_{key}": value for key, value in report["micro avg"].items()}
    scores["overall_acc"] = accuracy_score(y_pred=y_pred_all, y_true=y_true_all)

    if return_metrics_by_label:
        ent_keys = [key for key in report.keys() if not key.endswith("avg")]
        for ent_key in ent_keys:
            for metric_key, metric_value in report[ent_key].items():
                scores[f"{ent_key}_{metric_key}"] = metric_value

    return scores


[docs]class SeqEvalEvaluator: """Evaluator to compute the performance of labeling tasks such as named entity recognition. This evaluator compares TextDocuments of reference with its predicted annotations and returns a dictionary of metrics. The evaluator converts the set of entities and documents to tags before compute the metric. It supports two schemes, IOB2 (a BIO scheme) and BILOU. The IOB2 scheme tags the Beginning, the Inside and the Outside text of a entity. The BILOU scheme tags the Beginning, the Inside and the Last tokens of multi-token entity as well as Unit-length entity. For more information about IOB schemes, refer to the `Wikipedia page <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`_ .. hint:: If **tokenizer** is not defined, the evaluator tokenizes the text by character. This may generate a lot of tokens with large documents and may affect execution time. You can use a fast tokenizer from HuggingFace, i.e. : bert tokenizer >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True) """ def __init__( self, tagging_scheme: Literal["bilou", "iob2"] = "bilou", return_metrics_by_label: bool = True, tokenizer: Optional[Any] = None, ): """ Parameters ---------- tagging_scheme: Scheme for tagging the tokens, it can be `bilou` or `iob2` return_metrics_by_label: If `True`, return the metrics by label in the output dictionary. If `False`, only return overall metrics tokenizer: Optional Fast Tokenizer to convert text into tokens. If not provided, the text is tokenized by character. """ self.tokenizer = tokenizer self.tagging_scheme = tagging_scheme self.return_metrics_by_label = return_metrics_by_label
[docs] def compute( self, documents: List[TextDocument], predicted_entities: List[List[Entity]] ) -> Dict[str, float]: """Compute metrics of entity matching giving predictions. Parameters ---------- documents: Text documents containing entities of reference predicted_entities: List of predicted entities by document Returns ------- Dict[str, float]: A dictionary with overall and per type metrics if required. The metrics included are: accuracy, precision, recall and F1 score. """ true_tags_all, pred_tags_all = [], [] for document, pred_entities in zip(documents, predicted_entities): text = document.text true_entities = document.anns.entities true_tags_all.append( self._tag_text_with_entities(text=text, entities=true_entities) ) pred_tags_all.append( self._tag_text_with_entities(text=text, entities=pred_entities) ) scores = _compute_seqeval_from_dict( y_true_all=true_tags_all, y_pred_all=pred_tags_all, tagging_scheme=self.tagging_scheme, return_metrics_by_label=self.return_metrics_by_label, ) return scores
def _tag_text_with_entities(self, text: str, entities: List[Entity]): if self.tokenizer is not None: # tags tokenized text, creates one tag per token text_encoding = self.tokenizer(text).encodings[0] tags = hf_tokenization_utils.transform_entities_to_tags( text_encoding=text_encoding, entities=entities, tagging_scheme=self.tagging_scheme, ) return tags # tags untokenized text, create one tag per character tags = ["O"] * len(text) for ent in entities: label = ent.label ent_spans = span_utils.normalize_spans(ent.spans) start_char = ent_spans[0].start end_char = ent_spans[-1].end chars_entity = list(range(start_char, end_char)) if not chars_entity: continue entity_tags = hf_tokenization_utils.create_entity_tags( nb_tags=len(chars_entity), label=label, tagging_scheme=self.tagging_scheme, ) for token_idx, tag in zip(chars_entity, entity_tags): tags[token_idx] = tag return tags
[docs]class SeqEvalMetricsComputer: """An implementation of :class:`~medkit.training.MetricsComputer` using seqeval to compute metrics in the training of named-entity recognition components. The metrics computer can be used with a :class:`~medkit.training.Trainer` """ def __init__( self, id_to_label: Dict[int, str], tagging_scheme: Literal["bilou", "iob2"] = "bilou", return_metrics_by_label: bool = True, ): """ id_to_label: Mapping integer value to label, it should be the same used in preprocess tagging_scheme: Scheme used for tagging the tokens, it can be `bilou` or `iob2` return_metrics_by_label: If `True`, return the metrics by label in the output dictionary. If `False`, only return overall metrics """ self.id_to_label = id_to_label self.tagging_scheme = tagging_scheme self.return_metrics_by_label = return_metrics_by_label
[docs] def prepare_batch( self, model_output: BatchData, input_batch: BatchData ) -> Dict[str, List[List[str]]]: """Prepare a batch of tensors to compute the metric Parameters ---------- model_output: A batch data including the `logits` predicted by the model input_batch: A batch data including the `labels` of reference Returns ------- Dict[str, List[List[str]]] A dictionary with the true and predicted tags representation of a batch data """ predictions_ids = ( model_output["logits"].argmax(dim=-1).detach().to("cpu").numpy() ) references_ids = input_batch["labels"].detach().to("cpu").numpy() # ignore special tokens mask_special_tokens = references_ids != hf_tokenization_utils.SPECIAL_TAG_ID_HF batch_true_tags = [ [self.id_to_label[tag] for tag in ref[mask]] for ref, mask in zip(references_ids, mask_special_tokens) ] batch_pred_tags = [ [self.id_to_label[tag] for tag in pred[mask]] for pred, mask in zip(predictions_ids, mask_special_tokens) ] return {"y_true": batch_true_tags, "y_pred": batch_pred_tags}
[docs] def compute(self, all_data: Dict[str, List[Any]]) -> Dict[str, float]: """Compute metrics using the tag representation collected by batches during the training/evaluation loop. Parameters ---------- all_data: A dictionary with the true and predicted tags collected by batches Returns ------- Dict[str, float]: A dictionary with overall and per label metrics if required. The metrics included are : accuracy, precision, recall and F1 score. """ # extract and format data from all_data y_true_all = all_data.get("y_true", []) y_pred_all = all_data.get("y_pred", []) scores = _compute_seqeval_from_dict( y_pred_all=y_pred_all, y_true_all=y_true_all, tagging_scheme=self.tagging_scheme, return_metrics_by_label=self.return_metrics_by_label, ) return scores