Source code for medkit.text.ner.hf_tokenization_utils

__all__ = [
    "transform_entities_to_tags",
    "align_and_map_tokens_with_tags",
    "convert_labels_to_tags",
]

from typing import Dict, List
from typing_extensions import Literal

from transformers.tokenization_utils_fast import EncodingFast

from medkit.core.text import Entity, span_utils

SPECIAL_TAG_ID_HF: int = -100


[docs]def convert_labels_to_tags( labels: List[str], tagging_scheme: Literal["bilou", "iob2"] = "bilou", ) -> Dict[str, int]: """Convert a list of labels in a mapping of NER tags Parameters ---------- labels: List of labels to convert tagging_scheme: Scheme to use in the conversion, "iob2" follows the BIO scheme. Returns ------- label_to_id: Dict[str, int]: Mapping with NER tags. Examples -------- >>> convert_labels_to_tags(labels=["test","problem"],tagging_scheme="iob2") {'O': 0, 'B-test': 1, 'I-test': 2, 'B-problem': 3, 'I-problem': 4} """ label_to_id = {} label_to_id["O"] = 0 if tagging_scheme == "bilou": scheme = ["B", "I", "L", "U"] else: scheme = ["B", "I"] all_labels = [f"{prefix}-{label}" for label in labels for prefix in scheme] for idx, label in enumerate(all_labels): label_to_id[label] = idx + 1 return label_to_id
def create_entity_tags( nb_tags: int, label: str, tagging_scheme: Literal["bilou", "iob2"] ) -> List[str]: """Create a list of tags representing one entity Parameters ---------- nb_tags: Total of tags to create label: Entity label tagging_scheme: Scheme to use in the conversion, "iob2" follows the BIO scheme. Returns ------- tags: List[str]: Tags representing the entity Examples -------- >>> create_entity_tags(nb_tags=3, label="corporation", tagging_scheme="bilou") ['B-corporation', 'I-corporation', 'L-corporation'] """ tags = [f"I-{label}"] * nb_tags if len(tags) == 1: prefix = "U" if tagging_scheme == "bilou" else "B" tags[0] = f"{prefix}-{label}" else: tags[0] = f"B-{label}" prefix = "L" if tagging_scheme == "bilou" else "I" tags[-1] = f"{prefix}-{label}" return tags
[docs]def transform_entities_to_tags( text_encoding: EncodingFast, entities: List[Entity], tagging_scheme: Literal["bilou", "iob2"] = "bilou", ) -> List[str]: """ Transform entities from a encoded document to a list of BILOU/IOB2 tags. Parameters ---------- text_encoding: Encoding of the document of reference, this is created by a HuggingFace fast tokenizer. It contains a tokenized version of the document to tag. entities: The list of entities to transform tagging_scheme: Scheme to tag the tokens, it can be `bilou` or `iob2` Returns ------- List[str]: A list describing the document with tags. By default the tags could be "B", "I", "L", "O","U", if `tagging_scheme` is `iob2` the tags could be "B", "I","O". Examples -------- >>> # Define a fast tokenizer, i.e. : bert tokenizer >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True) >>> document = TextDocument(text="medkit") >>> entities = [Entity(label="corporation", spans=[Span(start=0, end=6)], text='medkit')] >>> # Get text encoding of the document using the tokenizer >>> text_encoding = tokenizer(document.text).encodings[0] >>> print(text_encoding.tokens) ['[CLS]', 'med',##kit', '[SEP]'] Transform to BILOU tags >>> tags = transform_entities_to_tags(text_encoding,entities) >>> assert tags == ['O', 'B-corporation', 'L-corporation', 'O'] Transform to IOB2 tags >>> tags = transform_entities_to_tags(text_encoding,entities,"iob2") >>> assert tags == ['O', 'B-corporation', 'I-corporation', 'O'] """ tags = ["O"] * len(text_encoding) for ent in entities: label = ent.label ent_spans = span_utils.normalize_spans(ent.spans) start_char = ent_spans[0].start end_char = ent_spans[-1].end tokens_entity = set() for idx in range(start_char, end_char): token_id = text_encoding.char_to_token(idx) if token_id is not None: tokens_entity.add(token_id) tokens_entity = sorted(list(tokens_entity)) if not tokens_entity: continue entity_tags = create_entity_tags( nb_tags=len(tokens_entity), label=label, tagging_scheme=tagging_scheme ) for token_idx, tag in zip(tokens_entity, entity_tags): tags[token_idx] = tag return tags
[docs]def align_and_map_tokens_with_tags( text_encoding: EncodingFast, tags: List[str], tag_to_id: Dict[str, int], map_sub_tokens: bool = True, ) -> List[int]: """ Return a list of tags_ids aligned with the text encoding. Tags considered as special tokens will have the `SPECIAL_TAG_ID_HF`. Parameters ---------- text_encoding: Text encoding after tokenization with a HuggingFace fast tokenizer tags: A list of tags i.e BILOU tags tag_to_id: Mapping tag to id map_sub_tokens: When a token is not in the vocabulary of the tokenizer, it could split the token into multiple subtokens. If `map_sub_tokens` is True, all tags inside a token will be converted. If `map_sub_tokens` is False, only the first subtoken of a split token will be converted. Returns ------- List[int]: A list of tags ids Examples -------- >>> # Define a fast tokenizer, i.e. : bert tokenizer >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True) >>> # define data to map >>> text_encoding = tokenizer("medkit").encodings[0] >>> tags = ["O","B-corporation","I-corporation","O"] >>> tag_to_id = {"O":0, "B-corporation":1, "I-corporation":2} >>> print(text_encoding.tokens) ['[CLS]', 'med',##kit', '[SEP]'] Maping all tags to tags_ids >>> tags_ids = align_and_map_tokens_with_tags(text_encoding, tags,tag_to_id) >>> assert tags_ids == [-100, 1, 2, -100] Maping only first tag in tokens >>> tags_ids = align_and_map_tokens_with_tags(text_encoding, tags, tag_to_id,False) >>> assert tags_ids == [-100, 1, -100, -100] """ special_tokens_mask = text_encoding.special_tokens_mask tags_ids = [SPECIAL_TAG_ID_HF] * len(tags) words = text_encoding.word_ids prev_word = None for idx, label in enumerate(tags): if special_tokens_mask[idx]: continue current_word = words[idx] if current_word != prev_word: # map the first token of the word tags_ids[idx] = tag_to_id[label] prev_word = current_word if map_sub_tokens: tags_ids[idx] = tag_to_id[label] return tags_ids