Source code for medkit.text.ner.hf_tokenization_utils

__all__ = [
    "transform_entities_to_tags",
    "align_and_map_tokens_with_tags",
    "convert_labels_to_tags",
]

from typing import Dict, List
from typing_extensions import Literal

from transformers.tokenization_utils_fast import EncodingFast

from medkit.core.text import Entity, span_utils

SPECIAL_TAG_ID_HF: int = -100


[docs]def convert_labels_to_tags(
    labels: List[str],
    tagging_scheme: Literal["bilou", "iob2"] = "bilou",
) -> Dict[str, int]:
    """Convert a list of labels in a mapping of NER tags

    Parameters
    ----------
    labels:
        List of labels to convert
    tagging_scheme:
        Scheme to use in the conversion, "iob2" follows the BIO scheme.

    Returns
    -------
    label_to_id: Dict[str, int]:
        Mapping with NER tags.

    Examples
    --------
    >>> convert_labels_to_tags(labels=["test","problem"],tagging_scheme="iob2")
    {'O': 0, 'B-test': 1, 'I-test': 2, 'B-problem': 3, 'I-problem': 4}

    """
    label_to_id = {}
    label_to_id["O"] = 0

    if tagging_scheme == "bilou":
        scheme = ["B", "I", "L", "U"]
    else:
        scheme = ["B", "I"]

    all_labels = [f"{prefix}-{label}" for label in labels for prefix in scheme]

    for idx, label in enumerate(all_labels):
        label_to_id[label] = idx + 1
    return label_to_id


def create_entity_tags(
    nb_tags: int, label: str, tagging_scheme: Literal["bilou", "iob2"]
) -> List[str]:
    """Create a list of tags representing one entity

    Parameters
    ----------
    nb_tags:
        Total of tags to create
    label:
        Entity label
    tagging_scheme:
        Scheme to use in the conversion, "iob2" follows the BIO scheme.

    Returns
    -------
    tags: List[str]:
        Tags representing the entity

    Examples
    --------
    >>> create_entity_tags(nb_tags=3, label="corporation", tagging_scheme="bilou")
    ['B-corporation', 'I-corporation', 'L-corporation']
    """
    tags = [f"I-{label}"] * nb_tags
    if len(tags) == 1:
        prefix = "U" if tagging_scheme == "bilou" else "B"
        tags[0] = f"{prefix}-{label}"
    else:
        tags[0] = f"B-{label}"
        prefix = "L" if tagging_scheme == "bilou" else "I"
        tags[-1] = f"{prefix}-{label}"
    return tags


[docs]def transform_entities_to_tags(
    text_encoding: EncodingFast,
    entities: List[Entity],
    tagging_scheme: Literal["bilou", "iob2"] = "bilou",
) -> List[str]:
    """
    Transform entities from a encoded document to a list of BILOU/IOB2 tags.

    Parameters
    ----------
    text_encoding:
        Encoding of the document of reference, this is created by a HuggingFace fast tokenizer.
        It contains a tokenized version of the document to tag.
    entities:
        The list of entities to transform
    tagging_scheme:
        Scheme to tag the tokens, it can be `bilou` or `iob2`

    Returns
    -------
    List[str]:
        A list describing the document with tags. By default the tags
        could be "B", "I", "L", "O","U", if `tagging_scheme` is `iob2`
        the tags could be "B", "I","O".

    Examples
    --------
    >>> # Define a fast tokenizer, i.e. : bert tokenizer
    >>> from transformers import AutoTokenizer
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

    >>> document = TextDocument(text="medkit")
    >>> entities = [Entity(label="corporation", spans=[Span(start=0, end=6)], text='medkit')]
    >>> # Get text encoding of the document using the tokenizer
    >>> text_encoding = tokenizer(document.text).encodings[0]
    >>> print(text_encoding.tokens)
    ['[CLS]', 'med',##kit', '[SEP]']

    Transform to BILOU tags

    >>> tags = transform_entities_to_tags(text_encoding,entities)
    >>> assert tags == ['O', 'B-corporation', 'L-corporation', 'O']

    Transform to IOB2 tags

    >>> tags = transform_entities_to_tags(text_encoding,entities,"iob2")
    >>> assert tags == ['O', 'B-corporation', 'I-corporation', 'O']


    """
    tags = ["O"] * len(text_encoding)

    for ent in entities:
        label = ent.label
        ent_spans = span_utils.normalize_spans(ent.spans)
        start_char = ent_spans[0].start
        end_char = ent_spans[-1].end
        tokens_entity = set()

        for idx in range(start_char, end_char):
            token_id = text_encoding.char_to_token(idx)

            if token_id is not None:
                tokens_entity.add(token_id)

        tokens_entity = sorted(list(tokens_entity))

        if not tokens_entity:
            continue

        entity_tags = create_entity_tags(
            nb_tags=len(tokens_entity), label=label, tagging_scheme=tagging_scheme
        )

        for token_idx, tag in zip(tokens_entity, entity_tags):
            tags[token_idx] = tag
    return tags


[docs]def align_and_map_tokens_with_tags(
    text_encoding: EncodingFast,
    tags: List[str],
    tag_to_id: Dict[str, int],
    map_sub_tokens: bool = True,
) -> List[int]:
    """
    Return a list of tags_ids aligned with the text encoding.
    Tags considered as special tokens will have the `SPECIAL_TAG_ID_HF`.

    Parameters
    ----------
    text_encoding:
        Text encoding after tokenization with a HuggingFace fast tokenizer
    tags:
        A list of tags i.e BILOU tags
    tag_to_id:
        Mapping tag to id
    map_sub_tokens:
        When a token is not in the vocabulary of the tokenizer, it could split
        the token into multiple subtokens.
        If `map_sub_tokens` is True, all tags inside a token will be converted.
        If `map_sub_tokens` is False, only the first subtoken of a split token will be
        converted.

    Returns
    -------
    List[int]:
        A list of tags ids

    Examples
    --------
    >>> # Define a fast tokenizer, i.e. : bert tokenizer
    >>> from transformers import AutoTokenizer
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

    >>> # define data to map
    >>> text_encoding = tokenizer("medkit").encodings[0]
    >>> tags = ["O","B-corporation","I-corporation","O"]
    >>> tag_to_id = {"O":0, "B-corporation":1, "I-corporation":2}
    >>> print(text_encoding.tokens)
    ['[CLS]', 'med',##kit', '[SEP]']

    Maping all tags to tags_ids

    >>> tags_ids = align_and_map_tokens_with_tags(text_encoding, tags,tag_to_id)
    >>> assert tags_ids == [-100, 1, 2, -100]

    Maping only first tag in tokens

    >>> tags_ids = align_and_map_tokens_with_tags(text_encoding, tags, tag_to_id,False)
    >>> assert tags_ids == [-100, 1, -100, -100]
    """
    special_tokens_mask = text_encoding.special_tokens_mask

    tags_ids = [SPECIAL_TAG_ID_HF] * len(tags)
    words = text_encoding.word_ids

    prev_word = None
    for idx, label in enumerate(tags):
        if special_tokens_mask[idx]:
            continue

        current_word = words[idx]
        if current_word != prev_word:
            # map the first token of the word
            tags_ids[idx] = tag_to_id[label]
            prev_word = current_word

        if map_sub_tokens:
            tags_ids[idx] = tag_to_id[label]
    return tags_ids