Source code for medkit.text.segmentation.tokenizer_utils

__all__ = ["lstrip", "rstrip", "strip"]

from typing import Tuple


[docs]def lstrip(text: str, start: int = 0, chars: str = None) -> Tuple[str, int]:
    """Returns a copy of the string with leading characters removed
    and its corresponding new start index.

    Parameters
    ----------
    text
        The text to strip.
    start
        The start index from the original text if any.
    chars
        The list of characters to strip. Default behaviour is like `str.lstrip([chars])`.
    """
    new_text = text.lstrip(chars)
    new_start = start + (len(text) - len(new_text))
    return new_text, new_start


[docs]def rstrip(text: str, end: int = None, chars: str = None) -> Tuple[str, int]:
    """Returns a copy of the string with trailing characters removed
    and its corresponding new end index.

    Parameters
    ----------
    text
        The text to strip.
    end
        The end index from the original text if any.
    chars
        The list of characters to strip. Default behaviour is like `str.rstrip([chars])`.
    """
    if end is None:
        end = len(text)
    new_text = text.rstrip(chars)
    new_end = end - (len(text) - len(new_text))
    return new_text, new_end


[docs]def strip(text: str, start: int = 0, chars: str = None) -> Tuple[str, int, int]:
    """Returns a copy of the string with leading characters removed
    and its corresponding new start and end indexes.

    Parameters
    ----------
    text
        The text to strip.
    start
        The start index from the original text if any.
    chars
        The list of characters to strip. Default behaviour is like `str.lstrip([chars])`.
    """
    new_text, new_start = lstrip(text, start, chars)
    new_end = new_start + len(new_text)
    new_text, new_end = rstrip(new_text, new_end)
    return new_text, new_start, new_end