Source code for medkit.core.text.utils

from __future__ import annotations

__all__ = [
    "replace_point_after_keywords",
    "replace_multiple_newline_after_sentence",
    "replace_newline_inside_sentence",
    "clean_newline_character",
    "clean_multiple_whitespaces_in_sentence",
    "clean_parentheses_eds",
    "replace_point_in_uppercase",
    "replace_point_in_numbers",
    "replace_point_before_keywords",
    "lstrip",
    "rstrip",
    "strip",
]


import re
from typing import TYPE_CHECKING

from medkit.core.text import span_utils

if TYPE_CHECKING:
    from medkit.core.text.span import AnySpan

# Some strings for character classification
_NUMERIC_CHARS = "0-9"
_UPPERCASE_CHARS = "A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ"
_PUNCT_CHARS = r"\.,;\?\!\:\("
_LOWERCASE_CHARS = "a-zàâäçéèêëîïôöùûüÿ"



[docs]
def clean_newline_character(text: str, spans: list[AnySpan], keep_endlines: bool = False) -> tuple[str, list[AnySpan]]:
    """Replace the newline character depending on its position in the text.
    The endlines characters that are not suppressed can be either kept as
    endlines, or replaced by spaces. This method combines :func:`replace_multiple_newline_after_sentence`
    and :func:`replace_newline_inside_sentence`.

    Parameters
    ----------
    text : str
        The text to be modified
    spans : list of AnySpan
        Spans associated to the `text`
    keep_endlines : bool, default=False
        Whether to keep the endlines as '.\\\\n' or replace them with '. '

    Returns
    -------
    text : str
        The cleaned text
    spans : list of AnySpan
        The list of modified spans

    Examples
    --------
    >>> text = "This is\\n\\n\\ta sentence\\nAnother\\nsentence\\n\\nhere"
    >>> spans = [Span(0, len(text))]
    >>> text, spans = clean_newline_character(text, spans, keep_endlines=False)
    >>> print(text)
    This is a sentence. Another sentence here

    >>> text, spans = clean_newline_character(text, spans, keep_endlines=True)
    >>> print(text)
    This is a sentence.
    Another sentence here
    """
    text, spans = replace_multiple_newline_after_sentence(text, spans)
    text, spans = replace_newline_inside_sentence(text, spans)
    text, spans = _replace_text(text, spans, pattern="\n+", repl=".\n" if keep_endlines else ". ")
    return text, spans




[docs]
def clean_parentheses_eds(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Modify the text near the parentheses depending on its content.
    The rules are adapted for French documents.

    Examples
    --------
    >>> text = \"\"\"
    ... Le test PCR est (-), pas de nouvelles.
    ... L'examen d'aujourd'hui est (+).
    ... Les bilans réalisés (biologique, métabolique en particulier à la recherche
    ... de GAMT et X fragile) sont revenus négatifs.
    ... Le patient a un traitement(debuté le 3/02).
    ... \"\"\"
    >>> spans = [Span(0, len(text))]
    >>> text, spans = clean_parentheses_eds(text, spans)
    >>> print(text)
    Le test PCR est  negatif , pas de nouvelles.
    L'examen d'aujourd'hui est  positif .
    Les bilans réalisés sont revenus négatifs ; biologique, métabolique en particulier à la recherche
    de GAMT et X fragile.
    Le patient a un traitement,debuté le 3/02,.
    """
    text, spans = _replace_text(text, spans, r"\(-\)", " negatif ", group=0)
    text, spans = _replace_text(text, spans, r"\(\+\)", " positif ", group=0)

    text, spans = _replace_big_parentheses(text, spans)
    text, spans = _replace_small_parentheses(text, spans)
    return text, spans




[docs]
def clean_multiple_whitespaces_in_sentence(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Replace multiple white-spaces between alphanumeric characters and
    lowercase characters with a single whitespace

    Example:
    -------
    >>> text = "A   phrase    with  multiple   spaces     "
    >>> spans = [Span(0, len(text))]
    >>> text, spans = clean_multiple_whitespaces_in_sentence(text, spans)
    >>> print(text)
    A phrase with multiple spaces
    """
    pattern = r"([ \t]{2,})"
    text, spans = _replace_text(text, spans, pattern, " ", group=0)
    return text, spans




[docs]
def replace_point_after_keywords(
    text: str,
    spans: list[AnySpan],
    keywords: list[str],
    strict: bool = False,
    replace_by: str = " ",
) -> tuple[str, list[AnySpan]]:
    """Replace the character '.' after a keyword and update its span.
    Could be used to replace dots that indicate the title of a person (i.e. M. or Mrs.)
    or some dots that appear by mistake after `keywords`

    Parameters
    ----------
    text : str
        The text to be modified
    spans : list of AnySpan
        Spans associated to the `text`
    keywords : list of str
        Word or pattern to match before a point
    strict : bool, default=False
        If True, the keyword must be followed by a point.
        If False, the keyword could have zero or many whitespaces before a point
    replace_by : str, default=" "
        Replacement string

    Returns
    -------
    text : str
        The text with the replaced matches
    spans : list of AnySpan
        The list of modified spans

    Examples
    --------
    >>> text = "Le Dr. a un rdv. Mme. Bernand est venue à 14h"
    >>> spans = [Span(0, len(text))]
    >>> keywords = ["Dr", "Mme"]
    >>> text, spans = replace_point_after_keywords(text, spans, keywords, replace_by="")
    >>> print(text)
    Le Dr a un rdv. Mme Bernand est venue à 14h
    """
    # Create a list regex using '\b' to indicate that keyword is a word
    keywords_regexp = "|".join([rf"\b{keyword}" for keyword in keywords])
    pattern = rf"(?:{keywords_regexp})(\.)" if strict else rf"(?:{keywords_regexp})(\s*\.)"

    # The first group has the span of interest
    text, spans = _replace_text(text, spans, pattern, repl=replace_by, group=1)
    return text, spans




[docs]
def replace_multiple_newline_after_sentence(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Replace multiple space characters between a newline
    character \\\\n and a capital letter or a number with a single newline character.

    Parameters
    ----------
    text : str
        The text to be modified
    spans : list of AnySpan
        Spans associated to the `text`

    Returns
    -------
    text : str
        The cleaned text
    spans : list of AnySpan
        The list of modified spans
    """
    pattern = rf"(?P<blanks>\r?\n[\r\n]*)[\t\s]*[{_NUMERIC_CHARS}{_UPPERCASE_CHARS}]"
    replace_by = "\n"
    text, spans = _replace_text(text, spans, pattern, repl=replace_by, group="blanks")
    return text, spans




[docs]
def replace_newline_inside_sentence(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Replace the newline character \\\\n between lowercase letters
    or punctuation marks with a space

    Parameters
    ----------
    text : str
        The text to be modified
    spans : list of AnySpan
        Spans associated to the `text`

    Returns
    -------
    text : str
        The cleaned text
    spans : list of AnySpan
        The list of modified spans
    """
    pattern = rf"(?P<blanks>\r?\n[\r\n]*)[\t\s]*[{_LOWERCASE_CHARS}{_PUNCT_CHARS}]"
    replace_by = " "
    text, spans = _replace_text(text, spans, pattern, repl=replace_by, group="blanks")
    return text, spans



def _replace_big_parentheses(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Modify the sentence containing large parentheses.
    The new sentence contains the text after the parentheses followed by
    the text that was inside the parentheses.
    """
    # capture multiple spaces to control the output format
    pattern = re.compile(r"(\s*)\((?P<txt_inside>[^)(]{30,5000})\)(\s*)(?P<txt_after>[^.]*)\.")

    while True:
        # iteration over the new text until no matches are found
        match = pattern.search(text)
        if match is None:
            break

        # extract groups including their spans
        txt_in, span_in = span_utils.extract(text, spans, [match.span("txt_inside")])
        txt_af, span_af = span_utils.extract(text, spans, [match.span("txt_after")])

        if span_af:
            # insert characters before and after each group
            txt_in, span_in = span_utils.insert(txt_in, span_in, [len(txt_in)], ["."])
            # insert a space by default (eq: ' {text_af} ; ')
            txt_af, span_af = span_utils.insert(txt_af, span_af, [0, len(txt_af)], [" ", " ; "])
            # create the new phrase
            txt_new, span_new = span_utils.concatenate([txt_af, txt_in], [span_af, span_in])
        else:
            # there is no text after (), insert ';' before
            txt_new, span_new = span_utils.insert(txt_in, span_in, [0, len(txt_in)], [" ; ", "."])

        # add the new phrase into the text. Extract text_before and text_after
        # from this match and concatenate all to update texp_tmp and spans
        txt_before, span_before = span_utils.extract(text, spans, [(0, match.start(0))])
        txt_after, span_after = span_utils.extract(text, spans, [(match.end(0), len(text))])
        text, spans = span_utils.concatenate([txt_before, txt_new, txt_after], [span_before, span_new, span_after])
    return text, spans


def _replace_small_parentheses(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Modify the sentence containing small parentheses.
    The new sentence has the text that was inside the parentheses surrounded by `,`
    """
    pattern = r"(\()(?:[^)(]{1,29})(\))"
    # capture each parenthesis
    group_1 = [match.span(1) for match in re.finditer(pattern, text)]
    group_2 = [match.span(2) for match in re.finditer(pattern, text)]
    ranges = sorted([*group_1, *group_2], key=lambda sp: sp[0])
    text, spans = span_utils.replace(text, spans, ranges, [","] * len(ranges))
    return text, spans


def _replace_text(
    text: str,
    spans: list[AnySpan],
    pattern: str,
    repl: str,
    group: str | int = 0,
) -> tuple[str, list[AnySpan]]:
    """Replace matches in `text` by `repl` and update its spans."""
    ranges = [(match.span(group)) for match in re.finditer(pattern, text)]
    return span_utils.replace(text, spans, ranges, [repl] * len(ranges))



[docs]
def replace_point_in_uppercase(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Replace the character '.' between uppercase characters
    with a space and update its span.

    Examples
    --------
    >>> text = "Abréviation ING.DRT or RTT.J"
    >>> spans = [Span(0, len(text))]
    >>> text, spans = replace_point_in_uppercase(text, spans)
    >>> print(text)
    Abréviation ING DRT or RTT J

    """
    pattern = rf"[{_UPPERCASE_CHARS}](\.)[{_UPPERCASE_CHARS}]"
    text, spans = _replace_text(text, spans, pattern, " ", group=1)
    return text, spans




[docs]
def replace_point_in_numbers(text: str, spans: list[AnySpan]) -> tuple[str, list[AnySpan]]:
    """Replace the character '.' between numbers
    with the character ',' a space and update its span.

    Example:
    -------
    >>> text = "La valeur est de 3.456."
    >>> spans = [Span(0, len(text))]
    >>> text, spans = replace_point_in_numbers(text, spans)
    >>> print(text)
    La valeur est de 3,456.
    """
    pattern = rf"[{_NUMERIC_CHARS}](\.)[{_NUMERIC_CHARS}]"
    text, spans = _replace_text(text, spans, pattern, ",", group=1)
    return text, spans




[docs]
def replace_point_before_keywords(text: str, spans: list[AnySpan], keywords: list[str]) -> tuple[str, list[AnySpan]]:
    """Replace the character '.' before a keyword
    with a space and update its span.
    """
    keywords_regexp = "|".join([rf"{keyword}\b" for keyword in keywords])
    pattern = rf"(\s\.\s*)(?:{keywords_regexp})"
    text, spans = _replace_text(text, spans, pattern, " ", group=1)
    return text, spans




[docs]
def lstrip(text: str, start: int = 0, chars: str | None = None) -> tuple[str, int]:
    """Returns a copy of the string with leading characters removed
    and its corresponding new start index.

    Parameters
    ----------
    text : str
        The text to strip.
    start : int, default=0
        The start index from the original text if any.
    chars : str, optional
        The list of characters to strip. Default behaviour is like `str.lstrip([chars])`.

    Returns
    -------
    new_text : str
        New text
    new_start : int
        New start index
    """
    new_text = text.lstrip(chars)
    new_start = start + (len(text) - len(new_text))
    return new_text, new_start




[docs]
def rstrip(text: str, end: int | None = None, chars: str | None = None) -> tuple[str, int]:
    """Returns a copy of the string with trailing characters removed
    and its corresponding new end index.

    Parameters
    ----------
    text : str
        The text to strip.
    end : int, optional
        The end index from the original text if any.
    chars : str, optional
        The list of characters to strip. Default behaviour is like `str.rstrip([chars])`.

    Returns
    -------
    new_text : str
        New text
    new_end : int
        New end index
    """
    if end is None:
        end = len(text)
    new_text = text.rstrip(chars)
    new_end = end - (len(text) - len(new_text))
    return new_text, new_end




[docs]
def strip(text: str, start: int = 0, chars: str | None = None) -> tuple[str, int, int]:
    """Returns a copy of the string with leading characters removed
    and its corresponding new start and end indexes.

    Parameters
    ----------
    text : str
        The text to strip.
    start : int, default=0
        The start index from the original text if any.
    chars : str, optional
        The list of characters to strip. Default behaviour is like `str.lstrip([chars])`.

    Returns
    -------
    new_text : str
        New text
    new_start : int
        New start index
    new_end : int
        New end index
    """
    new_text, new_start = lstrip(text, start, chars)
    new_end = new_start + len(new_text)
    new_text, new_end = rstrip(new_text, new_end)
    return new_text, new_start, new_end