Source code for medkit.core.text.utils

__all__ = [
    "replace_point_after_keywords",
    "replace_multiple_newline_after_sentence",
    "replace_newline_inside_sentence",
    "clean_newline_character",
    "clean_multiple_whitespaces_in_sentence",
    "clean_parentheses_eds",
    "replace_point_in_uppercase",
    "replace_point_in_numbers",
    "replace_point_before_keywords",
    "lstrip",
    "rstrip",
    "strip",
]


import re
from typing import List, Tuple, Union

from medkit.core.text.span import AnySpan
import medkit.core.text.span_utils as span_utils

# Some strings for character classification
_NUMERIC_CHARS = "0-9"
_UPPERCASE_CHARS = "A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ"
_PUNCT_CHARS = r"\.,;\?\!\:\("
_LOWERCASE_CHARS = "a-zàâäçéèêëîïôöùûüÿ"


[docs]def clean_newline_character( text: str, spans: List[AnySpan], keep_endlines: bool = False ) -> Tuple[str, List[AnySpan]]: """Replace the newline character depending on its position in the text. The endlines characters that are not suppressed can be either kept as endlines, or replaced by spaces. This method combines :func:`replace_multiple_newline_after_sentence` and :func:`replace_newline_inside_sentence`. Parameters ---------- text: The text to be modified spans: Spans associated to the `text` keep_endlines: Whether to keep the endlines as '.\\\\n' or replace them with '. ' Returns ------- The cleaned text and the list of spans updated Examples -------- >>> text = "This is\\n\\n\\ta sentence\\nAnother\\nsentence\\n\\nhere" >>> spans = [Span(0, len(text))] >>> text, spans = clean_newline_character(text, spans, keep_endlines=False) >>> print(text) This is a sentence. Another sentence here >>> text, spans = clean_newline_character(text, spans, keep_endlines=True) >>> print(text) This is a sentence. Another sentence here """ text, spans = replace_multiple_newline_after_sentence(text, spans) text, spans = replace_newline_inside_sentence(text, spans) text, spans = _replace_text( text, spans, pattern="\n+", repl=".\n" if keep_endlines else ". " ) return text, spans
[docs]def clean_parentheses_eds(text: str, spans: List[AnySpan]) -> Tuple[str, List[AnySpan]]: """Modify the text near the parentheses depending on its content. The rules are adapted for French documents. Examples -------- >>> text = \"\"\" ... Le test PCR est (-), pas de nouvelles. ... L'examen d'aujourd'hui est (+). ... Les bilans réalisés (biologique, métabolique en particulier à la recherche ... de GAMT et X fragile) sont revenus négatifs. ... Le patient a un traitement(debuté le 3/02). ... \"\"\" >>> spans = [Span(0,len(text))] >>> text, spans = clean_parentheses_eds(text,spans) >>> print(text) Le test PCR est negatif , pas de nouvelles. L'examen d'aujourd'hui est positif . Les bilans réalisés sont revenus négatifs ; biologique, métabolique en particulier à la recherche de GAMT et X fragile. Le patient a un traitement,debuté le 3/02,. """ text, spans = _replace_text(text, spans, r"\(-\)", " negatif ", group=0) text, spans = _replace_text(text, spans, r"\(\+\)", " positif ", group=0) text, spans = _replace_big_parentheses(text, spans) text, spans = _replace_small_parentheses(text, spans) return text, spans
[docs]def clean_multiple_whitespaces_in_sentence( text: str, spans: List[AnySpan] ) -> Tuple[str, List[AnySpan]]: """Replace multiple white-spaces between alphanumeric characters and lowercase characters with a single whitespace Example ------- >>> text = "A phrase with multiple spaces " >>> spans = [Span(0, len(text))] >>> text, spans = clean_multiple_whitespaces_in_sentence(text, spans) >>> print(text) A phrase with multiple spaces """ pattern = r"([ \t]{2,})" text, spans = _replace_text(text, spans, pattern, " ", group=0) return text, spans
[docs]def replace_point_after_keywords( text: str, spans: List[AnySpan], keywords: List[str], strict: bool = False, replace_by: str = " ", ) -> Tuple[str, List[AnySpan]]: """Replace the character '.' after a keyword and update its span. Could be used to replace dots that indicate the title of a person (i.e. M. or Mrs.) or some dots that appear by mistake after `keywords` Parameters ---------- text: The text to be modified spans: Spans associated to the `text` keywords: Word or pattern to match before a point strict: If True, the keyword must be followed by a point. If False, the keyword could have zero or many whitespaces before a point replace_by: Replacement string Returns ------ The text with the replaced matches and the updated list of spans Examples -------- >>> text = "Le Dr. a un rdv. Mme. Bernand est venue à 14h" >>> spans = [Span(0, len(text))] >>> keywords = ["Dr","Mme"] >>> text, spans = replace_point_after_keywords(text, spans, keywords,replace_by="") >>> print(text) Le Dr a un rdv. Mme Bernand est venue à 14h """ # Create a list regex using '\b' to indicate that keyword is a word keywords_regexp = "|".join([rf"\b{keyword}" for keyword in keywords]) if strict: pattern = rf"(?:{keywords_regexp})(\.)" # point after kw else: pattern = rf"(?:{keywords_regexp})(\s*\.)" # zero or many whitespaces after kw # The first group has the span of interest text, spans = _replace_text(text, spans, pattern, repl=replace_by, group=1) return text, spans
[docs]def replace_multiple_newline_after_sentence( text: str, spans: List[AnySpan] ) -> Tuple[str, List[AnySpan]]: """Replace multiple space characters between a newline character \\\\n and a capital letter or a number with a single newline character. Parameters ---------- text: The text to be modified spans: Spans associated to the `text` Returns ------ The cleaned text and the list of spans updated """ pattern = rf"(?P<blanks>\r?\n[\r\n]*)[\t\s]*[{_NUMERIC_CHARS}{_UPPERCASE_CHARS}]" replace_by = "\n" text, spans = _replace_text(text, spans, pattern, repl=replace_by, group="blanks") return text, spans
[docs]def replace_newline_inside_sentence( text: str, spans: List[AnySpan] ) -> Tuple[str, List[AnySpan]]: """Replace the newline character \\\\n between lowercase letters or punctuation marks with a space Parameters ---------- text: The text to be modified spans: Spans associated to the `text` Returns ------ The cleaned text and the list of spans updated """ pattern = rf"(?P<blanks>\r?\n[\r\n]*)[\t\s]*[{_LOWERCASE_CHARS}{_PUNCT_CHARS}]" replace_by = " " text, spans = _replace_text(text, spans, pattern, repl=replace_by, group="blanks") return text, spans
def _replace_big_parentheses( text: str, spans: List[AnySpan] ) -> Tuple[str, List[AnySpan]]: """Modify the sentence containing large parentheses. The new sentence contains the text after the parentheses followed by the text that was inside the parentheses. """ # capture multiple spaces to control the output format pattern = re.compile( r"(\s*)\((?P<txt_inside>[^)(]{30,5000})\)(\s*)(?P<txt_after>[^.]*)\." ) while True: # iteration over the new text until no matches are found match = pattern.search(text) if match is None: break # extract groups including their spans txt_in, span_in = span_utils.extract(text, spans, [match.span("txt_inside")]) txt_af, span_af = span_utils.extract(text, spans, [match.span("txt_after")]) if span_af: # insert characters before and after each group txt_in, span_in = span_utils.insert(txt_in, span_in, [len(txt_in)], ["."]) # insert a space by default (eq: ' {text_af} ; ') txt_af, span_af = span_utils.insert( txt_af, span_af, [0, len(txt_af)], [" ", " ; "] ) # create the new phrase txt_new, span_new = span_utils.concatenate( [txt_af, txt_in], [span_af, span_in] ) else: # there is no text after (), insert ';' before txt_new, span_new = span_utils.insert( txt_in, span_in, [0, len(txt_in)], [" ; ", "."] ) # add the new phrase into the text. Extract text_before and text_after # from this match and concatenate all to update texp_tmp and spans txt_before, span_before = span_utils.extract(text, spans, [(0, match.start(0))]) txt_after, span_after = span_utils.extract( text, spans, [(match.end(0), len(text))] ) text, spans = span_utils.concatenate( [txt_before, txt_new, txt_after], [span_before, span_new, span_after] ) return text, spans def _replace_small_parentheses( text: str, spans: List[AnySpan] ) -> Tuple[str, List[AnySpan]]: """Modify the sentence containing small parentheses. The new sentence has the text that was inside the parentheses surrounded by `,` """ pattern = r"(\()(?:[^)(]{1,29})(\))" # capture each parenthesis group_1 = [match.span(1) for match in re.finditer(pattern, text)] group_2 = [match.span(2) for match in re.finditer(pattern, text)] ranges = sorted([*group_1, *group_2], key=lambda sp: sp[0]) text, spans = span_utils.replace(text, spans, ranges, [","] * len(ranges)) return text, spans def _replace_text( text: str, spans: List[AnySpan], pattern: str, repl: str, group: Union[str, int] = 0, ) -> Tuple[str, List[AnySpan]]: """Replace matches in `text` by `repl` and update its spans.""" ranges = [(match.span(group)) for match in re.finditer(pattern, text)] return span_utils.replace(text, spans, ranges, [repl] * len(ranges))
[docs]def replace_point_in_uppercase( text: str, spans: List[AnySpan] ) -> Tuple[str, List[AnySpan]]: """Replace the character '.' between uppercase characters with a space and update its span. Examples -------- >>> text = "Abréviation ING.DRT or RTT.J" >>> spans = [Span(0, len(text))] >>> text, spans = replace_point_in_uppercase(text, spans) >>> print(text) Abréviation ING DRT or RTT J """ pattern = rf"[{_UPPERCASE_CHARS}](\.)[{_UPPERCASE_CHARS}]" text, spans = _replace_text(text, spans, pattern, " ", group=1) return text, spans
[docs]def replace_point_in_numbers( text: str, spans: List[AnySpan] ) -> Tuple[str, List[AnySpan]]: """Replace the character '.' between numbers with the character ',' a space and update its span. Example ------- >>> text = "La valeur est de 3.456." >>> spans = [Span(0, len(text))] >>> text, spans = replace_point_in_numbers(text, spans) >>> print(text) La valeur est de 3,456. """ pattern = rf"[{_NUMERIC_CHARS}](\.)[{_NUMERIC_CHARS}]" text, spans = _replace_text(text, spans, pattern, ",", group=1) return text, spans
[docs]def replace_point_before_keywords( text: str, spans: List[AnySpan], keywords: List[str] ) -> Tuple[str, List[AnySpan]]: """Replace the character '.' before a keyword with a space and update its span. """ keywords_regexp = "|".join([rf"{keyword}\b" for keyword in keywords]) pattern = rf"(\s\.\s*)(?:{keywords_regexp})" text, spans = _replace_text(text, spans, pattern, " ", group=1) return text, spans
[docs]def lstrip(text: str, start: int = 0, chars: str = None) -> Tuple[str, int]: """Returns a copy of the string with leading characters removed and its corresponding new start index. Parameters ---------- text The text to strip. start The start index from the original text if any. chars The list of characters to strip. Default behaviour is like `str.lstrip([chars])`. """ new_text = text.lstrip(chars) new_start = start + (len(text) - len(new_text)) return new_text, new_start
[docs]def rstrip(text: str, end: int = None, chars: str = None) -> Tuple[str, int]: """Returns a copy of the string with trailing characters removed and its corresponding new end index. Parameters ---------- text The text to strip. end The end index from the original text if any. chars The list of characters to strip. Default behaviour is like `str.rstrip([chars])`. """ if end is None: end = len(text) new_text = text.rstrip(chars) new_end = end - (len(text) - len(new_text)) return new_text, new_end
[docs]def strip(text: str, start: int = 0, chars: str = None) -> Tuple[str, int, int]: """Returns a copy of the string with leading characters removed and its corresponding new start and end indexes. Parameters ---------- text The text to strip. start The start index from the original text if any. chars The list of characters to strip. Default behaviour is like `str.lstrip([chars])`. """ new_text, new_start = lstrip(text, start, chars) new_end = new_start + len(new_text) new_text, new_end = rstrip(new_text, new_end) return new_text, new_start, new_end