__all__ = [
"replace_point_after_keywords",
"replace_multiple_newline_after_sentence",
"replace_newline_inside_sentence",
"clean_newline_character",
"clean_multiple_whitespaces_in_sentence",
"clean_parentheses_eds",
"replace_point_in_uppercase",
"replace_point_in_numbers",
"replace_point_before_keywords",
"lstrip",
"rstrip",
"strip",
]
import re
from typing import List, Tuple, Union
from medkit.core.text.span import AnySpan
import medkit.core.text.span_utils as span_utils
# Some strings for character classification
_NUMERIC_CHARS = "0-9"
_UPPERCASE_CHARS = "A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ"
_PUNCT_CHARS = r"\.,;\?\!\:\("
_LOWERCASE_CHARS = "a-zàâäçéèêëîïôöùûüÿ"
[docs]def clean_newline_character(
text: str, spans: List[AnySpan], keep_endlines: bool = False
) -> Tuple[str, List[AnySpan]]:
"""Replace the newline character depending on its position in the text.
The endlines characters that are not suppressed can be either kept as
endlines, or replaced by spaces. This method combines :func:`replace_multiple_newline_after_sentence`
and :func:`replace_newline_inside_sentence`.
Parameters
----------
text:
The text to be modified
spans:
Spans associated to the `text`
keep_endlines:
Whether to keep the endlines as '.\\\\n' or replace them with '. '
Returns
-------
The cleaned text and the list of spans updated
Examples
--------
>>> text = "This is\\n\\n\\ta sentence\\nAnother\\nsentence\\n\\nhere"
>>> spans = [Span(0, len(text))]
>>> text, spans = clean_newline_character(text, spans, keep_endlines=False)
>>> print(text)
This is a sentence. Another sentence here
>>> text, spans = clean_newline_character(text, spans, keep_endlines=True)
>>> print(text)
This is a sentence.
Another sentence here
"""
text, spans = replace_multiple_newline_after_sentence(text, spans)
text, spans = replace_newline_inside_sentence(text, spans)
text, spans = _replace_text(
text, spans, pattern="\n+", repl=".\n" if keep_endlines else ". "
)
return text, spans
[docs]def clean_parentheses_eds(text: str, spans: List[AnySpan]) -> Tuple[str, List[AnySpan]]:
"""Modify the text near the parentheses depending on its content.
The rules are adapted for French documents.
Examples
--------
>>> text = \"\"\"
... Le test PCR est (-), pas de nouvelles.
... L'examen d'aujourd'hui est (+).
... Les bilans réalisés (biologique, métabolique en particulier à la recherche
... de GAMT et X fragile) sont revenus négatifs.
... Le patient a un traitement(debuté le 3/02).
... \"\"\"
>>> spans = [Span(0,len(text))]
>>> text, spans = clean_parentheses_eds(text,spans)
>>> print(text)
Le test PCR est negatif , pas de nouvelles.
L'examen d'aujourd'hui est positif .
Les bilans réalisés sont revenus négatifs ; biologique, métabolique en particulier à la recherche
de GAMT et X fragile.
Le patient a un traitement,debuté le 3/02,.
"""
text, spans = _replace_text(text, spans, r"\(-\)", " negatif ", group=0)
text, spans = _replace_text(text, spans, r"\(\+\)", " positif ", group=0)
text, spans = _replace_big_parentheses(text, spans)
text, spans = _replace_small_parentheses(text, spans)
return text, spans
[docs]def clean_multiple_whitespaces_in_sentence(
text: str, spans: List[AnySpan]
) -> Tuple[str, List[AnySpan]]:
"""Replace multiple white-spaces between alphanumeric characters and
lowercase characters with a single whitespace
Example
-------
>>> text = "A phrase with multiple spaces "
>>> spans = [Span(0, len(text))]
>>> text, spans = clean_multiple_whitespaces_in_sentence(text, spans)
>>> print(text)
A phrase with multiple spaces
"""
pattern = r"([ \t]{2,})"
text, spans = _replace_text(text, spans, pattern, " ", group=0)
return text, spans
[docs]def replace_point_after_keywords(
text: str,
spans: List[AnySpan],
keywords: List[str],
strict: bool = False,
replace_by: str = " ",
) -> Tuple[str, List[AnySpan]]:
"""Replace the character '.' after a keyword and update its span.
Could be used to replace dots that indicate the title of a person (i.e. M. or Mrs.)
or some dots that appear by mistake after `keywords`
Parameters
----------
text:
The text to be modified
spans:
Spans associated to the `text`
keywords:
Word or pattern to match before a point
strict:
If True, the keyword must be followed by a point.
If False, the keyword could have zero or many whitespaces before a point
replace_by:
Replacement string
Returns
------
The text with the replaced matches and the updated list of spans
Examples
--------
>>> text = "Le Dr. a un rdv. Mme. Bernand est venue à 14h"
>>> spans = [Span(0, len(text))]
>>> keywords = ["Dr","Mme"]
>>> text, spans = replace_point_after_keywords(text, spans, keywords,replace_by="")
>>> print(text)
Le Dr a un rdv. Mme Bernand est venue à 14h
"""
# Create a list regex using '\b' to indicate that keyword is a word
keywords_regexp = "|".join([rf"\b{keyword}" for keyword in keywords])
if strict:
pattern = rf"(?:{keywords_regexp})(\.)" # point after kw
else:
pattern = rf"(?:{keywords_regexp})(\s*\.)" # zero or many whitespaces after kw
# The first group has the span of interest
text, spans = _replace_text(text, spans, pattern, repl=replace_by, group=1)
return text, spans
[docs]def replace_multiple_newline_after_sentence(
text: str, spans: List[AnySpan]
) -> Tuple[str, List[AnySpan]]:
"""Replace multiple space characters between a newline
character \\\\n and a capital letter or a number with a single newline character.
Parameters
----------
text:
The text to be modified
spans:
Spans associated to the `text`
Returns
------
The cleaned text and the list of spans updated
"""
pattern = rf"(?P<blanks>\r?\n[\r\n]*)[\t\s]*[{_NUMERIC_CHARS}{_UPPERCASE_CHARS}]"
replace_by = "\n"
text, spans = _replace_text(text, spans, pattern, repl=replace_by, group="blanks")
return text, spans
[docs]def replace_newline_inside_sentence(
text: str, spans: List[AnySpan]
) -> Tuple[str, List[AnySpan]]:
"""Replace the newline character \\\\n between lowercase letters
or punctuation marks with a space
Parameters
----------
text:
The text to be modified
spans:
Spans associated to the `text`
Returns
------
The cleaned text and the list of spans updated
"""
pattern = rf"(?P<blanks>\r?\n[\r\n]*)[\t\s]*[{_LOWERCASE_CHARS}{_PUNCT_CHARS}]"
replace_by = " "
text, spans = _replace_text(text, spans, pattern, repl=replace_by, group="blanks")
return text, spans
def _replace_big_parentheses(
text: str, spans: List[AnySpan]
) -> Tuple[str, List[AnySpan]]:
"""Modify the sentence containing large parentheses.
The new sentence contains the text after the parentheses followed by
the text that was inside the parentheses.
"""
# capture multiple spaces to control the output format
pattern = re.compile(
r"(\s*)\((?P<txt_inside>[^)(]{30,5000})\)(\s*)(?P<txt_after>[^.]*)\."
)
while True:
# iteration over the new text until no matches are found
match = pattern.search(text)
if match is None:
break
# extract groups including their spans
txt_in, span_in = span_utils.extract(text, spans, [match.span("txt_inside")])
txt_af, span_af = span_utils.extract(text, spans, [match.span("txt_after")])
if span_af:
# insert characters before and after each group
txt_in, span_in = span_utils.insert(txt_in, span_in, [len(txt_in)], ["."])
# insert a space by default (eq: ' {text_af} ; ')
txt_af, span_af = span_utils.insert(
txt_af, span_af, [0, len(txt_af)], [" ", " ; "]
)
# create the new phrase
txt_new, span_new = span_utils.concatenate(
[txt_af, txt_in], [span_af, span_in]
)
else:
# there is no text after (), insert ';' before
txt_new, span_new = span_utils.insert(
txt_in, span_in, [0, len(txt_in)], [" ; ", "."]
)
# add the new phrase into the text. Extract text_before and text_after
# from this match and concatenate all to update texp_tmp and spans
txt_before, span_before = span_utils.extract(text, spans, [(0, match.start(0))])
txt_after, span_after = span_utils.extract(
text, spans, [(match.end(0), len(text))]
)
text, spans = span_utils.concatenate(
[txt_before, txt_new, txt_after], [span_before, span_new, span_after]
)
return text, spans
def _replace_small_parentheses(
text: str, spans: List[AnySpan]
) -> Tuple[str, List[AnySpan]]:
"""Modify the sentence containing small parentheses.
The new sentence has the text that was inside the parentheses surrounded by `,`
"""
pattern = r"(\()(?:[^)(]{1,29})(\))"
# capture each parenthesis
group_1 = [match.span(1) for match in re.finditer(pattern, text)]
group_2 = [match.span(2) for match in re.finditer(pattern, text)]
ranges = sorted([*group_1, *group_2], key=lambda sp: sp[0])
text, spans = span_utils.replace(text, spans, ranges, [","] * len(ranges))
return text, spans
def _replace_text(
text: str,
spans: List[AnySpan],
pattern: str,
repl: str,
group: Union[str, int] = 0,
) -> Tuple[str, List[AnySpan]]:
"""Replace matches in `text` by `repl` and update its spans."""
ranges = [(match.span(group)) for match in re.finditer(pattern, text)]
return span_utils.replace(text, spans, ranges, [repl] * len(ranges))
[docs]def replace_point_in_uppercase(
text: str, spans: List[AnySpan]
) -> Tuple[str, List[AnySpan]]:
"""Replace the character '.' between uppercase characters
with a space and update its span.
Examples
--------
>>> text = "Abréviation ING.DRT or RTT.J"
>>> spans = [Span(0, len(text))]
>>> text, spans = replace_point_in_uppercase(text, spans)
>>> print(text)
Abréviation ING DRT or RTT J
"""
pattern = rf"[{_UPPERCASE_CHARS}](\.)[{_UPPERCASE_CHARS}]"
text, spans = _replace_text(text, spans, pattern, " ", group=1)
return text, spans
[docs]def replace_point_in_numbers(
text: str, spans: List[AnySpan]
) -> Tuple[str, List[AnySpan]]:
"""Replace the character '.' between numbers
with the character ',' a space and update its span.
Example
-------
>>> text = "La valeur est de 3.456."
>>> spans = [Span(0, len(text))]
>>> text, spans = replace_point_in_numbers(text, spans)
>>> print(text)
La valeur est de 3,456.
"""
pattern = rf"[{_NUMERIC_CHARS}](\.)[{_NUMERIC_CHARS}]"
text, spans = _replace_text(text, spans, pattern, ",", group=1)
return text, spans
[docs]def replace_point_before_keywords(
text: str, spans: List[AnySpan], keywords: List[str]
) -> Tuple[str, List[AnySpan]]:
"""Replace the character '.' before a keyword
with a space and update its span.
"""
keywords_regexp = "|".join([rf"{keyword}\b" for keyword in keywords])
pattern = rf"(\s\.\s*)(?:{keywords_regexp})"
text, spans = _replace_text(text, spans, pattern, " ", group=1)
return text, spans
[docs]def lstrip(text: str, start: int = 0, chars: str = None) -> Tuple[str, int]:
"""Returns a copy of the string with leading characters removed
and its corresponding new start index.
Parameters
----------
text
The text to strip.
start
The start index from the original text if any.
chars
The list of characters to strip. Default behaviour is like `str.lstrip([chars])`.
"""
new_text = text.lstrip(chars)
new_start = start + (len(text) - len(new_text))
return new_text, new_start
[docs]def rstrip(text: str, end: int = None, chars: str = None) -> Tuple[str, int]:
"""Returns a copy of the string with trailing characters removed
and its corresponding new end index.
Parameters
----------
text
The text to strip.
end
The end index from the original text if any.
chars
The list of characters to strip. Default behaviour is like `str.rstrip([chars])`.
"""
if end is None:
end = len(text)
new_text = text.rstrip(chars)
new_end = end - (len(text) - len(new_text))
return new_text, new_end
[docs]def strip(text: str, start: int = 0, chars: str = None) -> Tuple[str, int, int]:
"""Returns a copy of the string with leading characters removed
and its corresponding new start and end indexes.
Parameters
----------
text
The text to strip.
start
The start index from the original text if any.
chars
The list of characters to strip. Default behaviour is like `str.lstrip([chars])`.
"""
new_text, new_start = lstrip(text, start, chars)
new_end = new_start + len(new_text)
new_text, new_end = rstrip(new_text, new_end)
return new_text, new_start, new_end