Source code for medkit.text.preprocessing.eds_cleaner

__all__ = ["EDSCleaner"]

from typing import List

from medkit.core import Operation
from medkit.core.text import Segment, utils

# predefined configuration for french documents
_FR_CIVIL_TITLES = ["M", "Mme", "Mlle", "Mr", "Pr", "Dr", "Mde"]
_FR_PREPOSITIONS_AFTER = [
    "de",
    "par",
    "le",
    "du",
    "la",
    "les",
    "des",
    "un",
    "une",
    "ou",
    "pour",
    "avec",
]
_FR_KEYWORDS_BEFORE = ["pour", "avec", "et"]


[docs]class EDSCleaner(Operation): """ EDS pre-processing annotation module This module is a non-destructive module allowing to remove and clean selected points and newlines characters. It respects the span modification by creating a new text-bound annotation containing the span modification information from input text. """ _DEFAULT_LABEL = "clean_text" def __init__( self, output_label: str = _DEFAULT_LABEL, keep_endlines: bool = False, handle_parentheses_eds: bool = True, handle_points_eds: bool = True, uid: str = None, ): """ Instantiate the endlines handler. Parameters ---------- output_label The output label of the created annotations. keep_endlines: If True, modify multiple endlines using `.\\n` as a replacement. If False (default), modify multiple endlines using whitespaces (`.\\s`) as a replacement. handle_parentheses_eds: If True (default), modify the text near to parentheses or keywords according to predefined rules for french documents If False, the text near to parentheses or keywords is not modified handle_points_eds: Modify points near to predefined keywords for french documents If True (default), modify the points near to keywords If False, the points near to keywords is not modified uid Identifier of the pre-processing module """ # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) self.output_label = output_label self.keep_endlines = keep_endlines self.handle_parentheses_eds = handle_parentheses_eds self.handle_points_eds = handle_points_eds
[docs] def run(self, segments: List[Segment]) -> List[Segment]: """ Run the module on a list of segments provided as input and returns a new list of segments. Parameters ---------- segments List of segments to normalize Returns ------- List[~medkit.core.text.Segment]: List of cleaned segments. """ return [ norm_segment for segment in segments for norm_segment in self._clean_segment_text(segment) ]
def _clean_segment_text(self, segment: Segment): """ Clean up a segment non-destructively, remove points between numbers and upper case letters. Then remove multiple whitespaces or newline characters. Finally, modify parentheses or point after keywords if necessary. """ text = segment.text spans = segment.spans # modify points characters text, spans = utils.replace_point_in_uppercase(text, spans) text, spans = utils.replace_point_in_numbers(text, spans) # modify newline character text, spans = utils.clean_newline_character( text=text, spans=spans, keep_endlines=self.keep_endlines ) # modify all whitespaces characters text, spans = utils.clean_multiple_whitespaces_in_sentence(text, spans) # modify parentheses using predefined rules for french documents if self.handle_parentheses_eds: text, spans = utils.clean_parentheses_eds(text, spans) if self.handle_points_eds: # replace the character `.` after and before certain keywords # after the title of a person (i.e. M. or Mrs.) text, spans = utils.replace_point_after_keywords( text=text, spans=spans, keywords=_FR_CIVIL_TITLES, strict=True, ) # after certain prepositions (`du` . patient) text, spans = utils.replace_point_after_keywords( text=text, spans=spans, keywords=_FR_PREPOSITIONS_AFTER, strict=False, ) # before certain prepositions (venue . `avec`) text, spans = utils.replace_point_before_keywords( text=text, spans=spans, keywords=_FR_KEYWORDS_BEFORE ) # create ann with the clean text clean_text = Segment(label=self.output_label, spans=spans, text=text) if self._prov_tracer is not None: self._prov_tracer.add_prov( clean_text, self.description, source_data_items=[segment] ) yield clean_text