Source code for medkit.text.preprocessing.eds_cleaner

__all__ = ["EDSCleaner"]

from typing import List

from medkit.core import Operation
from medkit.core.text import Segment, utils

# predefined configuration for french documents
_FR_CIVIL_TITLES = ["M", "Mme", "Mlle", "Mr", "Pr", "Dr", "Mde"]
_FR_PREPOSITIONS_AFTER = [
    "de",
    "par",
    "le",
    "du",
    "la",
    "les",
    "des",
    "un",
    "une",
    "ou",
    "pour",
    "avec",
]
_FR_KEYWORDS_BEFORE = ["pour", "avec", "et"]


[docs]class EDSCleaner(Operation):
    """
    EDS pre-processing annotation module

    This module is a non-destructive module allowing to remove and clean selected points
    and newlines characters. It respects the span modification by creating a new
    text-bound annotation containing the span modification information from input text.

    """

    _DEFAULT_LABEL = "clean_text"

    def __init__(
        self,
        output_label: str = _DEFAULT_LABEL,
        keep_endlines: bool = False,
        handle_parentheses_eds: bool = True,
        handle_points_eds: bool = True,
        uid: str = None,
    ):
        """
        Instantiate the endlines handler.

        Parameters
        ----------
        output_label
            The output label of the created annotations.
        keep_endlines:
            If True, modify multiple endlines using `.\\n` as a replacement.
            If False (default), modify multiple endlines using whitespaces (`.\\s`) as a replacement.
        handle_parentheses_eds:
            If True (default), modify the text near to parentheses or keywords according to
            predefined rules for french documents
            If False, the text near to parentheses or keywords is not modified
        handle_points_eds:
            Modify points near to predefined keywords for french documents
            If True (default), modify the points near to keywords
            If False, the points near to keywords is not modified
        uid
            Identifier of the pre-processing module
        """
        # Pass all arguments to super (remove self)
        init_args = locals()
        init_args.pop("self")
        super().__init__(**init_args)

        self.output_label = output_label
        self.keep_endlines = keep_endlines
        self.handle_parentheses_eds = handle_parentheses_eds
        self.handle_points_eds = handle_points_eds

[docs]    def run(self, segments: List[Segment]) -> List[Segment]:
        """
        Run the module on a list of segments provided as input
        and returns a new list of segments.

        Parameters
        ----------
        segments
            List of segments to normalize

        Returns
        -------
        List[~medkit.core.text.Segment]:
            List of cleaned segments.
        """
        return [
            norm_segment
            for segment in segments
            for norm_segment in self._clean_segment_text(segment)
        ]

    def _clean_segment_text(self, segment: Segment):
        """
        Clean up a segment non-destructively, remove points between numbers and  upper case letters.
        Then remove multiple whitespaces or newline characters.
        Finally, modify parentheses or point after keywords if necessary.
        """
        text = segment.text
        spans = segment.spans

        # modify points characters
        text, spans = utils.replace_point_in_uppercase(text, spans)
        text, spans = utils.replace_point_in_numbers(text, spans)

        # modify newline character
        text, spans = utils.clean_newline_character(
            text=text, spans=spans, keep_endlines=self.keep_endlines
        )
        # modify all whitespaces characters
        text, spans = utils.clean_multiple_whitespaces_in_sentence(text, spans)

        # modify parentheses using predefined rules for french documents
        if self.handle_parentheses_eds:
            text, spans = utils.clean_parentheses_eds(text, spans)

        if self.handle_points_eds:
            # replace the character `.` after and before certain keywords
            # after the title of a person (i.e. M. or Mrs.)
            text, spans = utils.replace_point_after_keywords(
                text=text,
                spans=spans,
                keywords=_FR_CIVIL_TITLES,
                strict=True,
            )
            # after certain prepositions (`du` . patient)
            text, spans = utils.replace_point_after_keywords(
                text=text,
                spans=spans,
                keywords=_FR_PREPOSITIONS_AFTER,
                strict=False,
            )
            # before certain prepositions (venue   . `avec`)
            text, spans = utils.replace_point_before_keywords(
                text=text, spans=spans, keywords=_FR_KEYWORDS_BEFORE
            )

        # create ann with the clean text
        clean_text = Segment(label=self.output_label, spans=spans, text=text)

        if self._prov_tracer is not None:
            self._prov_tracer.add_prov(
                clean_text, self.description, source_data_items=[segment]
            )

        yield clean_text