Source code for medkit.text.preprocessing.char_replacer

from __future__ import annotations

__all__ = ["CharReplacer"]

from medkit.core.operation import Operation
from medkit.core.text import Segment, span_utils
from medkit.text.preprocessing.char_rules import ALL_CHAR_RULES



[docs]
class CharReplacer(Operation):
    """Generic character replacer to be used as pre-processing module

    This module is a non-destructive module allowing to replace selected 1-char string
    with the wanted n-chars strings.
    It respects the span modification by creating a new text-bound annotation containing
    the span modification information from input text.
    """

    def __init__(
        self,
        output_label: str,
        rules: list[tuple[str, str]] | None = None,
        name: str | None = None,
        uid: str | None = None,
    ):
        """Parameters
        ----------
        output_label : str
            The output label of the created annotations
        rules : list of tuple, optional
            The list of replacement rules. Default: ALL_CHAR_RULES
        name : str, optional
            Name describing the pre-processing module (defaults to the class name)
        uid : str, optional
            Identifier of the pre-processing module
        """
        # Pass all arguments to super (remove self)
        init_args = locals()
        init_args.pop("self")
        super().__init__(**init_args)

        self.output_label = output_label
        if rules is None:
            rules = ALL_CHAR_RULES
        self.rules = dict(rules)

        assert not any(
            len(key) != 1 for key in self.rules
        ), "CharReplacer can only contain rules that replace 1-char string."


[docs]
    def run(self, segments: list[Segment]) -> list[Segment]:
        """Run the module on a list of segments provided as input
        and returns a new list of segments

        Parameters
        ----------
        segments : list of Segment
            List of segments to process

        Returns
        -------
        list of Segment
            List of new segments
        """
        return [processed_segment for segment in segments for processed_segment in self._process_segment_text(segment)]


    def _process_segment_text(self, segment: Segment):
        ranges = []
        replacement_texts = []

        for ind, c in enumerate(segment.text):
            nc = self.rules.get(c)
            if nc is not None:
                ranges.append((ind, ind + 1))
                replacement_texts.append(nc)

        new_text, new_spans = span_utils.replace(
            text=segment.text,
            spans=segment.spans,
            ranges=ranges,
            replacement_texts=replacement_texts,
        )

        processed_text = Segment(label=self.output_label, spans=new_spans, text=new_text)

        if self._prov_tracer is not None:
            self._prov_tracer.add_prov(processed_text, self.description, source_data_items=[segment])

        yield processed_text