Source code for medkit.text.preprocessing.char_replacer
from __future__ import annotations
__all__ = ["CharReplacer"]
from medkit.core.operation import Operation
from medkit.core.text import Segment, span_utils
from medkit.text.preprocessing.char_rules import ALL_CHAR_RULES
[docs]
class CharReplacer(Operation):
"""Generic character replacer to be used as pre-processing module
This module is a non-destructive module allowing to replace selected 1-char string
with the wanted n-chars strings.
It respects the span modification by creating a new text-bound annotation containing
the span modification information from input text.
"""
def __init__(
self,
output_label: str,
rules: list[tuple[str, str]] | None = None,
name: str | None = None,
uid: str | None = None,
):
"""Parameters
----------
output_label : str
The output label of the created annotations
rules : list of tuple, optional
The list of replacement rules. Default: ALL_CHAR_RULES
name : str, optional
Name describing the pre-processing module (defaults to the class name)
uid : str, optional
Identifier of the pre-processing module
"""
# Pass all arguments to super (remove self)
init_args = locals()
init_args.pop("self")
super().__init__(**init_args)
self.output_label = output_label
if rules is None:
rules = ALL_CHAR_RULES
self.rules = dict(rules)
assert not any(
len(key) != 1 for key in self.rules
), "CharReplacer can only contain rules that replace 1-char string."
[docs]
def run(self, segments: list[Segment]) -> list[Segment]:
"""Run the module on a list of segments provided as input
and returns a new list of segments
Parameters
----------
segments : list of Segment
List of segments to process
Returns
-------
list of Segment
List of new segments
"""
return [processed_segment for segment in segments for processed_segment in self._process_segment_text(segment)]
def _process_segment_text(self, segment: Segment):
ranges = []
replacement_texts = []
for ind, c in enumerate(segment.text):
nc = self.rules.get(c)
if nc is not None:
ranges.append((ind, ind + 1))
replacement_texts.append(nc)
new_text, new_spans = span_utils.replace(
text=segment.text,
spans=segment.spans,
ranges=ranges,
replacement_texts=replacement_texts,
)
processed_text = Segment(label=self.output_label, spans=new_spans, text=new_text)
if self._prov_tracer is not None:
self._prov_tracer.add_prov(processed_text, self.description, source_data_items=[segment])
yield processed_text