from __future__ import annotations
__all__ = ["Normalizer", "NormalizerRule"]
import re
from typing import List, NamedTuple, Optional, Tuple
from medkit.core.operation import Operation
from medkit.core.text import Segment, span_utils
[docs]class NormalizerRule(NamedTuple):
pattern_to_replace: str
new_text: str
[docs]class Normalizer(Operation):
"""
Generic normalizer to be used as pre-processing module
This module is a non-destructive module allowing to replace selected characters
with the wanted characters.
It respects the span modification by creating a new text-bound annotation containing
the span modification information from input text.
"""
def __init__(
self,
output_label: str,
rules: List[Tuple[str, str]] = None,
name: Optional[str] = None,
uid: Optional[str] = None,
):
"""
Parameters
----------
output_label
The output label of the created annotations
rules
The list of replacement rules
name:
Name describing the pre-processing module (defaults to the class name)
uid
Identifier of the pre-processing module
"""
# Pass all arguments to super (remove self)
init_args = locals()
init_args.pop("self")
super().__init__(**init_args)
self.output_label = output_label
if rules is None:
rules = []
self.rules = [NormalizerRule(*rule) for rule in rules]
regex_rules = ["(" + rule.pattern_to_replace + ")" for rule in self.rules]
regex_rule = r"|".join(regex_rules)
self._pattern = re.compile(regex_rule)
[docs] def run(self, segments: List[Segment]) -> List[Segment]:
"""
Run the module on a list of segments provided as input
and returns a new list of segments
Parameters
----------
segments
List of segments to normalize
Returns
-------
List[~medkit.core.text.Segment]:
List of normalized segments
"""
return [
norm_segment
for segment in segments
for norm_segment in self._normalize_segment_text(segment)
]
def _normalize_segment_text(self, segment: Segment):
ranges = []
replacement_texts = []
for match in self._pattern.finditer(segment.text):
ranges.append(match.span())
for index in range(len(self.rules)):
if match.groups()[index] is not None:
replacement_texts.append(self.rules[index].new_text)
break
new_text, new_spans = span_utils.replace(
text=segment.text,
spans=segment.spans,
ranges=ranges,
replacement_texts=replacement_texts,
)
normalized_text = Segment(
label=self.output_label, spans=new_spans, text=new_text
)
if self._prov_tracer is not None:
self._prov_tracer.add_prov(
normalized_text, self.description, source_data_items=[segment]
)
yield normalized_text