Source code for medkit.text.segmentation.rush_sentence_tokenizer

"""
This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[rush-sentence-tokenizer]`.
"""

from __future__ import annotations

__all__ = ["RushSentenceTokenizer"]

from pathlib import Path
import re
from typing import Iterator, List, Optional, Union

from PyRuSH import RuSH

from medkit.core.text import Segment, SegmentationOperation, span_utils


_PATH_TO_DEFAULT_RULES = (
    Path(__file__).parent / "rush_sentence_tokenizer_default_rules.tsv"
)


[docs]class RushSentenceTokenizer(SegmentationOperation): """Sentence segmentation annotator based on PyRuSH.""" _DEFAULT_LABEL = "sentence" def __init__( self, output_label: str = _DEFAULT_LABEL, path_to_rules: Optional[Union[str, Path]] = None, keep_newlines: bool = True, attrs_to_copy: Optional[List[str]] = None, uid: Optional[str] = None, ): """ Instantiate the RuSH tokenizer Parameters ---------- output_label: The output label of the created annotations. path_to_rules: Path to csv or tsv file to provide to PyRuSH. If none provided, "rush_tokenizer_default_rules.tsv" will be used (corresponds to the "conf/rush_rules.tsv" in the PyRush repo) keep_newlines: With the default rules, newline chars are not used to split sentences, therefore a sentence maybe contain one or more newline chars. If `keep_newlines` is False, newlines will be replaced by spaces. attrs_to_copy: Labels of the attributes that should be copied from the input segment to the derived segment. For example, useful for propagating section name. uid: Identifier of the tokenizer """ # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) if attrs_to_copy is None: attrs_to_copy = [] if path_to_rules is None: path_to_rules = _PATH_TO_DEFAULT_RULES self.output_label = output_label self.path_to_rules = path_to_rules self.keep_newlines = keep_newlines self.attrs_to_copy = attrs_to_copy self._rush = RuSH(str(path_to_rules))
[docs] def run(self, segments: List[Segment]) -> List[Segment]: """ Return sentences detected in `segments`. Parameters ---------- segments: List of segments into which to look for sentences Returns ------- List[Segments]: Sentences segments found in `segments` """ return [ sentence for segment in segments for sentence in self._find_sentences_in_segment(segment) ]
def _find_sentences_in_segment(self, segment: Segment) -> Iterator[Segment]: rush_spans = self._rush.segToSentenceSpans(segment.text) for rush_span in rush_spans: text, spans = span_utils.extract( text=segment.text, spans=segment.spans, ranges=[(rush_span.begin, rush_span.end)], ) if not self.keep_newlines: ranges = [m.span() for m in re.finditer(r"\n", text)] replacements = [" " for _ in ranges] text, spans = span_utils.replace(text, spans, ranges, replacements) sentence = Segment( label=self.output_label, spans=spans, text=text, ) # Copy inherited attributes for label in self.attrs_to_copy: for attr in segment.attrs.get(label=label): copied_attr = attr.copy() sentence.attrs.add(copied_attr) # handle provenance if self._prov_tracer is not None: self._prov_tracer.add_prov( copied_attr, self.description, [attr] ) if self._prov_tracer is not None: self._prov_tracer.add_prov( sentence, self.description, source_data_items=[segment] ) yield sentence