Source code for medkit.text.segmentation.rush_sentence_tokenizer

"""
This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[rush-sentence-tokenizer]`.
"""

from __future__ import annotations

__all__ = ["RushSentenceTokenizer"]

import dataclasses
from pathlib import Path
import re
from typing import Iterator, List, Optional, Union

from PyRuSH import RuSH

from medkit.core.text import Segment, SegmentationOperation, span_utils


@dataclasses.dataclass(frozen=True)
class DefaultConfig:
    output_label = "SENTENCE"
    path_to_rules = None
    keep_newlines = True


_PATH_TO_DEFAULT_RULES = (
    Path(__file__).parent / "rush_sentence_tokenizer_default_rules.tsv"
)


[docs]class RushSentenceTokenizer(SegmentationOperation): """Sentence segmentation annotator based on PyRuSH.""" def __init__( self, output_label: str = DefaultConfig.output_label, path_to_rules: Union[str, Path] = DefaultConfig.path_to_rules, keep_newlines: bool = DefaultConfig.keep_newlines, uid: Optional[str] = None, ): """ Instantiate the RuSH tokenizer Parameters ---------- output_label: The output label of the created annotations. Default: "SENTENCE" (cf.DefaultConfig) path_to_rules: Path to csv or tsv file to provide to PyRuSH. If none provided, "rush_tokenizer_default_rules.tsv" will be used (corresponds to the "conf/rush_rules.tsv" in the PyRush repo) keep_newlines: With the default rules, newline chars are not used to split sentences, therefore a sentence maybe contain one or more newline chars. If `keep_newlines` is False, newlines will be replaced by spaces. uid: Identifier of the tokenizer """ # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) if path_to_rules is None: path_to_rules = _PATH_TO_DEFAULT_RULES self.output_label = output_label self.path_to_rules = path_to_rules self.keep_newlines = keep_newlines self._rush = RuSH(str(path_to_rules))
[docs] def run(self, segments: List[Segment]) -> List[Segment]: """ Return sentences detected in `segments`. Parameters ---------- segments: List of segments into which to look for sentences Returns ------- List[Segments]: Sentences segments found in `segments` """ return [ sentence for segment in segments for sentence in self._find_sentences_in_segment(segment) ]
def _find_sentences_in_segment(self, segment: Segment) -> Iterator[Segment]: rush_spans = self._rush.segToSentenceSpans(segment.text) for rush_span in rush_spans: text, spans = span_utils.extract( text=segment.text, spans=segment.spans, ranges=[(rush_span.begin, rush_span.end)], ) if not self.keep_newlines: ranges = [m.span() for m in re.finditer(r"\n", text)] replacements = " " * len(ranges) text, spans = span_utils.replace(text, spans, ranges, replacements) sentence = Segment( label=self.output_label, spans=spans, text=text, ) if self._prov_tracer is not None: self._prov_tracer.add_prov( sentence, self.description, source_data_items=[segment] ) yield sentence