Source code for medkit.text.segmentation.rush_sentence_tokenizer

"""
This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[rush-sentence-tokenizer]`.
"""

from __future__ import annotations

__all__ = ["RushSentenceTokenizer"]

from pathlib import Path
import re
from typing import Iterator, List, Optional, Union

from PyRuSH import RuSH

from medkit.core.text import Segment, SegmentationOperation, span_utils


_PATH_TO_DEFAULT_RULES = (
    Path(__file__).parent / "rush_sentence_tokenizer_default_rules.tsv"
)


[docs]class RushSentenceTokenizer(SegmentationOperation):
    """Sentence segmentation annotator based on PyRuSH."""

    _DEFAULT_LABEL = "sentence"

    def __init__(
        self,
        output_label: str = _DEFAULT_LABEL,
        path_to_rules: Optional[Union[str, Path]] = None,
        keep_newlines: bool = True,
        attrs_to_copy: Optional[List[str]] = None,
        uid: Optional[str] = None,
    ):
        """
        Instantiate the RuSH tokenizer

        Parameters
        ----------
        output_label:
            The output label of the created annotations.
        path_to_rules:
            Path to csv or tsv file to provide to PyRuSH. If none provided,
            "rush_tokenizer_default_rules.tsv" will be used
            (corresponds to the "conf/rush_rules.tsv" in the PyRush repo)
        keep_newlines:
            With the default rules, newline chars are not used to split
            sentences, therefore a sentence maybe contain one or more newline chars.
            If `keep_newlines` is False, newlines will be replaced by spaces.
        attrs_to_copy:
            Labels of the attributes that should be copied from the input segment
            to the derived segment. For example, useful for propagating section name.

        uid:
            Identifier of the tokenizer
        """
        # Pass all arguments to super (remove self)
        init_args = locals()
        init_args.pop("self")
        super().__init__(**init_args)

        if attrs_to_copy is None:
            attrs_to_copy = []

        if path_to_rules is None:
            path_to_rules = _PATH_TO_DEFAULT_RULES

        self.output_label = output_label
        self.path_to_rules = path_to_rules
        self.keep_newlines = keep_newlines
        self.attrs_to_copy = attrs_to_copy
        self._rush = RuSH(str(path_to_rules))

[docs]    def run(self, segments: List[Segment]) -> List[Segment]:
        """
        Return sentences detected in `segments`.

        Parameters
        ----------
        segments:
            List of segments into which to look for sentences

        Returns
        -------
        List[Segments]:
            Sentences segments found in `segments`
        """
        return [
            sentence
            for segment in segments
            for sentence in self._find_sentences_in_segment(segment)
        ]

    def _find_sentences_in_segment(self, segment: Segment) -> Iterator[Segment]:
        rush_spans = self._rush.segToSentenceSpans(segment.text)
        for rush_span in rush_spans:
            text, spans = span_utils.extract(
                text=segment.text,
                spans=segment.spans,
                ranges=[(rush_span.begin, rush_span.end)],
            )

            if not self.keep_newlines:
                ranges = [m.span() for m in re.finditer(r"\n", text)]
                replacements = [" " for _ in ranges]
                text, spans = span_utils.replace(text, spans, ranges, replacements)

            sentence = Segment(
                label=self.output_label,
                spans=spans,
                text=text,
            )

            # Copy inherited attributes
            for label in self.attrs_to_copy:
                for attr in segment.attrs.get(label=label):
                    copied_attr = attr.copy()
                    sentence.attrs.add(copied_attr)
                    # handle provenance
                    if self._prov_tracer is not None:
                        self._prov_tracer.add_prov(
                            copied_attr, self.description, [attr]
                        )

            if self._prov_tracer is not None:
                self._prov_tracer.add_prov(
                    sentence, self.description, source_data_items=[segment]
                )

            yield sentence