Source code for medkit.text.segmentation.rush_sentence_tokenizer

"""This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[rush-sentence-tokenizer]`.
"""

from __future__ import annotations

__all__ = ["RushSentenceTokenizer"]

import re
from pathlib import Path
from typing import Iterator

from PyRuSH import RuSH

from medkit.core.text import Segment, SegmentationOperation, span_utils

_PATH_TO_DEFAULT_RULES = Path(__file__).parent / "rush_sentence_tokenizer_default_rules.tsv"



[docs]
class RushSentenceTokenizer(SegmentationOperation):
    """Sentence segmentation annotator based on PyRuSH."""

    _DEFAULT_LABEL = "sentence"

    def __init__(
        self,
        output_label: str = _DEFAULT_LABEL,
        path_to_rules: str | Path | None = None,
        keep_newlines: bool = True,
        attrs_to_copy: list[str] | None = None,
        uid: str | None = None,
    ):
        """Instantiate the RuSH tokenizer

        Parameters
        ----------
        output_label: str, optional
            The output label of the created annotations.
        path_to_rules: str or Path, optional
            Path to csv or tsv file to provide to PyRuSH. If none provided,
            "rush_tokenizer_default_rules.tsv" will be used
            (corresponds to the "conf/rush_rules.tsv" in the PyRush repo)
        keep_newlines: bool, default=True
            With the default rules, newline chars are not used to split
            sentences, therefore a sentence maybe contain one or more newline chars.
            If `keep_newlines` is False, newlines will be replaced by spaces.
        attrs_to_copy: list of str, optional
            Labels of the attributes that should be copied from the input segment
            to the derived segment. For example, useful for propagating section name.
        uid: str, optional
            Identifier of the tokenizer
        """
        # Pass all arguments to super (remove self)
        init_args = locals()
        init_args.pop("self")
        super().__init__(**init_args)

        if attrs_to_copy is None:
            attrs_to_copy = []

        if path_to_rules is None:
            path_to_rules = _PATH_TO_DEFAULT_RULES

        self.output_label = output_label
        self.path_to_rules = path_to_rules
        self.keep_newlines = keep_newlines
        self.attrs_to_copy = attrs_to_copy
        self._rush = RuSH(str(path_to_rules))


[docs]
    def run(self, segments: list[Segment]) -> list[Segment]:
        """Return sentences detected in `segments`.

        Parameters
        ----------
        segments: list of Segment
            List of segments into which to look for sentences

        Returns
        -------
        list of Segment:
            Sentences segments found in `segments`
        """
        return [sentence for segment in segments for sentence in self._find_sentences_in_segment(segment)]


    def _find_sentences_in_segment(self, segment: Segment) -> Iterator[Segment]:
        rush_spans = self._rush.segToSentenceSpans(segment.text)
        for rush_span in rush_spans:
            text, spans = span_utils.extract(
                text=segment.text,
                spans=segment.spans,
                ranges=[(rush_span.begin, rush_span.end)],
            )

            if not self.keep_newlines:
                ranges = [m.span() for m in re.finditer(r"\n", text)]
                replacements = [" " for _ in ranges]
                text, spans = span_utils.replace(text, spans, ranges, replacements)

            sentence = Segment(
                label=self.output_label,
                spans=spans,
                text=text,
            )

            # Copy inherited attributes
            for label in self.attrs_to_copy:
                for attr in segment.attrs.get(label=label):
                    copied_attr = attr.copy()
                    sentence.attrs.add(copied_attr)
                    # handle provenance
                    if self._prov_tracer is not None:
                        self._prov_tracer.add_prov(copied_attr, self.description, [attr])

            if self._prov_tracer is not None:
                self._prov_tracer.add_prov(sentence, self.description, source_data_items=[segment])

            yield sentence