Source code for medkit.text.segmentation.sentence_tokenizer

from __future__ import annotations

__all__ = ["SentenceTokenizer"]

import re
from typing import Iterator, List, Optional, Tuple

from medkit.core.text import Segment, SegmentationOperation, span_utils


[docs]class SentenceTokenizer(SegmentationOperation):
    """Sentence segmentation annotator based on end punctuation rules"""

    _DEFAULT_LABEL = "sentence"
    _DEFAULT_PUNCT_CHARS = (".", ";", "?", "!")

    def __init__(
        self,
        output_label: str = _DEFAULT_LABEL,
        punct_chars: Tuple[str] = _DEFAULT_PUNCT_CHARS,
        keep_punct: bool = False,
        split_on_newlines: bool = True,
        attrs_to_copy: Optional[List[str]] = None,
        uid: Optional[str] = None,
    ):
        """
        Instantiate the sentence tokenizer

        Parameters
        ----------
        output_label: str, Optional
            The output label of the created annotations.
        punct_chars: Tuple[str], Optional
            The set of characters corresponding to end punctuations.
        keep_punct: bool, Optional
            If True, the end punctuations are kept in the detected sentence.
            If False, the sentence text does not include the end punctuations.
        split_on_newlines:
            Whether to consider that newlines characters are sentence boundaries or not.
        attrs_to_copy:
            Labels of the attributes that should be copied from the input segment
            to the derived segment. For example, useful for propagating section name.
        uid: str, Optional
            Identifier of the tokenizer
        """
        # Pass all arguments to super (remove self)
        init_args = locals()
        init_args.pop("self")
        super().__init__(**init_args)

        if attrs_to_copy is None:
            attrs_to_copy = []

        self.output_label = output_label
        self.punct_chars = punct_chars
        self.keep_punct = keep_punct
        self.split_on_newlines = split_on_newlines
        self.attrs_to_copy = attrs_to_copy

        # pre-compile patterns
        self._newline_pattern = re.compile(
            r" *(?P<content>[^\n\r]+) *(?P<separator>[\n\r]+|$)"
        )
        punct_string = re.escape("".join(self.punct_chars))
        self._punct_pattern = re.compile(
            rf" *(?P<content>[^{punct_string}]+) *(?P<separator>[{punct_string}]+|$)"
        )

[docs]    def run(self, segments: List[Segment]) -> List[Segment]:
        """
        Return sentences detected in `segments`.

        Parameters
        ----------

        segments:
            List of segments into which to look for sentences

        Returns
        -------
        List[Segments]:
            Sentences segments found in `segments`
        """
        return [
            sentence
            for segment in segments
            for sentence in self._find_sentences_in_segment(segment)
        ]

    def _find_sentences_in_segment(self, segment: Segment) -> Iterator[Segment]:
        # split on newlines (discarding newline chars) then split each line on punct chars
        if self.split_on_newlines:
            for line_start, line_end in self._split_text(
                segment.text, self._newline_pattern, keep_separator=False
            ):
                sub_text = segment.text[line_start:line_end]
                for sub_start, sub_end in self._split_text(
                    sub_text, self._punct_pattern, keep_separator=self.keep_punct
                ):
                    start = line_start + sub_start
                    end = line_start + sub_end
                    yield self._build_sentence(segment, range=(start, end))
        # or split directly on punct chars
        else:
            for start, end in self._split_text(
                segment.text, self._punct_pattern, keep_separator=self.keep_punct
            ):
                yield self._build_sentence(segment, range=(start, end))

    @staticmethod
    def _split_text(
        text: str, pattern: re.Pattern, keep_separator: bool
    ) -> Iterator[Tuple[int, int]]:
        for match in pattern.finditer(text):
            start = match.start("content")
            end = match.end("separator") if keep_separator else match.end("content")
            has_letters = re.search(r"\w", text[start:end])
            if end > start and has_letters:
                yield start, end

    def _build_sentence(
        self, source_segment: Segment, range: Tuple[int, int]
    ) -> Segment:
        text, spans = span_utils.extract(
            text=source_segment.text,
            spans=source_segment.spans,
            ranges=[range],
        )

        sentence = Segment(
            label=self.output_label,
            spans=spans,
            text=text,
        )

        # Copy inherited attributes
        for label in self.attrs_to_copy:
            for attr in source_segment.attrs.get(label=label):
                copied_attr = attr.copy()
                sentence.attrs.add(copied_attr)
                # handle provenance
                if self._prov_tracer is not None:
                    self._prov_tracer.add_prov(copied_attr, self.description, [attr])

        if self._prov_tracer is not None:
            self._prov_tracer.add_prov(
                sentence, self.description, source_data_items=[source_segment]
            )

        return sentence