Source code for medkit.text.segmentation.sentence_tokenizer

from __future__ import annotations

__all__ = ["SentenceTokenizer"]

import re
from typing import Iterator, List, Optional, Tuple

from medkit.core.text import Segment, SegmentationOperation, span_utils


[docs]class SentenceTokenizer(SegmentationOperation): """Sentence segmentation annotator based on end punctuation rules""" _DEFAULT_LABEL = "sentence" _DEFAULT_PUNCT_CHARS = (".", ";", "?", "!") def __init__( self, output_label: str = _DEFAULT_LABEL, punct_chars: Tuple[str] = _DEFAULT_PUNCT_CHARS, keep_punct: bool = False, split_on_newlines: bool = True, attrs_to_copy: Optional[List[str]] = None, uid: Optional[str] = None, ): """ Instantiate the sentence tokenizer Parameters ---------- output_label: str, Optional The output label of the created annotations. punct_chars: Tuple[str], Optional The set of characters corresponding to end punctuations. keep_punct: bool, Optional If True, the end punctuations are kept in the detected sentence. If False, the sentence text does not include the end punctuations. split_on_newlines: Whether to consider that newlines characters are sentence boundaries or not. attrs_to_copy: Labels of the attributes that should be copied from the input segment to the derived segment. For example, useful for propagating section name. uid: str, Optional Identifier of the tokenizer """ # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) if attrs_to_copy is None: attrs_to_copy = [] self.output_label = output_label self.punct_chars = punct_chars self.keep_punct = keep_punct self.split_on_newlines = split_on_newlines self.attrs_to_copy = attrs_to_copy # pre-compile patterns self._newline_pattern = re.compile( r" *(?P<content>[^\n\r]+) *(?P<separator>[\n\r]+|$)" ) punct_string = re.escape("".join(self.punct_chars)) self._punct_pattern = re.compile( rf" *(?P<content>[^{punct_string}]+) *(?P<separator>[{punct_string}]+|$)" )
[docs] def run(self, segments: List[Segment]) -> List[Segment]: """ Return sentences detected in `segments`. Parameters ---------- segments: List of segments into which to look for sentences Returns ------- List[Segments]: Sentences segments found in `segments` """ return [ sentence for segment in segments for sentence in self._find_sentences_in_segment(segment) ]
def _find_sentences_in_segment(self, segment: Segment) -> Iterator[Segment]: # split on newlines (discarding newline chars) then split each line on punct chars if self.split_on_newlines: for line_start, line_end in self._split_text( segment.text, self._newline_pattern, keep_separator=False ): sub_text = segment.text[line_start:line_end] for sub_start, sub_end in self._split_text( sub_text, self._punct_pattern, keep_separator=self.keep_punct ): start = line_start + sub_start end = line_start + sub_end yield self._build_sentence(segment, range=(start, end)) # or split directly on punct chars else: for start, end in self._split_text( segment.text, self._punct_pattern, keep_separator=self.keep_punct ): yield self._build_sentence(segment, range=(start, end)) @staticmethod def _split_text( text: str, pattern: re.Pattern, keep_separator: bool ) -> Iterator[Tuple[int, int]]: for match in pattern.finditer(text): start = match.start("content") end = match.end("separator") if keep_separator else match.end("content") has_letters = re.search(r"\w", text[start:end]) if end > start and has_letters: yield start, end def _build_sentence( self, source_segment: Segment, range: Tuple[int, int] ) -> Segment: text, spans = span_utils.extract( text=source_segment.text, spans=source_segment.spans, ranges=[range], ) sentence = Segment( label=self.output_label, spans=spans, text=text, ) # Copy inherited attributes for label in self.attrs_to_copy: for attr in source_segment.attrs.get(label=label): copied_attr = attr.copy() sentence.attrs.add(copied_attr) # handle provenance if self._prov_tracer is not None: self._prov_tracer.add_prov(copied_attr, self.description, [attr]) if self._prov_tracer is not None: self._prov_tracer.add_prov( sentence, self.description, source_data_items=[source_segment] ) return sentence