Source code for medkit.text.segmentation.section_tokenizer

from __future__ import annotations

__all__ = ["SectionModificationRule", "SectionTokenizer"]

import dataclasses
import pathlib
from typing import Dict, Iterable, List, Optional, Tuple
from typing_extensions import Literal
import yaml

from flashtext import KeywordProcessor

from medkit.core import Attribute
from medkit.core.text import Segment, SegmentationOperation, span_utils
from medkit.core.text.utils import lstrip, rstrip


_PATH_TO_DEFAULT_RULES = (
    pathlib.Path(__file__).parent / "default_section_definition.yml"
)


[docs]@dataclasses.dataclass
class SectionModificationRule:
    section_name: str
    new_section_name: str
    other_sections: List[str]
    order: Literal["BEFORE", "AFTER"]


[docs]class SectionTokenizer(SegmentationOperation):
    """Section segmentation annotator based on keyword rules"""

    _DEFAULT_LABEL: str = "section"
    _DEFAULT_STRIP_CHARS: str = ".;,?! \n\r\t"

    def __init__(
        self,
        section_dict: Dict[str, List[str]] = None,
        output_label: str = _DEFAULT_LABEL,
        section_rules: Iterable[SectionModificationRule] = (),
        strip_chars: str = _DEFAULT_STRIP_CHARS,
        uid: Optional[str] = None,
    ):
        """
        Initialize the Section Tokenizer

        Parameters
        ----------
        section_dict
            Dictionary containing the section name as key and the list of mappings as
            value. If None, the content of default_section_definition.yml will be used.
        output_label
            Segment label to use for annotation output.
        section_rules
            List of rules for modifying a section name according its order to the other
            sections. If section_dict is None, the content of
            default_section_definition.yml will be used.
        strip_chars
            The list of characters to strip at the beginning of the returned segment.
        uid: str, Optional
            Identifier of the tokenizer
        """
        # Pass all arguments to super (remove self)
        init_args = locals()
        init_args.pop("self")
        super().__init__(**init_args)

        self.output_label = output_label
        self.strip_chars = strip_chars

        if section_dict is None:
            section_dict, section_rules = self.load_section_definition(
                _PATH_TO_DEFAULT_RULES, encoding="utf-8"
            )

        self.section_dict = section_dict
        self.section_rules = tuple(section_rules)

        self.keyword_processor = KeywordProcessor(case_sensitive=True)
        self.keyword_processor.add_keywords_from_dict(section_dict)

[docs]    def run(self, segments: List[Segment]) -> List[Segment]:
        """
        Return sections detected in `segments`.
        Each section is a segment with an attached attribute
        (label: <same as self.output_label>, value: <the name of the section>).


        Parameters
        ----------
        segments:
            List of segments into which to look for sections

        Returns
        -------
        List[Segments]:
            Sections segments found in `segments`
        """
        return [
            section
            for segment in segments
            for section in self._find_sections_in_segment(segment)
        ]

    def _find_sections_in_segment(self, segment: Segment):
        # Process mappings
        match = self.keyword_processor.extract_keywords(segment.text, span_info=True)

        # Sort according to the match start
        match.sort(key=lambda x: x[1])
        if len(match) == 0 or match[0][1] != 0:
            # Add head before any detected sections
            match.insert(0, ("head", 0, 0))

        # Get sections to rename according defined rules
        # e.g., set any 'traitement' section occurring before 'histoire' or 'evolution'
        # to 'traitement entree' (cf. example)
        new_sections = self._get_sections_to_rename(match)

        for index, section in enumerate(match):
            name = new_sections.get(index, section[0])
            if index != len(match) - 1:
                ranges = [(section[1], match[index + 1][1])]
            else:
                ranges = [(section[1], len(segment.text))]

            # Remove extra characters at beginning of the detected segments
            # and white spaces at end of the text
            strip_ranges = []
            for start, end in ranges:
                text, new_start = lstrip(
                    segment.text[start:end], start, self.strip_chars
                )
                text, new_end = rstrip(text, end)
                if len(text) == 0:  # empty segment
                    continue
                strip_ranges.append((new_start, new_end))

            # Extract medkit spans from relative spans (i.e., ranges)
            text, spans = span_utils.extract(
                text=segment.text,
                spans=segment.spans,
                ranges=strip_ranges,
            )

            # add section name in metadata
            metadata = dict(name=name)
            section = Segment(
                label=self.output_label,
                spans=spans,
                text=text,
                metadata=metadata,
            )

            # add section name in section attribute
            attr = Attribute(label=self.output_label, value=name)
            section.attrs.add(attr)

            if self._prov_tracer is not None:
                self._prov_tracer.add_prov(
                    section, self.description, source_data_items=[segment]
                )
                self._prov_tracer.add_prov(
                    attr, self.description, source_data_items=[segment]
                )

            yield section

    def _get_sections_to_rename(self, match: List[Tuple]):
        match_type = [m[0] for m in match]
        map_index_new_name = {}
        list_to_process = ()
        for rule in self.section_rules:
            if rule.order == "BEFORE":
                # Change section name if section is before one of the listed sections
                list_to_process = enumerate(match_type)
            elif rule.order == "AFTER":
                # Change section name if the section is after one of the listed sections
                list_to_process = reversed(list(enumerate(match_type)))

            # Navigate in list according to the order defined above
            candidate_sections = []
            for index, section_name in list_to_process:
                if section_name == rule.section_name:
                    candidate_sections.append(index)
                if section_name in rule.other_sections:
                    for candidate_index in candidate_sections:
                        map_index_new_name[candidate_index] = rule.new_section_name
                    candidate_sections.clear()

        return map_index_new_name

    @classmethod
    def get_example(cls):
        config_path = _PATH_TO_DEFAULT_RULES
        section_dict, section_rules = cls.load_section_definition(
            config_path, encoding="utf-8"
        )
        section_tokenizer = cls(
            section_dict=section_dict,
            section_rules=section_rules,
        )
        return section_tokenizer

[docs]    @staticmethod
    def load_section_definition(
        filepath: pathlib.Path, encoding: Optional[str] = None
    ) -> Tuple[Dict[str, List[str]], Tuple[SectionModificationRule, ...]]:
        """
        Load the sections definition stored in a yml file

        Parameters
        ----------
        filepath:
            Path to a yml file containing the sections(name + mappings) and rules
        encoding:
            Encoding of the file to open

        Returns
        -------
        Tuple[Dict[str, List[str]], Tuple[SectionModificationRule, ...]]
            Tuple containing:
            - the dictionary where key is the section name and value is the list of all
            equivalent strings.
            - the list of section modification rules.
            These rules allow to rename some sections according their order
        """

        with open(filepath, mode="r", encoding=encoding) as f:
            config = yaml.safe_load(f)

        section_dict = config["sections"]
        section_rules = tuple(
            SectionModificationRule(**rule) for rule in config["rules"]
        )

        return section_dict, section_rules

[docs]    @staticmethod
    def save_section_definition(
        section_dict: Dict[str, List[str]],
        section_rules: Iterable[SectionModificationRule],
        filepath: pathlib.Path,
        encoding: Optional[str] = None,
    ):
        """
        Save section yaml definition file

        Parameters
        ----------
        section_dict
            Dictionary containing the section name as key and the list of mappings
            as value (cf. content of default_section_dict.yml as example)
        section_rules
            List of rules for modifying a section name according its order to the other
            sections.
        filepath
            Path to the file to save
        encoding
            File encoding. Default: None
        """
        with open(filepath, mode="w", encoding=encoding) as f:
            data = {"sections": section_dict, "rules": []}
            for rule in section_rules:
                data["rules"].append(dataclasses.asdict(rule))
            yaml.safe_dump(data, f, allow_unicode=True, encoding=encoding)