from __future__ import annotations
__all__ = ["SectionModificationRule", "SectionTokenizer"]
import dataclasses
import pathlib
from typing import Dict, Iterable, List, Optional, Tuple
from typing_extensions import Literal
import yaml
from flashtext import KeywordProcessor
from medkit.core import Attribute
from medkit.core.text import Segment, SegmentationOperation, span_utils
from medkit.core.text.utils import lstrip, rstrip
_PATH_TO_DEFAULT_RULES = (
pathlib.Path(__file__).parent / "default_section_definition.yml"
)
[docs]@dataclasses.dataclass
class SectionModificationRule:
section_name: str
new_section_name: str
other_sections: List[str]
order: Literal["BEFORE", "AFTER"]
[docs]class SectionTokenizer(SegmentationOperation):
"""Section segmentation annotator based on keyword rules"""
_DEFAULT_LABEL: str = "section"
_DEFAULT_STRIP_CHARS: str = ".;,?! \n\r\t"
def __init__(
self,
section_dict: Dict[str, List[str]] = None,
output_label: str = _DEFAULT_LABEL,
section_rules: Iterable[SectionModificationRule] = (),
strip_chars: str = _DEFAULT_STRIP_CHARS,
uid: Optional[str] = None,
):
"""
Initialize the Section Tokenizer
Parameters
----------
section_dict
Dictionary containing the section name as key and the list of mappings as
value. If None, the content of default_section_definition.yml will be used.
output_label
Segment label to use for annotation output.
section_rules
List of rules for modifying a section name according its order to the other
sections. If section_dict is None, the content of
default_section_definition.yml will be used.
strip_chars
The list of characters to strip at the beginning of the returned segment.
uid: str, Optional
Identifier of the tokenizer
"""
# Pass all arguments to super (remove self)
init_args = locals()
init_args.pop("self")
super().__init__(**init_args)
self.output_label = output_label
self.strip_chars = strip_chars
if section_dict is None:
section_dict, section_rules = self.load_section_definition(
_PATH_TO_DEFAULT_RULES, encoding="utf-8"
)
self.section_dict = section_dict
self.section_rules = tuple(section_rules)
self.keyword_processor = KeywordProcessor(case_sensitive=True)
self.keyword_processor.add_keywords_from_dict(section_dict)
[docs] def run(self, segments: List[Segment]) -> List[Segment]:
"""
Return sections detected in `segments`.
Each section is a segment with an attached attribute
(label: <same as self.output_label>, value: <the name of the section>).
Parameters
----------
segments:
List of segments into which to look for sections
Returns
-------
List[Segments]:
Sections segments found in `segments`
"""
return [
section
for segment in segments
for section in self._find_sections_in_segment(segment)
]
def _find_sections_in_segment(self, segment: Segment):
# Process mappings
match = self.keyword_processor.extract_keywords(segment.text, span_info=True)
# Sort according to the match start
match.sort(key=lambda x: x[1])
if len(match) == 0 or match[0][1] != 0:
# Add head before any detected sections
match.insert(0, ("head", 0, 0))
# Get sections to rename according defined rules
# e.g., set any 'traitement' section occurring before 'histoire' or 'evolution'
# to 'traitement entree' (cf. example)
new_sections = self._get_sections_to_rename(match)
for index, section in enumerate(match):
name = new_sections.get(index, section[0])
if index != len(match) - 1:
ranges = [(section[1], match[index + 1][1])]
else:
ranges = [(section[1], len(segment.text))]
# Remove extra characters at beginning of the detected segments
# and white spaces at end of the text
strip_ranges = []
for start, end in ranges:
text, new_start = lstrip(
segment.text[start:end], start, self.strip_chars
)
text, new_end = rstrip(text, end)
if len(text) == 0: # empty segment
continue
strip_ranges.append((new_start, new_end))
# Extract medkit spans from relative spans (i.e., ranges)
text, spans = span_utils.extract(
text=segment.text,
spans=segment.spans,
ranges=strip_ranges,
)
# add section name in metadata
metadata = dict(name=name)
section = Segment(
label=self.output_label,
spans=spans,
text=text,
metadata=metadata,
)
# add section name in section attribute
attr = Attribute(label=self.output_label, value=name)
section.attrs.add(attr)
if self._prov_tracer is not None:
self._prov_tracer.add_prov(
section, self.description, source_data_items=[segment]
)
self._prov_tracer.add_prov(
attr, self.description, source_data_items=[segment]
)
yield section
def _get_sections_to_rename(self, match: List[Tuple]):
match_type = [m[0] for m in match]
map_index_new_name = {}
list_to_process = ()
for rule in self.section_rules:
if rule.order == "BEFORE":
# Change section name if section is before one of the listed sections
list_to_process = enumerate(match_type)
elif rule.order == "AFTER":
# Change section name if the section is after one of the listed sections
list_to_process = reversed(list(enumerate(match_type)))
# Navigate in list according to the order defined above
candidate_sections = []
for index, section_name in list_to_process:
if section_name == rule.section_name:
candidate_sections.append(index)
if section_name in rule.other_sections:
for candidate_index in candidate_sections:
map_index_new_name[candidate_index] = rule.new_section_name
candidate_sections.clear()
return map_index_new_name
@classmethod
def get_example(cls):
config_path = _PATH_TO_DEFAULT_RULES
section_dict, section_rules = cls.load_section_definition(
config_path, encoding="utf-8"
)
section_tokenizer = cls(
section_dict=section_dict,
section_rules=section_rules,
)
return section_tokenizer
[docs] @staticmethod
def load_section_definition(
filepath: pathlib.Path, encoding: Optional[str] = None
) -> Tuple[Dict[str, List[str]], Tuple[SectionModificationRule, ...]]:
"""
Load the sections definition stored in a yml file
Parameters
----------
filepath:
Path to a yml file containing the sections(name + mappings) and rules
encoding:
Encoding of the file to open
Returns
-------
Tuple[Dict[str, List[str]], Tuple[SectionModificationRule, ...]]
Tuple containing:
- the dictionary where key is the section name and value is the list of all
equivalent strings.
- the list of section modification rules.
These rules allow to rename some sections according their order
"""
with open(filepath, mode="r", encoding=encoding) as f:
config = yaml.safe_load(f)
section_dict = config["sections"]
section_rules = tuple(
SectionModificationRule(**rule) for rule in config["rules"]
)
return section_dict, section_rules
[docs] @staticmethod
def save_section_definition(
section_dict: Dict[str, List[str]],
section_rules: Iterable[SectionModificationRule],
filepath: pathlib.Path,
encoding: Optional[str] = None,
):
"""
Save section yaml definition file
Parameters
----------
section_dict
Dictionary containing the section name as key and the list of mappings
as value (cf. content of default_section_dict.yml as example)
section_rules
List of rules for modifying a section name according its order to the other
sections.
filepath
Path to the file to save
encoding
File encoding. Default: None
"""
with open(filepath, mode="w", encoding=encoding) as f:
data = {"sections": section_dict, "rules": []}
for rule in section_rules:
data["rules"].append(dataclasses.asdict(rule))
yaml.safe_dump(data, f, allow_unicode=True, encoding=encoding)