Source code for medkit.text.ner.simstring_matcher

from __future__ import annotations

__all__ = ["SimstringMatcher", "SimstringMatcherRule", "SimstringMatcherNormalization"]

import dataclasses
import tempfile
from pathlib import Path
from typing import Any

import yaml
from typing_extensions import Literal

from medkit.text.ner._base_simstring_matcher import (
    BaseSimstringMatcher,
    BaseSimstringMatcherNormalization,
    BaseSimstringMatcherRule,
    build_simstring_matcher_databases,
)

_RULES_DB_FILENAME = "rules.db"
_SIMSTRING_DB_FILENAME = "simstring"



[docs]
@dataclasses.dataclass
class SimstringMatcherRule(BaseSimstringMatcherRule):
    """Rule to use with :class:`~.SimstringMatcher`

    Attributes
    ----------
    term:
        Term to match using similarity-based fuzzy matching
    label:
        Label to use for the entities created when a match is found
    case_sensitive:
        Whether to take case into account when looking for matches.
    unicode_sensitive:
        Whether to use ASCII-only versions of the rule term and input texts when
        looking for matches (non-ASCII chars replaced by closest ASCII chars).
    normalizations:
        Optional list of normalization attributes that should be attached to the
        entities created
    """


[docs]
    @staticmethod
    def from_dict(data: dict[str, Any]) -> SimstringMatcherRule:
        """Creates a SimStringMatcherRule from a dict."""
        return SimstringMatcherRule(
            term=data["term"],
            label=data["label"],
            case_sensitive=data["case_sensitive"],
            unicode_sensitive=data["unicode_sensitive"],
            normalizations=[SimstringMatcherNormalization.from_dict(n) for n in data["normalizations"]],
        )





[docs]
class SimstringMatcherNormalization(BaseSimstringMatcherNormalization):
    """Descriptor of normalization attributes to attach to entities
    created from a :class:`~.SimstringMatcherRule`

    Attributes
    ----------
    kb_name:
        The name of the knowledge base we are referencing. Ex: "umls"
    kb_version:
        The name of the knowledge base we are referencing. Ex: "202AB"
    kb_id:
        The id of the entity in the knowledge base, for instance a CUI
    term:
        Optional normalized version of the entity text in the knowledge base
    """


[docs]
    @staticmethod
    def from_dict(data: dict[str, Any]) -> SimstringMatcherNormalization:
        """Creates a SimstringMatcherNormalization object from a dict"""
        return SimstringMatcherNormalization(
            kb_name=data["kb_name"],
            kb_version=data["kb_version"],
            kb_id=data["id"],
            term=data["term"],
        )





[docs]
class SimstringMatcher(BaseSimstringMatcher):
    """Entity matcher relying on string similarity

    Uses the `simstring` fuzzy matching algorithm
    (http://chokkan.org/software/simstring/).

    Note that setting `spacy_tokenization_language` to `True` might reduce the
    number of false positives. This requires the `spacy` optional dependency,
    which can be installed with `pip install medkit-lib[spacy]`.
    """

    def __init__(
        self,
        rules: list[SimstringMatcherRule],
        threshold: float = 0.9,
        min_length: int = 3,
        max_length: int = 50,
        similarity: Literal["cosine", "dice", "jaccard", "overlap"] = "jaccard",
        spacy_tokenization_language: str | None = None,
        blacklist: list[str] | None = None,
        same_beginning: bool = False,
        attrs_to_copy: list[str] | None = None,
        name: str | None = None,
        uid: str | None = None,
    ):
        """Parameters
        ----------
        rules: list of SimstringMatcherRule
            Rules to use for matching entities.
        threshold: float, default=0.9
            Minimum similarity (between 0.0 and 1.0) between a rule term
            and the text of an entity matched on that rule.
        min_length: int, default=3
            Minimum number of chars in matched entities.
        max_length: int, default=50
            Maximum number of chars in matched entities.
        similarity: str, default="jaccard"
            Similarity metric to use.
        spacy_tokenization_language: str, optional
            2-letter code (ex: "fr", "en", etc.) designating the language of the
            spacy model to use for tokenization. If provided, spacy will be used
            to tokenize input segments and filter out some tokens based on their
            part-of-speech tags, such as determinants, conjunctions and
            prepositions. If `None`, a simple regexp based tokenization will be
            used, which is faster but might give more false positives.
        blacklist: list of str, optional
            Optional list of exact terms to ignore.
        same_beginning: bool, default=False
            Ignore all matches that start with a different character than the
            term of the rule. This can be convenient to get rid of false
            positives on words that are very similar but have opposite meanings
            because of a preposition, for instance "activation" and
            "inactivation".
        attrs_to_copy: list of str, optional
            Labels of the attributes that should be copied from the source
            segment to the created entity. Useful for propagating context
            attributes (negation, antecedent, etc.).
        name: str, optional
            Name describing the matcher (defaults to the class name).
        uid: str, optional
            Identifier of the matcher.
        """
        self._temp_dir = tempfile.TemporaryDirectory()
        rules_db_file = Path(self._temp_dir.name) / _RULES_DB_FILENAME
        simstring_db_file = Path(self._temp_dir.name) / _SIMSTRING_DB_FILENAME

        build_simstring_matcher_databases(
            simstring_db_file,
            rules_db_file,
            rules,
        )

        super().__init__(
            simstring_db_file=simstring_db_file,
            rules_db_file=rules_db_file,
            threshold=threshold,
            min_length=min_length,
            max_length=max_length,
            similarity=similarity,
            spacy_tokenization_language=spacy_tokenization_language,
            blacklist=blacklist,
            same_beginning=same_beginning,
            attrs_to_copy=attrs_to_copy,
            name=name,
            uid=uid,
        )


[docs]
    @staticmethod
    def load_rules(path_to_rules: Path, encoding: str | None = None) -> list[SimstringMatcherRule]:
        """Load all rules stored in a yml file

        Parameters
        ----------
        path_to_rules
            The path to a yml file containing a list of mappings with the same
            structure as :class:`~.SimstringMatcherRule`
        encoding: str, optional
            The encoding of the file to open

        Returns
        -------
        List[SimstringMatcherRule]
            List of all the rules in `path_to_rules`, can be used to init a
            :class:`~.SimstringMatcher`
        """

        class _Loader(yaml.Loader):
            pass

        def _construct_mapping(loader, node):
            data = loader.construct_mapping(node)
            if "kb_name" in data:
                return SimstringMatcherNormalization(**data)
            return SimstringMatcherRule(**data)

        _Loader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _construct_mapping)

        with Path(path_to_rules).open(encoding=encoding) as fp:
            return yaml.load(fp, Loader=_Loader)  # noqa: S506



[docs]
    @staticmethod
    def save_rules(rules: list[SimstringMatcherRule], path_to_rules: Path, encoding: str | None = None):
        """Store rules in a yml file

        Parameters
        ----------
        rules: list of SimstringMatcherRule
            The rules to save
        path_to_rules: Path
            The path to a yml file that will contain the rules
        encoding: str, optional
            The encoding of the yml file
        """
        with Path(path_to_rules).open(mode="w", encoding=encoding) as fp:
            rules_data = [dataclasses.asdict(r) for r in rules]
            yaml.safe_dump(rules_data, fp)