Source code for medkit.text.ner.quick_umls_matcher

"""This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[quick-umls-matcher]`.
"""
from __future__ import annotations

__all__ = ["QuickUMLSMatcher"]

from typing import TYPE_CHECKING, ClassVar, Iterator, NamedTuple

import quickumls.about
import quickumls.constants
from packaging.version import parse as parse_version
from quickumls import QuickUMLS
from typing_extensions import Literal

from medkit.core.text import (
    Entity,
    NEROperation,
    Segment,
    UMLSNormAttribute,
    span_utils,
)
from medkit.text.ner import umls_utils

if TYPE_CHECKING:
    from pathlib import Path

# workaround for https://github.com/Georgetown-IR-Lab/QuickUMLS/issues/68
_spacy_language_map_fixed = False


def _fix_spacy_language_map():
    global _spacy_language_map_fixed  # noqa: PLW0603
    if _spacy_language_map_fixed:
        return

    if parse_version(quickumls.about.__version__) < parse_version("1.4.1"):
        for key, value in quickumls.constants.SPACY_LANGUAGE_MAP.items():
            ext = "_core_web_sm" if value == "en" else "_core_news_sm"
            quickumls.constants.SPACY_LANGUAGE_MAP[key] = value + ext

    _spacy_language_map_fixed = True


class _QuickUMLSInstall(NamedTuple):
    version: str
    language: str
    lowercase: bool
    normalize_unicode: bool



[docs]
class QuickUMLSMatcher(NEROperation):
    """Entity annotator relying on QuickUMLS.

    This annotator requires a QuickUMLS installation performed
    with `python -m quickumls.install` with flags corresponding
    to the params `language`, `version`, `lowercase` and `normalize_unicode`
    passed at init. QuickUMLS installations must be registered with the
    `add_install` class method.

    For instance, if we want to use `QuickUMLSMatcher` with a french
    lowercase QuickUMLS install based on UMLS version 2021AB,
    we must first create this installation with:

    >>> python -m quickumls.install --language FRE --lowercase /path/to/umls/2021AB/data /path/to/quick/umls/install

    then register this install with:

    >>> QuickUMLSMatcher.add_install(
    >>>        "/path/to/quick/umls/install",
    >>>        version="2021AB",
    >>>        language="FRE",
    >>>        lowercase=True,
    >>> )

    and finally instantiate the matcher with:

    >>> matcher = QuickUMLSMatcher(
    >>>     version="2021AB",
    >>>     language="FRE",
    >>>     lowercase=True,
    >>> )

    This mechanism makes it possible to store in the OperationDescription
    how the used QuickUMLS was created, and to reinstantiate the same matcher
    on a different environment if a similar install is available.
    """

    _install_paths: ClassVar[dict[_QuickUMLSInstall, str]] = {}


[docs]
    @classmethod
    def add_install(
        cls,
        path: str | Path,
        version: str,
        language: str,
        lowercase: bool = False,
        normalize_unicode: bool = False,
    ):
        """Register path and settings of a QuickUMLS installation performed
        with `python -m quickumls.install`

        Parameters
        ----------
        path : str or Path
            The path to the destination folder passed to the install command
        version : str
            The version of the UMLS database, for instance "2021AB"
        language : str
            The language flag passed to the install command, for instance "ENG"
        lowercase : bool, default=False
            Whether the --lowercase flag was passed to the install command
            (concepts are lowercased to increase recall)
        normalize_unicode : bool, default=False
            Whether the --normalize-unicode flag was passed to the install command
            (non-ASCII chars in concepts are converted to the closest ASCII chars)
        """
        install = _QuickUMLSInstall(version, language, lowercase, normalize_unicode)
        cls._install_paths[install] = str(path)



[docs]
    @classmethod
    def clear_installs(cls):
        """Remove all QuickUMLS installation registered with `add_install`"""
        cls._install_paths.clear()


    @classmethod
    def _get_path_to_install(
        cls,
        version: str,
        language: str,
        lowercase: bool = False,
        normalize_unicode: bool = False,
    ) -> str:
        """Find a QuickUMLS install with corresponding settings

        The QuickUMLS install must have been previously registered with `add_install`.
        """
        install = _QuickUMLSInstall(version, language, lowercase, normalize_unicode)
        path = cls._install_paths.get(install)
        if not path:
            msg = (
                f"Couldn't find any Quick- UMLS install for version={version},"
                f" language={language}, lowercase={lowercase},"
                f" normalize_unicode={normalize_unicode}.\nRegistered installs:"
                f" {cls._install_paths}"
            )
            raise ValueError(msg)
        return path

    def __init__(
        self,
        version: str,
        language: str,
        lowercase: bool = False,
        normalize_unicode: bool = False,
        overlapping: Literal["length", "score"] = "length",
        threshold: float = 0.9,
        window: int = 5,
        similarity: Literal["dice", "jaccard", "cosine", "overlap"] = "jaccard",
        accepted_semtypes: list[str] = quickumls.constants.ACCEPTED_SEMTYPES,
        attrs_to_copy: list[str] | None = None,
        output_label: str | dict[str, str] | None = None,
        name: str | None = None,
        uid: str | None = None,
    ):
        """Instantiate the QuickUMLS matcher

        Parameters
        ----------
        version : str
            UMLS version of the QuickUMLS install to use, for instance "2021AB"
            Will be used to decide with QuickUMLS to use
        language : str
            Language flag of the QuickUMLS install to use, for instance "ENG".
            Will be used to decide with QuickUMLS to use
        lowercase : bool, default=False
            Whether to use a QuickUMLS install with lowercased concepts
            Will be used to decide with QuickUMLS to use
        normalize_unicode : bool, default=False
            Whether to use a QuickUMLS install with non-ASCII chars concepts
            converted to the closest ASCII chars.
            Will be used to decide with QuickUMLS to use
        overlapping : {"length", "score"}, default="length"
            Criteria for sorting multiple potential matches (cf QuickUMLS doc)
        threshold : float, default=0.9
            Minimum similarity (cf QuickUMLS doc)
        window : int, default=5
            Max number of tokens per match (cf QuickUMLS doc)
        similarity : {"dice", "jaccard", "cosine", "overlap"}, default="jaccard"
            Similarity measure to use (cf QuickUMLS doc)
        accepted_semtypes : list of str, optional
            UMLS semantic types that matched concepts should belong to (cf QuickUMLS doc).
        attrs_to_copy : list of str, optional
            Labels of the attributes that should be copied from the source segment
            to the created entity. Useful for propagating context attributes
            (negation, antecendent, etc)
        output_label : str or dict of str to str, optional
            By default, ~`medkit.text.ner.umls.SEMGROUP_LABELS` will be used as
            entity labels. Use this parameter to override them. Example:
            `{"DISO": "problem", "PROC": "test}`. If `output_labels_by_semgroup`
            is a string, all entities will use this string as label instead.
        name : str, optional
            Name describing the matcher (defaults to the class name)
        uid : str, optional
            Identifier of the matcher
        """
        _fix_spacy_language_map()

        # Pass all arguments to super (remove self)
        init_args = locals()
        init_args.pop("self")
        super().__init__(**init_args)

        if attrs_to_copy is None:
            attrs_to_copy = []

        self.language = language
        self.version = version
        self.lowercase = lowercase
        self.normalize_unicode = normalize_unicode
        self.overlapping = overlapping
        self.threshold = threshold
        self.similarity = similarity
        self.window = window
        self.accepted_semtypes = accepted_semtypes
        self.attrs_to_copy = attrs_to_copy

        path_to_install = self._get_path_to_install(version, language, lowercase, normalize_unicode)
        self._matcher = QuickUMLS(
            quickumls_fp=path_to_install,
            overlapping_criteria=overlapping,
            threshold=threshold,
            window=window,
            similarity_name=similarity,
            accepted_semtypes=accepted_semtypes,
        )
        assert (  # noqa: PT018
            self._matcher.language_flag == language
            and self._matcher.to_lowercase_flag == lowercase
            and self._matcher.normalize_unicode_flag == normalize_unicode
        ), "Inconsistent QuickUMLS install flags"

        self._semtype_to_semgroup = umls_utils.load_semgroups_by_semtype()
        self.label_mapping = self._get_label_mapping(output_label)

    @staticmethod
    def _get_label_mapping(output_label: None | str | dict[str, str]) -> dict[str, str]:
        """Return label mapping according to `output_label`"""
        if output_label is None:
            return umls_utils.SEMGROUP_LABELS

        if isinstance(output_label, str):
            return {key: output_label for key in umls_utils.SEMGROUP_LABELS}

        if isinstance(output_label, dict):
            label_mapping = umls_utils.SEMGROUP_LABELS.copy()
            label_mapping.update(output_label)
            return label_mapping
        return None


[docs]
    def run(self, segments: list[Segment]) -> list[Entity]:
        """Return entities (with UMLS normalization attributes) for each match in `segments`

        Parameters
        ----------
        segments : list of Segment
            List of segments into which to look for matches

        Returns
        -------
        list of Entity
            Entities found in `segments`, with :class:`~UMLSNormAttribute` attributes.
        """
        return [entity for segment in segments for entity in self._find_matches_in_segment(segment)]


    def _find_matches_in_segment(self, segment: Segment) -> Iterator[Entity]:
        matches = self._matcher.match(segment.text)
        for match_candidates in matches:
            # only the best matching CUI (1st match candidate) is returned
            # TODO should we create a normalization attributes for each CUI instead?
            match = match_candidates[0]

            text, spans = span_utils.extract(segment.text, segment.spans, [(match["start"], match["end"])])
            semtypes = list(match["semtypes"])

            # define label using the first semtype
            semgroup = self._semtype_to_semgroup[semtypes[0]]
            label = self.label_mapping[semgroup]

            entity = Entity(
                label=label,
                text=text,
                spans=spans,
            )

            for attr_label in self.attrs_to_copy:
                for attr in segment.attrs.get(label=attr_label):
                    copied_attr = attr.copy()
                    entity.attrs.add(copied_attr)
                    # handle provenance
                    if self._prov_tracer is not None:
                        self._prov_tracer.add_prov(copied_attr, self.description, [attr])

            norm_attr = UMLSNormAttribute(
                cui=match["cui"],
                umls_version=self.version,
                term=match["term"],
                score=match["similarity"],
                sem_types=list(match["semtypes"]),
            )
            entity.attrs.add(norm_attr)

            if self._prov_tracer is not None:
                self._prov_tracer.add_prov(entity, self.description, source_data_items=[segment])
                self._prov_tracer.add_prov(norm_attr, self.description, source_data_items=[segment])

            yield entity