Source code for medkit.text.ner.quick_umls_matcher

"""This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[quick-umls-matcher]`.
"""
from __future__ import annotations

__all__ = ["QuickUMLSMatcher"]

from typing import TYPE_CHECKING, ClassVar, Iterator, NamedTuple

import quickumls.about
import quickumls.constants
from packaging.version import parse as parse_version
from quickumls import QuickUMLS
from typing_extensions import Literal

from medkit.core.text import (
    Entity,
    NEROperation,
    Segment,
    UMLSNormAttribute,
    span_utils,
)
from medkit.text.ner import umls_utils

if TYPE_CHECKING:
    from pathlib import Path

# workaround for https://github.com/Georgetown-IR-Lab/QuickUMLS/issues/68
_spacy_language_map_fixed = False


def _fix_spacy_language_map():
    global _spacy_language_map_fixed  # noqa: PLW0603
    if _spacy_language_map_fixed:
        return

    if parse_version(quickumls.about.__version__) < parse_version("1.4.1"):
        for key, value in quickumls.constants.SPACY_LANGUAGE_MAP.items():
            ext = "_core_web_sm" if value == "en" else "_core_news_sm"
            quickumls.constants.SPACY_LANGUAGE_MAP[key] = value + ext

    _spacy_language_map_fixed = True


class _QuickUMLSInstall(NamedTuple):
    version: str
    language: str
    lowercase: bool
    normalize_unicode: bool


[docs] class QuickUMLSMatcher(NEROperation): """Entity annotator relying on QuickUMLS. This annotator requires a QuickUMLS installation performed with `python -m quickumls.install` with flags corresponding to the params `language`, `version`, `lowercase` and `normalize_unicode` passed at init. QuickUMLS installations must be registered with the `add_install` class method. For instance, if we want to use `QuickUMLSMatcher` with a french lowercase QuickUMLS install based on UMLS version 2021AB, we must first create this installation with: >>> python -m quickumls.install --language FRE --lowercase /path/to/umls/2021AB/data /path/to/quick/umls/install then register this install with: >>> QuickUMLSMatcher.add_install( >>> "/path/to/quick/umls/install", >>> version="2021AB", >>> language="FRE", >>> lowercase=True, >>> ) and finally instantiate the matcher with: >>> matcher = QuickUMLSMatcher( >>> version="2021AB", >>> language="FRE", >>> lowercase=True, >>> ) This mechanism makes it possible to store in the OperationDescription how the used QuickUMLS was created, and to reinstantiate the same matcher on a different environment if a similar install is available. """ _install_paths: ClassVar[dict[_QuickUMLSInstall, str]] = {}
[docs] @classmethod def add_install( cls, path: str | Path, version: str, language: str, lowercase: bool = False, normalize_unicode: bool = False, ): """Register path and settings of a QuickUMLS installation performed with `python -m quickumls.install` Parameters ---------- path : str or Path The path to the destination folder passed to the install command version : str The version of the UMLS database, for instance "2021AB" language : str The language flag passed to the install command, for instance "ENG" lowercase : bool, default=False Whether the --lowercase flag was passed to the install command (concepts are lowercased to increase recall) normalize_unicode : bool, default=False Whether the --normalize-unicode flag was passed to the install command (non-ASCII chars in concepts are converted to the closest ASCII chars) """ install = _QuickUMLSInstall(version, language, lowercase, normalize_unicode) cls._install_paths[install] = str(path)
[docs] @classmethod def clear_installs(cls): """Remove all QuickUMLS installation registered with `add_install`""" cls._install_paths.clear()
@classmethod def _get_path_to_install( cls, version: str, language: str, lowercase: bool = False, normalize_unicode: bool = False, ) -> str: """Find a QuickUMLS install with corresponding settings The QuickUMLS install must have been previously registered with `add_install`. """ install = _QuickUMLSInstall(version, language, lowercase, normalize_unicode) path = cls._install_paths.get(install) if not path: msg = ( f"Couldn't find any Quick- UMLS install for version={version}," f" language={language}, lowercase={lowercase}," f" normalize_unicode={normalize_unicode}.\nRegistered installs:" f" {cls._install_paths}" ) raise ValueError(msg) return path def __init__( self, version: str, language: str, lowercase: bool = False, normalize_unicode: bool = False, overlapping: Literal["length", "score"] = "length", threshold: float = 0.9, window: int = 5, similarity: Literal["dice", "jaccard", "cosine", "overlap"] = "jaccard", accepted_semtypes: list[str] = quickumls.constants.ACCEPTED_SEMTYPES, attrs_to_copy: list[str] | None = None, output_label: str | dict[str, str] | None = None, name: str | None = None, uid: str | None = None, ): """Instantiate the QuickUMLS matcher Parameters ---------- version : str UMLS version of the QuickUMLS install to use, for instance "2021AB" Will be used to decide with QuickUMLS to use language : str Language flag of the QuickUMLS install to use, for instance "ENG". Will be used to decide with QuickUMLS to use lowercase : bool, default=False Whether to use a QuickUMLS install with lowercased concepts Will be used to decide with QuickUMLS to use normalize_unicode : bool, default=False Whether to use a QuickUMLS install with non-ASCII chars concepts converted to the closest ASCII chars. Will be used to decide with QuickUMLS to use overlapping : {"length", "score"}, default="length" Criteria for sorting multiple potential matches (cf QuickUMLS doc) threshold : float, default=0.9 Minimum similarity (cf QuickUMLS doc) window : int, default=5 Max number of tokens per match (cf QuickUMLS doc) similarity : {"dice", "jaccard", "cosine", "overlap"}, default="jaccard" Similarity measure to use (cf QuickUMLS doc) accepted_semtypes : list of str, optional UMLS semantic types that matched concepts should belong to (cf QuickUMLS doc). attrs_to_copy : list of str, optional Labels of the attributes that should be copied from the source segment to the created entity. Useful for propagating context attributes (negation, antecendent, etc) output_label : str or dict of str to str, optional By default, ~`medkit.text.ner.umls.SEMGROUP_LABELS` will be used as entity labels. Use this parameter to override them. Example: `{"DISO": "problem", "PROC": "test}`. If `output_labels_by_semgroup` is a string, all entities will use this string as label instead. name : str, optional Name describing the matcher (defaults to the class name) uid : str, optional Identifier of the matcher """ _fix_spacy_language_map() # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) if attrs_to_copy is None: attrs_to_copy = [] self.language = language self.version = version self.lowercase = lowercase self.normalize_unicode = normalize_unicode self.overlapping = overlapping self.threshold = threshold self.similarity = similarity self.window = window self.accepted_semtypes = accepted_semtypes self.attrs_to_copy = attrs_to_copy path_to_install = self._get_path_to_install(version, language, lowercase, normalize_unicode) self._matcher = QuickUMLS( quickumls_fp=path_to_install, overlapping_criteria=overlapping, threshold=threshold, window=window, similarity_name=similarity, accepted_semtypes=accepted_semtypes, ) assert ( # noqa: PT018 self._matcher.language_flag == language and self._matcher.to_lowercase_flag == lowercase and self._matcher.normalize_unicode_flag == normalize_unicode ), "Inconsistent QuickUMLS install flags" self._semtype_to_semgroup = umls_utils.load_semgroups_by_semtype() self.label_mapping = self._get_label_mapping(output_label) @staticmethod def _get_label_mapping(output_label: None | str | dict[str, str]) -> dict[str, str]: """Return label mapping according to `output_label`""" if output_label is None: return umls_utils.SEMGROUP_LABELS if isinstance(output_label, str): return {key: output_label for key in umls_utils.SEMGROUP_LABELS} if isinstance(output_label, dict): label_mapping = umls_utils.SEMGROUP_LABELS.copy() label_mapping.update(output_label) return label_mapping return None
[docs] def run(self, segments: list[Segment]) -> list[Entity]: """Return entities (with UMLS normalization attributes) for each match in `segments` Parameters ---------- segments : list of Segment List of segments into which to look for matches Returns ------- list of Entity Entities found in `segments`, with :class:`~UMLSNormAttribute` attributes. """ return [entity for segment in segments for entity in self._find_matches_in_segment(segment)]
def _find_matches_in_segment(self, segment: Segment) -> Iterator[Entity]: matches = self._matcher.match(segment.text) for match_candidates in matches: # only the best matching CUI (1st match candidate) is returned # TODO should we create a normalization attributes for each CUI instead? match = match_candidates[0] text, spans = span_utils.extract(segment.text, segment.spans, [(match["start"], match["end"])]) semtypes = list(match["semtypes"]) # define label using the first semtype semgroup = self._semtype_to_semgroup[semtypes[0]] label = self.label_mapping[semgroup] entity = Entity( label=label, text=text, spans=spans, ) for attr_label in self.attrs_to_copy: for attr in segment.attrs.get(label=attr_label): copied_attr = attr.copy() entity.attrs.add(copied_attr) # handle provenance if self._prov_tracer is not None: self._prov_tracer.add_prov(copied_attr, self.description, [attr]) norm_attr = UMLSNormAttribute( cui=match["cui"], umls_version=self.version, term=match["term"], score=match["similarity"], sem_types=list(match["semtypes"]), ) entity.attrs.add(norm_attr) if self._prov_tracer is not None: self._prov_tracer.add_prov(entity, self.description, source_data_items=[segment]) self._prov_tracer.add_prov(norm_attr, self.description, source_data_items=[segment]) yield entity