Source code for medkit.text.ner.umls_utils

__all__ = [
    "UMLSEntry",
    "load_umls_entries",
    "preprocess_term_to_match",
    "preprocess_acronym",
    "guess_umls_version",
    "SEMGROUPS",
    "SEMGROUP_LABELS",
]


from collections import defaultdict
import dataclasses
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Union
from tqdm import tqdm
import re

import unidecode

SEMGROUP_LABELS = {
    "ACTI": "activity",
    "ANAT": "anatomy",
    "CHEM": "chemical",
    "CONC": "concept",
    "DEVI": "device",
    "DISO": "disorder",
    "GENE": "genes_sequence",
    "GEOG": "geographic_area",
    "LIVB": "living_being",
    "OBJC": "object",
    "OCCU": "occupation",
    "ORGA": "organization",
    "PHEN": "phenomenon",
    "PHYS": "physiology",
    "PROC": "procedure",
}
"""
Labels corresponding to UMLS semgroups
"""


SEMGROUPS = list(SEMGROUP_LABELS.keys())
"""
Valid UMLS semgroups
"""


[docs]@dataclasses.dataclass
class UMLSEntry:
    """Entry in MRCONSO.RRF file of a UMLS dictionary

    Attributes
    ----------
    cui:
        Unique identifier of the concept designated by the term
    ref_term:
        Original version of the term
    semtypes:
        Semantic types of the concept (TUIs)
    semgroups:
        Semantic groups of the concept
    """

    cui: str
    term: str
    semtypes: Optional[List[str]] = None
    semgroups: Optional[List[str]] = None

    def to_dict(self):
        return dict(
            cui=self.cui,
            term=self.term,
            semtypes=self.semtypes,
            semgroups=self.semgroups,
        )


# based on https://github.com/GanjinZero/CODER/blob/master/coderpp/test/load_umls.py


[docs]def load_umls_entries(
    mrconso_file: Union[str, Path],
    mrsty_file: Union[str, Path] = None,
    sources: Optional[List[str]] = None,
    languages: Optional[List[str]] = None,
    show_progress: bool = False,
) -> Iterator[UMLSEntry]:
    """Load all terms and associated CUIs found in a UMLS MRCONSO.RRF file

    Parameters
    ----------
    mrconso_file:
        Path to the UMLS MRCONSO.RRF file
    mrsty_file:
        Path to the UMLS MRSTY.RRF file. If provided, semtypes info will be
        included in the entries returned.
    sources:
        Sources to consider (ex: ICD10, CCS) If none provided, CUIs and terms
        of all sources will be taken into account.
    languages:
        Languages to consider. If none provided, CUIs and terms of all languages
        will be taken into account
    show_progress:
        Whether to show a progressbar

    Returns
    -------
    Iterator[UMLSEntry]
        Iterator over all term entries found in UMLS install
    """
    mrconso_file = Path(mrconso_file)
    if mrsty_file is not None:
        mrsty_file = Path(mrsty_file)

    file_size = mrconso_file.stat().st_size
    luis_seen = set()

    # load semtypes and semgroups if MRSTY is provided
    if mrsty_file is not None:
        semtypes_by_cui = load_semtypes_by_cui(mrsty_file)
        semgroups_by_semtype = load_semgroups_by_semtype()
    else:
        semtypes_by_cui = None
        semgroups_by_semtype = None

    with open(mrconso_file, encoding="utf-8") as fp:
        lines_iter = fp

        if show_progress:
            progress_bar = tqdm(
                total=file_size,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
            )

        for line in lines_iter:
            if show_progress:
                line_size = len(line.encode("utf-8"))
                progress_bar.update(line_size)

            row = line.strip().split("|")
            cui = row[0]
            language = row[1]
            lui = row[3]
            source = row[11]
            term = row[14]

            if sources is not None and source not in sources:
                continue
            if languages is not None and language not in languages:
                continue
            if lui in luis_seen:
                continue

            if semtypes_by_cui is not None and cui in semtypes_by_cui:
                semtypes = semtypes_by_cui[cui]
                semgroups = [semgroups_by_semtype[semtype] for semtype in semtypes]
            else:
                semtypes = None
                semgroups = None

            luis_seen.add(lui)
            yield UMLSEntry(cui, term, semtypes, semgroups)

    if show_progress:
        progress_bar.close()


def load_semtypes_by_cui(mrsty_file: Union[str, Path]) -> Dict[str, List[str]]:
    """
    Load the list of semtypes associated to each CUI found in a MRSTY.RRF file

    Params
    ------
    mrsty_file:
        Path to the UMLS MRSTY.RRF file.

    Returns
    -------
    Dict[str, List[str]]
        Mapping between CUIs and associated semtypes
    """

    mrsty_file = Path(mrsty_file)
    semtypes_by_cui = defaultdict(list)

    with open(mrsty_file) as fp:
        for line in fp:
            row = line.strip().split("|")
            cui = row[0]
            semtypes_by_cui[cui].append(row[1])

    return dict(semtypes_by_cui)


# The semantic groups provide a partition of the UMLS Metathesaurus for 99.5%
# of the concepts, we use this file to obtain a semtype-to-semgroup mapping.
# Source: UMLS project
# https://lhncbc.nlm.nih.gov/semanticnetwork/download/sg_archive/SemGroups-v04.txt
_UMLS_SEMGROUPS_FILE = Path(__file__).parent / "umls_semgroups_v04.txt"
_SEMGROUPS_BY_SEMTYPE = None


def load_semgroups_by_semtype() -> Dict[str, str]:
    """
    Load the semgroup associated to each semtype

    Returns
    -------
    Dict[str, str]
        Mapping between semtype TUIs and corresponding semgroup
    """

    global _SEMGROUPS_BY_SEMTYPE
    if _SEMGROUPS_BY_SEMTYPE is None:
        _SEMGROUPS_BY_SEMTYPE = {}
        with open(_UMLS_SEMGROUPS_FILE) as fp:
            for line in fp:
                semgroup, _, semtype, _ = line.split("|")
                _SEMGROUPS_BY_SEMTYPE[semtype] = semgroup
    return _SEMGROUPS_BY_SEMTYPE


_BRACKET_PATTERN = re.compile("\\(.*?\\)")


[docs]def preprocess_term_to_match(
    term: str,
    lowercase: bool,
    normalize_unicode: bool,
    clean_nos: bool = True,
    clean_brackets: bool = False,
    clean_dashes: bool = False,
):
    """
    Preprocess a UMLS term for matching purposes

    Parameters
    ----------
    term: str
        Term to preprocess
    lowercase:
        Whether `term` should be lowercased
    normalize_unicode:
        Whether `term_to_match` should be ASCII-only (non-ASCII chars replaced by closest ASCII chars)
    clean_nos:
        Whether to remove "NOS"
    clean_brackets:
        Whether to remove brackets
    clean_dashes:
        Whether to remove dashes
    """
    if lowercase:
        term = term.lower()
    if normalize_unicode:
        term = unidecode.unidecode(term)

    term = " " + term + " "
    if clean_nos:
        term = term.replace(" NOS ", " ").replace(" nos ", " ")
    if clean_brackets:
        term = _BRACKET_PATTERN.sub("", term)
    if clean_dashes:
        term = term.replace("-", " ")
    term = " ".join([w for w in term.split() if w])
    return term


_ACRONYM_PATTERN = re.compile(
    r"^ *(?P<acronym>[^ \(\)]+) *\( *(?P<expanded>[^\(\)]+) *\) *$"
)


[docs]def preprocess_acronym(term: str) -> Optional[str]:
    """
    Detect if a term contains an acronym with the expanded form between
    parenthesis, and return the acronym if that is the case.

    This will work for terms such as: "ECG (ÉlectroCardioGramme)", where the
    acronym can be rebuilt by taking the ASCII version of each uppercase
    letter inside the parenthesis.

    Parameters
    ----------
    term:
        Term that may contain an acronym. Ex: "ECG (ÉlectroCardioGramme)"

    Returns
    -------
    Optional[str]
        The acronym in the term if any, else `None`. Ex: "ECG"
    """

    match = _ACRONYM_PATTERN.match(term)
    if not match:
        return None

    # extract acronym (before the parenthesis) and expanded form (between parenthesis)
    acronym = match.group("acronym")
    expanded = match.group("expanded")

    # try to rebuild acronym from expanded form:
    # replace special characters with ASCII
    expanded = unidecode.unidecode(expanded)
    # keep only uppercase chars
    acronym_candidate = "".join(c for c in expanded if c.isupper())
    # if it doesn't match the part before the parenthesis
    # we decide it is not an acronym
    if acronym != acronym_candidate:
        return None
    return acronym


[docs]def guess_umls_version(path: Union[str, Path]) -> str:
    """Try to infer UMLS version (ex: "2021AB") from any UMLS-related path

    Parameters
    ----------
    path:
        Path to the root directory of the UMLS install or any file inside that directory
    Returns
    -------
        UMLS version, estimated by finding the leaf-most folder in `path` that is not
        "META", "NET" nor "LEX", nor a subfolder of these folders
    """
    path = Path(path).resolve()
    if path.is_file():
        path = path.parent
    while any(dir_name in path.parts for dir_name in ("META", "NET", "LEX")):
        path = path.parent
    return path.name