Source code for medkit.text.ner.umls_utils

__all__ = ["UMLSEntry", "load_umls", "preprocess_term_to_match", "guess_umls_version"]

import dataclasses
from pathlib import Path
from typing import Iterator, List, Optional, Union
from tqdm import tqdm
import re

import unidecode


# based on https://github.com/GanjinZero/CODER/blob/master/coderpp/test/load_umls.py


[docs]@dataclasses.dataclass
class UMLSEntry:
    """Entry in MRCONSO.RRF file of a UMLS dictionary

    Attributes
    ----------
    cui:
        Unique identifier of the concept designated by the term
    ref_term:
        Original version of the term
    """

    cui: str
    term: str

    def to_dict(self):
        return dict(cui=self.cui, term=self.term)


[docs]def load_umls(
    mrconso_file: Union[str, Path],
    sources: Optional[List[str]] = None,
    languages: Optional[List[str]] = None,
    show_progress: bool = False,
) -> Iterator[UMLSEntry]:
    """Load all terms and associated CUIs found in a UMLS MRCONSO.RRF file

    Parameters
    ----------
    mrconso_file:
        Path to the UMLS MRCONSO.RRF file
    sources:
        Sources to consider (ex: ICD10, CCS) If none provided, CUIs and terms
        of all sources will be taken into account.
    languages:
        Languages to consider. If none provided, CUIs and terms of all languages
        will be taken into account
    show_progress:
        Whether to show a progressbar

    Returns
    -------
    Iterator[UMLSEntry]
        Iterator over all term entries found in UMLS install
    """
    mrconso_file = Path(mrconso_file)
    file_size = mrconso_file.stat().st_size
    luis_seen = set()

    with open(mrconso_file, encoding="utf-8") as fp:
        lines_iter = fp

        if show_progress:
            progress_bar = tqdm(
                total=file_size,
                unit="B",
                unit_scale=True,
                unit_divisor=1024,
            )

        for line in lines_iter:
            if show_progress:
                line_size = len(line.encode("utf-8"))
                progress_bar.update(line_size)

            row = line.strip().split("|")
            cui = row[0]
            language = row[1]
            lui = row[3]
            source = row[11]
            term = row[14]

            if sources is not None and source not in sources:
                continue
            if languages is not None and language not in languages:
                continue
            if lui in luis_seen:
                continue

            luis_seen.add(lui)
            yield UMLSEntry(cui, term)

    if show_progress:
        progress_bar.close()


_BRACKET_PATTERN = re.compile("\\(.*?\\)")


[docs]def preprocess_term_to_match(
    term: str,
    lowercase: bool,
    normalize_unicode: bool,
    clean_nos: bool = True,
    clean_brackets: bool = True,
    clean_dashes: bool = True,
):
    """
    Preprocess a UMLS term for matching purposes

    Parameters
    ----------
    term: str
        Term to preprocess
    lowercase:
        Whether `term` should be lowercased
    normalize_unicode:
        Whether `term_to_match` should be ASCII-only (non-ASCII chars replaced by closest ASCII chars)
    clean_nos:
        Whether to remove "NOS"
    clean_brackets:
        Whether to remove brackets
    clean_dashes:
        Wehther to remove dashes
    """
    if lowercase:
        term = term.lower()
    if normalize_unicode:
        term = unidecode.unidecode(term)

    term = " " + term + " "
    if clean_nos:
        term = term.replace(" NOS ", " ").replace(" nos ", " ")
    if clean_brackets:
        term = _BRACKET_PATTERN.sub("", term)
    if clean_dashes:
        term = term.replace("-", " ")
    term = " ".join([w for w in term.split() if w])
    return term


[docs]def guess_umls_version(path: Union[str, Path]) -> str:
    """Try to infer UMLS version (ex: "2021AB") from any UMLS-related path

    Parameters
    ----------
    path:
        Path to the root directory of the UMLS install or any file inside that directory
    Returns
    -------
        UMLS version, estimated by finding the leaf-most folder in `path` that is not
        "META", "NET" nor "LEX", nor a subfolder of these folders
    """
    path = Path(path).resolve()
    if path.is_file():
        path = path.parent
    while any(dir_name in path.parts for dir_name in ("META", "NET", "LEX")):
        path = path.parent
    return path.name