Source code for medkit.text.ner.umls_utils

__all__ = [
    "UMLSEntry",
    "load_umls_entries",
    "preprocess_term_to_match",
    "preprocess_acronym",
    "guess_umls_version",
    "SEMGROUPS",
    "SEMGROUP_LABELS",
]


from collections import defaultdict
import dataclasses
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Union
from tqdm import tqdm
import re

import unidecode

SEMGROUP_LABELS = {
    "ACTI": "activity",
    "ANAT": "anatomy",
    "CHEM": "chemical",
    "CONC": "concept",
    "DEVI": "device",
    "DISO": "disorder",
    "GENE": "genes_sequence",
    "GEOG": "geographic_area",
    "LIVB": "living_being",
    "OBJC": "object",
    "OCCU": "occupation",
    "ORGA": "organization",
    "PHEN": "phenomenon",
    "PHYS": "physiology",
    "PROC": "procedure",
}
"""
Labels corresponding to UMLS semgroups
"""


SEMGROUPS = list(SEMGROUP_LABELS.keys())
"""
Valid UMLS semgroups
"""


[docs]@dataclasses.dataclass class UMLSEntry: """Entry in MRCONSO.RRF file of a UMLS dictionary Attributes ---------- cui: Unique identifier of the concept designated by the term ref_term: Original version of the term semtypes: Semantic types of the concept (TUIs) semgroups: Semantic groups of the concept """ cui: str term: str semtypes: Optional[List[str]] = None semgroups: Optional[List[str]] = None def to_dict(self): return dict( cui=self.cui, term=self.term, semtypes=self.semtypes, semgroups=self.semgroups, )
# based on https://github.com/GanjinZero/CODER/blob/master/coderpp/test/load_umls.py
[docs]def load_umls_entries( mrconso_file: Union[str, Path], mrsty_file: Union[str, Path] = None, sources: Optional[List[str]] = None, languages: Optional[List[str]] = None, show_progress: bool = False, ) -> Iterator[UMLSEntry]: """Load all terms and associated CUIs found in a UMLS MRCONSO.RRF file Parameters ---------- mrconso_file: Path to the UMLS MRCONSO.RRF file mrsty_file: Path to the UMLS MRSTY.RRF file. If provided, semtypes info will be included in the entries returned. sources: Sources to consider (ex: ICD10, CCS) If none provided, CUIs and terms of all sources will be taken into account. languages: Languages to consider. If none provided, CUIs and terms of all languages will be taken into account show_progress: Whether to show a progressbar Returns ------- Iterator[UMLSEntry] Iterator over all term entries found in UMLS install """ mrconso_file = Path(mrconso_file) if mrsty_file is not None: mrsty_file = Path(mrsty_file) file_size = mrconso_file.stat().st_size luis_seen = set() # load semtypes and semgroups if MRSTY is provided if mrsty_file is not None: semtypes_by_cui = load_semtypes_by_cui(mrsty_file) semgroups_by_semtype = load_semgroups_by_semtype() else: semtypes_by_cui = None semgroups_by_semtype = None with open(mrconso_file, encoding="utf-8") as fp: lines_iter = fp if show_progress: progress_bar = tqdm( total=file_size, unit="B", unit_scale=True, unit_divisor=1024, ) for line in lines_iter: if show_progress: line_size = len(line.encode("utf-8")) progress_bar.update(line_size) row = line.strip().split("|") cui = row[0] language = row[1] lui = row[3] source = row[11] term = row[14] if sources is not None and source not in sources: continue if languages is not None and language not in languages: continue if lui in luis_seen: continue if semtypes_by_cui is not None and cui in semtypes_by_cui: semtypes = semtypes_by_cui[cui] semgroups = [semgroups_by_semtype[semtype] for semtype in semtypes] else: semtypes = None semgroups = None luis_seen.add(lui) yield UMLSEntry(cui, term, semtypes, semgroups) if show_progress: progress_bar.close()
def load_semtypes_by_cui(mrsty_file: Union[str, Path]) -> Dict[str, List[str]]: """ Load the list of semtypes associated to each CUI found in a MRSTY.RRF file Params ------ mrsty_file: Path to the UMLS MRSTY.RRF file. Returns ------- Dict[str, List[str]] Mapping between CUIs and associated semtypes """ mrsty_file = Path(mrsty_file) semtypes_by_cui = defaultdict(list) with open(mrsty_file) as fp: for line in fp: row = line.strip().split("|") cui = row[0] semtypes_by_cui[cui].append(row[1]) return dict(semtypes_by_cui) # The semantic groups provide a partition of the UMLS Metathesaurus for 99.5% # of the concepts, we use this file to obtain a semtype-to-semgroup mapping. # Source: UMLS project # https://lhncbc.nlm.nih.gov/semanticnetwork/download/sg_archive/SemGroups-v04.txt _UMLS_SEMGROUPS_FILE = Path(__file__).parent / "umls_semgroups_v04.txt" _SEMGROUPS_BY_SEMTYPE = None def load_semgroups_by_semtype() -> Dict[str, str]: """ Load the semgroup associated to each semtype Returns ------- Dict[str, str] Mapping between semtype TUIs and corresponding semgroup """ global _SEMGROUPS_BY_SEMTYPE if _SEMGROUPS_BY_SEMTYPE is None: _SEMGROUPS_BY_SEMTYPE = {} with open(_UMLS_SEMGROUPS_FILE) as fp: for line in fp: semgroup, _, semtype, _ = line.split("|") _SEMGROUPS_BY_SEMTYPE[semtype] = semgroup return _SEMGROUPS_BY_SEMTYPE _BRACKET_PATTERN = re.compile("\\(.*?\\)")
[docs]def preprocess_term_to_match( term: str, lowercase: bool, normalize_unicode: bool, clean_nos: bool = True, clean_brackets: bool = False, clean_dashes: bool = False, ): """ Preprocess a UMLS term for matching purposes Parameters ---------- term: str Term to preprocess lowercase: Whether `term` should be lowercased normalize_unicode: Whether `term_to_match` should be ASCII-only (non-ASCII chars replaced by closest ASCII chars) clean_nos: Whether to remove "NOS" clean_brackets: Whether to remove brackets clean_dashes: Whether to remove dashes """ if lowercase: term = term.lower() if normalize_unicode: term = unidecode.unidecode(term) term = " " + term + " " if clean_nos: term = term.replace(" NOS ", " ").replace(" nos ", " ") if clean_brackets: term = _BRACKET_PATTERN.sub("", term) if clean_dashes: term = term.replace("-", " ") term = " ".join([w for w in term.split() if w]) return term
_ACRONYM_PATTERN = re.compile( r"^ *(?P<acronym>[^ \(\)]+) *\( *(?P<expanded>[^\(\)]+) *\) *$" )
[docs]def preprocess_acronym(term: str) -> Optional[str]: """ Detect if a term contains an acronym with the expanded form between parenthesis, and return the acronym if that is the case. This will work for terms such as: "ECG (ÉlectroCardioGramme)", where the acronym can be rebuilt by taking the ASCII version of each uppercase letter inside the parenthesis. Parameters ---------- term: Term that may contain an acronym. Ex: "ECG (ÉlectroCardioGramme)" Returns ------- Optional[str] The acronym in the term if any, else `None`. Ex: "ECG" """ match = _ACRONYM_PATTERN.match(term) if not match: return None # extract acronym (before the parenthesis) and expanded form (between parenthesis) acronym = match.group("acronym") expanded = match.group("expanded") # try to rebuild acronym from expanded form: # replace special characters with ASCII expanded = unidecode.unidecode(expanded) # keep only uppercase chars acronym_candidate = "".join(c for c in expanded if c.isupper()) # if it doesn't match the part before the parenthesis # we decide it is not an acronym if acronym != acronym_candidate: return None return acronym
[docs]def guess_umls_version(path: Union[str, Path]) -> str: """Try to infer UMLS version (ex: "2021AB") from any UMLS-related path Parameters ---------- path: Path to the root directory of the UMLS install or any file inside that directory Returns ------- UMLS version, estimated by finding the leaf-most folder in `path` that is not "META", "NET" nor "LEX", nor a subfolder of these folders """ path = Path(path).resolve() if path.is_file(): path = path.parent while any(dir_name in path.parts for dir_name in ("META", "NET", "LEX")): path = path.parent return path.name