Source code for medkit.text.spacy.edsnlp

"""
This package needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit[edsnlp]`.
"""

__all__ = [
    "EDSNLPPipeline",
    "EDSNLPDocPipeline",
    "build_date_attribute",
    "build_value_attribute",
    "build_score_attribute",
    "build_context_attribute",
    "build_history_attribute",
    "DEFAULT_ATTRIBUTE_FACTORIES",
]

from typing import Callable, Dict, Optional, List

from edsnlp.pipelines.ner.adicap.models import AdicapCode as EDSNLP_AdicapCode
from edsnlp.pipelines.ner.scores.tnm.models import TNM as EDSNLP_TNM
from edsnlp.pipelines.misc.dates.models import (
    AbsoluteDate as EDSNLP_AbsoluteDate,
    RelativeDate as EDSNLP_RelativeDate,
    Duration as EDSNLP_Duration,
    Direction as EDSNLP_Direction,
)
from edsnlp.pipelines.misc.measurements.measurements import (
    SimpleMeasurement as EDSNLP_SimpleMeasurement,
)
from spacy import Language
from spacy.tokens import Span as SpacySpan
from spacy.tokens.underscore import Underscore

from medkit.core import Attribute
from medkit.text.ner import (
    ADICAPNormAttribute,
    DateAttribute,
    RelativeDateAttribute,
    RelativeDateDirection,
    DurationAttribute,
)
from medkit.text.ner.tnm_attribute import TNMAttribute
from medkit.text.spacy import SpacyPipeline, SpacyDocPipeline


[docs]def build_date_attribute(spacy_span: SpacySpan, spacy_label: str) -> Attribute: """ Build a medkit date attribute from an EDS-NLP attribute with a date object as value. Parameters ---------- spacy_span Spacy span having an ESD-NLP date attribute spacy_label Label of the date attribute on `spacy_spacy`. Ex: "date", "consultation_date" Returns ------- Attribute :class:`~medkit.text.ner.DateAttribute`, :class:`~medkit.text.ner.RelativeDateAttribute` or :class:`~medkit.text.ner.DurationAttribute` instance, depending on the EDS-NLP attribute """ value = spacy_span._.get(spacy_label) if isinstance(value, EDSNLP_AbsoluteDate): return DateAttribute( label=spacy_label, year=value.year, month=value.month, day=value.day, hour=value.hour, minute=value.minute, second=value.second, ) elif isinstance(value, EDSNLP_RelativeDate): direction = ( RelativeDateDirection.PAST if value.direction is EDSNLP_Direction.PAST else RelativeDateDirection.FUTURE ) return RelativeDateAttribute( label=spacy_label, direction=direction, years=value.year, months=value.month, weeks=value.week, days=value.day, hours=value.hour, minutes=value.minute, seconds=value.second, ) elif isinstance(value, EDSNLP_Duration): return DurationAttribute( label=spacy_label, years=value.year, months=value.month, weeks=value.week, days=value.day, hours=value.hour, minutes=value.minute, seconds=value.second, ) else: raise ValueError(f"Unexpected value type: {type(value)}")
[docs]def build_value_attribute(spacy_span: SpacySpan, spacy_label: str) -> Attribute: """ Build a medkit attribute from an EDS-NLP "value" attribute with a custom object as value: - if the value is an EDS-NLP `Adipcap` object, a :class:`~medkit.text.ner.ADICAPNormAttribute` instance is returned; - if the value is an EDS-NLP `TNN` object, a :class:`~medkit.text.ner.tnm_attribute.TNMAttribute` instance is returned; - if the value is an EDS-NLP `SimpleMeasurement` object, a :class:`~medkit.core.Attribute` instance is returned. Otherwise an error is raised. Parameters ---------- spacy_span Spacy span having an attribute custom object as value spacy_label Label of the attribute on `spacy_spacy`. Ex: "value" Returns ------- Attribute Medkit attribute corresponding to the spacy attribute value """ value = spacy_span._.get(spacy_label) if isinstance(value, EDSNLP_AdicapCode): return ADICAPNormAttribute( code=value.code, sampling_mode=value.sampling_mode, technic=value.technic, organ=value.organ, pathology=value.pathology, pathology_type=value.pathology_type, behaviour_type=value.behaviour_type, ) elif isinstance(value, EDSNLP_TNM): return TNMAttribute( prefix=value.prefix, tumour=value.tumour, tumour_specification=value.tumour_specification, node=value.node, node_specification=value.node_specification, node_suffix=value.node_suffix, metastasis=value.metastasis, resection_completeness=value.resection_completeness, version=value.version, version_year=value.version_year, ) elif isinstance(value, EDSNLP_SimpleMeasurement): return Attribute( label=spacy_label, value=value.value, metadata={"unit": value.unit} ) else: raise ValueError(f"Unexpected value type: {type(value)}")
[docs]def build_score_attribute(spacy_span: SpacySpan, spacy_label: str) -> Attribute: """ Build a medkit attribute from an EDS-NLP "score_name" and corresponding "score_value" attribute. Parameters ---------- spacy_span Spacy span having "score_name" and "score_value" attributes spacy_label Must be "score_name" Returns ------- Attribute Medkit attribute with "score_name" value as label and "score_value" value as value """ assert spacy_label == "score_name" label = spacy_span._.score_name value = spacy_span._.score_value method = spacy_span._.get("score_method") metadata = {"method": method} if method is not None else None return Attribute(label=label, value=value, metadata=metadata)
[docs]def build_context_attribute(spacy_span: SpacySpan, spacy_label: str) -> Attribute: """ Build a medkit attribute from an EDS-NLP context/qualifying attribute, adding the cues as metadata Parameters ---------- spacy_span Spacy span having a context/qualifying attribute spacy_label Label of the attribute on `spacy_spacy`. Ex: "negation", "hypothesis", etc Returns ------- Attribute Medkit attribute corresponding to the spacy attribute """ value = spacy_span._.get(spacy_label) cues = spacy_span._.get(f"{spacy_label}_cues") metadata = {"cues": [c.text for c in cues]} if cues else None return Attribute(label=spacy_label, value=value, metadata=metadata)
[docs]def build_history_attribute(spacy_span: SpacySpan, spacy_label: str) -> Attribute: """ Build a medkit attribute from an EDS-NLP "history" attribute, adding the cues as metadata Parameters ---------- spacy_span Spacy span having a "history" attribute spacy_label Must be "history" Returns ------- Attribute Medkit attribute corresponding to the spacy attribute """ assert spacy_label == "history" value = spacy_span._.history history_cues = spacy_span._.get("history_cues") recent_cues = spacy_span._.get("recent_cues") metadata = {} if history_cues is not None: metadata["history_cues"] = [c.text for c in history_cues] if recent_cues is not None: metadata["recent_cues"] = [c.text for c in recent_cues] return Attribute(label="history", value=value, metadata=metadata)
DEFAULT_ATTRIBUTE_FACTORIES = { # created by several components "value": build_value_attribute, # from eds.dates "date": build_date_attribute, # from eds.consultation_dates "consultation_date": build_date_attribute, # from eds.score and some subclasses "score_name": build_score_attribute, # from eds.family "family": build_context_attribute, # from eds.hypothesis "hypothesis": build_context_attribute, # from eds.negation "negation": build_context_attribute, # from eds.reported_speech "reported_speech": build_context_attribute, # from eds.history "history": build_history_attribute, } """Pre-defined attribute factories to handle EDS-NLP attributes""" _ATTR_LABELS_TO_IGNORE = { # text after spaCy pre-preprocessing "normalized_variant", # should be in metadata of entities matched by eds.contextual-matcher but we don't support that "assigned", "source", # declared but unused attribute of eds.dates "datetime", # unsupported experimental feature of eds.dates "period" # ignored because each entity matched by eds.reason will also have its own is_reason attribute "ents_reason", # redundant with value attr "adicap", # will be set as value of score_name attr "score_value", # added to metadata of score_name attr "score_method", # context/qualifying attributes with deprecated aliases and cues included in metadata "family_", "family_cues", "history_", "history_cues", "recent_cues", "antecedents", "antecedents_", "antecedents_cues", "antecedent", "antecedent_", "antecedent_cues", "hypothesis_", "hypothesis_cues", "negation_", "negated", "polarity_", "negation_cues", "reported_speech_", "reported_speech_cues", }
[docs]class EDSNLPPipeline(SpacyPipeline): """Segment annotator relying on an EDS-NLP pipeline""" def __init__( self, nlp: Language, spacy_entities: Optional[List[str]] = None, spacy_span_groups: Optional[List[str]] = None, spacy_attrs: Optional[List[str]] = None, medkit_attribute_factories: Optional[ Dict[str, Callable[[SpacySpan, str], Attribute]] ] = None, name: Optional[str] = None, uid: Optional[str] = None, ): """Initialize the segment annotator Parameters ---------- nlp: Language object with the loaded pipeline from Spacy spacy_entities: Labels of new spacy entities (`doc.ents`) to convert into medkit entities. If `None` (default) all the new spacy entities will be converted spacy_span_groups: Name of new spacy span groups (`doc.spans`) to convert into medkit segments. If `None` (default) new spacy span groups will be converted spacy_attrs: Name of span extensions to convert into medkit attributes. If `None`, all non-redundant EDS-NLP attributes will be handled. medkit_attribute_factories: Mapping of factories in charge of converting spacy attributes to medkit attributes. Factories will receive a spacy span and an an attribute label when called. The key in the mapping is the attribute label. Pre-defined default factories are listed in :const:`~DEFAULT_ATTRIBUTE_FACTORIES` name: Name describing the pipeline (defaults to the class name). uid: Identifier of the pipeline """ if medkit_attribute_factories is None: medkit_attribute_factories = DEFAULT_ATTRIBUTE_FACTORIES else: medkit_attribute_factories = { **DEFAULT_ATTRIBUTE_FACTORIES, **medkit_attribute_factories, } if spacy_attrs is None: # default to all span attributes except blacklisted ones spacy_attrs = [ attr for attr in Underscore.span_extensions if attr not in _ATTR_LABELS_TO_IGNORE ] super().__init__( nlp=nlp, spacy_entities=spacy_entities, spacy_span_groups=spacy_span_groups, spacy_attrs=spacy_attrs, medkit_attribute_factories=medkit_attribute_factories, name=name, uid=uid, )
[docs]class EDSNLPDocPipeline(SpacyDocPipeline): """ DocPipeline to obtain annotations created using EDS-NLP """ def __init__( self, nlp: Language, medkit_labels_anns: Optional[List[str]] = None, medkit_attrs: Optional[List[str]] = None, spacy_entities: Optional[List[str]] = None, spacy_span_groups: Optional[List[str]] = None, spacy_attrs: Optional[List[str]] = None, medkit_attribute_factories: Optional[ Dict[str, Callable[[SpacySpan, str], Attribute]] ] = None, name: Optional[str] = None, uid: Optional[str] = None, ): """Initialize the pipeline Parameters ---------- nlp: Language object with the loaded pipeline from Spacy medkit_labels_anns: Labels of medkit annotations to include in the spacy document. If `None` (default) all the annotations will be included. medkit_attrs: Labels of medkit attributes to add in the annotations that will be included. If `None` (default) all the attributes will be added as `custom attributes` in each annotation included. spacy_entities: Labels of new spacy entities (`doc.ents`) to convert into medkit entities. If `None` (default) all the new spacy entities will be converted and added into its origin medkit document. spacy_span_groups: Name of new spacy span groups (`doc.spans`) to convert into medkit segments. If `None` (default) new spacy span groups will be converted and added into its origin medkit document. spacy_attrs: Name of span extensions to convert into medkit attributes. If `None`, all non-redundant EDS-NLP attributes will be handled. medkit_attribute_factories: Mapping of factories in charge of converting spacy attributes to medkit attributes. Factories will receive a spacy span and an an attribute label when called. The key in the mapping is the attribute label. Pre-defined default factories are listed in :const:`~DEFAULT_ATTRIBUTE_FACTORIES` name: Name describing the pipeline (defaults to the class name). uid: Identifier of the pipeline """ # use pre-defined attribute factory if medkit_attribute_factories is None: medkit_attribute_factories = DEFAULT_ATTRIBUTE_FACTORIES else: medkit_attribute_factories = { **DEFAULT_ATTRIBUTE_FACTORIES, **medkit_attribute_factories, } if spacy_attrs is None: # default to all span attributes except blacklisted ones spacy_attrs = [ attr for attr in Underscore.span_extensions if attr not in _ATTR_LABELS_TO_IGNORE ] super().__init__( nlp=nlp, medkit_labels_anns=medkit_labels_anns, medkit_attrs=medkit_attrs, spacy_entities=spacy_entities, spacy_span_groups=spacy_span_groups, spacy_attrs=spacy_attrs, medkit_attribute_factories=medkit_attribute_factories, name=name, uid=uid, )