Source code for medkit.text.spacy.pipeline

from __future__ import annotations

__all__ = ["SpacyPipeline"]

from typing import TYPE_CHECKING, Callable

from medkit.core.operation import Operation
from medkit.text.spacy import spacy_utils

if TYPE_CHECKING:
    from spacy import Language
    from spacy.tokens import Doc
    from spacy.tokens import Span as SpacySpan

    from medkit.core import Attribute
    from medkit.core.text import Segment


[docs] class SpacyPipeline(Operation): """Segment annotator relying on a Spacy pipeline""" def __init__( self, nlp: Language, spacy_entities: list[str] | None = None, spacy_span_groups: list[str] | None = None, spacy_attrs: list[str] | None = None, medkit_attribute_factories: dict[str, Callable[[SpacySpan, str], Attribute]] | None = None, name: str | None = None, uid: str | None = None, ): """Initialize the segment annotator Parameters ---------- nlp : Language Language object with the loaded pipeline from Spacy spacy_entities : list of str, optional Labels of new spacy entities (`doc.ents`) to convert into medkit entities. If `None` (default) all the new spacy entities will be converted spacy_span_groups : list of str, optional Name of new spacy span groups (`doc.spans`) to convert into medkit segments. If `None` (default) new spacy span groups will be converted spacy_attrs : list of str, optional Name of span extensions to convert into medkit attributes. If `None` (default) all non-None extensions will be added for each annotation with a medkit ID. medkit_attribute_factories : dict of str to Callable, optional Mapping of factories in charge of converting spacy attributes to medkit attributes. Factories will receive a spacy span and an an attribute label when called. The key in the mapping is the attribute label. name : str, optional Name describing the pipeline (defaults to the class name). uid : str, optional Identifier of the pipeline """ # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) self.nlp = nlp self.spacy_entities = spacy_entities self.spacy_span_groups = spacy_span_groups self.spacy_attrs = spacy_attrs self.medkit_attribute_factories = medkit_attribute_factories
[docs] def run(self, segments: list[Segment]) -> list[Segment]: """Run a spacy pipeline on a list of segments provided as input and returns a new list of segments. Each segment is converted to spacy document (Doc object). Then, the spacy pipeline is executed and finally, the new annotations and attributes are converted into medkit annotations. Parameters ---------- segments : list of Segment List of segments on which to run the spacy pipeline Returns ------- list of Segment List of new annotations """ output_segments = [] for segment in segments: # build spacy doc # TODO: transfer of annotations and attributes attached to # a segment are not currently supported, no anns are included spacy_doc = spacy_utils.build_spacy_doc_from_medkit_segment( nlp=self.nlp, segment=segment, annotations=[], attrs=[], include_medkit_info=True, ) # apply nlp spacy spacy_doc = self.nlp(spacy_doc) new_segments = self._find_segments_in_spacy_doc(spacy_doc=spacy_doc, medkit_source_ann=segment) output_segments.extend(new_segments) return output_segments
def _find_segments_in_spacy_doc(self, spacy_doc: Doc, medkit_source_ann: Segment): # get new annotations and attributes segments, attrs_by_ann_id = spacy_utils.extract_anns_and_attrs_from_spacy_doc( spacy_doc=spacy_doc, medkit_source_ann=medkit_source_ann, entities=self.spacy_entities, span_groups=self.spacy_span_groups, attrs=self.spacy_attrs, attribute_factories=self.medkit_attribute_factories, rebuild_medkit_anns_and_attrs=False, ) for new_segment in segments: # add provenance if self._prov_tracer is not None: self._prov_tracer.add_prov( new_segment, self.description, source_data_items=[medkit_source_ann], ) # add attributes if new_segment.uid in attrs_by_ann_id: for attr in attrs_by_ann_id[new_segment.uid]: new_segment.attrs.add(attr) if self._prov_tracer is not None: self._prov_tracer.add_prov( attr, self.description, source_data_items=[medkit_source_ann], ) yield new_segment