Source code for medkit.text.spacy.pipeline

__all__ = ["SpacyPipeline"]
from typing import Dict, Callable, List, Optional

from spacy import Language
from spacy.tokens import Doc, Span as SpacySpan

from medkit.core import Attribute
from medkit.core.operation import Operation
from medkit.core.text import Segment
from medkit.text.spacy import spacy_utils


[docs]class SpacyPipeline(Operation): """Segment annotator relying on a Spacy pipeline""" def __init__( self, nlp: Language, spacy_entities: Optional[List[str]] = None, spacy_span_groups: Optional[List[str]] = None, spacy_attrs: Optional[List[str]] = None, medkit_attribute_factories: Optional[ Dict[str, Callable[[SpacySpan, str], Attribute]] ] = None, name: Optional[str] = None, uid: Optional[str] = None, ): """Initialize the segment annotator Parameters ---------- nlp: Language object with the loaded pipeline from Spacy spacy_entities: Labels of new spacy entities (`doc.ents`) to convert into medkit entities. If `None` (default) all the new spacy entities will be converted spacy_span_groups: Name of new spacy span groups (`doc.spans`) to convert into medkit segments. If `None` (default) new spacy span groups will be converted spacy_attrs: Name of span extensions to convert into medkit attributes. If `None` (default) all non-None extensions will be added for each annotation with a medkit ID. medkit_attribute_factories: Mapping of factories in charge of converting spacy attributes to medkit attributes. Factories will receive a spacy span and an an attribute label when called. The key in the mapping is the attribute label. name: Name describing the pipeline (defaults to the class name). uid: Identifier of the pipeline """ # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) self.nlp = nlp self.spacy_entities = spacy_entities self.spacy_span_groups = spacy_span_groups self.spacy_attrs = spacy_attrs self.medkit_attribute_factories = medkit_attribute_factories
[docs] def run(self, segments: List[Segment]) -> List[Segment]: """Run a spacy pipeline on a list of segments provided as input and returns a new list of segments. Each segment is converted to spacy document (Doc object). Then, the spacy pipeline is executed and finally, the new annotations and attributes are converted into medkit annotations. Parameters ---------- segments: List of segments on which to run the spacy pipeline Returns ------- List[Segments]: List of new annotations """ output_segments = [] for segment in segments: # build spacy doc # TODO: transfer of annotations and attributes attached to # a segment are not currently supported, no anns are included spacy_doc = spacy_utils.build_spacy_doc_from_medkit_segment( nlp=self.nlp, segment=segment, annotations=[], attrs=[], include_medkit_info=True, ) # apply nlp spacy spacy_doc = self.nlp(spacy_doc) new_segments = self._find_segments_in_spacy_doc( spacy_doc=spacy_doc, medkit_source_ann=segment ) output_segments.extend(new_segments) return output_segments
def _find_segments_in_spacy_doc(self, spacy_doc: Doc, medkit_source_ann: Segment): # get new annotations and attributes segments, attrs_by_ann_id = spacy_utils.extract_anns_and_attrs_from_spacy_doc( spacy_doc=spacy_doc, medkit_source_ann=medkit_source_ann, entities=self.spacy_entities, span_groups=self.spacy_span_groups, attrs=self.spacy_attrs, attribute_factories=self.medkit_attribute_factories, rebuild_medkit_anns_and_attrs=False, ) for new_segment in segments: # add provenance if self._prov_tracer is not None: self._prov_tracer.add_prov( new_segment, self.description, source_data_items=[medkit_source_ann], ) # add attributes if new_segment.uid in attrs_by_ann_id.keys(): for attr in attrs_by_ann_id[new_segment.uid]: new_segment.attrs.add(attr) if self._prov_tracer is not None: self._prov_tracer.add_prov( attr, self.description, source_data_items=[medkit_source_ann], ) yield new_segment