__all__ = ["SpacyPipeline"]
from typing import Dict, Callable, List, Optional
from spacy import Language
from spacy.tokens import Doc, Span as SpacySpan
from medkit.core import Attribute
from medkit.core.operation import Operation
from medkit.core.text import Segment
from medkit.text.spacy import spacy_utils
[docs]class SpacyPipeline(Operation):
"""Segment annotator relying on a Spacy pipeline"""
def __init__(
self,
nlp: Language,
spacy_entities: Optional[List[str]] = None,
spacy_span_groups: Optional[List[str]] = None,
spacy_attrs: Optional[List[str]] = None,
medkit_attribute_factories: Optional[
Dict[str, Callable[[SpacySpan, str], Attribute]]
] = None,
name: Optional[str] = None,
uid: Optional[str] = None,
):
"""Initialize the segment annotator
Parameters
----------
nlp:
Language object with the loaded pipeline from Spacy
spacy_entities:
Labels of new spacy entities (`doc.ents`) to convert into medkit entities.
If `None` (default) all the new spacy entities will be converted
spacy_span_groups:
Name of new spacy span groups (`doc.spans`) to convert into medkit segments.
If `None` (default) new spacy span groups will be converted
spacy_attrs:
Name of span extensions to convert into medkit attributes.
If `None` (default) all non-None extensions will be added for each annotation with
a medkit ID.
medkit_attribute_factories:
Mapping of factories in charge of converting spacy attributes to
medkit attributes. Factories will receive a spacy span and an an
attribute label when called. The key in the mapping is the attribute
label.
name:
Name describing the pipeline (defaults to the class name).
uid:
Identifier of the pipeline
"""
# Pass all arguments to super (remove self)
init_args = locals()
init_args.pop("self")
super().__init__(**init_args)
self.nlp = nlp
self.spacy_entities = spacy_entities
self.spacy_span_groups = spacy_span_groups
self.spacy_attrs = spacy_attrs
self.medkit_attribute_factories = medkit_attribute_factories
[docs] def run(self, segments: List[Segment]) -> List[Segment]:
"""Run a spacy pipeline on a list of segments provided as input
and returns a new list of segments.
Each segment is converted to spacy document (Doc object).
Then, the spacy pipeline is executed and finally, the new
annotations and attributes are converted into medkit annotations.
Parameters
----------
segments:
List of segments on which to run the spacy pipeline
Returns
-------
List[Segments]:
List of new annotations
"""
output_segments = []
for segment in segments:
# build spacy doc
# TODO: transfer of annotations and attributes attached to
# a segment are not currently supported, no anns are included
spacy_doc = spacy_utils.build_spacy_doc_from_medkit_segment(
nlp=self.nlp,
segment=segment,
annotations=[],
attrs=[],
include_medkit_info=True,
)
# apply nlp spacy
spacy_doc = self.nlp(spacy_doc)
new_segments = self._find_segments_in_spacy_doc(
spacy_doc=spacy_doc, medkit_source_ann=segment
)
output_segments.extend(new_segments)
return output_segments
def _find_segments_in_spacy_doc(self, spacy_doc: Doc, medkit_source_ann: Segment):
# get new annotations and attributes
segments, attrs_by_ann_id = spacy_utils.extract_anns_and_attrs_from_spacy_doc(
spacy_doc=spacy_doc,
medkit_source_ann=medkit_source_ann,
entities=self.spacy_entities,
span_groups=self.spacy_span_groups,
attrs=self.spacy_attrs,
attribute_factories=self.medkit_attribute_factories,
rebuild_medkit_anns_and_attrs=False,
)
for new_segment in segments:
# add provenance
if self._prov_tracer is not None:
self._prov_tracer.add_prov(
new_segment,
self.description,
source_data_items=[medkit_source_ann],
)
# add attributes
if new_segment.uid in attrs_by_ann_id.keys():
for attr in attrs_by_ann_id[new_segment.uid]:
new_segment.attrs.add(attr)
if self._prov_tracer is not None:
self._prov_tracer.add_prov(
attr,
self.description,
source_data_items=[medkit_source_ann],
)
yield new_segment