Source code for medkit.io.spacy
"""
This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[spacy]`.
"""
__all__ = ["SpacyInputConverter", "SpacyOutputConverter"]
from typing import List, Optional
from spacy import Language
from spacy.tokens import Doc
from medkit.core import OperationDescription, ProvTracer, generate_id
from medkit.core.text import TextDocument
from medkit.text.spacy.spacy_utils import (
build_spacy_doc_from_medkit_doc,
extract_anns_and_attrs_from_spacy_doc,
)
[docs]class SpacyInputConverter:
"""Class in charge of converting spacy documents into a collection of TextDocuments."""
def __init__(
self,
entities: Optional[List[str]] = None,
span_groups: Optional[List[str]] = None,
attrs: Optional[List[str]] = None,
uid: Optional[str] = None,
):
"""Initialize the spacy input converter
Parameters
----------
entities:
Labels of spacy entities (`doc.ents`) to convert into medkit entities.
If `None` (default) all spacy entities will be converted and added into
its origin medkit document.
span_groups:
Name of groups of spacy spans (`doc.spans`) to convert into medkit segments.
If `None` (default) all groups of spacy spans will be converted and added into
the medkit document.
attrs:
Name of span extensions to convert into medkit attributes.
If `None` (default) all non-None extensions will be added for each annotation
uid:
Identifier of the converter
"""
if uid is None:
uid = generate_id()
self.uid = uid
self._prov_tracer: Optional[ProvTracer] = None
self.entities = entities
self.span_groups = span_groups
self.attrs = attrs
@property
def description(self) -> OperationDescription:
config = dict(
entities=self.entities,
span_groups=self.span_groups,
attrs=self.attrs,
)
return OperationDescription(
uid=self.uid,
name=self.__class__.__name__,
class_name=self.__class__.__name__,
config=config,
)
def set_prov_tracer(self, prov_tracer: ProvTracer):
self._prov_tracer = prov_tracer
[docs] def load(self, spacy_docs: List[Doc]) -> List[TextDocument]:
"""
Create a list of TextDocuments from a list of spacy Doc objects.
Depending on the configuration of the converted, the selected annotations
and attributes are included in the documents.
Parameters
----------
spacy_docs:
A list of spacy documents to convert
Returns
-------
List[TextDocument]
A list of TextDocuments
"""
medkit_docs = []
for spacy_doc in spacy_docs:
# create a new medkit document (TextDocument object)
medkit_doc = TextDocument(text=spacy_doc.text_with_ws)
anns = self._load_anns(spacy_doc)
for ann in anns:
medkit_doc.anns.add(ann)
medkit_docs.append(medkit_doc)
return medkit_docs
def _load_anns(self, spacy_doc: Doc):
annotations, attributes_by_ann = extract_anns_and_attrs_from_spacy_doc(
spacy_doc=spacy_doc,
medkit_source_ann=None,
entities=self.entities,
span_groups=self.span_groups,
attrs=self.attrs,
rebuild_medkit_anns_and_attrs=True,
)
# add annotations
for ann in annotations:
if self._prov_tracer is not None:
# the input converter does not know the source data item
self._prov_tracer.add_prov(ann, self.description, source_data_items=[])
if ann.uid in attributes_by_ann.keys():
attrs = attributes_by_ann[ann.uid]
for attr in attrs:
ann.attrs.add(attr)
if self._prov_tracer is not None:
# the input converter does not know the source data item
self._prov_tracer.add_prov(
attr, self.description, source_data_items=[]
)
return annotations
[docs]class SpacyOutputConverter:
"""Class in charge of converting a list of TextDocuments into a
list of spacy documents"""
def __init__(
self,
nlp: Language,
apply_nlp_spacy: bool = False,
labels_anns: Optional[List[str]] = None,
attrs: Optional[List[str]] = None,
uid: Optional[str] = None,
):
"""Initialize the spacy output converter
Parameters
----------
nlp:
Language object with the loaded pipeline from Spacy
apply_nlp_spacy:
If True, each component of `nlp` pipeline is applied to the new spacy document.
Some features, such as 'POS TAG', are added by a component of the pipeline, this
parameter should be True, in order to add such attributes.
If False, the `nlp` pipeline is not applied in the spacy document, so the document
contains only the annotations and attributes transferred by medkit.
labels_anns:
Labels of medkit annotations to include in the spacy document.
If `None` (default) all the annotations will be included.
attrs:
Labels of medkit attributes to add in the annotations that will be included.
If `None` (default) all the attributes will be added as `custom attributes`
in each annotation included.
uid:
Identifier of the pipeline
"""
if uid is None:
uid = generate_id()
self.uid = uid
self._prov_tracer: Optional[ProvTracer] = None
self.nlp = nlp
self.labels_anns = labels_anns
self.attrs = attrs
self.apply_nlp_spacy = apply_nlp_spacy
@property
def description(self) -> OperationDescription:
# medkit does not support serialisation of nlp objects,
# however version information like model name, author etc. is stored
config = dict(
nlp_metadata=self.nlp.meta,
labels_anns=self.labels_anns,
attrs=self.attrs,
apply_nlp_spacy=self.apply_nlp_spacy,
)
return OperationDescription(
uid=self.uid, class_name=self.__class__.__name__, config=config
)
[docs] def convert(self, medkit_docs: List[TextDocument]) -> List[Doc]:
"""
Convert a list of TextDocuments into a list of spacy Doc objects.
Depending on the configuration of the converted, the selected annotations
and attributes are included in the documents.
Parameters
----------
medkit_docs:
A list of TextDocuments to convert
Returns
-------
List[Doc]
A list of spacy Doc objects
"""
spacy_docs = []
for medkit_doc in medkit_docs:
# create a spacy document from medkit with the selected annotations
spacy_doc = build_spacy_doc_from_medkit_doc(
nlp=self.nlp,
medkit_doc=medkit_doc,
labels_anns=self.labels_anns,
attrs=self.attrs,
include_medkit_info=False,
)
# each component of nlp spacy is applied
if self.apply_nlp_spacy:
spacy_doc = self.nlp(spacy_doc)
spacy_docs.append(spacy_doc)
return spacy_docs