Source code for medkit.io.spacy

"""This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[spacy]`.
"""
from __future__ import annotations

__all__ = ["SpacyInputConverter", "SpacyOutputConverter"]


from typing import TYPE_CHECKING

from medkit.core import OperationDescription, ProvTracer, generate_id
from medkit.core.text import TextDocument
from medkit.text.spacy.spacy_utils import (
    build_spacy_doc_from_medkit_doc,
    extract_anns_and_attrs_from_spacy_doc,
)

if TYPE_CHECKING:
    from spacy import Language
    from spacy.tokens import Doc


[docs] class SpacyInputConverter: """Class in charge of converting spacy documents into a collection of TextDocuments.""" def __init__( self, entities: list[str] | None = None, span_groups: list[str] | None = None, attrs: list[str] | None = None, uid: str | None = None, ): """Initialize the spacy input converter Parameters ---------- entities : list of str, optional Labels of spacy entities (`doc.ents`) to convert into medkit entities. If `None` (default) all spacy entities will be converted and added into its origin medkit document. span_groups : list of str, optional Name of groups of spacy spans (`doc.spans`) to convert into medkit segments. If `None` (default) all groups of spacy spans will be converted and added into the medkit document. attrs : list of str, optional Name of span extensions to convert into medkit attributes. If `None` (default) all non-None extensions will be added for each annotation uid : str, optional Identifier of the converter """ if uid is None: uid = generate_id() self.uid = uid self._prov_tracer: ProvTracer | None = None self.entities = entities self.span_groups = span_groups self.attrs = attrs @property def description(self) -> OperationDescription: config = { "entities": self.entities, "span_groups": self.span_groups, "attrs": self.attrs, } return OperationDescription( uid=self.uid, name=self.__class__.__name__, class_name=self.__class__.__name__, config=config, ) def set_prov_tracer(self, prov_tracer: ProvTracer): self._prov_tracer = prov_tracer
[docs] def load(self, spacy_docs: list[Doc]) -> list[TextDocument]: """Create a list of TextDocuments from a list of spacy Doc objects. Depending on the configuration of the converted, the selected annotations and attributes are included in the documents. Parameters ---------- spacy_docs : list of Doc A list of spacy documents to convert Returns ------- list of TextDocument A list of TextDocuments """ medkit_docs = [] for spacy_doc in spacy_docs: # create a new medkit document (TextDocument object) medkit_doc = TextDocument(text=spacy_doc.text_with_ws) anns = self._load_anns(spacy_doc) for ann in anns: medkit_doc.anns.add(ann) medkit_docs.append(medkit_doc) return medkit_docs
def _load_anns(self, spacy_doc: Doc): annotations, attributes_by_ann = extract_anns_and_attrs_from_spacy_doc( spacy_doc=spacy_doc, medkit_source_ann=None, entities=self.entities, span_groups=self.span_groups, attrs=self.attrs, rebuild_medkit_anns_and_attrs=True, ) # add annotations for ann in annotations: if self._prov_tracer is not None: # the input converter does not know the source data item self._prov_tracer.add_prov(ann, self.description, source_data_items=[]) if ann.uid in attributes_by_ann: attrs = attributes_by_ann[ann.uid] for attr in attrs: ann.attrs.add(attr) if self._prov_tracer is not None: # the input converter does not know the source data item self._prov_tracer.add_prov(attr, self.description, source_data_items=[]) return annotations
[docs] class SpacyOutputConverter: """Class in charge of converting a list of TextDocuments into a list of spacy documents """ def __init__( self, nlp: Language, apply_nlp_spacy: bool = False, labels_anns: list[str] | None = None, attrs: list[str] | None = None, uid: str | None = None, ): """Initialize the spacy output converter Parameters ---------- nlp : Language Language object with the loaded pipeline from Spacy apply_nlp_spacy : bool, default=False If True, each component of `nlp` pipeline is applied to the new spacy document. Some features, such as 'POS TAG', are added by a component of the pipeline, this parameter should be True, in order to add such attributes. If False, the `nlp` pipeline is not applied in the spacy document, so the document contains only the annotations and attributes transferred by medkit. labels_anns : list of str, optional Labels of medkit annotations to include in the spacy document. If `None` (default) all the annotations will be included. attrs : list of str, optional Labels of medkit attributes to add in the annotations that will be included. If `None` (default) all the attributes will be added as `custom attributes` in each annotation included. uid : str, optional Identifier of the pipeline """ if uid is None: uid = generate_id() self.uid = uid self._prov_tracer: ProvTracer | None = None self.nlp = nlp self.labels_anns = labels_anns self.attrs = attrs self.apply_nlp_spacy = apply_nlp_spacy @property def description(self) -> OperationDescription: # medkit does not support serialisation of nlp objects, # however version information like model name, author etc. is stored config = { "nlp_metadata": self.nlp.meta, "labels_anns": self.labels_anns, "attrs": self.attrs, "apply_nlp_spacy": self.apply_nlp_spacy, } return OperationDescription(uid=self.uid, class_name=self.__class__.__name__, config=config)
[docs] def convert(self, medkit_docs: list[TextDocument]) -> list[Doc]: """Convert a list of TextDocuments into a list of spacy Doc objects. Depending on the configuration of the converted, the selected annotations and attributes are included in the documents. Parameters ---------- medkit_docs : list of TextDocument A list of TextDocuments to convert Returns ------- list of Doc A list of spacy Doc objects """ spacy_docs = [] for medkit_doc in medkit_docs: # create a spacy document from medkit with the selected annotations spacy_doc = build_spacy_doc_from_medkit_doc( nlp=self.nlp, medkit_doc=medkit_doc, labels_anns=self.labels_anns, attrs=self.attrs, include_medkit_info=False, ) # each component of nlp spacy is applied if self.apply_nlp_spacy: spacy_doc = self.nlp(spacy_doc) spacy_docs.append(spacy_doc) return spacy_docs