Source code for medkit.io.spacy

"""
This module needs extra-dependencies not installed as core dependencies of medkit.
To install them, use `pip install medkit-lib[spacy]`.
"""

__all__ = ["SpacyInputConverter", "SpacyOutputConverter"]

from typing import List, Optional

from spacy import Language
from spacy.tokens import Doc

from medkit.core import OperationDescription, ProvTracer, generate_id
from medkit.core.text import TextDocument
from medkit.text.spacy.spacy_utils import (
    build_spacy_doc_from_medkit_doc,
    extract_anns_and_attrs_from_spacy_doc,
)


[docs]class SpacyInputConverter:
    """Class in charge of converting spacy documents into a collection of TextDocuments."""

    def __init__(
        self,
        entities: Optional[List[str]] = None,
        span_groups: Optional[List[str]] = None,
        attrs: Optional[List[str]] = None,
        uid: Optional[str] = None,
    ):
        """Initialize the spacy input converter

        Parameters
        ----------
        entities:
            Labels of spacy entities (`doc.ents`) to convert into medkit entities.
            If `None` (default) all spacy entities will be converted and added into
            its origin medkit document.
        span_groups:
            Name of groups of spacy spans (`doc.spans`) to convert into medkit segments.
            If `None` (default) all groups of spacy spans will be converted and added into
            the medkit document.
        attrs:
            Name of span extensions to convert into medkit attributes.
            If `None` (default) all non-None extensions will be added for each annotation
        uid:
            Identifier of the converter
        """

        if uid is None:
            uid = generate_id()

        self.uid = uid
        self._prov_tracer: Optional[ProvTracer] = None

        self.entities = entities
        self.span_groups = span_groups
        self.attrs = attrs

    @property
    def description(self) -> OperationDescription:
        config = dict(
            entities=self.entities,
            span_groups=self.span_groups,
            attrs=self.attrs,
        )

        return OperationDescription(
            uid=self.uid,
            name=self.__class__.__name__,
            class_name=self.__class__.__name__,
            config=config,
        )

    def set_prov_tracer(self, prov_tracer: ProvTracer):
        self._prov_tracer = prov_tracer

[docs]    def load(self, spacy_docs: List[Doc]) -> List[TextDocument]:
        """
        Create a list of TextDocuments from a list of spacy Doc objects.
        Depending on the configuration of the converted, the selected annotations
        and attributes are included in the documents.

        Parameters
        ----------
        spacy_docs:
            A list of spacy documents to convert

        Returns
        -------
         List[TextDocument]
            A list of TextDocuments
        """
        medkit_docs = []
        for spacy_doc in spacy_docs:
            # create a new medkit document (TextDocument object)
            medkit_doc = TextDocument(text=spacy_doc.text_with_ws)
            anns = self._load_anns(spacy_doc)
            for ann in anns:
                medkit_doc.anns.add(ann)
            medkit_docs.append(medkit_doc)

        return medkit_docs

    def _load_anns(self, spacy_doc: Doc):
        annotations, attributes_by_ann = extract_anns_and_attrs_from_spacy_doc(
            spacy_doc=spacy_doc,
            medkit_source_ann=None,
            entities=self.entities,
            span_groups=self.span_groups,
            attrs=self.attrs,
            rebuild_medkit_anns_and_attrs=True,
        )

        # add annotations
        for ann in annotations:
            if self._prov_tracer is not None:
                # the input converter does not know the source data item
                self._prov_tracer.add_prov(ann, self.description, source_data_items=[])

            if ann.uid in attributes_by_ann.keys():
                attrs = attributes_by_ann[ann.uid]
                for attr in attrs:
                    ann.attrs.add(attr)
                    if self._prov_tracer is not None:
                        # the input converter does not know the source data item
                        self._prov_tracer.add_prov(
                            attr, self.description, source_data_items=[]
                        )
        return annotations


[docs]class SpacyOutputConverter:
    """Class in charge of converting a list of TextDocuments into a
    list of spacy documents"""

    def __init__(
        self,
        nlp: Language,
        apply_nlp_spacy: bool = False,
        labels_anns: Optional[List[str]] = None,
        attrs: Optional[List[str]] = None,
        uid: Optional[str] = None,
    ):
        """Initialize the spacy output converter

        Parameters
        ----------
        nlp:
            Language object with the loaded pipeline from Spacy
        apply_nlp_spacy:
            If True, each component of `nlp` pipeline is applied to the new spacy document.
            Some features, such as 'POS TAG', are added by a component of the pipeline, this
            parameter should be True, in order to add such attributes.
            If False, the `nlp` pipeline is not applied in the spacy document, so the document
            contains only the annotations and attributes transferred by medkit.
        labels_anns:
            Labels of medkit annotations to include in the spacy document.
            If `None` (default) all the annotations will be included.
        attrs:
            Labels of medkit attributes to add in the annotations that will be included.
            If `None` (default) all the attributes will be added as `custom attributes`
            in each annotation included.
        uid:
            Identifier of the pipeline

        """
        if uid is None:
            uid = generate_id()

        self.uid = uid
        self._prov_tracer: Optional[ProvTracer] = None

        self.nlp = nlp
        self.labels_anns = labels_anns
        self.attrs = attrs
        self.apply_nlp_spacy = apply_nlp_spacy

    @property
    def description(self) -> OperationDescription:
        # medkit does not support serialisation of nlp objects,
        # however version information like model name, author etc. is stored
        config = dict(
            nlp_metadata=self.nlp.meta,
            labels_anns=self.labels_anns,
            attrs=self.attrs,
            apply_nlp_spacy=self.apply_nlp_spacy,
        )
        return OperationDescription(
            uid=self.uid, class_name=self.__class__.__name__, config=config
        )

[docs]    def convert(self, medkit_docs: List[TextDocument]) -> List[Doc]:
        """
        Convert a list of TextDocuments into a list of spacy Doc objects.
        Depending on the configuration of the converted, the selected annotations
        and attributes are included in the documents.

        Parameters
        ----------
        medkit_docs:
            A list of TextDocuments to convert

        Returns
        -------
        List[Doc]
            A list of spacy Doc objects
        """

        spacy_docs = []
        for medkit_doc in medkit_docs:
            # create a spacy document from medkit with the selected annotations
            spacy_doc = build_spacy_doc_from_medkit_doc(
                nlp=self.nlp,
                medkit_doc=medkit_doc,
                labels_anns=self.labels_anns,
                attrs=self.attrs,
                include_medkit_info=False,
            )
            # each component of nlp spacy is applied
            if self.apply_nlp_spacy:
                spacy_doc = self.nlp(spacy_doc)

            spacy_docs.append(spacy_doc)

        return spacy_docs