:py:mod:`medkit.core.text`
==========================

.. py:module:: medkit.core.text


Submodules
----------
.. toctree::
   :titlesonly:
   :maxdepth: 1

   annotation/index.rst
   annotation_container/index.rst
   document/index.rst
   entity_attribute_container/index.rst
   entity_norm_attribute/index.rst
   operation/index.rst
   span/index.rst
   span_utils/index.rst
   umls_norm_attribute/index.rst
   utils/index.rst


Package Contents
----------------

Classes
~~~~~~~

.. autoapisummary::

   medkit.core.text.Entity
   medkit.core.text.Relation
   medkit.core.text.Segment
   medkit.core.text.TextAnnotation
   medkit.core.text.TextAnnotationContainer
   medkit.core.text.TextDocument
   medkit.core.text.EntityAttributeContainer
   medkit.core.text.EntityNormAttribute
   medkit.core.text.ContextOperation
   medkit.core.text.CustomTextOpType
   medkit.core.text.NEROperation
   medkit.core.text.SegmentationOperation
   medkit.core.text.AnySpan
   medkit.core.text.ModifiedSpan
   medkit.core.text.Span
   medkit.core.text.UMLSNormAttribute



Functions
~~~~~~~~~

.. autoapisummary::

   medkit.core.text.create_text_operation



.. py:class:: Entity(label: str, text: str, spans: list[medkit.core.text.span.AnySpan], attrs: list[medkit.core.attribute.Attribute] | None = None, metadata: dict[str, Any] | None = None, uid: str | None = None, store: medkit.core.store.Store | None = None, attr_container_class: type[medkit.core.text.entity_attribute_container.EntityAttributeContainer] = EntityAttributeContainer)


   Bases: :py:obj:`Segment`

   
   Text entity referencing part of an :class:`~medkit.core.text.TextDocument`.














   :Attributes:

       **uid** : str
           The entity identifier.

       **label** : str
           The label for this entity (e.g., DISEASE)

       **text** : str
           Text of the entity.

       **spans** : list of AnySpan
           List of spans indicating which parts of the entity text correspond to
           which part of the document's full text.

       **attrs** : EntityAttributeContainer
           Attributes of the entity. Stored in a
           :class:{~medkit.core.EntityAttributeContainer} but can be passed as a list at
           init.

       **metadata** : dict of str to Any
           The metadata of the entity

       **keys** : set of str
           Pipeline output keys to which the entity belongs to.


   ..
       !! processed by numpydoc !!
   .. py:attribute:: attrs
      :type: medkit.core.text.entity_attribute_container.EntityAttributeContainer

      


.. py:class:: Relation(label: str, source_id: str, target_id: str, attrs: list[medkit.core.attribute.Attribute] | None = None, metadata: dict[str, Any] | None = None, uid: str | None = None, store: medkit.core.store.Store | None = None, attr_container_class: type[medkit.core.attribute_container.AttributeContainer] = AttributeContainer)


   Bases: :py:obj:`TextAnnotation`

   
   Relation between two text entities.














   :Attributes:

       **uid** : str
           The identifier of the relation

       **label** : str
           The relation label

       **source_id** : str
           The identifier of the entity from which the relation is defined

       **target_id** : str
           The identifier of the entity to which the relation is defined

       **attrs** : AttributeContainer
           The attributes of the relation

       **metadata** : dict of str to Any
           The metadata of the relation

       **keys** : set of str
           Pipeline output keys to which the relation belongs to


   ..
       !! processed by numpydoc !!
   .. py:attribute:: source_id
      :type: str

      

   .. py:attribute:: target_id
      :type: str

      

   .. py:method:: to_dict() -> dict[str, Any]


   .. py:method:: from_dict(relation_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:

      
      Create a Relation from a dict.


      :Parameters:

          **relation_dict** : dict of str to Any
              A dictionary from a serialized relation as generated by to_dict()














      ..
          !! processed by numpydoc !!


.. py:class:: Segment(label: str, text: str, spans: list[medkit.core.text.span.AnySpan], attrs: list[medkit.core.attribute.Attribute] | None = None, metadata: dict[str, Any] | None = None, uid: str | None = None, store: medkit.core.store.Store | None = None, attr_container_class: type[medkit.core.attribute_container.AttributeContainer] = AttributeContainer)


   Bases: :py:obj:`TextAnnotation`

   
   Text segment referencing part of an :class:`~medkit.core.text.TextDocument`.














   :Attributes:

       **uid** : str
           The segment identifier.

       **label** : str
           The label for this segment (e.g., SENTENCE)

       **text** : str
           Text of the segment.

       **spans** : list of AnySpan
           List of spans indicating which parts of the segment text correspond to
           which part of the document's full text.

       **attrs** : AttributeContainer
           Attributes of the segment. Stored in a
           :class:{~medkit.core.AttributeContainer} but can be passed as a list at
           init.

       **metadata** : dict of str to Any
           The metadata of the segment

       **keys** : set of str
           Pipeline output keys to which the segment belongs to.


   ..
       !! processed by numpydoc !!
   .. py:attribute:: spans
      :type: list[medkit.core.text.span.AnySpan]

      

   .. py:attribute:: text
      :type: str

      

   .. py:method:: to_dict() -> dict[str, Any]


   .. py:method:: from_dict(segment_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:

      
      Create a Segment from a dict.


      :Parameters:

          **segment_dict** : dict of str to Any
              A dictionary from a serialized segment as generated by to_dict()














      ..
          !! processed by numpydoc !!


.. py:class:: TextAnnotation(label: str, attrs: list[medkit.core.attribute.Attribute] | None = None, metadata: dict[str, Any] | None = None, uid: str | None = None, attr_container_class: type[medkit.core.attribute_container.AttributeContainer] = AttributeContainer)


   Bases: :py:obj:`abc.ABC`, :py:obj:`medkit.core.dict_conv.SubclassMapping`

   
   Base abstract class for all text annotations.














   :Attributes:

       **uid** : str
           Unique identifier of the annotation.

       **label** : str
           The label for this annotation (e.g., SENTENCE)

       **attrs** : AttributeContainer
           Attributes of the annotation. Stored in a
           :class:{~medkit.core.AttributeContainer} but can be passed as a list at
           init.

       **metadata** : dict of str to Any
           The metadata of the annotation

       **keys** : set of str
           Pipeline output keys to which the annotation belongs to.


   ..
       !! processed by numpydoc !!
   .. py:attribute:: uid
      :type: str

      

   .. py:attribute:: label
      :type: str

      

   .. py:attribute:: attrs
      :type: medkit.core.attribute_container.AttributeContainer

      

   .. py:attribute:: metadata
      :type: dict[str, Any]

      

   .. py:attribute:: keys
      :type: set[str]

      

   .. py:method:: __init_subclass__()
      :classmethod:


   .. py:method:: from_dict(ann_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:


   .. py:method:: to_dict() -> dict[str, Any]
      :abstractmethod:



.. py:class:: TextAnnotationContainer(doc_id: str, raw_segment: medkit.core.text.annotation.Segment)


   Bases: :py:obj:`medkit.core.annotation_container.AnnotationContainer`\ [\ :py:obj:`medkit.core.text.annotation.TextAnnotation`\ ]

   
   Manage a list of text annotations belonging to a text document.

   This behaves more or less like a list: calling `len()` and iterating are
   supported. Additional filtering is available through the `get()` method.

   Also provides retrieval of entities, segments, relations, and handling of
   raw segment.















   ..
       !! processed by numpydoc !!
   .. py:property:: segments
      :type: list[medkit.core.text.annotation.Segment]

      
      Return the list of segments.
















      ..
          !! processed by numpydoc !!

   .. py:property:: entities
      :type: list[medkit.core.text.annotation.Entity]

      
      Return the list of entities.
















      ..
          !! processed by numpydoc !!

   .. py:property:: relations
      :type: list[medkit.core.text.annotation.Relation]

      
      Return the list of relations.
















      ..
          !! processed by numpydoc !!

   .. py:method:: add(ann: medkit.core.text.annotation.TextAnnotation)

      
      Attach an annotation to the document.


      :Parameters:

          **ann** : AnnotationType
              Annotation to add.





      :Raises:

          ValueError
              If the annotation is already attached to the document
              (based on `annotation.uid`)









      ..
          !! processed by numpydoc !!

   .. py:method:: get(*, label: str | None = None, key: str | None = None) -> list[medkit.core.text.annotation.TextAnnotation]

      
      Return a list of the annotations of the document.


      :Parameters:

          **label** : str, optional
              Label to use to filter annotations.

          **key** : str, optional
              Key to use to filter annotations.














      ..
          !! processed by numpydoc !!

   .. py:method:: get_by_id(uid) -> medkit.core.text.annotation.TextAnnotation

      
      Return the annotation corresponding to a specific identifier.


      :Parameters:

          **uid** : str
              Identifier of the annotation to return.














      ..
          !! processed by numpydoc !!

   .. py:method:: get_segments(*, label: str | None = None, key: str | None = None) -> list[medkit.core.text.annotation.Segment]

      
      Return a list of the segments of the document (not including entities).


      :Parameters:

          **label** : str, optional
              Label to use to filter segments.

          **key** : str, optional
              Key to use to filter segments.














      ..
          !! processed by numpydoc !!

   .. py:method:: get_entities(*, label: str | None = None, key: str | None = None) -> list[medkit.core.text.annotation.Entity]

      
      Return a list of the entities of the document.


      :Parameters:

          **label** : str, optional
              Label to use to filter entities.

          **key** : str, optional
              Key to use to filter entities.














      ..
          !! processed by numpydoc !!

   .. py:method:: get_relations(*, label: str | None = None, key: str | None = None, source_id: str | None = None) -> list[medkit.core.text.annotation.Relation]

      
      Return a list of the relations of the document.


      :Parameters:

          **label** : str, optional
              Label to use to filter relations.

          **key** : str, optional
              Key to use to filter relations.

          **source_id** : str, optional
              Identifier of the source entity to use to filter relations.














      ..
          !! processed by numpydoc !!


.. py:class:: TextDocument(text: str, anns: Sequence[medkit.core.text.annotation.TextAnnotation] | None = None, attrs: Sequence[medkit.core.Attribute] | None = None, metadata: dict[str, Any] | None = None, uid: str | None = None)


   Bases: :py:obj:`medkit.core.dict_conv.SubclassMapping`

   
   Document holding text annotations.

   Annotations must be subclasses of `TextAnnotation`.












   .. rubric:: Examples

   >>> doc = TextDocument(text="hello")
   >>> raw_text = doc.anns.get(label=TextDocument.RAW_LABEL)[0]

   :Attributes:

       **uid** : str
           Unique identifier of the document.

       **text** : str
           Full document text.

       **anns** : TextAnnotationContainer
           Annotations of the document. Stored in an
           :class:`~.text.TextAnnotationContainer` but can be passed as a list at init.

       **attrs** : AttributeContainer
           Attributes of the document. Stored in an
           :class:`~.core.AttributeContainer` but can be passed as a list at init

       **metadata** : dict of str to Any
           Document metadata.

       **raw_segment** : Segment
           Auto-generated segment containing the full unprocessed document text. To
           get the raw text as an annotation to pass to processing operations:


   ..
       !! processed by numpydoc !!
   .. py:property:: text
      :type: str


   .. py:attribute:: RAW_LABEL
      :type: ClassVar[str]
      :value: 'RAW_TEXT'

      

   .. py:attribute:: uid
      :type: str

      

   .. py:attribute:: anns
      :type: medkit.core.text.annotation_container.TextAnnotationContainer

      

   .. py:attribute:: attrs
      :type: medkit.core.AttributeContainer

      

   .. py:attribute:: metadata
      :type: dict[str, Any]

      

   .. py:attribute:: raw_segment
      :type: medkit.core.text.annotation.Segment

      

   .. py:method:: _generate_raw_segment(text: str, doc_id: str) -> medkit.core.text.annotation.Segment
      :classmethod:


   .. py:method:: __init_subclass__()
      :classmethod:


   .. py:method:: to_dict(with_anns: bool = True) -> dict[str, Any]


   .. py:method:: from_dict(doc_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:

      
      Create a TextDocument from a dict.


      :Parameters:

          **doc_dict** : dict of str to Any
              A dictionary from a serialized TextDocument as generated by to_dict()














      ..
          !! processed by numpydoc !!

   .. py:method:: from_file(path: os.PathLike, encoding: str = 'utf-8') -> typing_extensions.Self
      :classmethod:

      
      Create a document from a text file.


      :Parameters:

          **path** : Path
              Path of the text file

          **encoding** : str, default="utf-8"
              Text encoding to use

      :Returns:

          TextDocument
              Text document with contents of `path` as text. The file path is
              included in the document metadata.













      ..
          !! processed by numpydoc !!

   .. py:method:: from_dir(path: os.PathLike, pattern: str = '*.txt', encoding: str = 'utf-8') -> list[typing_extensions.Self]
      :classmethod:

      
      Create documents from text files in a directory.


      :Parameters:

          **path** : Path
              Path of the directory containing text files

          **pattern** : str
              Glob pattern to match text files in `path`

          **encoding** : str
              Text encoding to use

      :Returns:

          list of TextDocument
              Text documents with contents of each file as text













      ..
          !! processed by numpydoc !!

   .. py:method:: get_snippet(segment: medkit.core.text.annotation.Segment, max_extend_length: int) -> str

      
      Return a portion of the original text containing the annotation.


      :Parameters:

          **segment** : Segment
              The annotation

          **max_extend_length** : int
              Maximum number of characters to use around the annotation

      :Returns:

          str
              A portion of the text around the annotation













      ..
          !! processed by numpydoc !!


.. py:class:: EntityAttributeContainer(owner_id: str)


   Bases: :py:obj:`medkit.core.attribute_container.AttributeContainer`

   
   Manage a list of attributes attached to a text entity.

   This behaves more or less like a list: calling `len()` and iterating are
   supported. Additional filtering is available through the `get()` method.

   Also provides retrieval of normalization attributes.















   ..
       !! processed by numpydoc !!
   .. py:property:: norms
      :type: list[medkit.core.text.entity_norm_attribute.EntityNormAttribute]

      
      Return the list of normalization attributes.
















      ..
          !! processed by numpydoc !!

   .. py:method:: add(attr: medkit.core.attribute.Attribute)

      
      Attach an attribute to the annotation.


      :Parameters:

          **attr** : Attribute
              Attribute to add.





      :Raises:

          ValueError
              If the attribute is already attached to the annotation (based on
              `attr.uid`).









      ..
          !! processed by numpydoc !!

   .. py:method:: get_norms() -> list[medkit.core.text.entity_norm_attribute.EntityNormAttribute]

      
      Return a list of the normalization attributes of the annotation.
















      ..
          !! processed by numpydoc !!


.. py:class:: EntityNormAttribute(kb_name: str | None, kb_id: Any | None, kb_version: str | None = None, term: str | None = None, score: float | None = None, metadata: dict[str, Any] | None = None, uid: str | None = None)


   Bases: :py:obj:`medkit.core.attribute.Attribute`

   
   Normalization attribute linking an entity to an ID in a knowledge base.














   :Attributes:

       **uid** : str
           Identifier of the attribute

       **label** : str
           The attribute label, always set to :attr:`EntityNormAttribute.LABEL
           <.core.text.EntityNormAttribute.LABEL>`

       **value** : Any
           String representation of the normalization, containing `kb_id`, along
           with `kb_name` if available (ex: "umls:C0011849"). For special cases
           where only `term` is available, it is used as value.

       **kb_name** : str, optional
           Name of the knowledge base (ex: "icd"). Should always be provided except
           in special cases when we just want to store a normalized term.

       **kb_id** : Any, optional
           ID in the knowledge base to which the annotation should be linked.
           Should always be provided except in special cases when we just want to
           store a normalized term.

       **kb_version** : str, optional
           Optional version of the knowledge base.

       **term** : str, optional
           Optional normalized version of the entity text.

       **score** : float, optional
           Optional score reflecting confidence of this link.

       **metadata** : dict of str to Any
           Metadata of the attribute


   ..
       !! processed by numpydoc !!
   .. py:attribute:: kb_name
      :type: str | None

      

   .. py:attribute:: kb_id
      :type: Any | None

      

   .. py:attribute:: kb_version
      :type: str | None

      

   .. py:attribute:: term
      :type: str | None

      

   .. py:attribute:: score
      :type: float | None

      

   .. py:attribute:: LABEL
      :type: ClassVar[str]
      :value: 'NORMALIZATION'

      
      Label used for all normalization attributes
















      ..
          !! processed by numpydoc !!

   .. py:method:: to_brat() -> str

      
      Return a value compatible with the brat format.
















      ..
          !! processed by numpydoc !!

   .. py:method:: to_spacy() -> str

      
      Return a value compatible with spaCy.
















      ..
          !! processed by numpydoc !!

   .. py:method:: to_dict() -> dict[str, Any]


   .. py:method:: from_dict(data_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:

      
      Create an Attribute from a dict.


      :Parameters:

          **attribute_dict: dict of str to Any**
              A dictionary from a serialized Attribute as generated by to_dict()














      ..
          !! processed by numpydoc !!


.. py:class:: ContextOperation(uid: str | None = None, name: str | None = None, **kwargs)


   Bases: :py:obj:`medkit.core.operation.Operation`

   
   Abstract operation for context detection.

   It uses a list of segments as input for running the operation and creates attributes
   that are directly appended to these segments.















   ..
       !! processed by numpydoc !!
   .. py:method:: run(segments: list[medkit.core.text.annotation.Segment]) -> None
      :abstractmethod:



.. py:class:: CustomTextOpType


   Bases: :py:obj:`enum.IntEnum`

   
   Supported function types for creating custom text operations.
















   ..
       !! processed by numpydoc !!
   .. py:attribute:: CREATE_ONE_TO_N
      :value: 1

      
      Take 1 data item, return N new data items.
















      ..
          !! processed by numpydoc !!

   .. py:attribute:: EXTRACT_ONE_TO_N
      :value: 2

      
      Take 1 data item, return N existing data items
















      ..
          !! processed by numpydoc !!

   .. py:attribute:: FILTER
      :value: 3

      
      Take 1 data item, return True or False.
















      ..
          !! processed by numpydoc !!


.. py:class:: NEROperation(uid: str | None = None, name: str | None = None, **kwargs)


   Bases: :py:obj:`medkit.core.operation.Operation`

   
   Abstract operation for detecting entities.

   It uses a list of segments as input and produces a list of detected entities.















   ..
       !! processed by numpydoc !!
   .. py:method:: run(segments: list[medkit.core.text.annotation.Segment]) -> list[medkit.core.text.annotation.Entity]
      :abstractmethod:



.. py:class:: SegmentationOperation(uid: str | None = None, name: str | None = None, **kwargs)


   Bases: :py:obj:`medkit.core.operation.Operation`

   
   Abstract operation for segmenting text.

   It uses a list of segments as input and produces a list of new segments.















   ..
       !! processed by numpydoc !!
   .. py:method:: run(segments: list[medkit.core.text.annotation.Segment]) -> list[medkit.core.text.annotation.Segment]
      :abstractmethod:



.. py:function:: create_text_operation(function: Callable, function_type: CustomTextOpType, name: str | None = None, args: dict | None = None) -> _CustomTextOperation

   
   Instantiate a custom text operation from a user-defined function.


   :Parameters:

       **function** : Callable
           User-defined function

       **function_type** : CustomTextOpType
           Type of function.
           Supported values are defined in :class:`~medkit.core.text.CustomTextOpType`

       **name** : str, optional
           Name of the operation used for provenance info (default: function name)

       **args** : str, optional
           Dictionary containing the arguments of the function if any.

   :Returns:

       _CustomTextOperation
           An instance of a custom text operation













   ..
       !! processed by numpydoc !!

.. py:class:: AnySpan


   Bases: :py:obj:`abc.ABC`, :py:obj:`medkit.core.dict_conv.SubclassMapping`

   
   Helper class that provides a standard way to create an ABC using
   inheritance.
















   ..
       !! processed by numpydoc !!
   .. py:attribute:: length
      :type: int

      

   .. py:method:: __init_subclass__()
      :classmethod:


   .. py:method:: from_dict(ann_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:


   .. py:method:: to_dict() -> dict[str, Any]
      :abstractmethod:



.. py:class:: ModifiedSpan


   Bases: :py:obj:`AnySpan`

   
   Slice of text not present in the original text.


   :Parameters:

       **length** : int
           Number of characters

       **replaced_spans** : list of Span
           Slices of the original text that this span is replacing














   ..
       !! processed by numpydoc !!
   .. py:attribute:: length
      :type: int

      

   .. py:attribute:: replaced_spans
      :type: list[Span]

      

   .. py:method:: to_dict() -> dict[str, Any]


   .. py:method:: from_dict(modified_span_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:

      
      Create a Modified from a dict.


      :Parameters:

          **modified_span_dict** : dict of str to Any
              A dictionary from a serialized ModifiedSpan as generated by to_dict()














      ..
          !! processed by numpydoc !!


.. py:class:: Span


   Bases: :py:obj:`AnySpan`

   
   Slice of text extracted from the original text.


   :Parameters:

       **start** : int
           Index of the first character in the original text

       **end** : int
           Index of the last character in the original text, plus one














   ..
       !! processed by numpydoc !!
   .. py:property:: length


   .. py:attribute:: start
      :type: int

      

   .. py:attribute:: end
      :type: int

      

   .. py:method:: to_dict() -> dict[str, Any]


   .. py:method:: overlaps(other: Span)

      
      Test if 2 spans reference at least one character in common.
















      ..
          !! processed by numpydoc !!

   .. py:method:: from_dict(span_dict: dict[str, Any]) -> typing_extensions.Self
      :classmethod:

      
      Create a Span from a dict.


      :Parameters:

          **span_dict: dict**
              A dictionary from a serialized span as generated by to_dict()














      ..
          !! processed by numpydoc !!


.. py:class:: UMLSNormAttribute(cui: str, umls_version: str, term: str | None = None, score: float | None = None, sem_types: list[str] | None = None, metadata: dict[str, Any] | None = None, uid: str | None = None)


   Bases: :py:obj:`medkit.core.text.entity_norm_attribute.EntityNormAttribute`

   
   Normalization attribute linking an entity to a CUI in the UMLS knowledge base.














   :Attributes:

       **uid** : str
           Identifier of the attribute

       **label** : str
           The attribute label, always set to :attr:`EntityNormAttribute.LABEL
           <.core.text.EntityNormAttribute.LABEL>`

       **value** : Any
           CUI prefixed with "umls:" (ex: "umls:C0011849")

       **kb_name** : str, optional
           Name of the knowledge base. Always "umls"

       **kb_id** : Any, optional
           CUI (Concept Unique Identifier) to which the annotation should be linked

       **cui** : str
           Convenience alias of `kb_id`

       **kb_version** : str, optional
           Version of the UMLS database (ex: "202AB")

       **umls_version** : str
           Convenience alias of `kb_version`

       **term** : str, optional
           Optional normalized version of the entity text

       **score** : float, optional
           Optional score reflecting confidence of this link

       **sem_types** : list of str, optional
           Optional IDs of semantic types of the CUI (ex: ["T047"])

       **metadata** : dict of str to Any
           Metadata of the attribute


   ..
       !! processed by numpydoc !!
   .. py:property:: cui


   .. py:property:: umls_version


   .. py:attribute:: sem_types
      :type: list[str] | None

      

   .. py:method:: to_dict() -> dict[str, Any]


   .. py:method:: from_dict(data: dict[str, Any]) -> typing_extensions.Self
      :classmethod:

      
      Create an Attribute from a dict.


      :Parameters:

          **attribute_dict: dict of str to Any**
              A dictionary from a serialized Attribute as generated by to_dict()














      ..
          !! processed by numpydoc !!


