Source code for medkit.core.text.document

from __future__ import annotations

__all__ = ["TextDocument"]

import dataclasses
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Sequence

from typing_extensions import Self

from medkit.core import Attribute, AttributeContainer, dict_conv
from medkit.core.id import generate_deterministic_id, generate_id
from medkit.core.text import span_utils
from medkit.core.text.annotation import Segment, TextAnnotation
from medkit.core.text.annotation_container import TextAnnotationContainer
from medkit.core.text.span import Span

if TYPE_CHECKING:
    import os



[docs]
@dataclasses.dataclass(init=False)
class TextDocument(dict_conv.SubclassMapping):
    """Document holding text annotations

    Annotations must be subclasses of `TextAnnotation`.

    Attributes
    ----------
    uid : str
        Unique identifier of the document.
    text : str
        Full document text.
    anns : TextAnnotationContainer
        Annotations of the document. Stored in an
        :class:`~.text.TextAnnotationContainer` but can be passed as a list at init.
    attrs : AttributeContainer
        Attributes of the document. Stored in an
        :class:`~.core.AttributeContainer` but can be passed as a list at init
    metadata : dict of str to Any
        Document metadata.
    raw_segment : Segment
        Auto-generated segment containing the full unprocessed document text. To
        get the raw text as an annotation to pass to processing operations:

    Examples
    --------
    >>> doc = TextDocument(text="hello")
    >>> raw_text = doc.anns.get(label=TextDocument.RAW_LABEL)[0]
    """

    RAW_LABEL: ClassVar[str] = "RAW_TEXT"

    uid: str
    anns: TextAnnotationContainer
    attrs: AttributeContainer
    metadata: dict[str, Any]
    raw_segment: Segment

    def __init__(
        self,
        text: str,
        anns: Sequence[TextAnnotation] | None = None,
        attrs: Sequence[Attribute] | None = None,
        metadata: dict[str, Any] | None = None,
        uid: str | None = None,
    ):
        if anns is None:
            anns = []
        if attrs is None:
            attrs = []
        if metadata is None:
            metadata = {}
        if uid is None:
            uid = generate_id()

        self.uid = uid
        self.metadata = metadata

        # auto-generated raw segment to hold the text
        self.raw_segment = self._generate_raw_segment(text, uid)

        self.anns = TextAnnotationContainer(doc_id=self.uid, raw_segment=self.raw_segment)
        for ann in anns:
            self.anns.add(ann)

        self.attrs = AttributeContainer(
            owner_id=self.uid,
        )

        for attr in attrs:
            self.attrs.add(attr)

    @classmethod
    def _generate_raw_segment(cls, text: str, doc_id: str) -> Segment:
        uid = str(generate_deterministic_id(reference_id=doc_id))

        return Segment(
            label=cls.RAW_LABEL,
            spans=[Span(0, len(text))],
            text=text,
            uid=uid,
        )

    @property
    def text(self) -> str:
        return self.raw_segment.text

    def __init_subclass__(cls):
        TextDocument.register_subclass(cls)
        super().__init_subclass__()

    def to_dict(self, with_anns: bool = True) -> dict[str, Any]:
        doc_dict = {
            "uid": self.uid,
            "text": self.text,
            "metadata": self.metadata,
        }
        if with_anns:
            doc_dict["anns"] = [a.to_dict() for a in self.anns]

        if self.attrs:
            doc_dict["attrs"] = [a.to_dict() for a in self.attrs]

        dict_conv.add_class_name_to_data_dict(self, doc_dict)
        return doc_dict


[docs]
    @classmethod
    def from_dict(cls, doc_dict: dict[str, Any]) -> Self:
        """Creates a TextDocument from a dict

        Parameters
        ----------
        doc_dict : dict of str to Any
            A dictionary from a serialized TextDocument as generated by to_dict()
        """
        # if class method is not the same as the TextDocument one
        # (e.g., when subclassing with an overriding method)
        subclass = cls.get_subclass_for_data_dict(doc_dict)
        if subclass is not None:
            return subclass.from_dict(doc_dict)

        anns = [TextAnnotation.from_dict(a) for a in doc_dict.get("anns", [])]
        attrs = [Attribute.from_dict(a) for a in doc_dict.get("attrs", [])]
        return cls(
            uid=doc_dict["uid"],
            text=doc_dict["text"],
            anns=anns,
            attrs=attrs,
            metadata=doc_dict["metadata"],
        )



[docs]
    @classmethod
    def from_file(cls, path: os.PathLike, encoding: str = "utf-8") -> Self:
        """Create a document from a text file

        Parameters
        ----------
        path : Path
            Path of the text file
        encoding : str, default="utf-8"
            Text encoding to use

        Returns
        -------
        TextDocument
            Text document with contents of `path` as text. The file path is
            included in the document metadata.
        """
        path = Path(path)
        text = path.read_text(encoding=encoding)
        return cls(text=text, metadata={"path_to_text": str(path.absolute())})



[docs]
    @classmethod
    def from_dir(
        cls,
        path: os.PathLike,
        pattern: str = "*.txt",
        encoding: str = "utf-8",
    ) -> list[Self]:
        """Create documents from text files in a directory

        Parameters
        ----------
        path : Path
            Path of the directory containing text files
        pattern : str
            Glob pattern to match text files in `path`
        encoding : str
            Text encoding to use

        Returns
        -------
        list of TextDocument
            Text documents with contents of each file as text
        """
        path = Path(path)
        files = sorted(path.glob(pattern))
        return [cls.from_file(f, encoding) for f in files]



[docs]
    def get_snippet(self, segment: Segment, max_extend_length: int) -> str:
        """Return a portion of the original text containing the annotation

        Parameters
        ----------
        segment : Segment
            The annotation
        max_extend_length : int
            Maximum number of characters to use around the annotation

        Returns
        -------
        str
            A portion of the text around the annotation
        """
        spans_normalized = span_utils.normalize_spans(segment.spans)
        start = min(s.start for s in spans_normalized)
        end = max(s.end for s in spans_normalized)
        start_extended = max(start - max_extend_length // 2, 0)
        remaining_max_extend_length = max_extend_length - (start - start_extended)
        end_extended = min(end + remaining_max_extend_length, len(self.text))
        return self.text[start_extended:end_extended]