from __future__ import annotations
__all__ = ["TextDocument"]
import dataclasses
import os
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Optional, Sequence
from typing_extensions import Self
from medkit.core import dict_conv, Attribute, AttributeContainer
from medkit.core.id import generate_id, generate_deterministic_id
from medkit.core.text.annotation import TextAnnotation, Segment
from medkit.core.text.annotation_container import TextAnnotationContainer
from medkit.core.text.span import Span
from medkit.core.text import span_utils
[docs]@dataclasses.dataclass(init=False)
class TextDocument(dict_conv.SubclassMapping):
"""
Document holding text annotations
Annotations must be subclasses of `TextAnnotation`.
Attributes
----------
uid:
Unique identifier of the document.
text:
Full document text.
anns:
Annotations of the document. Stored in an
:class:`~.text.TextAnnotationContainer` but can be passed as a list at init.
attrs:
Attributes of the document. Stored in an
:class:`~.core.AttributeContainer` but can be passed as a list at init
metadata:
Document metadata.
raw_segment:
Auto-generated segment containing the full unprocessed document text. To
get the raw text as an annotation to pass to processing operations:
>>> doc = TextDocument(text="hello")
>>> raw_text = doc.anns.get(label=TextDocument.RAW_LABEL)[0]
"""
RAW_LABEL: ClassVar[str] = "RAW_TEXT"
uid: str
anns: TextAnnotationContainer
attrs: AttributeContainer
metadata: Dict[str, Any]
raw_segment: Segment
def __init__(
self,
text: str,
anns: Optional[Sequence[TextAnnotation]] = None,
attrs: Optional[Sequence[Attribute]] = None,
metadata: Optional[Dict[str, Any]] = None,
uid: Optional[str] = None,
):
if anns is None:
anns = []
if attrs is None:
attrs = []
if metadata is None:
metadata = {}
if uid is None:
uid = generate_id()
self.uid = uid
self.metadata = metadata
# auto-generated raw segment to hold the text
self.raw_segment = self._generate_raw_segment(text, uid)
self.anns = TextAnnotationContainer(
doc_id=self.uid, raw_segment=self.raw_segment
)
for ann in anns:
self.anns.add(ann)
self.attrs = AttributeContainer(
owner_id=self.uid,
)
for attr in attrs:
self.attrs.add(attr)
@classmethod
def _generate_raw_segment(cls, text: str, doc_id: str) -> Segment:
uid = str(generate_deterministic_id(reference_id=doc_id))
return Segment(
label=cls.RAW_LABEL,
spans=[Span(0, len(text))],
text=text,
uid=uid,
)
@property
def text(self) -> str:
return self.raw_segment.text
def __init_subclass__(cls):
TextDocument.register_subclass(cls)
super().__init_subclass__()
def to_dict(self, with_anns: bool = True) -> Dict[str, Any]:
doc_dict = dict(
uid=self.uid,
text=self.text,
metadata=self.metadata,
)
if with_anns:
doc_dict["anns"] = [a.to_dict() for a in self.anns]
if self.attrs:
doc_dict["attrs"] = [a.to_dict() for a in self.attrs]
dict_conv.add_class_name_to_data_dict(self, doc_dict)
return doc_dict
[docs] @classmethod
def from_dict(cls, doc_dict: Dict[str, Any]) -> Self:
"""
Creates a TextDocument from a dict
Parameters
----------
doc_dict: dict
A dictionary from a serialized TextDocument as generated by to_dict()
"""
# if class method is not the same as the TextDocument one
# (e.g., when subclassing with an overriding method)
subclass = cls.get_subclass_for_data_dict(doc_dict)
if subclass is not None:
return subclass.from_dict(doc_dict)
anns = [TextAnnotation.from_dict(a) for a in doc_dict.get("anns", [])]
attrs = [Attribute.from_dict(a) for a in doc_dict.get("attrs", [])]
return cls(
uid=doc_dict["uid"],
text=doc_dict["text"],
anns=anns,
attrs=attrs,
metadata=doc_dict["metadata"],
)
[docs] @classmethod
def from_file(cls, path: os.PathLike, encoding: Optional[str] = "utf-8") -> Self:
"""
Create a document from a text file
Parameters
----------
path:
Path of the text file
encoding:
Text encoding to use
Returns
-------
TextDocument:
Text document with contents of `path` as text. The file path is
included in the document metadata.
"""
path = Path(path)
text = path.read_text(encoding=encoding)
return cls(text=text, metadata={"path_to_text": str(path.absolute())})
[docs] @classmethod
def from_dir(
cls,
path: os.PathLike,
pattern: str = "*.txt",
encoding: Optional[str] = "utf-8",
) -> List[Self]:
"""
Create documents from text files in a directory
Parameters
----------
path:
Path of the directory containing text files
pattern:
Glob pattern to match text files in `path`
encoding:
Text encoding to use
Returns
-------
List[TextDocument]:
Text documents with contents of each file as text
"""
path = Path(path)
files = sorted(path.glob(pattern))
return [cls.from_file(f, encoding) for f in files]
[docs] def get_snippet(self, segment: Segment, max_extend_length: int) -> str:
"""Return a portion of the original text containing the annotation
Parameters
----------
segment:
The annotation
max_extend_length:
Maximum number of characters to use around the annotation
Returns
-------
str:
A portion of the text around the annotation
"""
spans_normalized = span_utils.normalize_spans(segment.spans)
start = min(s.start for s in spans_normalized)
end = max(s.end for s in spans_normalized)
start_extended = max(start - max_extend_length // 2, 0)
remaining_max_extend_length = max_extend_length - (start - start_extended)
end_extended = min(end + remaining_max_extend_length, len(self.text))
return self.text[start_extended:end_extended]