Source code for medkit.io.doccano

__all__ = [
    "DoccanoTask",
    "DoccanoClientConfig",
    "DoccanoInputConverter",
    "DoccanoOutputConverter",
]

import dataclasses
import enum
import json
import logging
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from typing_extensions import Self
from zipfile import ZipFile

from medkit.core import Attribute, OperationDescription, ProvTracer
from medkit.core.id import generate_id, generate_deterministic_id
from medkit.core.text import Entity, Relation, Span, TextDocument, span_utils

from medkit.io._common import get_anns_by_type

logger = logging.getLogger(__name__)


[docs]class DoccanoTask(enum.Enum):
    """Supported doccano tasks. The task defines
    the type of document to convert.

    Attributes
    ----------
    TEXT_CLASSIFICATION
        Documents with a category
    RELATION_EXTRACTION
        Documents with entities and relations (including IDs)
    SEQUENCE_LABELING
        Documents with entities in tuples
    """

    TEXT_CLASSIFICATION = "text_classification"
    RELATION_EXTRACTION = "relation_extraction"
    SEQUENCE_LABELING = "sequence_labeling"


[docs]@dataclasses.dataclass
class DoccanoClientConfig:
    """A class representing the configuration in the doccano client.
    The default values are the default values used by doccano.

    Attributes
    ----------
    column_text:
        Name or key representing the text
    column_label:
        Name or key representing the label
    """

    column_text: str = "text"
    column_label: str = "label"


# FIXME: datamodels to factorize in _doccano_utils
@dataclasses.dataclass()
class _DoccanoEntity:
    id: int
    start_offset: int
    end_offset: int
    label: str

    def to_dict(self) -> Dict[str, Any]:
        entity_dict = dict(
            id=self.id,
            start_offset=self.start_offset,
            end_offset=self.end_offset,
            label=self.label,
        )
        return entity_dict


@dataclasses.dataclass()
class _DoccanoEntityTuple:
    start_offset: int
    end_offset: int
    label: str

    def to_tuple(self) -> Tuple[int, int, str]:
        return (self.start_offset, self.end_offset, self.label)


@dataclasses.dataclass()
class _DoccanoRelation:
    id: int
    from_id: int
    to_id: int
    type: str

    def to_dict(self) -> Dict[str, Any]:
        relation_dict = dict(
            id=self.id,
            from_id=self.from_id,
            to_id=self.to_id,
            type=self.type,
        )
        return relation_dict


@dataclasses.dataclass()
class _DoccanoDocRelationExtraction:
    text: str
    entities: List[_DoccanoEntity]
    relations: List[_DoccanoRelation]
    metadata: Dict[str, Any]

    @classmethod
    def from_dict(
        cls, doc_line: Dict[str, Any], client_config: DoccanoClientConfig
    ) -> Self:
        text: str = doc_line.pop(client_config.column_text)
        entities = [_DoccanoEntity(**ann) for ann in doc_line.pop("entities")]
        relations = [_DoccanoRelation(**ann) for ann in doc_line.pop("relations")]
        # in doccano, metadata is what remains after removing key fields
        metadata = doc_line
        return cls(text=text, entities=entities, relations=relations, metadata=metadata)

    def to_dict(self) -> Dict[str, Any]:
        doc_dict = dict(text=self.text)
        doc_dict["entities"] = [ent.to_dict() for ent in self.entities]
        doc_dict["relations"] = [rel.to_dict() for rel in self.relations]
        doc_dict.update(self.metadata)
        return doc_dict


@dataclasses.dataclass()
class _DoccanoDocSeqLabeling:
    text: str
    entities: List[_DoccanoEntityTuple]
    metadata: Dict[str, Any]

    @classmethod
    def from_dict(
        cls, doc_line: Dict[str, Any], client_config: DoccanoClientConfig
    ) -> Self:
        text = doc_line.pop(client_config.column_text)
        entities = [
            _DoccanoEntityTuple(*ann)
            for ann in doc_line.pop(client_config.column_label)
        ]
        # in doccano, metadata is what remains after removing key fields
        metadata = doc_line
        return cls(text=text, entities=entities, metadata=metadata)

    def to_dict(self) -> Dict[str, Any]:
        doc_dict = dict(text=self.text)
        doc_dict["label"] = [ent.to_tuple() for ent in self.entities]
        doc_dict.update(self.metadata)
        return doc_dict


@dataclasses.dataclass()
class _DoccanoDocTextClassification:
    text: str
    label: str
    metadata: Dict[str, Any]

    @classmethod
    def from_dict(
        cls, doc_line: Dict[str, Any], client_config: DoccanoClientConfig
    ) -> Self:
        text = doc_line.pop(client_config.column_text)
        label = doc_line.pop(client_config.column_label)[0]

        if not isinstance(label, str):
            raise TypeError(
                "The label must be a string. Please check if the document corresponds"
                " to a text classification task rather than sequence labeling"
            )
        # in doccano, metadata is what remains after removing key fields
        metadata = doc_line
        return cls(text=text, label=label, metadata=metadata)

    def to_dict(self) -> Dict[str, Any]:
        doc_dict = dict(text=self.text, label=[str(self.label)])
        doc_dict.update(self.metadata)
        return doc_dict


[docs]class DoccanoInputConverter:
    """Convert doccano files (.JSONL) containing annotations for a given task.

    For each line, a :class:`~.core.text.TextDocument` will be created.
    The doccano files can be loaded from a directory with zip files or from a jsonl file.

    The converter supports custom configuration to define the parameters used by doccano
    when importing the data (c.f. :class:`~.io.doccano.DoccanoClientConfig`)

    .. warning::
        If the option *Count grapheme clusters as one character*  was selected
        when creating the doccano project, the converted documents are
        likely to have alignment problems; the converter does not support this option.
    """

    def __init__(
        self,
        task: DoccanoTask,
        client_config: Optional[DoccanoClientConfig] = None,
        attr_label: str = "doccano_category",
        uid: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        task:
            The doccano task for the input converter
        client_config:
            Optional client configuration to define default values in doccano interface.
            This config can change, for example, the name of the text field or labels.
        attr_label:
            The label to use for the medkit attribute that represents the doccano category.
            This is related to :class:`~.io.DoccanoTask.TEXT_CLASSIFICATION` projects.
        uid:
            Identifier of the converter.
        """
        if uid is None:
            uid = generate_id()

        if client_config is None:
            client_config = DoccanoClientConfig()

        self.uid = uid
        self.client_config = client_config
        self.task = task
        self.attr_label = attr_label
        self._prov_tracer: Optional[ProvTracer] = None

[docs]    def set_prov_tracer(self, prov_tracer: ProvTracer):
        """Enable provenance tracing.

        Parameters
        ----------
        prov_tracer:
            The provenance tracer used to trace the provenance.
        """
        self._prov_tracer = prov_tracer

    @property
    def description(self) -> OperationDescription:
        """Contains all the input converter init parameters."""
        return OperationDescription(
            uid=self.uid,
            name=self.__class__.__name__,
            class_name=self.__class__.__name__,
            config=dict(task=self.task.value),
        )

[docs]    def load_from_directory_zip(self, dir_path: Union[str, Path]) -> List[TextDocument]:
        """Create a list of TextDocuments from zip files in a directory.
        The zip files should contain a JSONL file coming from doccano.

        Parameters
        ----------
        dir_path:
            The path to the directory containing zip files.

        Returns
        -------
        List[TextDocument]
            A list of TextDocuments
        """
        documents = []
        for path_zip in sorted(Path(dir_path).glob("*.zip")):
            documents.extend(self.load_from_zip(path_zip))

        if len(documents) == 0:
            logger.warning(f"No .zip nor .jsonl found in '{dir_path}'")

        return documents

[docs]    def load_from_zip(self, input_file: Union[str, Path]) -> List[TextDocument]:
        """
        Create a list of TextDocuments from a zip file containing a JSONL file
        coming from doccano.

        Parameters
        ----------
        input_file:
            The path to the zip file containing a docanno JSONL file

        Returns
        -------
        List[TextDocument]
            A list of TextDocuments
        """

        with tempfile.TemporaryDirectory() as tmpdir:
            with ZipFile(input_file, mode="r") as zip_file:
                filename = zip_file.namelist()[0]
                unzipped_file = Path(tmpdir) / filename
                zip_file.extract(filename, tmpdir)
            return self.load_from_file(unzipped_file)

[docs]    def load_from_file(self, input_file: Union[str, Path]) -> List[TextDocument]:
        """Create a list of TextDocuments from a doccano JSONL file.

        Parameters
        ----------
        input_file:
            The path to the JSONL file containing doccano annotations

        Returns
        -------
        List[TextDocument]
            A list of TextDocuments
        """
        documents = []
        with open(Path(input_file), encoding="utf-8") as fp:
            for line in fp:
                doc_line = json.loads(line)
                doc = self._parse_doc_line(doc_line)
                documents.append(doc)

        self._check_crlf_character(documents)
        return documents

    def _check_crlf_character(self, documents: List[TextDocument]):
        """Check if the list of converted documents contains the CRLF character.
        This character is the only indicator available to warn
        if there are alignment problems in the documents"""
        if (
            self.task == DoccanoTask.RELATION_EXTRACTION
            or self.task == DoccanoTask.SEQUENCE_LABELING
        ):
            nb_docs_with_warning = sum(
                document.text.find("\r\n") != -1 for document in documents
            )

            if nb_docs_with_warning > 0:
                logger.warning(
                    f"{nb_docs_with_warning}/{len(documents)} documents contain"
                    " '\\r\\n' characters. If you have selected 'Count grapheme"
                    " clusters as one character' when creating the doccano project,"
                    " converted documents are likely to have alignment problems.\n"
                    " Please ignore this message if you did not select this option when"
                    " creating the project."
                )

    def _parse_doc_line(self, doc_line: Dict[str, Any]) -> TextDocument:
        """Parse a doc_line into a TextDocument depending on the task

        Parameters
        ----------
        doc_line:
            A dictionary representing an annotation from doccano

        Returns
        -------
        TextDocument
            A document with parsed annotations.
        """
        if self.task == DoccanoTask.RELATION_EXTRACTION:
            return self._parse_doc_line_relation_extraction(doc_line=doc_line)
        if self.task == DoccanoTask.TEXT_CLASSIFICATION:
            return self._parse_doc_line_text_classification(doc_line=doc_line)
        if self.task == DoccanoTask.SEQUENCE_LABELING:
            return self._parse_doc_line_seq_labeling(doc_line=doc_line)

    def _parse_doc_line_relation_extraction(
        self, doc_line: Dict[str, Any]
    ) -> TextDocument:
        """Parse a dictionary and return a TextDocument with entities and relations

        Parameters
        ----------
        doc_line:
            Dictionary with doccano annotation

        Returns
        -------
        TextDocument
            The document with annotations
        """
        try:
            doccano_doc = _DoccanoDocRelationExtraction.from_dict(
                doc_line, client_config=self.client_config
            )
        except Exception as err:
            raise Exception(
                "Impossible to convert the document. Please check the task"
                " or the client configuration of the converter"
            ) from err

        ents_by_doccano_id = dict()
        relations = []
        for doccano_entity in doccano_doc.entities:
            text = doccano_doc.text[
                doccano_entity.start_offset : doccano_entity.end_offset
            ]
            entity = Entity(
                text=text,
                label=doccano_entity.label,
                spans=[Span(doccano_entity.start_offset, doccano_entity.end_offset)],
                metadata=dict(doccano_id=doccano_entity.id),
            )
            ents_by_doccano_id[doccano_entity.id] = entity

            if self._prov_tracer is not None:
                self._prov_tracer.add_prov(
                    entity, self.description, source_data_items=[]
                )

        for doccano_relation in doccano_doc.relations:
            relation = Relation(
                label=doccano_relation.type,
                source_id=ents_by_doccano_id[doccano_relation.from_id].uid,
                target_id=ents_by_doccano_id[doccano_relation.to_id].uid,
                metadata=dict(doccano_id=doccano_relation.id),
            )
            relations.append(relation)

            if self._prov_tracer is not None:
                self._prov_tracer.add_prov(
                    relation, self.description, source_data_items=[]
                )

        anns = list(ents_by_doccano_id.values()) + relations
        doc = TextDocument(
            text=doccano_doc.text,
            anns=anns,
            metadata=doccano_doc.metadata,
        )

        return doc

    def _parse_doc_line_seq_labeling(self, doc_line: Dict[str, Any]) -> TextDocument:
        """Parse a dictionary and return a TextDocument with entities

        Parameters
        ----------
        doc_line:
            Dictionary with doccano annotation.

        Returns
        -------
        TextDocument
            The document with annotations
        """
        try:
            doccano_doc = _DoccanoDocSeqLabeling.from_dict(
                doc_line, client_config=self.client_config
            )
        except Exception as err:
            raise Exception(
                "Impossible to convert the document. Please check the task"
                " or the client configuration of the converter"
            ) from err

        entities = []
        for doccano_entity in doccano_doc.entities:
            text = doccano_doc.text[
                doccano_entity.start_offset : doccano_entity.end_offset
            ]
            entity = Entity(
                text=text,
                label=doccano_entity.label,
                spans=[Span(doccano_entity.start_offset, doccano_entity.end_offset)],
            )
            entities.append(entity)

            if self._prov_tracer is not None:
                self._prov_tracer.add_prov(
                    entity, self.description, source_data_items=[]
                )

        doc = TextDocument(
            text=doccano_doc.text,
            anns=entities,
            metadata=doccano_doc.metadata,
        )
        return doc

    def _parse_doc_line_text_classification(
        self, doc_line: Dict[str, Any]
    ) -> TextDocument:
        """Parse a dictionary and return a TextDocument with an attribute.

        Parameters
        ----------
        doc_line:
            Dictionary with doccano annotation.

        Returns
        -------
        TextDocument
            The document with its category
        """
        try:
            doccano_doc = _DoccanoDocTextClassification.from_dict(
                doc_line, client_config=self.client_config
            )
        except Exception as err:
            raise Exception(
                "Impossible to convert the document. Please check the task"
                " or the client configuration of the converter"
            ) from err

        attr = Attribute(label=self.attr_label, value=doccano_doc.label)

        if self._prov_tracer is not None:
            self._prov_tracer.add_prov(attr, self.description, source_data_items=[])

        doc = TextDocument(text=doccano_doc.text, metadata=doccano_doc.metadata)
        doc.attrs.add(attr)
        return doc


[docs]class DoccanoOutputConverter:
    """Convert medkit files to doccano files (.JSONL) for a given task.

    For each :class:`~medkit.core.text.TextDocument` a jsonline will be created.
    """

    def __init__(
        self,
        task: DoccanoTask,
        anns_labels: Optional[List[str]] = None,
        attr_label: Optional[str] = None,
        ignore_segments: bool = True,
        include_metadata: Optional[bool] = True,
        uid: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        task:
            The doccano task for the input converter
        anns_labels:
            Labels of medkit annotations to convert into doccano annotations.
            If `None` (default) all the entities or relations will be converted.
            Useful for :class:`~.io.DoccanoTask.SEQUENCE_LABELING` or
            :class:`~.io.DoccanoTask.RELATION_EXTRACTION` converters.
        attr_label:
            The label of the medkit attribute that represents the text category.
            Useful for :class:`~.io.DoccanoTask.TEXT_CLASSIFICATION` converters.
        ignore_segments:
            If `True` medkit segments will be ignored. Only entities will be
            converted to Doccano entities.  If `False` the medkit segments will
            be converted to Doccano entities as well.
            Useful for :class:`~.io.DoccanoTask.SEQUENCE_LABELING` or
            :class:`~.io.DoccanoTask.RELATION_EXTRACTION` converters.
        include_metadata:
            Whether include medkit metadata in the converted documents
        uid:
            Identifier of the converter.
        """
        if uid is None:
            uid = generate_id()

        self.uid = uid
        self.task = task
        self.anns_labels = anns_labels
        self.attr_label = attr_label
        self.ignore_segments = ignore_segments
        self.include_metadata = include_metadata

    @property
    def description(self) -> OperationDescription:
        return OperationDescription(
            uid=self.uid,
            name=self.__class__.__name__,
            class_name=self.__class__.__name__,
            config=dict(task=self.task.value),
        )

[docs]    def save(self, docs: List[TextDocument], output_file: Union[str, Path]):
        """Convert and save a list of TextDocuments into a doccano file (.JSONL)

        Parameters
        ----------
        docs:
            List of medkit doc objects to convert
        output_file:
            Path or string of the JSONL file where to save the converted documents
        """

        output_file = Path(output_file)

        with open(output_file, mode="w", encoding="utf-8") as fp:
            for medkit_doc in docs:
                doc_line = self._convert_doc_by_task(medkit_doc)
                fp.write(json.dumps(doc_line, ensure_ascii=False) + "\n")

    def _convert_doc_by_task(self, medkit_doc: TextDocument) -> Dict[str, Any]:
        """Convert a TextDocument into a dictionary depending on the task

        Parameters
        ----------
        medkit_doc:
            Document to convert

        Returns
        -------
        Dict[str,Any]
            Dictionary with doccano annotation
        """
        if self.task == DoccanoTask.RELATION_EXTRACTION:
            return self._convert_doc_relation_extraction(medkit_doc=medkit_doc)
        if self.task == DoccanoTask.TEXT_CLASSIFICATION:
            return self._convert_doc_text_classification(medkit_doc=medkit_doc)
        if self.task == DoccanoTask.SEQUENCE_LABELING:
            return self._convert_doc_seq_labeling(medkit_doc=medkit_doc)

    def _convert_doc_relation_extraction(
        self, medkit_doc: TextDocument
    ) -> Dict[str, Any]:
        """Convert a TextDocument to a doc_line compatible
        with the doccano relation extraction task

        Parameters
        ----------
        medkit_doc:
            Document to convert, it may contain entities and relations

        Returns
        -------
        Dict[str,Any]
            Dictionary with doccano annotation. It may contain
            text, entities and relations
        """
        doccano_ents_by_medkit_uid = dict()
        doccano_relations = []

        anns_by_type = get_anns_by_type(medkit_doc, self.anns_labels)
        medkit_segments = anns_by_type["entities"]
        if not self.ignore_segments:
            medkit_segments += anns_by_type["segments"]

        for medkit_segment in medkit_segments:
            spans = span_utils.normalize_spans(medkit_segment.spans)
            ann_id = generate_deterministic_id(medkit_segment.uid)
            entity = _DoccanoEntity(
                id=ann_id.int,
                start_offset=spans[0].start,
                end_offset=spans[-1].end,
                label=medkit_segment.label,
            )
            doccano_ents_by_medkit_uid[medkit_segment.uid] = entity

        for medkit_relation in anns_by_type["relations"]:
            subj = doccano_ents_by_medkit_uid.get(medkit_relation.source_id)
            obj = doccano_ents_by_medkit_uid.get(medkit_relation.target_id)

            if subj is None or obj is None:
                logger.warning(
                    f"Ignore relation {medkit_relation.uid}. Entity source/target was"
                    " no found"
                )
                continue

            ann_id = generate_deterministic_id(medkit_relation.uid)
            relation = _DoccanoRelation(
                id=ann_id.int,
                from_id=subj.id,
                to_id=obj.id,
                type=medkit_relation.label,
            )
            doccano_relations.append(relation)

        metadata = medkit_doc.metadata if self.include_metadata else {}

        doccano_doc = _DoccanoDocRelationExtraction(
            text=medkit_doc.text,
            entities=list(doccano_ents_by_medkit_uid.values()),
            relations=doccano_relations,
            metadata=metadata,
        )

        return doccano_doc.to_dict()

    def _convert_doc_seq_labeling(self, medkit_doc: TextDocument) -> Dict[str, Any]:
        """Convert a TextDocument to a doc_line compatible
        with the doccano sequence labeling task

        Parameters
        ----------
        medkit_doc:
            Document to convert, it may contain entities

        Returns
        -------
        Dict[str,Any]
            Dictionary with doccano annotation. It may contain
            text ans its label (a list of tuples representing entities)
        """
        anns_by_type = get_anns_by_type(medkit_doc, self.anns_labels)
        medkit_segments = anns_by_type["entities"]
        if not self.ignore_segments:
            medkit_segments += anns_by_type["segments"]
        doccano_entities = []
        for medkit_segment in medkit_segments:
            spans = span_utils.normalize_spans(medkit_segment.spans)
            entity = _DoccanoEntityTuple(
                start_offset=spans[0].start,
                end_offset=spans[-1].end,
                label=medkit_segment.label,
            )
            doccano_entities.append(entity)

        metadata = medkit_doc.metadata if self.include_metadata else {}
        doccano_doc = _DoccanoDocSeqLabeling(
            text=medkit_doc.text,
            entities=doccano_entities,
            metadata=metadata,
        )

        return doccano_doc.to_dict()

    def _convert_doc_text_classification(
        self, medkit_doc: TextDocument
    ) -> Dict[str, Any]:
        """Convert a TextDocument to a doc_line compatible with
        the doccano text classification task.

        Parameters
        ----------
        medkit_doc:
            Document to convert, it may contain at least one attribute to convert.

        Returns
        -------
        Dict[str,Any]
            Dictionary with doccano annotation. It may contain
            text ans its label (a category(str))
        """
        attributes = medkit_doc.attrs.get(label=self.attr_label)

        if not attributes:
            raise KeyError(
                "The attribute with the corresponding text class was not found. Check"
                f" the 'attr_label' for this converter, {self.attr_label} was provided."
            )

        metadata = medkit_doc.metadata if self.include_metadata else {}
        doccano_doc = _DoccanoDocTextClassification(
            text=medkit_doc.text,
            label=attributes[0].value,
            metadata=metadata,
        )
        return doccano_doc.to_dict()