Source code for medkit.io.doccano

from __future__ import annotations

__all__ = [
    "DoccanoTask",
    "DoccanoClientConfig",
    "DoccanoInputConverter",
    "DoccanoOutputConverter",
]

import dataclasses
import enum
import json
import logging
import tempfile
from pathlib import Path
from typing import Any
from zipfile import ZipFile

from typing_extensions import Self

from medkit.core import Attribute, OperationDescription, ProvTracer
from medkit.core.id import generate_deterministic_id, generate_id
from medkit.core.text import Entity, Relation, Span, TextDocument, span_utils
from medkit.io._common import get_anns_by_type

logger = logging.getLogger(__name__)


[docs] class DoccanoTask(enum.Enum): """Supported doccano tasks. The task defines the type of document to convert. Attributes ---------- TEXT_CLASSIFICATION Documents with a category RELATION_EXTRACTION Documents with entities and relations (including IDs) SEQUENCE_LABELING Documents with entities in tuples """ TEXT_CLASSIFICATION = "text_classification" RELATION_EXTRACTION = "relation_extraction" SEQUENCE_LABELING = "sequence_labeling"
[docs] @dataclasses.dataclass class DoccanoClientConfig: """A class representing the configuration in the doccano client. The default values are the default values used by doccano. Attributes ---------- column_text : str, default="text" Name or key representing the text column_label : str, default="label" Name or key representing the label """ column_text: str = "text" column_label: str = "label"
# FIXME: datamodels to factorize in _doccano_utils @dataclasses.dataclass() class _DoccanoEntity: id: int start_offset: int end_offset: int label: str def to_dict(self) -> dict[str, Any]: return { "id": self.id, "start_offset": self.start_offset, "end_offset": self.end_offset, "label": self.label, } @dataclasses.dataclass() class _DoccanoEntityTuple: start_offset: int end_offset: int label: str def to_tuple(self) -> tuple[int, int, str]: return self.start_offset, self.end_offset, self.label @dataclasses.dataclass() class _DoccanoRelation: id: int from_id: int to_id: int type: str def to_dict(self) -> dict[str, Any]: return { "id": self.id, "from_id": self.from_id, "to_id": self.to_id, "type": self.type, } @dataclasses.dataclass() class _DoccanoDocRelationExtraction: text: str entities: list[_DoccanoEntity] relations: list[_DoccanoRelation] metadata: dict[str, Any] @classmethod def from_dict(cls, doc_line: dict[str, Any], client_config: DoccanoClientConfig) -> Self: text: str = doc_line.pop(client_config.column_text) entities = [_DoccanoEntity(**ann) for ann in doc_line.pop("entities")] relations = [_DoccanoRelation(**ann) for ann in doc_line.pop("relations")] # in doccano, metadata is what remains after removing key fields metadata = doc_line return cls(text=text, entities=entities, relations=relations, metadata=metadata) def to_dict(self) -> dict[str, Any]: doc_dict = {"text": self.text} doc_dict["entities"] = [ent.to_dict() for ent in self.entities] doc_dict["relations"] = [rel.to_dict() for rel in self.relations] doc_dict.update(self.metadata) return doc_dict @dataclasses.dataclass() class _DoccanoDocSeqLabeling: text: str entities: list[_DoccanoEntityTuple] metadata: dict[str, Any] @classmethod def from_dict(cls, doc_line: dict[str, Any], client_config: DoccanoClientConfig) -> Self: text = doc_line.pop(client_config.column_text) entities = [_DoccanoEntityTuple(*ann) for ann in doc_line.pop(client_config.column_label)] # in doccano, metadata is what remains after removing key fields metadata = doc_line return cls(text=text, entities=entities, metadata=metadata) def to_dict(self) -> dict[str, Any]: doc_dict = {"text": self.text} doc_dict["label"] = [ent.to_tuple() for ent in self.entities] doc_dict.update(self.metadata) return doc_dict @dataclasses.dataclass() class _DoccanoDocTextClassification: text: str label: str metadata: dict[str, Any] @classmethod def from_dict(cls, doc_line: dict[str, Any], client_config: DoccanoClientConfig) -> Self: text = doc_line.pop(client_config.column_text) label = doc_line.pop(client_config.column_label)[0] if not isinstance(label, str): msg = ( "The label must be a string. Please check if the document corresponds" " to a text classification task rather than sequence labeling" ) raise TypeError(msg) # in doccano, metadata is what remains after removing key fields metadata = doc_line return cls(text=text, label=label, metadata=metadata) def to_dict(self) -> dict[str, Any]: doc_dict = {"text": self.text, "label": [str(self.label)]} doc_dict.update(self.metadata) return doc_dict
[docs] class DoccanoInputConverter: """Convert doccano files (.JSONL) containing annotations for a given task. For each line, a :class:`~.core.text.TextDocument` will be created. The doccano files can be loaded from a directory with zip files or from a jsonl file. The converter supports custom configuration to define the parameters used by doccano when importing the data (c.f. :class:`~.io.doccano.DoccanoClientConfig`) .. warning:: If the option *Count grapheme clusters as one character* was selected when creating the doccano project, the converted documents are likely to have alignment problems; the converter does not support this option. """ def __init__( self, task: DoccanoTask, client_config: DoccanoClientConfig | None = None, attr_label: str = "doccano_category", uid: str | None = None, ): """Parameters ---------- task : DocanoTask The doccano task for the input converter client_config : DoccanoClientConfig, optional Optional client configuration to define default values in doccano interface. This config can change, for example, the name of the text field or labels. attr_label : str, default="doccano_category" The label to use for the medkit attribute that represents the doccano category. This is related to :class:`~.io.DoccanoTask.TEXT_CLASSIFICATION` projects. uid : str, optional Identifier of the converter. """ if uid is None: uid = generate_id() if client_config is None: client_config = DoccanoClientConfig() self.uid = uid self.client_config = client_config self.task = task self.attr_label = attr_label self._prov_tracer: ProvTracer | None = None
[docs] def set_prov_tracer(self, prov_tracer: ProvTracer): """Enable provenance tracing. Parameters ---------- prov_tracer : ProvTracer The provenance tracer used to trace the provenance. """ self._prov_tracer = prov_tracer
@property def description(self) -> OperationDescription: """Contains all the input converter init parameters.""" return OperationDescription( uid=self.uid, name=self.__class__.__name__, class_name=self.__class__.__name__, config={"task": self.task.value}, )
[docs] def load_from_directory_zip(self, dir_path: str | Path) -> list[TextDocument]: """Create a list of TextDocuments from zip files in a directory. The zip files should contain a JSONL file coming from doccano. Parameters ---------- dir_path : str or Path The path to the directory containing zip files. Returns ------- list of TextDocument A list of TextDocuments """ documents = [doc for path in sorted(Path(dir_path).glob("*.zip")) for doc in self.load_from_zip(path)] if not documents: logger.warning("No .zip nor .jsonl found in '%s'", dir_path) return documents
[docs] def load_from_zip(self, input_file: str | Path) -> list[TextDocument]: """Create a list of TextDocuments from a zip file containing a JSONL file coming from doccano. Parameters ---------- input_file : str or Path The path to the zip file containing a docanno JSONL file Returns ------- list of TextDocument A list of TextDocuments """ with tempfile.TemporaryDirectory() as tmpdir: with ZipFile(input_file, mode="r") as zip_file: filename = zip_file.namelist()[0] unzipped_file = Path(tmpdir) / filename zip_file.extract(filename, tmpdir) return self.load_from_file(unzipped_file)
[docs] def load_from_file(self, input_file: str | Path) -> list[TextDocument]: """Create a list of TextDocuments from a doccano JSONL file. Parameters ---------- input_file : str or Path The path to the JSONL file containing doccano annotations Returns ------- list of TextDocument A list of TextDocuments """ documents = [] with Path(input_file).open(encoding="utf-8") as fp: for line in fp: doc_line = json.loads(line) doc = self._parse_doc_line(doc_line) documents.append(doc) self._check_crlf_character(documents) return documents
def _check_crlf_character(self, documents: list[TextDocument]): """Check if the list of converted documents contains the CRLF character. This character is the only indicator available to warn if there are alignment problems in the documents """ if self.task in (DoccanoTask.RELATION_EXTRACTION, DoccanoTask.SEQUENCE_LABELING): nb_docs_with_warning = sum(document.text.find("\r\n") != -1 for document in documents) if nb_docs_with_warning > 0: logger.warning( "%s/%s documents contain" " '\\r\\n' characters. If you have selected 'Count grapheme" " clusters as one character' when creating the doccano project," " converted documents are likely to have alignment problems.\n" " Please ignore this message if you did not select this option when" " creating the project.", nb_docs_with_warning, nb_docs_with_warning, ) def _parse_doc_line(self, doc_line: dict[str, Any]) -> TextDocument: """Parse a doc_line into a TextDocument depending on the task Parameters ---------- doc_line : dict of str to Any A dictionary representing an annotation from doccano Returns ------- TextDocument A document with parsed annotations. """ if self.task == DoccanoTask.RELATION_EXTRACTION: return self._parse_doc_line_relation_extraction(doc_line=doc_line) if self.task == DoccanoTask.TEXT_CLASSIFICATION: return self._parse_doc_line_text_classification(doc_line=doc_line) if self.task == DoccanoTask.SEQUENCE_LABELING: return self._parse_doc_line_seq_labeling(doc_line=doc_line) return None def _parse_doc_line_relation_extraction(self, doc_line: dict[str, Any]) -> TextDocument: """Parse a dictionary and return a TextDocument with entities and relations Parameters ---------- doc_line : dict of str to Any Dictionary with doccano annotation Returns ------- TextDocument The document with annotations """ try: doccano_doc = _DoccanoDocRelationExtraction.from_dict(doc_line, client_config=self.client_config) except (KeyError, TypeError) as err: msg = ( "Impossible to convert the document. Please check the task" " or the client configuration of the converter." ) raise ValueError(msg) from err ents_by_doccano_id = {} relations = [] for doccano_entity in doccano_doc.entities: text = doccano_doc.text[doccano_entity.start_offset : doccano_entity.end_offset] entity = Entity( text=text, label=doccano_entity.label, spans=[Span(doccano_entity.start_offset, doccano_entity.end_offset)], metadata={"doccano_id": doccano_entity.id}, ) ents_by_doccano_id[doccano_entity.id] = entity if self._prov_tracer is not None: self._prov_tracer.add_prov(entity, self.description, source_data_items=[]) for doccano_relation in doccano_doc.relations: relation = Relation( label=doccano_relation.type, source_id=ents_by_doccano_id[doccano_relation.from_id].uid, target_id=ents_by_doccano_id[doccano_relation.to_id].uid, metadata={"doccano_id": doccano_relation.id}, ) relations.append(relation) if self._prov_tracer is not None: self._prov_tracer.add_prov(relation, self.description, source_data_items=[]) anns = list(ents_by_doccano_id.values()) + relations return TextDocument( text=doccano_doc.text, anns=anns, metadata=doccano_doc.metadata, ) def _parse_doc_line_seq_labeling(self, doc_line: dict[str, Any]) -> TextDocument: """Parse a dictionary and return a TextDocument with entities Parameters ---------- doc_line : dict of str to Any Dictionary with doccano annotation. Returns ------- TextDocument The document with annotations """ try: doccano_doc = _DoccanoDocSeqLabeling.from_dict(doc_line, client_config=self.client_config) except (KeyError, TypeError) as err: msg = ( "Impossible to convert the document. Please check the task" " or the client configuration of the converter." ) raise ValueError(msg) from err entities = [] for doccano_entity in doccano_doc.entities: text = doccano_doc.text[doccano_entity.start_offset : doccano_entity.end_offset] entity = Entity( text=text, label=doccano_entity.label, spans=[Span(doccano_entity.start_offset, doccano_entity.end_offset)], ) entities.append(entity) if self._prov_tracer is not None: self._prov_tracer.add_prov(entity, self.description, source_data_items=[]) return TextDocument( text=doccano_doc.text, anns=entities, metadata=doccano_doc.metadata, ) def _parse_doc_line_text_classification(self, doc_line: dict[str, Any]) -> TextDocument: """Parse a dictionary and return a TextDocument with an attribute. Parameters ---------- doc_line : dict of str to Any Dictionary with doccano annotation. Returns ------- TextDocument The document with its category """ try: doccano_doc = _DoccanoDocTextClassification.from_dict(doc_line, client_config=self.client_config) except (KeyError, TypeError) as err: msg = ( "Impossible to convert the document. Please check the task" " or the client configuration of the converter." ) raise ValueError(msg) from err attr = Attribute(label=self.attr_label, value=doccano_doc.label) if self._prov_tracer is not None: self._prov_tracer.add_prov(attr, self.description, source_data_items=[]) doc = TextDocument(text=doccano_doc.text, metadata=doccano_doc.metadata) doc.attrs.add(attr) return doc
[docs] class DoccanoOutputConverter: """Convert medkit files to doccano files (.JSONL) for a given task. For each :class:`~medkit.core.text.TextDocument` a jsonline will be created. """ def __init__( self, task: DoccanoTask, anns_labels: list[str] | None = None, attr_label: str | None = None, ignore_segments: bool = True, include_metadata: bool | None = True, uid: str | None = None, ): """Parameters ---------- task : DoccanoTask The doccano task for the input converter anns_labels : list of str, optional Labels of medkit annotations to convert into doccano annotations. If `None` (default) all the entities or relations will be converted. Useful for :class:`~.io.DoccanoTask.SEQUENCE_LABELING` or :class:`~.io.DoccanoTask.RELATION_EXTRACTION` converters. attr_label : str, optional The label of the medkit attribute that represents the text category. Useful for :class:`~.io.DoccanoTask.TEXT_CLASSIFICATION` converters. ignore_segments : bool, default=True If `True` medkit segments will be ignored. Only entities will be converted to Doccano entities. If `False` the medkit segments will be converted to Doccano entities as well. Useful for :class:`~.io.DoccanoTask.SEQUENCE_LABELING` or :class:`~.io.DoccanoTask.RELATION_EXTRACTION` converters. include_metadata : bool, default=True Whether include medkit metadata in the converted documents uid : str, optional Identifier of the converter. """ if uid is None: uid = generate_id() self.uid = uid self.task = task self.anns_labels = anns_labels self.attr_label = attr_label self.ignore_segments = ignore_segments self.include_metadata = include_metadata @property def description(self) -> OperationDescription: return OperationDescription( uid=self.uid, name=self.__class__.__name__, class_name=self.__class__.__name__, config={"task": self.task.value}, )
[docs] def save(self, docs: list[TextDocument], output_file: str | Path): """Convert and save a list of TextDocuments into a doccano file (.JSONL) Parameters ---------- docs : list of TextDocument List of medkit doc objects to convert output_file : str or Path Path or string of the JSONL file where to save the converted documents """ with Path(output_file).open(mode="w", encoding="utf-8") as fp: for medkit_doc in docs: doc_line = self._convert_doc_by_task(medkit_doc) fp.write(json.dumps(doc_line, ensure_ascii=False) + "\n")
def _convert_doc_by_task(self, medkit_doc: TextDocument) -> dict[str, Any]: """Convert a TextDocument into a dictionary depending on the task Parameters ---------- medkit_doc : TextDocument Document to convert Returns ------- dict of str to Any Dictionary with doccano annotation """ if self.task == DoccanoTask.RELATION_EXTRACTION: return self._convert_doc_relation_extraction(medkit_doc=medkit_doc) if self.task == DoccanoTask.TEXT_CLASSIFICATION: return self._convert_doc_text_classification(medkit_doc=medkit_doc) if self.task == DoccanoTask.SEQUENCE_LABELING: return self._convert_doc_seq_labeling(medkit_doc=medkit_doc) return None def _convert_doc_relation_extraction(self, medkit_doc: TextDocument) -> dict[str, Any]: """Convert a TextDocument to a doc_line compatible with the doccano relation extraction task Parameters ---------- medkit_doc : TextDocument Document to convert, it may contain entities and relations. Returns ------- dict of str to Any Dictionary with doccano annotation. It may contain text, entities and relations. """ doccano_ents_by_medkit_uid = {} doccano_relations = [] anns_by_type = get_anns_by_type(medkit_doc, self.anns_labels) medkit_segments = anns_by_type["entities"] if not self.ignore_segments: medkit_segments += anns_by_type["segments"] for medkit_segment in medkit_segments: spans = span_utils.normalize_spans(medkit_segment.spans) ann_id = generate_deterministic_id(medkit_segment.uid) entity = _DoccanoEntity( id=ann_id.int, start_offset=spans[0].start, end_offset=spans[-1].end, label=medkit_segment.label, ) doccano_ents_by_medkit_uid[medkit_segment.uid] = entity for medkit_relation in anns_by_type["relations"]: subj = doccano_ents_by_medkit_uid.get(medkit_relation.source_id) obj = doccano_ents_by_medkit_uid.get(medkit_relation.target_id) if subj is None or obj is None: logger.warning("Ignore relation %s. Entity source/target was no found", medkit_relation.uid) continue ann_id = generate_deterministic_id(medkit_relation.uid) relation = _DoccanoRelation( id=ann_id.int, from_id=subj.id, to_id=obj.id, type=medkit_relation.label, ) doccano_relations.append(relation) metadata = medkit_doc.metadata if self.include_metadata else {} doccano_doc = _DoccanoDocRelationExtraction( text=medkit_doc.text, entities=list(doccano_ents_by_medkit_uid.values()), relations=doccano_relations, metadata=metadata, ) return doccano_doc.to_dict() def _convert_doc_seq_labeling(self, medkit_doc: TextDocument) -> dict[str, Any]: """Convert a TextDocument to a doc_line compatible with the doccano sequence labeling task Parameters ---------- medkit_doc : TextDocument Document to convert, it may contain entities. Returns ------- dict of str to Any Dictionary with doccano annotation. It may contain text ans its label (a list of tuples representing entities). """ anns_by_type = get_anns_by_type(medkit_doc, self.anns_labels) medkit_segments = anns_by_type["entities"] if not self.ignore_segments: medkit_segments += anns_by_type["segments"] doccano_entities = [] for medkit_segment in medkit_segments: spans = span_utils.normalize_spans(medkit_segment.spans) entity = _DoccanoEntityTuple( start_offset=spans[0].start, end_offset=spans[-1].end, label=medkit_segment.label, ) doccano_entities.append(entity) metadata = medkit_doc.metadata if self.include_metadata else {} doccano_doc = _DoccanoDocSeqLabeling( text=medkit_doc.text, entities=doccano_entities, metadata=metadata, ) return doccano_doc.to_dict() def _convert_doc_text_classification(self, medkit_doc: TextDocument) -> dict[str, Any]: """Convert a TextDocument to a doc_line compatible with the doccano text classification task. Parameters ---------- medkit_doc : TextDocument Document to convert, it may contain at least one attribute to convert. Returns ------- dict of str to Any Dictionary with doccano annotation. It may contain text ans its label (a category(str)). """ attributes = medkit_doc.attrs.get(label=self.attr_label) if not attributes: msg = ( "The attribute with the corresponding text class was not found. Check" f" the 'attr_label' for this converter, {self.attr_label} was provided." ) raise KeyError(msg) metadata = medkit_doc.metadata if self.include_metadata else {} doccano_doc = _DoccanoDocTextClassification( text=medkit_doc.text, label=attributes[0].value, metadata=metadata, ) return doccano_doc.to_dict()