Source code for medkit.io.doccano
from __future__ import annotations
__all__ = [
"DoccanoTask",
"DoccanoClientConfig",
"DoccanoInputConverter",
"DoccanoOutputConverter",
]
import dataclasses
import enum
import json
import logging
import tempfile
from pathlib import Path
from typing import Any
from zipfile import ZipFile
from typing_extensions import Self
from medkit.core import Attribute, OperationDescription, ProvTracer
from medkit.core.id import generate_deterministic_id, generate_id
from medkit.core.text import Entity, Relation, Span, TextDocument, span_utils
from medkit.io._common import get_anns_by_type
logger = logging.getLogger(__name__)
[docs]
class DoccanoTask(enum.Enum):
"""Supported doccano tasks. The task defines
the type of document to convert.
Attributes
----------
TEXT_CLASSIFICATION
Documents with a category
RELATION_EXTRACTION
Documents with entities and relations (including IDs)
SEQUENCE_LABELING
Documents with entities in tuples
"""
TEXT_CLASSIFICATION = "text_classification"
RELATION_EXTRACTION = "relation_extraction"
SEQUENCE_LABELING = "sequence_labeling"
[docs]
@dataclasses.dataclass
class DoccanoClientConfig:
"""A class representing the configuration in the doccano client.
The default values are the default values used by doccano.
Attributes
----------
column_text : str, default="text"
Name or key representing the text
column_label : str, default="label"
Name or key representing the label
"""
column_text: str = "text"
column_label: str = "label"
# FIXME: datamodels to factorize in _doccano_utils
@dataclasses.dataclass()
class _DoccanoEntity:
id: int
start_offset: int
end_offset: int
label: str
def to_dict(self) -> dict[str, Any]:
return {
"id": self.id,
"start_offset": self.start_offset,
"end_offset": self.end_offset,
"label": self.label,
}
@dataclasses.dataclass()
class _DoccanoEntityTuple:
start_offset: int
end_offset: int
label: str
def to_tuple(self) -> tuple[int, int, str]:
return self.start_offset, self.end_offset, self.label
@dataclasses.dataclass()
class _DoccanoRelation:
id: int
from_id: int
to_id: int
type: str
def to_dict(self) -> dict[str, Any]:
return {
"id": self.id,
"from_id": self.from_id,
"to_id": self.to_id,
"type": self.type,
}
@dataclasses.dataclass()
class _DoccanoDocRelationExtraction:
text: str
entities: list[_DoccanoEntity]
relations: list[_DoccanoRelation]
metadata: dict[str, Any]
@classmethod
def from_dict(cls, doc_line: dict[str, Any], client_config: DoccanoClientConfig) -> Self:
text: str = doc_line.pop(client_config.column_text)
entities = [_DoccanoEntity(**ann) for ann in doc_line.pop("entities")]
relations = [_DoccanoRelation(**ann) for ann in doc_line.pop("relations")]
# in doccano, metadata is what remains after removing key fields
metadata = doc_line
return cls(text=text, entities=entities, relations=relations, metadata=metadata)
def to_dict(self) -> dict[str, Any]:
doc_dict = {"text": self.text}
doc_dict["entities"] = [ent.to_dict() for ent in self.entities]
doc_dict["relations"] = [rel.to_dict() for rel in self.relations]
doc_dict.update(self.metadata)
return doc_dict
@dataclasses.dataclass()
class _DoccanoDocSeqLabeling:
text: str
entities: list[_DoccanoEntityTuple]
metadata: dict[str, Any]
@classmethod
def from_dict(cls, doc_line: dict[str, Any], client_config: DoccanoClientConfig) -> Self:
text = doc_line.pop(client_config.column_text)
entities = [_DoccanoEntityTuple(*ann) for ann in doc_line.pop(client_config.column_label)]
# in doccano, metadata is what remains after removing key fields
metadata = doc_line
return cls(text=text, entities=entities, metadata=metadata)
def to_dict(self) -> dict[str, Any]:
doc_dict = {"text": self.text}
doc_dict["label"] = [ent.to_tuple() for ent in self.entities]
doc_dict.update(self.metadata)
return doc_dict
@dataclasses.dataclass()
class _DoccanoDocTextClassification:
text: str
label: str
metadata: dict[str, Any]
@classmethod
def from_dict(cls, doc_line: dict[str, Any], client_config: DoccanoClientConfig) -> Self:
text = doc_line.pop(client_config.column_text)
label = doc_line.pop(client_config.column_label)[0]
if not isinstance(label, str):
msg = (
"The label must be a string. Please check if the document corresponds"
" to a text classification task rather than sequence labeling"
)
raise TypeError(msg)
# in doccano, metadata is what remains after removing key fields
metadata = doc_line
return cls(text=text, label=label, metadata=metadata)
def to_dict(self) -> dict[str, Any]:
doc_dict = {"text": self.text, "label": [str(self.label)]}
doc_dict.update(self.metadata)
return doc_dict
[docs]
class DoccanoInputConverter:
"""Convert doccano files (.JSONL) containing annotations for a given task.
For each line, a :class:`~.core.text.TextDocument` will be created.
The doccano files can be loaded from a directory with zip files or from a jsonl file.
The converter supports custom configuration to define the parameters used by doccano
when importing the data (c.f. :class:`~.io.doccano.DoccanoClientConfig`)
.. warning::
If the option *Count grapheme clusters as one character* was selected
when creating the doccano project, the converted documents are
likely to have alignment problems; the converter does not support this option.
"""
def __init__(
self,
task: DoccanoTask,
client_config: DoccanoClientConfig | None = None,
attr_label: str = "doccano_category",
uid: str | None = None,
):
"""Parameters
----------
task : DocanoTask
The doccano task for the input converter
client_config : DoccanoClientConfig, optional
Optional client configuration to define default values in doccano interface.
This config can change, for example, the name of the text field or labels.
attr_label : str, default="doccano_category"
The label to use for the medkit attribute that represents the doccano category.
This is related to :class:`~.io.DoccanoTask.TEXT_CLASSIFICATION` projects.
uid : str, optional
Identifier of the converter.
"""
if uid is None:
uid = generate_id()
if client_config is None:
client_config = DoccanoClientConfig()
self.uid = uid
self.client_config = client_config
self.task = task
self.attr_label = attr_label
self._prov_tracer: ProvTracer | None = None
[docs]
def set_prov_tracer(self, prov_tracer: ProvTracer):
"""Enable provenance tracing.
Parameters
----------
prov_tracer : ProvTracer
The provenance tracer used to trace the provenance.
"""
self._prov_tracer = prov_tracer
@property
def description(self) -> OperationDescription:
"""Contains all the input converter init parameters."""
return OperationDescription(
uid=self.uid,
name=self.__class__.__name__,
class_name=self.__class__.__name__,
config={"task": self.task.value},
)
[docs]
def load_from_directory_zip(self, dir_path: str | Path) -> list[TextDocument]:
"""Create a list of TextDocuments from zip files in a directory.
The zip files should contain a JSONL file coming from doccano.
Parameters
----------
dir_path : str or Path
The path to the directory containing zip files.
Returns
-------
list of TextDocument
A list of TextDocuments
"""
documents = [doc for path in sorted(Path(dir_path).glob("*.zip")) for doc in self.load_from_zip(path)]
if not documents:
logger.warning("No .zip nor .jsonl found in '%s'", dir_path)
return documents
[docs]
def load_from_zip(self, input_file: str | Path) -> list[TextDocument]:
"""Create a list of TextDocuments from a zip file containing a JSONL file
coming from doccano.
Parameters
----------
input_file : str or Path
The path to the zip file containing a docanno JSONL file
Returns
-------
list of TextDocument
A list of TextDocuments
"""
with tempfile.TemporaryDirectory() as tmpdir:
with ZipFile(input_file, mode="r") as zip_file:
filename = zip_file.namelist()[0]
unzipped_file = Path(tmpdir) / filename
zip_file.extract(filename, tmpdir)
return self.load_from_file(unzipped_file)
[docs]
def load_from_file(self, input_file: str | Path) -> list[TextDocument]:
"""Create a list of TextDocuments from a doccano JSONL file.
Parameters
----------
input_file : str or Path
The path to the JSONL file containing doccano annotations
Returns
-------
list of TextDocument
A list of TextDocuments
"""
documents = []
with Path(input_file).open(encoding="utf-8") as fp:
for line in fp:
doc_line = json.loads(line)
doc = self._parse_doc_line(doc_line)
documents.append(doc)
self._check_crlf_character(documents)
return documents
def _check_crlf_character(self, documents: list[TextDocument]):
"""Check if the list of converted documents contains the CRLF character.
This character is the only indicator available to warn
if there are alignment problems in the documents
"""
if self.task in (DoccanoTask.RELATION_EXTRACTION, DoccanoTask.SEQUENCE_LABELING):
nb_docs_with_warning = sum(document.text.find("\r\n") != -1 for document in documents)
if nb_docs_with_warning > 0:
logger.warning(
"%s/%s documents contain"
" '\\r\\n' characters. If you have selected 'Count grapheme"
" clusters as one character' when creating the doccano project,"
" converted documents are likely to have alignment problems.\n"
" Please ignore this message if you did not select this option when"
" creating the project.",
nb_docs_with_warning,
nb_docs_with_warning,
)
def _parse_doc_line(self, doc_line: dict[str, Any]) -> TextDocument:
"""Parse a doc_line into a TextDocument depending on the task
Parameters
----------
doc_line : dict of str to Any
A dictionary representing an annotation from doccano
Returns
-------
TextDocument
A document with parsed annotations.
"""
if self.task == DoccanoTask.RELATION_EXTRACTION:
return self._parse_doc_line_relation_extraction(doc_line=doc_line)
if self.task == DoccanoTask.TEXT_CLASSIFICATION:
return self._parse_doc_line_text_classification(doc_line=doc_line)
if self.task == DoccanoTask.SEQUENCE_LABELING:
return self._parse_doc_line_seq_labeling(doc_line=doc_line)
return None
def _parse_doc_line_relation_extraction(self, doc_line: dict[str, Any]) -> TextDocument:
"""Parse a dictionary and return a TextDocument with entities and relations
Parameters
----------
doc_line : dict of str to Any
Dictionary with doccano annotation
Returns
-------
TextDocument
The document with annotations
"""
try:
doccano_doc = _DoccanoDocRelationExtraction.from_dict(doc_line, client_config=self.client_config)
except (KeyError, TypeError) as err:
msg = (
"Impossible to convert the document. Please check the task"
" or the client configuration of the converter."
)
raise ValueError(msg) from err
ents_by_doccano_id = {}
relations = []
for doccano_entity in doccano_doc.entities:
text = doccano_doc.text[doccano_entity.start_offset : doccano_entity.end_offset]
entity = Entity(
text=text,
label=doccano_entity.label,
spans=[Span(doccano_entity.start_offset, doccano_entity.end_offset)],
metadata={"doccano_id": doccano_entity.id},
)
ents_by_doccano_id[doccano_entity.id] = entity
if self._prov_tracer is not None:
self._prov_tracer.add_prov(entity, self.description, source_data_items=[])
for doccano_relation in doccano_doc.relations:
relation = Relation(
label=doccano_relation.type,
source_id=ents_by_doccano_id[doccano_relation.from_id].uid,
target_id=ents_by_doccano_id[doccano_relation.to_id].uid,
metadata={"doccano_id": doccano_relation.id},
)
relations.append(relation)
if self._prov_tracer is not None:
self._prov_tracer.add_prov(relation, self.description, source_data_items=[])
anns = list(ents_by_doccano_id.values()) + relations
return TextDocument(
text=doccano_doc.text,
anns=anns,
metadata=doccano_doc.metadata,
)
def _parse_doc_line_seq_labeling(self, doc_line: dict[str, Any]) -> TextDocument:
"""Parse a dictionary and return a TextDocument with entities
Parameters
----------
doc_line : dict of str to Any
Dictionary with doccano annotation.
Returns
-------
TextDocument
The document with annotations
"""
try:
doccano_doc = _DoccanoDocSeqLabeling.from_dict(doc_line, client_config=self.client_config)
except (KeyError, TypeError) as err:
msg = (
"Impossible to convert the document. Please check the task"
" or the client configuration of the converter."
)
raise ValueError(msg) from err
entities = []
for doccano_entity in doccano_doc.entities:
text = doccano_doc.text[doccano_entity.start_offset : doccano_entity.end_offset]
entity = Entity(
text=text,
label=doccano_entity.label,
spans=[Span(doccano_entity.start_offset, doccano_entity.end_offset)],
)
entities.append(entity)
if self._prov_tracer is not None:
self._prov_tracer.add_prov(entity, self.description, source_data_items=[])
return TextDocument(
text=doccano_doc.text,
anns=entities,
metadata=doccano_doc.metadata,
)
def _parse_doc_line_text_classification(self, doc_line: dict[str, Any]) -> TextDocument:
"""Parse a dictionary and return a TextDocument with an attribute.
Parameters
----------
doc_line : dict of str to Any
Dictionary with doccano annotation.
Returns
-------
TextDocument
The document with its category
"""
try:
doccano_doc = _DoccanoDocTextClassification.from_dict(doc_line, client_config=self.client_config)
except (KeyError, TypeError) as err:
msg = (
"Impossible to convert the document. Please check the task"
" or the client configuration of the converter."
)
raise ValueError(msg) from err
attr = Attribute(label=self.attr_label, value=doccano_doc.label)
if self._prov_tracer is not None:
self._prov_tracer.add_prov(attr, self.description, source_data_items=[])
doc = TextDocument(text=doccano_doc.text, metadata=doccano_doc.metadata)
doc.attrs.add(attr)
return doc
[docs]
class DoccanoOutputConverter:
"""Convert medkit files to doccano files (.JSONL) for a given task.
For each :class:`~medkit.core.text.TextDocument` a jsonline will be created.
"""
def __init__(
self,
task: DoccanoTask,
anns_labels: list[str] | None = None,
attr_label: str | None = None,
ignore_segments: bool = True,
include_metadata: bool | None = True,
uid: str | None = None,
):
"""Parameters
----------
task : DoccanoTask
The doccano task for the input converter
anns_labels : list of str, optional
Labels of medkit annotations to convert into doccano annotations.
If `None` (default) all the entities or relations will be converted.
Useful for :class:`~.io.DoccanoTask.SEQUENCE_LABELING` or
:class:`~.io.DoccanoTask.RELATION_EXTRACTION` converters.
attr_label : str, optional
The label of the medkit attribute that represents the text category.
Useful for :class:`~.io.DoccanoTask.TEXT_CLASSIFICATION` converters.
ignore_segments : bool, default=True
If `True` medkit segments will be ignored. Only entities will be
converted to Doccano entities. If `False` the medkit segments will
be converted to Doccano entities as well.
Useful for :class:`~.io.DoccanoTask.SEQUENCE_LABELING` or
:class:`~.io.DoccanoTask.RELATION_EXTRACTION` converters.
include_metadata : bool, default=True
Whether include medkit metadata in the converted documents
uid : str, optional
Identifier of the converter.
"""
if uid is None:
uid = generate_id()
self.uid = uid
self.task = task
self.anns_labels = anns_labels
self.attr_label = attr_label
self.ignore_segments = ignore_segments
self.include_metadata = include_metadata
@property
def description(self) -> OperationDescription:
return OperationDescription(
uid=self.uid,
name=self.__class__.__name__,
class_name=self.__class__.__name__,
config={"task": self.task.value},
)
[docs]
def save(self, docs: list[TextDocument], output_file: str | Path):
"""Convert and save a list of TextDocuments into a doccano file (.JSONL)
Parameters
----------
docs : list of TextDocument
List of medkit doc objects to convert
output_file : str or Path
Path or string of the JSONL file where to save the converted documents
"""
with Path(output_file).open(mode="w", encoding="utf-8") as fp:
for medkit_doc in docs:
doc_line = self._convert_doc_by_task(medkit_doc)
fp.write(json.dumps(doc_line, ensure_ascii=False) + "\n")
def _convert_doc_by_task(self, medkit_doc: TextDocument) -> dict[str, Any]:
"""Convert a TextDocument into a dictionary depending on the task
Parameters
----------
medkit_doc : TextDocument
Document to convert
Returns
-------
dict of str to Any
Dictionary with doccano annotation
"""
if self.task == DoccanoTask.RELATION_EXTRACTION:
return self._convert_doc_relation_extraction(medkit_doc=medkit_doc)
if self.task == DoccanoTask.TEXT_CLASSIFICATION:
return self._convert_doc_text_classification(medkit_doc=medkit_doc)
if self.task == DoccanoTask.SEQUENCE_LABELING:
return self._convert_doc_seq_labeling(medkit_doc=medkit_doc)
return None
def _convert_doc_relation_extraction(self, medkit_doc: TextDocument) -> dict[str, Any]:
"""Convert a TextDocument to a doc_line compatible
with the doccano relation extraction task
Parameters
----------
medkit_doc : TextDocument
Document to convert, it may contain entities and relations.
Returns
-------
dict of str to Any
Dictionary with doccano annotation. It may contain text, entities and relations.
"""
doccano_ents_by_medkit_uid = {}
doccano_relations = []
anns_by_type = get_anns_by_type(medkit_doc, self.anns_labels)
medkit_segments = anns_by_type["entities"]
if not self.ignore_segments:
medkit_segments += anns_by_type["segments"]
for medkit_segment in medkit_segments:
spans = span_utils.normalize_spans(medkit_segment.spans)
ann_id = generate_deterministic_id(medkit_segment.uid)
entity = _DoccanoEntity(
id=ann_id.int,
start_offset=spans[0].start,
end_offset=spans[-1].end,
label=medkit_segment.label,
)
doccano_ents_by_medkit_uid[medkit_segment.uid] = entity
for medkit_relation in anns_by_type["relations"]:
subj = doccano_ents_by_medkit_uid.get(medkit_relation.source_id)
obj = doccano_ents_by_medkit_uid.get(medkit_relation.target_id)
if subj is None or obj is None:
logger.warning("Ignore relation %s. Entity source/target was no found", medkit_relation.uid)
continue
ann_id = generate_deterministic_id(medkit_relation.uid)
relation = _DoccanoRelation(
id=ann_id.int,
from_id=subj.id,
to_id=obj.id,
type=medkit_relation.label,
)
doccano_relations.append(relation)
metadata = medkit_doc.metadata if self.include_metadata else {}
doccano_doc = _DoccanoDocRelationExtraction(
text=medkit_doc.text,
entities=list(doccano_ents_by_medkit_uid.values()),
relations=doccano_relations,
metadata=metadata,
)
return doccano_doc.to_dict()
def _convert_doc_seq_labeling(self, medkit_doc: TextDocument) -> dict[str, Any]:
"""Convert a TextDocument to a doc_line compatible
with the doccano sequence labeling task
Parameters
----------
medkit_doc : TextDocument
Document to convert, it may contain entities.
Returns
-------
dict of str to Any
Dictionary with doccano annotation. It may contain
text ans its label (a list of tuples representing entities).
"""
anns_by_type = get_anns_by_type(medkit_doc, self.anns_labels)
medkit_segments = anns_by_type["entities"]
if not self.ignore_segments:
medkit_segments += anns_by_type["segments"]
doccano_entities = []
for medkit_segment in medkit_segments:
spans = span_utils.normalize_spans(medkit_segment.spans)
entity = _DoccanoEntityTuple(
start_offset=spans[0].start,
end_offset=spans[-1].end,
label=medkit_segment.label,
)
doccano_entities.append(entity)
metadata = medkit_doc.metadata if self.include_metadata else {}
doccano_doc = _DoccanoDocSeqLabeling(
text=medkit_doc.text,
entities=doccano_entities,
metadata=metadata,
)
return doccano_doc.to_dict()
def _convert_doc_text_classification(self, medkit_doc: TextDocument) -> dict[str, Any]:
"""Convert a TextDocument to a doc_line compatible with
the doccano text classification task.
Parameters
----------
medkit_doc : TextDocument
Document to convert, it may contain at least one attribute to convert.
Returns
-------
dict of str to Any
Dictionary with doccano annotation. It may contain
text ans its label (a category(str)).
"""
attributes = medkit_doc.attrs.get(label=self.attr_label)
if not attributes:
msg = (
"The attribute with the corresponding text class was not found. Check"
f" the 'attr_label' for this converter, {self.attr_label} was provided."
)
raise KeyError(msg)
metadata = medkit_doc.metadata if self.include_metadata else {}
doccano_doc = _DoccanoDocTextClassification(
text=medkit_doc.text,
label=attributes[0].value,
metadata=metadata,
)
return doccano_doc.to_dict()