Source code for medkit.tools.mtsamples

"""This module aims to provide facilities for accessing some examples of mtsamples files
available on this repository: https://github.com/neurazlab/mtsamplesFR

Refer to the repository for more information.

This repository contains:

* **a version of mtsamples.csv**
    Source: https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions
    license: CC0: Public Domain

* **a mtsamples_translation.json file which is a translation to french**

Date: 08/04/2022
"""

from __future__ import annotations

__all__ = ["load_mtsamples", "convert_mtsamples_to_medkit"]

import csv
import json
import urllib.request
from pathlib import Path

from medkit.core.text import TextDocument
from medkit.io.medkit_json import save_text_documents

_REPO_URL: str = "https://raw.githubusercontent.com/aneuraz/mtsamplesFR/master/data/"
_MTSAMPLES_FILE: str = "mtsamples.csv"
_MTSAMPLES_TRANSLATED_FILE: str = "mtsamples_translated.json"


[docs] def load_mtsamples( cache_dir: Path | str = ".cache", translated: bool = True, nb_max: int | None = None, ) -> list[TextDocument]: """Function loading mtsamples data into medkit text documents Parameters ---------- cache_dir : str or Path, default=".cache" Directory where to store mtsamples file. Default: .cache translated : bool, default=True If True (default), `mtsamples_translated.json` file is used (FR). If False, `mtsamples.csv` is used (EN) nb_max : int, optional Maximum number of documents to load Returns ------- list of TextDocument The medkit text documents corresponding to mtsamples data """ if translated: mtsamples_url = _REPO_URL + _MTSAMPLES_TRANSLATED_FILE cache_file = Path(cache_dir) / Path(_MTSAMPLES_TRANSLATED_FILE) else: mtsamples_url = _REPO_URL + _MTSAMPLES_FILE cache_file = Path(cache_dir) / Path(_MTSAMPLES_FILE) if not cache_file.exists(): cache_file.parent.mkdir(exist_ok=True, parents=True) urllib.request.urlretrieve(mtsamples_url, cache_file) # noqa: S310 with cache_file.open() as fp: mtsamples = json.load(fp) if translated else list(csv.DictReader(fp)) if nb_max is not None: mtsamples = mtsamples[:nb_max] return [ TextDocument( text=(sample["transcription_translated"] if translated else sample["transcription"]), metadata={ "id": sample["id"] if translated else sample[""], "description": sample["description"], "medical_specialty": sample["medical_specialty"], "sample_name": sample["sample_name"], "keywords": sample["keywords"], }, ) for sample in mtsamples ]
[docs] def convert_mtsamples_to_medkit( output_file: Path | str, encoding: str | None = "utf-8", cache_dir: Path | str = ".cache", translated: bool = True, ): """Convert mtsamples data into a medkit file Parameters ---------- output_file : str or Path Path to the medkit jsonl file to generate encoding : str, default="utf-8" Encoding of the medkit file to generate cache_dir : str or Path, default=".cache" Directory where mtsamples file is cached. Default: .cache translated : bool, default=True If True (default), `mtsamples_translated.json` file is used (FR). If False, `mtsamples.csv` is used (EN) """ docs = load_mtsamples(cache_dir, translated) save_text_documents(docs=docs, output_file=output_file, encoding=encoding)