Source code for medkit.audio.transcription.doc_transcriber

from __future__ import annotations

__all__ = ["DocTranscriber", "TranscriptionOperation"]

from typing import List, Optional
from typing_extensions import Protocol

from medkit.audio.transcription.transcribed_text_document import TranscribedTextDocument
from medkit.core import Operation
from medkit.core.audio import AudioDocument, Segment as AudioSegment
from medkit.core.text import Segment as TextSegment, Span as TextSpan


[docs]class TranscriptionOperation(Protocol): """ Protocol for operations in charge of the actual speech-to-text transcription to use with :class:`~.DocTranscriber` """ output_label: str """ Label to use for generated transcription attributes """
[docs] def run(self, segments: List[AudioSegment]): """ Add a transcription attribute to each segment with a text value containing the transcribed text. Parameters ---------- segments: List of segments to transcribe """
[docs]class DocTranscriber(Operation): """Speech-to-text transcriber generating text documents from audio documents. For each text document, all audio segments with a specific label are converted into text segments and regrouped in a corresponding new text document. The text of each segment is concatenated to form the full raw text of the new document. Generated text documents are instances of :class:`~medkit.audio.transcription.transcribed_text_document.TranscribedTextDocument` (subclass of :class:`~medkit.core.text.document.TextDocument`) with additional info such as the identifier of the original audio document and a mapping between audio spans and text spans. Methods :func: `create_text_segment()` and :func: `augment_full_text_for_next_segment()` can be overridden to customize how the text segments are created and how they are concatenated to form the full text. The actual transcription task is delegated to a :class:`~.TranscriptionOperation` that must be provided, for instance :class`~medkit.audio.transcription.hf_transcriber.HFTranscriber` or :class`~medkit.audio.transcription.sb_transcriber.SBTranscriber`. """ def __init__( self, input_label: str, output_label: str, transcription_operation: TranscriptionOperation, attrs_to_copy: Optional[List[str]] = None, uid: Optional[str] = None, ): """ Parameters ---------- input_label: Label of audio segments that should be transcribed. output_label: Label of generated text segments. transcription_operation: Transcription operation in charge of actually transcribing each audio segment. attrs_to_copy: Labels of attributes that should be copied from the original audio segments to the transcribed text segments. uid: Identifier of the transcriber. """ # Pass all arguments to super (remove self) init_args = locals() init_args.pop("self") super().__init__(**init_args) if attrs_to_copy is None: attrs_to_copy = [] self.input_label = input_label self.output_label = output_label self.transcription_operation = transcription_operation self.attrs_to_copy = attrs_to_copy # label of transcription attributes attached to audio segments self._attr_label = self.transcription_operation.output_label
[docs] def run(self, audio_docs: List[AudioDocument]) -> List[TranscribedTextDocument]: """Return a transcribed text document for each document in `audio_docs` Parameters ---------- audio_docs: Audio documents to transcribe Returns ------- List[TranscribedTextDocument]: Transcribed text documents (once per document in `audio_docs`) """ return [self._transcribe_doc(d) for d in audio_docs]
def _transcribe_doc(self, audio_doc: AudioDocument) -> TranscribedTextDocument: # get all audio segments with specified label audio_segs = audio_doc.anns.get(label=self.input_label) # transcribe them to text self.transcription_operation.run(audio_segs) # rebuild full text and segments from transcribed texts full_text = "" text_segs = [] text_spans_to_audio_spans = {} for audio_seg in audio_segs: # retrieve transcription attribute transcription_attr = audio_seg.attrs.get(label=self._attr_label)[0] text = transcription_attr.value # handle joining between segments full_text = self.augment_full_text_for_next_segment( full_text, text, audio_seg ) # compute text span start = len(full_text) full_text += text end = len(full_text) span = TextSpan(start, end) # create TextSegment with proper span referencing full text text_seg = TextSegment(label=self.output_label, spans=[span], text=text) # copy attrs from audio segment for label in self.attrs_to_copy: for attr in audio_seg.attrs.get(label=label): copied_attr = attr.copy() text_seg.attrs.add(copied_attr) # handle provenance if self._prov_tracer is not None: self._prov_tracer.add_prov( copied_attr, self.description, [attr] ) text_segs.append(text_seg) # store mapping between text and audio span text_spans_to_audio_spans[span] = audio_seg.span # handle provenance (text segment generated from transcription attribute) if self._prov_tracer is not None: self._prov_tracer.add_prov( text_seg, self.description, [transcription_attr] ) text_doc = TranscribedTextDocument( text=full_text, audio_doc_id=audio_doc.uid, text_spans_to_audio_spans=text_spans_to_audio_spans, ) for text_seg in text_segs: text_doc.anns.add(text_seg) # TODO should this be handled by provenance? # if self._prov_tracer is not None: # self._prov_tracer.add_prov( # text_doc, self, source_data_items=[audio_doc] # ) return text_doc
[docs] def augment_full_text_for_next_segment( self, full_text: str, segment_text: str, audio_segment: AudioSegment ) -> str: """Append intermediate joining text to full text before the next segment is concatenated to it. Override for custom behavior.""" if len(full_text) > 0: full_text += "\n" return full_text