Source code for medkit.text.postprocessing.overlapping
__all__ = ["filter_overlapping_entities"]
from typing import List
from medkit.core.text import Entity, span_utils
[docs]def filter_overlapping_entities(entities: List[Entity]) -> List[Entity]:
"""Filter a list of entities and remove overlaps. This method may be
useful for the creation of data for named entity recognition, where
a part of text can only contain one entity per 'word'.
When an overlap is detected, the longest entity is preferred.
Parameters
----------
entities:
Entities to filter
Returns
-------
List[Entity]
Filtered entities
"""
# concat normalize spans and entities to keep the relation after sorting
spans_data = [(span_utils.normalize_spans(ent.spans), ent) for ent in entities]
# sort by length and start of normalized spans, descending order
# the longest is preferred
sorted_spans = sorted(
spans_data,
key=lambda span: (
span[0][-1].end - span[0][0].start,
span[0][0].start,
),
reverse=True,
)
seen_chars = set()
filtered_entities = []
for span, ent in sorted_spans:
span_start = span[0].start
span_end = span[-1].end
if span_start not in seen_chars and span_end not in seen_chars:
seen_chars.update(range(span_start, span_end))
filtered_entities.append(ent)
filtered_entities = sorted(filtered_entities, key=lambda ent: ent.spans[0].start)
return filtered_entities