"""
Model class that represents a detection standard. A detection standard works like a allow/block list
to filter out inputs from the topic list that are not proper acronyms (e.g. `BEFUND`, `III`). Such
inputs are then not considered for evaluation purposes.
It is designed as an append-only list (i.e., entries do not need to be updated with variable
inputs).
"""
import logging
from typing import Dict, Set, List
from acres.model import topic_list
from acres.util import acronym as acro_util
from acres.util.acronym import Acronym
logger = logging.getLogger(__name__)
[docs]def parse(filename: str) -> Dict[str, bool]:
"""
Parses a .tsv-formatted detection standard into a dictionary.
:param filename:
:return:
"""
file = open(filename, "r", encoding="utf-8")
detection_standard = {} # type: Dict[str, bool]
for row in file:
fields = row.split("\t")
acronym = fields[0].strip()
valid = fields[1].strip() == 'TRUE'
# Remaining fields are ignored, because they map into `valid`.
detection_standard[acronym] = valid
file.close()
return detection_standard
[docs]def filter_valid(standard: Dict[str, bool]) -> Set[str]:
"""
Filter out invalid entries from a gold standard. Invalid entries are not proper acronyms
or repeated types.
:param standard:
:return:
"""
types = set() # type: Set[str]
for acronym, valid in standard.items():
if not valid:
continue
# A gold standard should not contain invalid acronyms
# This is actually a required check, as some long and invalid acronyms
# (e.g. "ACE-Hemmerunverträglichkeit") lead to performance issues.
if not acro_util.is_acronym(acronym):
logger.debug("{%s} does not pass acronym tests.", acronym)
continue
if acronym in types:
logger.debug("{%s} is repeated at least once.", acronym)
continue
types.add(acronym)
return types
[docs]def parse_valid(filename: str) -> Set[str]:
"""
Wrapper method for both `parse` and `filter_valid`.
:param filename:
:return:
"""
return filter_valid(parse(filename))
[docs]def update(previous: Dict[str, bool], acronyms: List[Acronym]) -> Dict[str, bool]:
"""
Update a previous detection standard with new acronyms from a topic list, preserving order.
:param previous:
:param acronyms:
:return:
"""
ret = previous
for acronym in acronyms:
if acronym.acronym not in ret:
ret[acronym.acronym] = True
return ret
[docs]def write(filename: str, standard: Dict[str, bool]) -> None:
"""
Write a detection standard into a file.
:param filename:
:param standard:
:return:
"""
file = open(filename, "w+", encoding="utf-8")
for acronym, valid in standard.items():
file.write(acronym)
file.write("\t")
file.write(str(valid).upper())
file.write("\n")
file.close()
if __name__ == "__main__":
detection_standard = "resources/detection_standard.tsv"
write(detection_standard,
update(parse(detection_standard), topic_list.parse("resources/topic_list.tsv")))