Source code for acres.stats.senses

"""
Module to estimate acronym ambiguity. It can be used to collect common acronym statistics, such as
senses/acronym.
"""
from typing import Dict, Set

from acres.model import expansion_standard


[docs]def bucketize(acronyms: Dict[str, Set[str]]) -> Dict[int, int]: """ Reduce: calculate the number of different acronyms for each degree of ambiguity. :param acronyms: :return: """ buckets = {} # type: Dict[int, int] for _, value in acronyms.items(): senses = len(value) buckets.setdefault(senses, 0) buckets[senses] += 1 return buckets
[docs]def map_senses_acronym(standard: Dict[str, Dict[str, int]], lenient: bool = False) -> Dict[str, Set[str]]: """ Map: collect senses for each acronym. :param standard: :param lenient: Whether to consider partial matches (1) as a valid sense. :return: """ senses = {} # type: Dict[str, Set[str]] for acronym, expansions in standard.items(): senses.setdefault(acronym, set()) for expansion, relevance in expansions.items(): if relevance == 2 or relevance == 1 and lenient: senses[acronym].add(expansion) return senses
[docs]def get_sense_buckets(filename: str) -> Dict[str, Set[str]]: """ Parses a reference standard and get a map of senses per acronym. :param filename: :return: """ standard = expansion_standard.parse(filename) return map_senses_acronym(standard)
if __name__ == "__main__": WORKBENCH = "resources/expansion_standard.tsv" print_senses(WORKBENCH) print_ambiguous(WORKBENCH) print_undefined(WORKBENCH)