Source code for acres.word2vec.test

"""
Module to apply/`test` a given word2vec model.
"""
import logging
from typing import Iterator, Tuple

from gensim.models import Word2Vec

from acres.preprocess import resource_factory

logger = logging.getLogger(__name__)


[docs]def find_candidates(acronym: str, left_context: str = "", right_context: str = "", min_distance: float = 0.0, max_rank: int = 500) -> Iterator[str]: """ Similar to robust_find_embeddings, this finds possible expansions of a given acronym. :param acronym: :param left_context: :param right_context: :param min_distance: :param max_rank: :return: """ model = resource_factory.get_nn_model() # Check for out of vocabulary acronyms # TODO fallback to something, maybe clean the acronym? if acronym not in model.wv.vocab: logger.warning("'%s' not found in the vocabulary!", acronym) return [] # TODO evaluate use of context # [('Kardiomyopathie', 0.772693395614624), ...] similar = _most_similar(model, acronym) rank = 0 for (expansion, distance) in similar: if distance < min_distance or rank > max_rank: return "" rank += 1 # When using Phrases, common collocations (e.g. "koronaren_Herzerkrankung") are shown # with '_' as a delimiter. yield expansion.replace("_", " ")
def _most_similar(model: Word2Vec, positive: str) -> Iterator[Tuple[str, float]]: """ A generator version of gensim's `most_similar` method. :param model: :param positive: :return: """ ratio = 10 start = 0 stop = 100 while True: expansions = model.wv.most_similar(positive=positive, topn=stop) for i in range(start, stop): if i >= len(expansions): return "" yield expansions[i] start = stop stop *= ratio if __name__ == "__main__": print(find_candidates("CMP"))