Source code for acres.model.ngrams

"""
Module to handle n-gram lists.
"""
import logging
from typing import Generator, List, Iterator

from acres.preprocess import resource_factory
from acres.util import acronym

logger = logging.getLogger(__name__)


[docs]class FilteredNGramStat: """Filtered NGramStat generator This generator generates ngrams of a given size out of a ngramstat.txt file, while respecting each ngram frequency. @todo ngramstat itself should be a generator """ TOKEN_SEPARATOR = " " PRINT_INTERVAL = 1000000 def __init__(self, ngram_size: int) -> None: """ :param ngram_size: The exact size of ngrams to be considered. """ self.ngram_size = ngram_size self.ngramstat = resource_factory.get_ngramstat() def __iter__(self) -> Generator[List[str], None, None]: logger.debug("Iterating...") for identifier, freq_ngram in self.ngramstat.items(): (freq, ngram) = freq_ngram tokens = ngram.split(self.TOKEN_SEPARATOR) length_tokens = len(tokens) # Only consider ngrams of a given size, so that we work with a non-overlapping list # If window > len(tokens), word2vec still works if length_tokens == self.ngram_size: if identifier % self.PRINT_INTERVAL == 0: logger.debug("%s: %s -> %s", identifier, freq, ngram) # Repeat ngram freq times for _ in range(int(freq)): yield tokens
[docs]def filter_acronym_contexts(ngrams: Iterator[List[str]]) -> 'Iterator[acronym.Acronym]': """ Filter an iterable of tokens by the ones containing an acronym in the middle and convert them \ to Acronym tuples. :param ngrams: :return: """ for tokens in ngrams: ngram_size = len(tokens) middle = int(len(tokens) / 2) if acronym.is_acronym(tokens[middle]): yield acronym.Acronym(acronym=tokens[middle], left_context=' '.join(tokens[0:middle]), right_context=' '.join(tokens[middle + 1:ngram_size]))