Source code for acres.fastngram.fastngram

"""
A faster version of n-gram matching that uses dictionaries for speed-up.
"""

import logging
import sys
from collections import OrderedDict
from typing import Dict, Set, Tuple, Iterator, List, Union

from acres.preprocess import resource_factory
from acres.util import functions
from acres.util.functions import import_conf
from acres.util.acronym import Acronym

logger = logging.getLogger(__name__)

# Maximum difference in size between left and right context.
MAX_DIFF = 1

PARTITIONS = int(import_conf("FastNgramPartitions"))


[docs]class ContextMap:
    """
    A map of contexts to center words.
    """

    def __init__(self) -> None:
        self.map = {}  # type: Dict[Tuple[str, str], OrderedDict[int, Set[str]]]

[docs]    def add(self, center: str, left_context: str, right_context: str, freq: int) -> None:
        """
        Add a center n-gram with a context.

        :param center:
        :param left_context:
        :param right_context:
        :param freq:
        :return:
        """
        context = (left_context, right_context)
        self.map.setdefault(context, OrderedDict())
        self.map[context].setdefault(freq, set())
        self.map[context][freq].add(center)

[docs]    def centers(self, left_context: str, right_context: str) -> 'OrderedDict[int, Set[str]]':
        """
        Find center n-grams that happen on a given context.

        :param left_context:
        :param right_context:
        :return:
        """
        context = (left_context, right_context)
        if context not in self.map:
            return OrderedDict()
        return self.map[context]


[docs]class CenterMap:
    """
    A map of center words to contexts.
    """

    def __init__(self) -> None:
        self.map = {}  # type: Dict[str, OrderedDict[int, Set[Tuple[str, str]]]]

[docs]    def add(self, center: str, left_context: str, right_context: str, freq: int) -> None:
        """
        Add a center n-gram with a context.

        :param center:
        :param left_context:
        :param right_context:
        :param freq:
        :return:
        """
        context = (left_context, right_context)
        self.map.setdefault(center, OrderedDict())
        self.map[center].setdefault(freq, set())
        self.map[center][freq].add(context)

[docs]    def contexts(self, center: str) -> 'OrderedDict[int, Set[Tuple[str, str]]]':
        """
        Find contexts for a given center word.

        :param center:
        :return:
        """
        if center not in self.map:
            return OrderedDict()
        return self.map[center]


[docs]def baseline(acronym: str, left_context: str = "", right_context: str = "") -> Iterator[str]:
    """
    A baseline method that expands only with unigrams.

    :param acronym:
    :param left_context:
    :param right_context:
    :return:
    """
    return fastngram(acronym, "", "")


[docs]def fastngram(acronym: str, left_context: str = "", right_context: str = "",
              min_freq: int = 2, max_rank: int = 100000) -> Iterator[str]:
    """
    Find an unlimited set of expansion candidates for an acronym given its left and right context. \
    Note that no filtering is done here, except from the acronym initial partioning.

    :param acronym:
    :param left_context:
    :param right_context:
    :param min_freq:
    :param max_rank:
    :return:
    """
    contextualized_acronym = Acronym(acronym=acronym, left_context=left_context,
                                                        right_context=right_context)
    contexts = _generate_acronym_contexts(contextualized_acronym)

    for ngram in _center_provider(contexts, min_freq, max_rank):
        yield ngram


[docs]def fasttype(acronym: str, left_context: str = "", right_context: str = "",
             min_freq: int = 2, max_rank: int = 100000) -> Iterator[str]:
    """
    Find an unlimited set of expansion candidates given the training contexts of the acronym. \
    Note that no filtering is done here, except from the acronym initial partioning.

    :param acronym:
    :param left_context: Not used.
    :param right_context: Not used.
    :param min_freq:
    :param max_rank:
    :return:
    """
    contexts = _find_contexts(acronym, min_freq)

    for ngram in _center_provider(contexts, min_freq, max_rank):
        yield ngram


def _find_contexts(acronym: str, min_freq: int) -> 'List[Acronym]':
    """
    Find contexts in the training data where this acronym appears.

    :param acronym:
    :param min_freq:
    :return:
    """
    model = resource_factory.get_center_map(functions.partition(acronym, PARTITIONS))

    all_contexts = []  # type: List[Acronym]
    for out_freq, contexts in model.contexts(acronym).items():
        for left, right in contexts:
            # Do not allow empty contexts.
            if left == '' and right == '':
                continue
            if out_freq < min_freq:
                break
            contextualized_acronym = Acronym(acronym=acronym, left_context=left,
                                                                right_context=right)
            all_contexts.append(contextualized_acronym)

    return all_contexts


def _center_provider(contexts: 'List[Acronym]', min_freq: int,
                     max_rank: int) -> Iterator[str]:
    """
    Provide unlimited center words for a given list of contexts.

    :param contexts:
    :param min_freq:
    :param max_rank:
    :return:
    """
    # Save previous expansions to avoid the same n-gram to be retrieve from different contexts.
    previous_ngrams = set()  # type: Set[str]

    rank = 0
    for contextualized_acronym in contexts:
        partition = functions.partition(contextualized_acronym.acronym, PARTITIONS)
        model = resource_factory.get_context_map(partition)

        left = contextualized_acronym.left_context
        right = contextualized_acronym.right_context
        count_map = model.centers(left, right)
        for freq, center_ngrams in count_map.items():
            if freq < min_freq:
                break
            for ngram in center_ngrams:
                if rank > max_rank:
                    return ""
                if ngram not in previous_ngrams:
                    previous_ngrams.add(ngram)
                    rank += 1
                    yield ngram


[docs]def create_map(ngrams: Dict[str, int], model: Union[ContextMap, CenterMap],
               partition: int = 0) -> Union[ContextMap, CenterMap]:
    """
    Create a search-optimized represenation of an ngram-list.

    :param ngrams:
    :param model:
    :param partition:
    :return:
    """
    logger.info("Creating model for fastngram with partition = %d...", partition)

    # Ensure ngrams are ordered by decreasing frequency.
    sorted_ngrams = sorted(ngrams.items(), key=lambda x: x[1], reverse=True)

    for ngram, freq in sorted_ngrams:
        for context in _generate_ngram_contexts(ngram):
            if functions.partition(context.acronym, PARTITIONS) == partition:
                model.add(context.acronym, context.left_context, context.right_context, freq)

    logger.info("Fastngram model created.")
    return model


def _generate_ngram_contexts(ngram: str) -> 'List[Acronym]':
    """
    Generate a list of contextualized n-grams with a decreasing central n-gram and increasing \
    lateral context.

    :param ngram:
    :return: 
    """
    tokens = ngram.split(" ")
    ngram_size = len(tokens)

    contexts = []
    # Walk only until half and `max_diff` more.
    for i in range(0, int((ngram_size + 1 + MAX_DIFF) / 2)):
        # Allow up to `max_diff` difference in size.
        for j in range(ngram_size - i + MAX_DIFF, ngram_size - i - MAX_DIFF - 1, -1):
            # Do not allow empty acronym.
            if i >= j:
                break
            # Do not walk past the n-gram.
            if j > ngram_size:
                continue
            left = sys.intern(" ".join(tokens[0:i]))
            right = sys.intern(" ".join(tokens[j:ngram_size]))
            center = sys.intern(" ".join(tokens[i:j]))
            contexts.append(Acronym(acronym=center, left_context=left,
                                                       right_context=right))
    return contexts


def _generate_acronym_contexts(contextualized_acronym: 'Acronym') -> 'List[Acronym]':
    """
    Generate a list of contextualized acronyms with decreasing lateral context.

    Right context is deemed more important than left context, e.g. EF 00%, HF 000/min,
    so we generate first longer right n-grams, e.g. (left_bigram, right_trigram).

    @todo default parameter min_length = 0, so that we avoid empty contexts if we want.

    :param contextualized_acronym:
    :return:
    """
    left = contextualized_acronym.left_context.split()
    right = contextualized_acronym.right_context.split()
    left_length = len(left)
    right_length = len(right)

    # We allow up to MAX_DIFF difference in context size iff the right context is larger than left.
    max_length = min(left_length, right_length)
    if right_length > left_length:
        max_length += min(MAX_DIFF, right_length - left_length)

    contexts = []  # type: List[Acronym]
    for j in range(max_length, -1, -1):
        # Left size > right size
        if j > right_length:
            continue
        for i in range(left_length - j - MAX_DIFF, left_length - j + MAX_DIFF + 1):
            # Prevents double empty context on last iteration
            if i > left_length:
                break
            # Left size < right size
            if i < 0:
                continue
            left_context = " ".join(left[i:left_length])
            right_context = " ".join(right[0:j])
            contexts.append(Acronym(acronym=contextualized_acronym.acronym,
                                                       left_context=left_context,
                                                       right_context=right_context))
    return contexts