Source code for acres.stats.stats

"""
Module for calculating corpus statistics. It is used to measure the training/test dataset according
to, e.g., number of tokens.
"""
from typing import List

from acres.util import acronym
from acres.util import functions


[docs]class Stats:
    """
    Class that generates and holds stats about a given text.
    """

    source_line_separator = "\n"

    def __init__(self) -> None:
        self.chars = 0
        self.types = 0
        self.tokens = 0
        self.acronym_types = 0
        self.acronyms = 0
        self.sentences = 0

[docs]    def calc_stats(self, text: str) -> None:
        """
        Calculates statistics for a given text string and sets the results as variables.

        :param text:
        :return:
        """
        self.chars = Stats.count_chars(text)
        self.types = Stats.count_types(text)
        self.tokens = Stats.count_tokens(text)
        self.acronym_types = Stats.count_acronyms_types(text)
        self.acronyms = Stats.count_acronyms(text)
        self.sentences = Stats.count_sentences(text)

[docs]    @staticmethod
    def count_chars(text: str) -> int:
        """
        Count the number of non-whitespace chars in a string.

        :param text:
        :return:
        """
        return len(''.join(text.split()))

[docs]    @staticmethod
    def count_types(text: str) -> int:
        """
        Count the number of unique tokens (types) in a string.

        :param text:
        :return:
        """
        types = set()
        for token in text.split():
            types.add(token)
        return len(types)

[docs]    @staticmethod
    def count_tokens(text: str) -> int:
        """
        Count the number of all tokens in a string.

        :param text:
        :return:
        """
        return len(text.split())

[docs]    @staticmethod
    def count_acronyms(text: str) -> int:
        """
        Count the number of acronyms in a string.

        Acronyms are as defined by the `acronym.is_acronym()` function.

        :param text:
        :return:
        """
        return len(Stats._get_acronyms(text))

[docs]    @staticmethod
    def count_acronyms_types(text: str) -> int:
        """
        Count the number of unique acronyms in a string.

        Acronyms are as defined by the `acronym.is_acronym()` function.

        :param text:
        :return:
        """
        return len(set(Stats._get_acronyms(text)))

[docs]    @staticmethod
    def count_sentences(text: str) -> int:
        """
        Count the number of sentences in a string.

        Sentences are any string separated by `line_separator`.

        :param text:
        :return:
        """
        count = 0
        for _ in text.split(Stats.source_line_separator):
            count += 1
        return count

    @staticmethod
    def _get_acronyms(text: str) -> List[str]:
        acronyms = []
        for token in text.split():
            if acronym.is_acronym(token):
                acronyms.append(token)
        return acronyms

    def __str__(self) -> str:
        ret = []
        ret.append("Chars: " + str(self.chars) + "\n")
        ret.append("Types: " + str(self.types) + "\n")
        ret.append("Tokens: " + str(self.tokens) + "\n")
        ret.append("Acronym Types: " + str(self.acronym_types) + "\n")
        ret.append("Acronyms: " + str(self.acronyms) + "\n")
        ret.append("Sentences (raw): " + str(self.sentences) + "\n")
        return ''.join(ret)

    def __add__(self, other: 'Stats') -> 'Stats':
        self.chars += other.chars
        self.types += other.types
        self.tokens += other.tokens
        self.acronym_types += other.acronym_types
        self.acronyms += other.acronyms
        self.sentences += other.sentences
        return self

    def __radd__(self, other: 'Stats') -> 'Stats':
        return self.__add__(other)


[docs]def get_stats(corpus_path: str) -> List[Stats]:
    """
    Generates all statistics from a given corpus directory.

    :param corpus_path:
    :return: A list of statistics objects, one for each file found in the corpus dir, plus an \
    extra one for the full corpus.
    """
    texts = functions.robust_text_import_from_dir(corpus_path)

    # Append the full corpus as a last doc so that we get global statistics.
    full_text = Stats.source_line_separator.join(texts)
    texts.append(full_text)

    ret = []
    for text in texts:
        stats = Stats()
        stats.calc_stats(text)
        ret.append(stats)
    return ret


[docs]def print_stats() -> None:
    """
    Generates and print statistics from the default corpus set in config.

    :return: None
    """
    corpus_path = functions.import_conf("CORPUS_PATH")
    if not corpus_path:
        return None

    all_stats = get_stats(corpus_path)
    for doc in all_stats:
        print(doc)
    print("Total docs: " + str(len(all_stats) - 1))
    return None


if __name__ == "__main__":
    print_stats()