Source code for acres.stats.stats

"""
Module for calculating corpus statistics. It is used to measure the training/test dataset according
to, e.g., number of tokens.
"""
from typing import List

from acres.util import acronym
from acres.util import functions


[docs]class Stats: """ Class that generates and holds stats about a given text. """ source_line_separator = "\n" def __init__(self) -> None: self.chars = 0 self.types = 0 self.tokens = 0 self.acronym_types = 0 self.acronyms = 0 self.sentences = 0
[docs] def calc_stats(self, text: str) -> None: """ Calculates statistics for a given text string and sets the results as variables. :param text: :return: """ self.chars = Stats.count_chars(text) self.types = Stats.count_types(text) self.tokens = Stats.count_tokens(text) self.acronym_types = Stats.count_acronyms_types(text) self.acronyms = Stats.count_acronyms(text) self.sentences = Stats.count_sentences(text)
[docs] @staticmethod def count_chars(text: str) -> int: """ Count the number of non-whitespace chars in a string. :param text: :return: """ return len(''.join(text.split()))
[docs] @staticmethod def count_types(text: str) -> int: """ Count the number of unique tokens (types) in a string. :param text: :return: """ types = set() for token in text.split(): types.add(token) return len(types)
[docs] @staticmethod def count_tokens(text: str) -> int: """ Count the number of all tokens in a string. :param text: :return: """ return len(text.split())
[docs] @staticmethod def count_acronyms(text: str) -> int: """ Count the number of acronyms in a string. Acronyms are as defined by the `acronym.is_acronym()` function. :param text: :return: """ return len(Stats._get_acronyms(text))
[docs] @staticmethod def count_acronyms_types(text: str) -> int: """ Count the number of unique acronyms in a string. Acronyms are as defined by the `acronym.is_acronym()` function. :param text: :return: """ return len(set(Stats._get_acronyms(text)))
[docs] @staticmethod def count_sentences(text: str) -> int: """ Count the number of sentences in a string. Sentences are any string separated by `line_separator`. :param text: :return: """ count = 0 for _ in text.split(Stats.source_line_separator): count += 1 return count
@staticmethod def _get_acronyms(text: str) -> List[str]: acronyms = [] for token in text.split(): if acronym.is_acronym(token): acronyms.append(token) return acronyms def __str__(self) -> str: ret = [] ret.append("Chars: " + str(self.chars) + "\n") ret.append("Types: " + str(self.types) + "\n") ret.append("Tokens: " + str(self.tokens) + "\n") ret.append("Acronym Types: " + str(self.acronym_types) + "\n") ret.append("Acronyms: " + str(self.acronyms) + "\n") ret.append("Sentences (raw): " + str(self.sentences) + "\n") return ''.join(ret) def __add__(self, other: 'Stats') -> 'Stats': self.chars += other.chars self.types += other.types self.tokens += other.tokens self.acronym_types += other.acronym_types self.acronyms += other.acronyms self.sentences += other.sentences return self def __radd__(self, other: 'Stats') -> 'Stats': return self.__add__(other)
[docs]def get_stats(corpus_path: str) -> List[Stats]: """ Generates all statistics from a given corpus directory. :param corpus_path: :return: A list of statistics objects, one for each file found in the corpus dir, plus an \ extra one for the full corpus. """ texts = functions.robust_text_import_from_dir(corpus_path) # Append the full corpus as a last doc so that we get global statistics. full_text = Stats.source_line_separator.join(texts) texts.append(full_text) ret = [] for text in texts: stats = Stats() stats.calc_stats(text) ret.append(stats) return ret
if __name__ == "__main__": print_stats()