Source code for acres.word2vec.train

"""
Trainer for word2vec embeddings based on an idea originally proposed by Johannes Hellrich
(https://github.com/JULIELab/hellrich_dh2016).

.. codeauthor:: Michel Oleynik
"""

import logging

from gensim.models import Word2Vec, Phrases
# from gensim.models import FastText
from gensim.models.phrases import Phraser

from acres.model import ngrams

logger = logging.getLogger(__name__)

WORKERS = 4


[docs]def train(ngram_size: int = 6, min_count: int = 1, net_size: int = 100, alpha: float = 0.025,
          sg: int = 1, hs: int = 0, negative: int = 5) -> Word2Vec:
    """
    Lazy load a word2vec model.

    :param ngram_size:
    :param min_count:
    :param net_size:
    :param alpha:
    :param sg:
    :param hs:
    :param negative:
    :return:
    """
    sentences = ngrams.FilteredNGramStat(ngram_size)

    # Find common bigram collocations.
    # Trigrams led to a 2% drop in F1 caused by a 2% drop in recall (though precision increased 2%).
    # TODO debug why "Rechter_Ventrikel" is not generated
    phrases = Phrases(sentences)
    bigram_transformer = Phraser(phrases)
    collocations = bigram_transformer[sentences]

    # model = FastText(size=net_size, window=ngram_size - 1, min_count=min_count)
    # model.build_vocab(sentences=collocations)
    # model.train(sentences=collocations, total_examples=model.corpus_count, epochs=5)

    model = Word2Vec(size=net_size, alpha=alpha, window=ngram_size - 1,
                     min_count=min_count, workers=WORKERS, sg=sg, hs=hs, negative=negative)
    model.build_vocab(sentences=collocations)
    model.train(sentences=collocations, total_examples=model.corpus_count, epochs=5)

    return model

    # Hellrich
    # model = gensim.models.Word2Vec(size=200, window=4, min_count=5, workers=8, alpha=0.01,
    # sg=1, hs=0, negative=5, sample=1e-3)


if __name__ == "__main__":
    MODEL = train(min_count=5)