Source code for acres.word2vec.train

"""
Trainer for word2vec embeddings based on an idea originally proposed by Johannes Hellrich
(https://github.com/JULIELab/hellrich_dh2016).

.. codeauthor:: Michel Oleynik
"""

import logging

from gensim.models import Word2Vec, Phrases
# from gensim.models import FastText
from gensim.models.phrases import Phraser

from acres.model import ngrams

logger = logging.getLogger(__name__)

WORKERS = 4


[docs]def train(ngram_size: int = 6, min_count: int = 1, net_size: int = 100, alpha: float = 0.025, sg: int = 1, hs: int = 0, negative: int = 5) -> Word2Vec: """ Lazy load a word2vec model. :param ngram_size: :param min_count: :param net_size: :param alpha: :param sg: :param hs: :param negative: :return: """ sentences = ngrams.FilteredNGramStat(ngram_size) # Find common bigram collocations. # Trigrams led to a 2% drop in F1 caused by a 2% drop in recall (though precision increased 2%). # TODO debug why "Rechter_Ventrikel" is not generated phrases = Phrases(sentences) bigram_transformer = Phraser(phrases) collocations = bigram_transformer[sentences] # model = FastText(size=net_size, window=ngram_size - 1, min_count=min_count) # model.build_vocab(sentences=collocations) # model.train(sentences=collocations, total_examples=model.corpus_count, epochs=5) model = Word2Vec(size=net_size, alpha=alpha, window=ngram_size - 1, min_count=min_count, workers=WORKERS, sg=sg, hs=hs, negative=negative) model.build_vocab(sentences=collocations) model.train(sentences=collocations, total_examples=model.corpus_count, epochs=5) return model
# Hellrich # model = gensim.models.Word2Vec(size=200, window=4, min_count=5, workers=8, alpha=0.01, # sg=1, hs=0, negative=5, sample=1e-3) if __name__ == "__main__": MODEL = train(min_count=5)