from gensim.models import word2vec, KeyedVectors
from gensim.test.utils import datapath
import os
import numpy as np

print('loading model')
model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)

X_train = np.zeros((25000, 300))
Y_train = np.zeros((25000, 1))

print('embedding positive train reviews')
pos_sentences = word2vec.PathLineSentences(datapath(os.getcwd() + '/data/aclImdb/train/pos/'))
offset = 0
for idx, sentence in enumerate(pos_sentences):
    offset += 1
    embed = np.zeros(300)
    count = 0
    for word in sentence:
        try:
            embed += model[word]
            count += 1
        except KeyError:
            pass
    X_train[idx] = embed / count
    Y_train[idx] = 1

print('embedding negative train reviews')
neg_sentences = word2vec.PathLineSentences(datapath(os.getcwd() + '/data/aclImdb/train/neg/'))
for idx, sentence in enumerate(neg_sentences):
    embed = np.zeros(300)
    count = 0
    for word in sentence:
        try:
            embed += model[word]
            count += 1
        except KeyError:
            pass
    X_train[idx+offset] = embed / count
    Y_train[idx+offset] = -1

np.save('data/aclImdb/train/x_train_word2vec.npy', X_train)
np.save('data/aclImdb/train/y_train)_word2vec.npy', Y_train)

X_test = np.zeros((25000, 300))
Y_test = np.zeros((25000, 1))

print('embedding positive test reviews')
pos_sentences = word2vec.PathLineSentences(datapath(os.getcwd() + '/data/aclImdb/train/pos/'))
offset = 0
for idx, sentence in enumerate(pos_sentences):
    offset += 1
    embed = np.zeros(300)
    count = 0
    for word in sentence:
        try:
            embed += model[word]
            count += 1
        except KeyError:
            pass
    X_test[idx] = embed / count
    Y_test[idx] = 1

print('embedding negative test reviews')
neg_sentences = word2vec.PathLineSentences(datapath(os.getcwd() + '/data/aclImdb/train/neg/'))
for idx, sentence in enumerate(neg_sentences):
    embed = np.zeros(300)
    count = 0
    for word in sentence:
        try:
            embed += model[word]
            count += 1
        except KeyError:
            pass
    X_test[idx+offset] = embed / count
    Y_test[idx+offset] = -1

np.save('data/aclImdb/test/x_test_word2vec.npy', X_test)
np.save('data/aclImdb/test/y_test)_word2vec.npy', Y_test)