import sys
from pathlib import Path

import numpy as np
from tqdm import tqdm

PATH_WORD2VEC = Path("models/GoogleNews-vectors-negative300.bin")
OUT_PATH = Path("models/GoogleNews-vectors-negative300.txt")


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def get_words_and_vectors_from_word2vec_format(path_wordvectors):
    # via https://stackoverflow.com/a/38230349

    # input:
    # word2vec format

    # return:
    # - n_words
    # - dim of vectors
    # - words: list of words
    # - vectors: (number of words, dimension of each word vector) numpy ndarray

    eprint(f"Loading word2vec Model...: {path_wordvectors}")
    with open(path_wordvectors, "r") as f:
        n_words, dim = map(int, next(f).split(" "))  # skip header
        words = []
        vectors = []
        for i, line in tqdm(enumerate(f, start=1)):
            if i % 10000 == 0:
                eprint("load {} data".format(i))
            splitLine = line.split(" ")
            word = splitLine[0]
            # 別 script (ilter-words-lower-and-enwiki_min200.py) より:
            # words = np.array([w.rstrip() for w in open(infile_fullpath, errors='ignore')]) # ignore unicode error: https://qiita.com/mitazet/items/adcbcc2da5b78056f256
            # なぜ rstrip()??
            vector = [float(val) for val in splitLine[1:]]
            words.append(word)
            vectors.append(vector)
        eprint("Done.", len(words), " words loaded!")
        return n_words, dim, words, np.array(vectors)

    # todo: header と読み込み数の照合


def join_and_save_words_and_vectors(n_words, dim, words, vectors, out_path):
    eprint(f"save: {out_path}")
    with open(out_path, "w") as fo:
        fo.write(f"{n_words} {dim}\n")
    with open(out_path, "ab") as fo:
        for word, vector in zip(words, vectors):
            fo.write(word.encode("utf-8"))
            fo.write(b" ")
            np.savetxt(
                fo, vector[None], delimiter=" ", newline="\n", fmt="%.{}g".format(12)
            )
            # vector[None]:
            #     python - numpy.savetxt Problems with 1D array writing - Stack Overflow
            #     https://stackoverflow.com/questions/6268657/numpy-savetxt-problems-with-1d-array-writing


def load_word2vec_txt(word2vec_txt_file):
    # via https://stackoverflow.com/a/38230349

    # output: dict (word -> numpy vector)

    print("Loading word2vec Model...", file=sys.stderr)
    model = {}
    next(word2vec_txt_file)  # skip header
    for i, line in enumerate(word2vec_txt_file, start=1):
        if i % 100000 == 0:
            print("load {} data".format(i), file=sys.stderr)
        splitLine = line.split(" ")
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.", len(model), " words loaded!", file=sys.stderr)
    return model

    # todo: header と読み込み数の照合


def load_word2vec_txt_separate(word2vec_txt_file):
    # via https://stackoverflow.com/a/38230349

    # output:
    # words: list of words
    # vectors: (number of words, dimension of each word vector) numpy ndarray

    print("Loading word2vec Model...", file=sys.stderr)
    n_words, dim = next(word2vec_txt_file).split(" ")  # header
    words = []
    vectors = []
    for i, line in enumerate(word2vec_txt_file, start=1):
        if i % 100000 == 0:
            print("load {} data".format(i), file=sys.stderr)
        splitLine = line.split(" ")
        word = splitLine[0]
        vector = [float(val) for val in splitLine[1:]]
        words.append(word)
        vectors.append(vector)
    print("Done.", len(words), " words loaded!", file=sys.stderr)
    return n_words, dim, words, np.array(vectors)

    # todo: header と読み込み数の照合


def join_words_and_vectors(n_words, dim, words, vectors, out_path):
    with open(out_path, "w") as fo:
        fo.write(f"{n_words} {dim}")
    with open(out_path, "ab") as fo:
        for word, vector in zip(words, vectors):
            fo.write(word.encode("utf-8"))
            fo.write(b" ")
            np.savetxt(
                fo, vector[None], delimiter=" ", newline="\n", fmt="%.{}g".format(12)
            )
            # vector[None]:
            #     python - numpy.savetxt Problems with 1D array writing - Stack Overflow
            #     https://stackoverflow.com/questions/6268657/numpy-savetxt-problems-with-1d-array-writing


if __name__ == "__main__":
    n_words, dim, words, vectors = get_words_and_vectors_from_word2vec_format(
        PATH_WORD2VEC
    )
    join_and_save_words_and_vectors(n_words, dim, words, vectors, OUT_PATH)
    # with open(OUT_PATH, "r") as f:
    #     model = load_word2vec_txt(f)
    # print(model["the"])
    # with open(OUT_PATH, "r") as f:
    #     n_words, dim, words, vectors = load_word2vec_txt_separate(f)
    # join_words_and_vectors(n_words, dim, words, vectors, OUT_PATH)
