import numpy as np
from keras.datasets import mnist
from timeit import default_timer as timer


def vectorize(x):
    # input: (N, 28, 28)
    # output: (N, 784)
    raise NotImplementedError

def normalize(x):
    raise NotImplementedError
    
def pad(x):
    # input: (N, 784)
    # output: (N, 784 + 1)
    raise NotImplementedError

def get_loss(w, x, y, reg):
    # w: (785,) weight vector
    # x: (N, 785) input features
    # y: (N,) labels
    # reg: regulator lambda
    # output: L(w)
    raise NotImplementedError

def get_loss_gradient(w, x, y, reg):
    # w: (785,) weight vector
    # x: (N, 785) input features
    # y: (N,) labels
    # reg: regulator lambda
    # output: gradient dL(w)/dw
    raise NotImplementedError

def gradient_descent(w, x, y, reg, epochs, lr):
    # w: (785,) initial weight vector
    # x: (N, 785) input features
    # y: (N,) labels
    # reg: regulator lambda
    # epochs: number of gradient updates
    # lr: learning rate
    # output: optimal w
    raise NotImplementedError

def stochastic_gradient_descent(w, x, y, reg, epochs, lr, bs, eps):
    # w: (785,) initial weight vector
    # x: (N, 785) input features
    # y: (N,) labels
    # reg: regulator lambda
    # epochs: number of gradient descent
    # lr: learning rate
    # bs: batch size B
    # eps: early stopping threshold
    # output: optimal w
    raise NotImplementedError

def predict(w, x):
    # w: (785,) weight vector
    # x: (N, 785) input features
    # output: (N,) labels
    raise NotImplementedError

def get_accuracy(y_true, y_pred):
    # y_true: (N,) true labels
    # y_pred: (N,) predicted labels
    # output: accuracy
    raise NotImplementedError

if __name__ == '__main__':

    # Some local tests to help debug your code. Feel free to
    # edit/add anything here, the autograder output will not
    # be affected.

    ############################
    # Data preprocessing tests #
    ############################

    (x_train_o, y_train_o), (x_test_o, y_test_o) = mnist.load_data()
    print(x_train_o.shape, y_train_o.shape)
    print(x_test_o.shape, y_test_o.shape)

    assert vectorize(x_train_o).shape == (60000, 28 * 28)
    assert vectorize(x_test_o).shape == (10000, 28 * 28)

    assert np.isclose(normalize(vectorize(x_train_o)).max(), 1)
    assert np.isclose(normalize(vectorize(x_test_o)).min(), 0)

    assert pad(vectorize(x_train_o)).shape == (60000, 28 * 28 + 1)
    assert pad(vectorize(x_test_o)).shape == (10000, 28 * 28 + 1)
    assert np.allclose(pad(vectorize(x_train_o))[:, -1], 1)
    assert np.allclose(pad(vectorize(x_test_o))[:, -1], 1)

    print('passed')

    ###################################
    # Generate training and test sets #
    ###################################

    y_t = 4  # true label
    y_f = 5  # false label

    i, = ((y_train_o == y_t) | (y_train_o == y_f)).nonzero()
    np.random.shuffle(i)

    # generate training set
    x_train = normalize(pad(vectorize(x_train_o)))[i]
    y_train = np.where(y_train_o[i] == y_t, 1, -1)
    print(x_train.shape, y_train.shape)

    i, = ((y_test_o == y_t) | (y_test_o == y_f)).nonzero()
    np.random.shuffle(i)

    # generate testing set
    x_test = normalize(pad(vectorize(x_test_o)))[i]
    y_test = np.where(y_test_o[i] == y_t, 1, -1)
    print(x_test.shape, y_test.shape)

    ###############
    # Testing SVM #
    ###############

    w = np.zeros(x_train.shape[1])

    print('gradient descent')
    start_time = timer()
    w_gd = gradient_descent(w, x_train, y_train, reg=.1, epochs=50, lr=.1)
    train_time = timer() - start_time
    acc = get_accuracy(y_test, predict(w_gd, x_test))
    print('accuracy: {:.4f}\ntraining time: {:.1f}'.format(acc, train_time))

    print()

    print('stochastic gradient descent')
    start_time = timer()
    w_sgd = stochastic_gradient_descent(w, x_train, y_train, reg=.1, epochs=50, lr=.1, bs=16, eps=.001)
    train_time = timer() - start_time
    acc = get_accuracy(y_test, predict(w_sgd, x_test))
    print('accuracy: {:.4f}\ntraining time: {:.1f}'.format(acc, train_time))
