"""
15-110 Hw6 - Language Modeling Project
Name: 
AndrewID:
"""

project = "WordPredict" # don't edit this

#### CHECK-IN 1 ####

def loadBook(filename):
    return

def getCorpusLength(corpus):
    return

def buildVocabulary(corpus):
    return

def countUnigrams(corpus):
    return

def getStartWords(corpus):
    return

def countStartWords(corpus):
    return

def countBigrams(corpus):
    return


def doWeek1():
    book = loadBook("data/fairytales_clean.txt")
    length = getCorpusLength(book)
    uniqueWords = buildVocabulary(book)
    unigrams = countUnigrams(book)
    startWords = countStartWords(book)
    bigrams = countBigrams(book)

#### WEEK 1 TESTS ####

def testLoadBook():
    print("Testing loadBook()...", end="")
    # We'll test with two shorts files, test1.txt and test2.txt
    # Open them up to see the contents!
    assert(loadBook("data/test1.txt") == [ 
        ["hello", "and", "welcome", "to", "15-110", "."],
        ["we're", "happy", "to", "have", "you", "."] ])
    assert(loadBook("data/test2.txt") == [ 
        ["this", "is", "the", "song", "that", "never", "ends"],
        ["yes", "it", "goes", "on", "and", "on", "my", "friends", "!"],
        ["some", "people", "started", "singing", "it", ",", "not", "knowing", 
         "what", "it", "was", ","],
        ["and", "now", "they", "keep", "on", "singing", "it", "forever", "just",
         "because", ".", ".", "."] ])
    print("... done!")

def testGetCorpusLength():
    print("Testing getCorpusLength()...", end="")
    assert(getCorpusLength([ 
        ["hello", "world"], 
        ["hello", "world", "again"] ]) == 5)
    assert(getCorpusLength([ 
        ["hello", "and", "welcome", "to", "15-110", "."],
        ["we're", "happy", "to", "have", "you", "."] ]) == 12)
    assert(getCorpusLength([ 
        ["this", "is", "the", "song", "that", "never", "ends"],
        ["yes", "it", "goes", "on", "and", "on", "my", "friends", "!"],
        ["some", "people", "started", "singing", "it", ",", "not", "knowing", 
         "what", "it", "was", ","],
        ["and", "now", "they", "keep", "on", "singing", "it", "forever", "just",
         "because", ".", ".", "."] ]) == 41)
    print("... done!")

def testBuildVocabulary():
    print("Testing buildVocabulary()...", end="")
    assert(sorted(buildVocabulary([ 
        ["hello", "world"], 
        ["hello", "world", "again"] ])) == sorted([ "hello", "world", "again"]))
    assert(sorted(buildVocabulary([ 
        ["hello", "and", "welcome", "to", "15-110", "."],
        ["we're", "happy", "to", "have", "you", "."] ])) == sorted([ "hello", "and",
         "welcome", "to", "15-110", ".", "we're", "happy", "have", "you"]))
    assert(sorted(buildVocabulary([ 
        ["this", "is", "the", "song", "that", "never", "ends"],
        ["yes", "it", "goes", "on", "and", "on", "my", "friends", "!"],
        ["some", "people", "started", "singing", "it", ",", "not", "knowing", 
         "what", "it", "was", ","],
        ["and", "now", "they", "keep", "on", "singing", "it", "forever", "just",
        "because", ".", ".", "."] ])) == sorted([ "this", "is", "the", "song", 
        "that", "never", "ends", "yes", "it", "goes", "on", "and", "my", "friends", 
        "!", "some", "people", "started", "singing", ",", "not", "knowing", "what", 
        "was", "now", "they", "keep", "forever", "just", "because", "." ]))
    print("... done!")

def testCountUnigrams():
    print("Testing countUnigrams()...", end="")
    assert(countUnigrams([ 
        ["hello", "world"], 
        ["hello", "world", "again"] ]) == { "hello" : 2, "world" : 2, "again" : 1 })
    assert(countUnigrams([ 
        ["hello", "and", "welcome", "to", "15-110", "."],
        ["we're", "happy", "to", "have", "you", "."] ]) == { "hello" : 1, "and" : 1,
        "welcome" : 1, "to" : 2, "15-110" : 1, "." : 2, "we're" : 1, "happy" : 1, 
        "have" : 1, "you" : 1 })
    assert(countUnigrams([ 
        ["this", "is", "the", "song", "that", "never", "ends"],
        ["yes", "it", "goes", "on", "and", "on", "my", "friends", "!"],
        ["some", "people", "started", "singing", "it", ",", "not", "knowing", 
         "what", "it", "was", ","],
        ["and", "now", "they", "keep", "on", "singing", "it", "forever", "just",
         "because", ".", ".", "."] ]) == { "this" : 1, "is" : 1, "the" : 1, 
         "song" : 1, "that" : 1, "never" : 1, "ends" : 1, "yes" : 1, "it" : 4, 
         "goes" : 1, "on" : 3, "and" : 2, "my" : 1, "friends" : 1, "!" : 1, 
         "some" : 1, "people" : 1, "started" : 1, "singing" : 2, "," : 2, 
         "not" : 1, "knowing" : 1, "what" : 1, "was" : 1, "now" : 1, "they" : 1, 
         "keep" : 1, "forever" : 1, "just" : 1, "because" : 1, "." : 3 })
    print("... done!")

def testGetStartWords():
    print("Testing getStartWords()...", end="")
    assert(getStartWords([ 
        ["hello", "world"], 
        ["hello", "world", "again"] ]) == [ "hello" ])
    assert(sorted(getStartWords([ 
        ["hello", "and", "welcome", "to", "15-110", "."],
        ["we're", "happy", "to", "have", "you", "."] ])) == sorted(["hello", "we're"]))
    assert(sorted(getStartWords([ 
        ["this", "is", "the", "song", "that", "never", "ends"],
        ["yes", "it", "goes", "on", "and", "on", "my", "friends", "!"],
        ["some", "people", "started", "singing", "it", ",", "not", "knowing", 
         "what", "it", "was", ","],
        ["and", "now", "they", "keep", "on", "singing", "it", "forever", "just",
         "because", ".", ".", "."] ])) == sorted(["this", "yes", "some", "and"]))
    print("... done!")

def testCountStartWords():
    print("Testing countStartWords()...", end="")
    assert(countStartWords([ 
        ["hello", "world"], 
        ["hello", "world", "again"] ]) == { "hello" : 2 })
    assert(countStartWords([ 
        ["hello", "and", "welcome", "to", "15-110", "."],
        ["we're", "happy", "to", "have", "you", "."] ]) == { "hello" : 1, "we're" : 1 })
    assert(countStartWords([ 
        ["this", "is", "the", "song", "that", "never", "ends"],
        ["yes", "it", "goes", "on", "and", "on", "my", "friends", "!"],
        ["some", "people", "started", "singing", "it", ",", "not", "knowing", 
         "what", "it", "was", ","],
        ["and", "now", "they", "keep", "on", "singing", "it", "forever", "just",
         "because", ".", ".", "."] ]) == { "this" : 1, "yes" : 1, "some" : 1, "and" : 1 })
    print("... done!")

def testCountBigrams():
    print("Testing countBigrams()...", end="")
    assert(countBigrams([ 
        ["hello", "world"], 
        ["hello", "world", "again"] ]) == { "hello" : { "world" : 2 },
        "world" : { "again" : 1 } })
    assert(countBigrams([ 
        ["hello", "and", "welcome", "to", "15-110", "."],
        ["we're", "happy", "to", "have", "you", "."] ]) == { "hello" : { "and" : 1 },
        "and" : { "welcome" : 1 }, "welcome" : { "to" : 1 }, 
        "to" : { "15-110" : 1, "have" : 1 }, "15-110" : { "." : 1 },
        "we're" : { "happy" : 1 }, "happy" : { "to" : 1 }, "have" : { "you" : 1 },
        "you" : { "." : 1 } })
    assert(countBigrams([ 
        ["this", "is", "the", "song", "that", "never", "ends"],
        ["yes", "it", "goes", "on", "and", "on", "my", "friends", "!"],
        ["some", "people", "started", "singing", "it", ",", "not", "knowing", 
         "what", "it", "was", ","],
        ["and", "now", "they", "keep", "on", "singing", "it", "forever", "just",
         "because", ".", ".", "."] ]) == { "this" : { "is" : 1 }, 
         "is" : { "the" : 1 }, "the" : { "song" : 1 }, "song" : { "that" : 1 }, 
         "that" : { "never" : 1 }, "never" : { "ends" : 1 }, "yes" : { "it" : 1 }, 
         "it" : { "goes" : 1, "," : 1, "was" : 1, "forever" : 1 }, 
         "goes" : { "on" : 1 }, "on" : { "and" : 1, "my" : 1, "singing" : 1 }, 
         "and" : { "on" : 1, "now" : 1 }, "my" : { "friends" : 1}, 
         "friends" : { "!" : 1 }, "some" : { "people" : 1 }, 
         "people" : { "started" : 1 }, "started" : { "singing" : 1 }, 
         "singing" : { "it" : 2 }, "," : { "not" : 1 }, "not" : { "knowing" : 1 },
         "knowing" : { "what" : 1 }, "what" : { "it" : 1 }, "was" : { "," : 1 },
         "now" : { "they" : 1 }, "they" : { "keep" : 1 }, "keep" : { "on" : 1 },
         "forever" : { "just" : 1 }, "just" : { "because" : 1 }, 
         "because" : { "." : 1 }, "." : { "." : 2 } })
    print("... done!")

def testWeek1():
    testLoadBook()
    testGetCorpusLength()
    testBuildVocabulary()
    testCountUnigrams()
    testGetStartWords()
    testCountStartWords()
    testCountBigrams()

testWeek1()
doWeek1()

#### CHECK-IN 2 ####

def buildUniformProbs(unigrams):
    return

def buildUnigramProbs(unigrams, unigramCounts, totalCount):
    return

def buildBigramProbs(unigramCounts, bigramCounts):
    return

def getTopWords(count, words, probs, ignoreList):
    return

from random import choices
def generateTextFromUnigrams(count, words, probs):
    return

def generateTextFromBigrams(count, startWords, startWordProbs, bigramProbs):
    return


def doWeek2():
    # Check-in 1 Functions
    book = loadBook("data/fairytales_clean.txt")
    length = getCorpusLength(book)
    uniqueWords = buildVocabulary(book)
    startWords = getStartWords(book)
    unigramCounts = countUnigrams(book)
    startWordCounts = countStartWords(book)
    bigramCounts = countBigrams(book)
    
    # Uniform Model
    uniformProbs = buildUniformProbs(uniqueWords)
    print("\nText generated by the Uniform Model:\n")
    print(generateTextFromUnigrams(100, uniqueWords, uniformProbs))
    print("\n-----\n")
    
    # Unigram Model
    unigramProbs = buildUnigramProbs(uniqueWords, unigramCounts, length)
    startWordProbs = buildUnigramProbs(startWords, startWordCounts, len(book))
    print("Top 20 words in the Unigram Model:")
    print(getTopWords(20, uniqueWords, unigramProbs, []))
    print("\nTop 20 starting words in the Unigram Model:")
    print(getTopWords(20, startWords, startWordProbs, []))
    print("\nText generated by the Unigram Model:\n")
    print(generateTextFromUnigrams(100, uniqueWords, unigramProbs))
    print("\n-----\n")
    
    # Bigram Model
    bigramProbs = buildBigramProbs(unigramCounts, bigramCounts)
    print("Text generated by the Bigram Model:\n")
    print(generateTextFromBigrams(100, startWords, startWordProbs, bigramProbs))

#### WEEK 2 TESTS ####

def testBuildUniformProbs():
    print("Testing buildUniformProbs()...", end="")
    assert(buildUniformProbs([ "hello", "world", "again"]) == [1/3, 1/3, 1/3])
    assert(buildUniformProbs([ "hello", "and", "welcome", "to", "15-110", ".",
           "we're", "happy", "have", "you"]) == [ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 
           0.1, 0.1, 0.1, 0.1 ])
    assert(buildUniformProbs([ "this", "is", "the", "song", "that", "never", 
           "ends", "yes", "it", "goes", "on", "and", "my", "friends", "!", "some",
           "people", "started", "singing", ",", "not", "knowing", "what", "was", 
           "now", "they", "keep", "forever", "just", "because", "." ]) == [1/31, 
           1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 
           1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 1/31, 
           1/31, 1/31, 1/31, 1/31, 1/31, 1/31 ])
    print("... done!")
    
def testBuildUnigramProbs():
    print("Testing buildUnigramProbs()...", end="")
    assert(buildUnigramProbs([ "hello", "world", "again"], { "hello" : 2, 
           "world" : 2, "again" : 1 }, 5 ) == [2/5, 2/5, 1/5])
    assert(buildUnigramProbs([ "hello", "and", "welcome", "to", "15-110", ".",
           "we're", "happy", "have", "you"], { "hello" : 1, "and" : 1,
           "welcome" : 1, "to" : 2, "15-110" : 1, "." : 2, "we're" : 1, 
           "happy" : 1, "have" : 1, "you" : 1 }, 12) == [1/12, 1/12, 1/12, 2/12, 
           1/12, 2/12, 1/12, 1/12, 1/12, 1/12])
    assert(buildUnigramProbs([ "this", "is", "the", "song", "that", "never", 
           "ends", "yes", "it", "goes", "on", "and", "my", "friends", "!", "some",
           "people", "started", "singing", ",", "not", "knowing", "what", "was", 
           "now", "they", "keep", "forever", "just", "because", "." ], { 
           "this" : 1, "is" : 1, "the" : 1, "song" : 1, "that" : 1, "never" : 1, 
           "ends" : 1, "yes" : 1, "it" : 4, "goes" : 1, "on" : 3, "and" : 2, 
           "my" : 1, "friends" : 1, "!" : 1, "some" : 1, "people" : 1, 
           "started" : 1, "singing" : 2, "," : 2, "not" : 1, "knowing" : 1, 
           "what" : 1, "was" : 1, "now" : 1, "they" : 1, "keep" : 1, 
           "forever" : 1, "just" : 1, "because" : 1, "." : 3 }, 41) == [
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 4/41, 1/41, 3/41, 2/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 2/41, 2/41, 1/41, 1/41, 1/41, 1/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 3/41])
    print("... done!")

def testBuildBigramProbs():
    print("Testing buildBigramProbs()...", end="")
    assert(buildBigramProbs({ "hello" : 2, "world" : 2, "again" : 1 }, { 
           "hello" : { "world" : 2 }, "world" : { "again" : 1 } }) == {
           "hello" : { "words" : ["world"], "probs" : [1] },
           "world" : { "words" : ["again"], "probs" : [0.5] } }) # since 'world'
           # appears twice, once at the end of a sentence
    assert(buildBigramProbs({ "hello" : 1, "and" : 1, "welcome" : 1, "to" : 2,
           "15-110" : 1, "." : 2, "we're" : 1, "happy" : 1, "have" : 1, 
           "you" : 1 }, { "hello" : { "and" : 1 }, "and" : { "welcome" : 1 }, 
           "welcome" : { "to" : 1 }, "to" : { "15-110" : 1, "have" : 1 }, 
           "15-110" : { "." : 1 }, "we're" : { "happy" : 1 }, 
           "happy" : { "to" : 1 }, "have" : { "you" : 1 }, 
           "you" : { "." : 1 } }) == { "hello" : { "words" : ["and"], "probs" : [1] },
           "and" : { "words" : ["welcome"], "probs" : [1] },
           "welcome" : { "words" : ["to"], "probs" : [1] },
           "to" : { "words" : ["15-110", "have"], "probs" : [0.5, 0.5] },
           "15-110" : { "words" : ["."], "probs" : [1] },
           "we're" : { "words" : ["happy"], "probs" : [1] },
           "happy" : { "words" : ["to"], "probs" : [1] },
           "have" : { "words" : ["you"], "probs" : [1] },
           "you" : { "words" : ["."], "probs" : [1] } })
    assert(buildBigramProbs({ "this" : 1, "is" : 1, "the" : 1, "song" : 1, 
           "that" : 1, "never" : 1, "ends" : 1, "yes" : 1, "it" : 4, "goes" : 1, 
           "on" : 3, "and" : 2, "my" : 1, "friends" : 1, "!" : 1, "some" : 1, 
           "people" : 1, "started" : 1, "singing" : 2, "," : 2, "not" : 1, 
           "knowing" : 1, "what" : 1, "was" : 1, "now" : 1, "they" : 1, "keep" : 1, 
           "forever" : 1, "just" : 1, "because" : 1, "." : 3 }, { 
           "this" : { "is" : 1 }, "is" : { "the" : 1 }, "the" : { "song" : 1 }, 
           "song" : { "that" : 1 }, "that" : { "never" : 1 }, 
           "never" : { "ends" : 1 }, "yes" : { "it" : 1 }, 
           "it" : { "goes" : 1, "," : 1, "was" : 1, "forever" : 1 }, 
           "goes" : { "on" : 1 }, "on" : { "and" : 1, "my" : 1, "singing" : 1 }, 
           "and" : { "on" : 1, "now" : 1 }, "my" : { "friends" : 1}, 
           "friends" : { "!" : 1 }, "some" : { "people" : 1 }, 
           "people" : { "started" : 1 }, "started" : { "singing" : 1 }, 
           "singing" : { "it" : 2 }, "," : { "not" : 1 }, "not" : { "knowing" : 1 },
           "knowing" : { "what" : 1 }, "what" : { "it" : 1 }, "was" : { "," : 1 },
           "now" : { "they" : 1 }, "they" : { "keep" : 1 }, "keep" : { "on" : 1 },
           "forever" : { "just" : 1 }, "just" : { "because" : 1 }, 
           "because" : { "." : 1 }, "." : { "." : 2 } }) == { 
           "this" : { "words" : ["is"], "probs" : [1] },
           "is" : { "words" : ["the"], "probs" : [1] },
           "the" : { "words" : ["song"], "probs" : [1] },
           "song" : { "words" : ["that"], "probs" : [1] },
           "that" : { "words" : ["never"], "probs" : [1] },
           "never" : { "words" : ["ends"], "probs" : [1] },
           "yes" : { "words" : ["it"], "probs" : [1] },
           "it" : { "words" : ["goes", ",", "was", "forever"], "probs" : [0.25, 0.25, 0.25, 0.25] },
           "goes" : { "words" : ["on"], "probs" : [1] },
           "on" : { "words" : ["and", "my", "singing"], "probs" : [1/3, 1/3, 1/3] },
           "and" : { "words" : ["on", "now"], "probs" : [0.5, 0.5] },
           "my" : { "words" : ["friends"], "probs" : [1] },
           "friends" : { "words" : ["!"], "probs" : [1] },
           "some" : { "words" : ["people"], "probs" : [1] },
           "people" : { "words" : ["started"], "probs" : [1] },
           "started" : { "words" : ["singing"], "probs" : [1] },
           "singing" : { "words" : ["it"], "probs" : [1] },
           "," : { "words" : ["not"], "probs" : [0.5] }, # because the total count of "," is 2, with one at the end
           "not" : { "words" : ["knowing"], "probs" : [1] },
           "knowing" : { "words" : ["what"], "probs" : [1] },
           "what" : { "words" : ["it"], "probs" : [1] },
           "was" : { "words" : [","], "probs" : [1] },
           "now" : { "words" : ["they"], "probs" : [1] },
           "they" : { "words" : ["keep"], "probs" : [1] },
           "keep" : { "words" : ["on"], "probs" : [1] },
           "forever" : { "words" : ["just"], "probs" : [1] },
           "just" : { "words" : ["because"], "probs" : [1] },
           "because" : { "words" : ["."], "probs" : [1] },
           "." : { "words" : ["."], "probs" : [2/3] } }) # because the total count is 3
    # One final test to make sure probabilities aren't always the same
    assert(buildBigramProbs({ "one" : 3 }, { "one" : { "a" : 1, "b" : 2 } }) == {
        "one" : { "words" : ["a", "b"], "probs" : [1/3, 2/3] } })
    print("... done!")

def testGetTopWords():
    print("Testing getTopWords()...", end="")
    assert(getTopWords(2, [ "hello", "world", "again"], [2/5, 2/5, 1/5], []) == { 
           "hello" : 2/5, "world" : 2/5 })
    assert(getTopWords(3, [ "hello", "world", "again"], [2/5, 2/5, 1/5], []) == {
           "hello" : 2/5, "world" : 2/5, "again" : 1/5 })
    assert(getTopWords(2, [ "hello", "and", "welcome", "to", "15-110", ".",
           "we're", "happy", "have", "you"], [1/12, 1/12, 1/12, 2/12, 
           1/12, 2/12, 1/12, 1/12, 1/12, 1/12], []) == { "to" : 2/12, "." : 2/12 })
    assert(getTopWords(3, [ "hello", "and", "welcome", "to", "15-110", ".",
           "we're", "happy", "have", "you"], [1/12, 1/12, 1/12, 2/12, 
           1/12, 2/12, 1/12, 1/12, 1/12, 1/12], [".", "hello", "and", "15-110", 
           "we're", "have", "you"]) == { "to" : 2/12, "welcome" : 1/12, 
           "happy" : 1/12 })
    assert(getTopWords(1, [ "this", "is", "the", "song", "that", "never", 
           "ends", "yes", "it", "goes", "on", "and", "my", "friends", "!", "some",
           "people", "started", "singing", ",", "not", "knowing", "what", "was", 
           "now", "they", "keep", "forever", "just", "because", "." ], [
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 4/41, 1/41, 3/41, 2/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 2/41, 2/41, 1/41, 1/41, 1/41, 1/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 3/41], []) == { "it" : 4/41 })
    assert(getTopWords(3, [ "this", "is", "the", "song", "that", "never", 
           "ends", "yes", "it", "goes", "on", "and", "my", "friends", "!", "some",
           "people", "started", "singing", ",", "not", "knowing", "what", "was", 
           "now", "they", "keep", "forever", "just", "because", "." ], [
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 4/41, 1/41, 3/41, 2/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 2/41, 2/41, 1/41, 1/41, 1/41, 1/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 3/41], []) == { "it" : 4/41, 
           "on" : 3/41, "." : 3/41 })
    assert(getTopWords(6, [ "this", "is", "the", "song", "that", "never", 
           "ends", "yes", "it", "goes", "on", "and", "my", "friends", "!", "some",
           "people", "started", "singing", ",", "not", "knowing", "what", "was", 
           "now", "they", "keep", "forever", "just", "because", "." ], [
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 4/41, 1/41, 3/41, 2/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 2/41, 2/41, 1/41, 1/41, 1/41, 1/41,
           1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 3/41], []) == { "it" : 4/41, 
           "on" : 3/41, "." : 3/41, "and" : 2/41, "singing" : 2/41, "," : 2/41 })
    print("... done!")

def testGenerateTextFromUnigrams():
    print("Testing generateTextFromUnigrams()...", end="")
    # Since this is random, we can only check that it's the right length
    # and that it only uses words in the provided list.
    words, probs = [ "hello", "world", "again"], [2/5, 2/5, 1/5]
    sentence = generateTextFromUnigrams(5, words, probs)
    assert(len(sentence.strip().split(" ")) == 5)
    for word in sentence.strip().split(" "):
        assert(word in words)

    words, probs = [ "hello", "and", "welcome", "to", "15-110", ".", "we're", 
        "happy", "have", "you"], [1/12, 1/12, 1/12, 2/12, 1/12, 2/12, 1/12, 1/12,
        1/12, 1/12]
    sentence = generateTextFromUnigrams(10, words, probs)
    assert(len(sentence.strip().split(" ")) == 10)
    for word in sentence.strip().split(" "):
        assert(word in words)
    
    words, probs = [ "this", "is", "the", "song", "that", "never", "ends", "yes", 
        "it", "goes", "on", "and", "my", "friends", "!", "some", "people", 
        "started", "singing", ",", "not", "knowing", "what", "was", "now", "they",
        "keep", "forever", "just", "because", "." ], [1/41, 1/41, 1/41, 1/41, 1/41, 
        1/41, 1/41, 1/41, 4/41, 1/41, 3/41, 2/41, 1/41, 1/41, 1/41, 1/41, 1/41, 
        1/41, 2/41, 2/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 1/41, 
        1/41, 3/41]
    sentence = generateTextFromUnigrams(20, words, probs)
    assert(len(sentence.strip().split(" ")) == 20)
    for word in sentence.strip().split(" "):
        assert(word in words)
    print("... done!")

def testGenerateTextFromBigrams():
    print("Testing generateTextFromBigrams()...", end="")
    # Since we assume . is used as a stopping point, include it specifically in
    # the test set.
    
    start_words, start_probs = [ "hi", "dear" ], [ 0.6, 0.4]
    bigram_probs = { "hi" : { "words" : [",", "how"], "probs" : [0.8, 0.2] },
        "dear" : { "words" : [ "sir", "madam" ], "probs" : [0.5, 0.5] },
        "," : { "words" : ["what's", "sup", "yeet"], "probs" : [0.3, 0.3, 0.4] },
        "how" : { "words" : ["are"], "probs" : [1] },
        "sir" : { "words" : [".", ","], "probs" : [0.8, 0.2] },
        "madam" : { "words" : [ ".", ","], "probs" : [0.8, 0.2] },
        "what's" : { "words" : [ "up" ], "probs" : [1] },
        "sup" : { "words" : [ "." ], "probs" : [1] },
        "yeet" : { "words" : [ "!" ], "probs" : [1] },
        "are" : { "words" : [ "you", "yeet" ], "probs" : [0.9, 0.1] },
        "up" : { "words" : [ ".", "," ], "probs" : [0.5, 0.5] },
        "!" : { "words" : ["!", "."], "probs" : [0.7, 0.3] },
        "you" : { "words" : ["."], "probs" : [1] } }
    sentence = generateTextFromBigrams(10, start_words, start_probs, bigram_probs)
    sentence_words = sentence.strip().split()
    assert(len(sentence_words) == 10)
    for i in range(len(sentence_words)):
        if i == 0 or sentence_words[i-1] == ".":
            assert(sentence_words[i] in start_words)
        else:
            assert(sentence_words[i] in bigram_probs[sentence_words[i-1]]["words"])
    print("... done!")

def testWeek2():
    testBuildUniformProbs()
    testBuildUnigramProbs()
    testBuildBigramProbs()
    testGetTopWords()
    testGenerateTextFromUnigrams()
    testGenerateTextFromBigrams()

testWeek2()
doWeek2()

#### FULL ASSIGNMENT ####

import matplotlib.pyplot as plt

ignore = [ ",", ".", "?", "'", '"', "-", "!", ":", ";", "by", "around", "over", 
           "a", "on", "be", "in", "the", "is", "on", "and", "to", "of", "it", 
           "as", "an", "but", "at", "if", "so", "was", "were", "for", "this", 
           "that", "onto", "from", "not", "into" ]

def graphTop50Words(corpus):
    return

def graphTopStartWords(corpus):
    return

def graphTopNextWords(corpus, word):
    return

def graphTopWordsSideBySide(corpus1, name1, corpus2, name2, title):
    return

def graphTopWordsInScatterplot(corpus1, corpus2, title):
    return

def doWeek3():
    book1 = loadBook("data/grimm_clean.txt")
    book2 = loadBook("data/andersen_clean.txt")
    graphTop50Words(book1)
    graphTopStartWords(book1)
    graphTopNextWords(book1, "said")
    graphTopNextWords(book2, "said")
    graphTopNextWords(book1, "good")
    graphTopNextWords(book2, "good")
    graphTopWordsSideBySide(book1, "Grimm", book2, "Andersen", "Top 50 Words in Grimm vs Andersen")
    graphTopWordsInScatterplot(book1, book2, "Top 50 Words in Grimm vs Andersen")

#### WEEK 3 PROVIDED CODE ####

"""
Expects a dictionary of words as keys with probabilities as values, and a title
Plots the words on the x axis, probabilities as the y axis and puts a title on top.
"""
def barPlot(dict, title):
    names = list(dict.keys())
    values = list(dict.values())
    plt.bar(names, values)
    plt.xticks(names, rotation='vertical')
    plt.title(title)
    plt.show()

"""
Expects 3 lists - one of names, and two of values such that the index of a name
corresponds to a value at the same index in both lists. Category1 and Category2
are the labels for the different colors in the graph. For example, you may use
it to graph two categories of probabilities side by side to look at the differences.
"""
def sideBySideBarPlots(names, values1, values2, category1, category2, title):
    x = list(range(len(names)))  # the label locations
    width = 0.35  # the width of the bars
    fig, ax = plt.subplots()
    pos1 = []
    pos2 = []
    for i in x:
        pos1.append(i - width/2)
        pos2.append(i + width/2)
    rects1 = ax.bar(pos1, values1, width, label=category1)
    rects2 = ax.bar(pos2, values2, width, label=category2)
    ax.set_xticks(x)
    ax.set_xticklabels(names)
    ax.legend()
    plt.title(title)
    plt.xticks(rotation="vertical")
    fig.tight_layout()
    plt.show()

"""
Expects two lists of probabilities and a list of labels (words) all the same length
and plots the probabilities of x and y, labels each point, and puts a title on top.
"""
def scatterPlot(xs, ys, labels, title):
    plt.scatter(xs, ys)

    # make labels for the points
    for i in range(len(labels)):
        plt.annotate(labels[i], # this is the text
                    (xs[i], ys[i]), # this is the point to label
                    textcoords="offset points", # how to position the text
                    xytext=(0, 10), # distance from text to points (x,y)
                    ha='center') # horizontal alignment can be left, right or center
    plt.title(title)
    plt.xlim(0, 0.02)
    plt.ylim(0, 0.02)
    plt.show()

#### WEEK 3 TESTS ####

# Instead of running individual tests, check the new graph generated by doWeek3
# after you finish each function.

doWeek3()