''' PPM-C ''' import math import sys import requests username = sys.argv[-1] k = 3 address ="http://multi6.aladdin.cs.cmu.edu:3001/algoreal" response = requests.get(address, params={"andrew": username}) str = response.text emissions = [] tables = [{} for i in range(0, k + 1)] tables[0] = {'': {}} def incr_count(c, context): ctxlength = len(context) if context not in tables[ctxlength]: tables[ctxlength][context] = {} if c in tables[ctxlength][context]: count = tables[ctxlength][context][c] + 1 else: count = 1 tables[ctxlength][context][c] = count def update_tables(c, context): for i in xrange(0, len(context) + 1): incr_count(c, context[i:]) def prob(c, table, omits): modtable = table.copy() for o in omits: modtable.pop(o, None) escapecount = len(modtable.keys()) fullcount = sum(modtable.values()) + escapecount if c != None: charcount = modtable[c] else: charcount = escapecount # Escape is when c=None if charcount == fullcount: return 1 # Implicit return float(charcount) / fullcount def emit(c, p): if c == None: c = "<$>" if p < 1: emissions.append((c, p)) def encode(c, fullcontext): omits = set([]) success = False for j in xrange(0, len(fullcontext) + 1): context = fullcontext[j:] ctxlength = len(context) if context in tables[ctxlength]: if c in tables[ctxlength][context]: emit(c, prob(c, tables[ctxlength][context], omits)) success = True break else: # escape emit(None, prob(None, tables[ctxlength][context], omits)) omits = omits.union(tables[ctxlength][context].keys()) # Include to omits else: # Context not found - so no need to emit anything continue if not success: # Previously unseen key emit(c, 1.0 / (256 - len(tables[0][''].keys()))) update_tables(c, fullcontext) for i in xrange(0, len(str)): encode(str[i], str[max(i - k, 0): i]) if k == 3: print "-----beginning-----" for e in emissions[:12]: print "%s,%f" % (e[0], e[1]) print "-----from the end----" for e in emissions[-12:]: print "%s,%f" % (e[0], e[1]) #print "k=%d: Total bits %f" % (k, sum(-math.log(e[1])/math.log(2) for e in emissions)) # With spaces removes for k in xrange(1, 6): emissions = [] tables = [{} for i in range(0, k + 1)] tables[0] = {'': {}} for i in xrange(0, len(str)): encode(str[i], str[max(i - k, 0): i]) print "k=%d Total bits %f" % (k, sum(-math.log(e[1])/math.log(2) for e in emissions))