import pickle
from scipy.stats import pearsonr
import numpy as np
import sys

gfile = sys.argv[1]
sfile = sys.argv[2]

with open(gfile,'rb') as f:
    genuine = pickle.load(f)

with open(sfile,'rb') as f:
    simulated = pickle.load(f)

def countDomainTupleOccurrences(domainTuples):
    """
    Count occurrences of a domain tuple.
    """
    domTupCount = {}
    for dt in domainTuples:
        c = domTupCount.setdefault(dt, 0)
        domTupCount[dt] = c+1
    return domTupCount

def iterate_domains(domArchs):
    """
    A simple generator function for iterating over every domain in a set of
    domain architectures.
    """
    for arch in domArchs:
        for domain in arch:
            yield domain

def getDoubles(domArchs):
    """
    Get a list of doubles from a list of domain architectures.

    domArchs: list of list of domain ID's (strings)

    Return a list of tuples (of strings)
    """
    dbls = []
    for arch in domArchs:
        for i in range(len(arch)-1):
            dbls.append( tuple(arch[i:i+2]) )
    return dbls

def getTriples(domArchs):
    """
    Get a list of triples from a list of domain architectures.

    domArchs: list of list of domain ID's (strings)

    Return a list of tuples (of strings)
    """
    trpls = []
    for arch in domArchs:
        for i in range(len(arch)-2):
            trpls.append( tuple(arch[i:i+3]) )
    return trpls

# singletons
gdomains = iterate_domains(genuine)
sdomains = iterate_domains(simulated)
gsingleton = countDomainTupleOccurrences(gdomains)
ssingleton = countDomainTupleOccurrences(sdomains)
gsingledata = [gsingleton[d] for d in gsingleton]
ssingledata = [ssingleton[d] if d in ssingleton else 0 for d in gsingleton]
r,p =pearsonr(gsingledata,ssingledata)
print(r)
print(p)

#pairs
gdoubles = getDoubles(genuine)
sdoubles = getDoubles(simulated)
g_unique_double = np.unique(gdoubles,axis=0)
s_unique_double = np.unique(sdoubles,axis=0)
combine_unique_double = np.concatenate((g_unique_double,s_unique_double))
all_unique_double = np.unique(combine_unique_double,axis=0)
alldoubles = [(db[0],db[1]) for db in all_unique_double]

gdoubledict = countDomainTupleOccurrences(gdoubles)
sdoubledict = countDomainTupleOccurrences(sdoubles)
gdoubledata = [gdoubledict[db] if db in gdoubledict else 0 for db in alldoubles]
sdoubledata = [sdoubledict[db] if db in sdoubledict else 0 for db in alldoubles]

r,p =pearsonr(gdoubledata,sdoubledata)
print(r)
print(p)

#Triples
gtriples = getTriples(genuine)
striples = getTriples(simulated)
g_unique_triple = np.unique(gtriples,axis=0)
s_unique_triple = np.unique(striples,axis=0)
combine_unique_triple = np.concatenate((g_unique_triple,s_unique_triple))
all_unique_triple = np.unique(combine_unique_triple,axis=0)
alltriples = [(tp[0],tp[1],tp[2]) for tp in all_unique_triple]
gtripledict = countDomainTupleOccurrences(gtriples)
stripledict = countDomainTupleOccurrences(striples)
gtripledata = [gtripledict[db] if db in gtripledict else 0 for db in alltriples]
stripledata = [stripledict[db] if db in stripledict else 0 for db in alltriples]
r,p =pearsonr(gtripledata,stripledata)
print(r)
print(p)

# pair occurence
g_cooc = {}
s_cooc = {}
for da in genuine:
    for i in range(len(da)):
        d = da[i]
        g_cooc.setdefault(d,[])
        for j in range(len(da)):
            if j != i:
                g_cooc[d].append(da[j])
for d in g_cooc:
    g_cooc[d] = np.unique(g_cooc[d])

for da in simulated:
    for i in range(len(da)):
        d = da[i]
        s_cooc.setdefault(d,[])
        for j in range(len(da)):
            if j != i:
                s_cooc[d].append(da[j])
for d in s_cooc:
    s_cooc[d] = np.unique(s_cooc[d])

gcoocdata = [len(g_cooc[d]) for d in g_cooc]
scoocdata = [len(s_cooc[d]) if d in s_cooc else 0 for d in g_cooc]
r,p =pearsonr(gcoocdata,scoocdata)
print(r)
print(p)

# tandem array len
tandem_lens_g = {}
tandem_lens_s = {}

for da in genuine:
    tmp_count = 1
    last_dom = ''
    for d in da:
        if d == last_dom:
            tmp_count += 1
        else:
            tandem_lens_g.setdefault(last_dom,[])
            tandem_lens_g[last_dom].append(tmp_count)
            tmp_count = 1
            last_dom = d
    tandem_lens_g.setdefault(d,[])
    tandem_lens_g[d].append(tmp_count)

for da in simulated:
    tmp_count = 1
    last_dom = ''
    for d in da:
        if d == last_dom:
            tmp_count += 1
        else:
            tandem_lens_s.setdefault(last_dom,[])
            tandem_lens_s[last_dom].append(tmp_count)
            tmp_count = 1
            last_dom = d
    tandem_lens_s.setdefault(d,[])
    tandem_lens_s[d].append(tmp_count)

two_tandem_g = {}
two_tandem_s = {}
for d in tandem_lens_g:
    all_tandems = np.array(tandem_lens_g[d])
    two_tandem_g[d] = all_tandems[all_tandems>=2]
    if len(two_tandem_g[d]) == 0:
        two_tandem_g[d] = [0]
for d in tandem_lens_s:
    all_tandems = np.array(tandem_lens_s[d])
    two_tandem_s[d] = all_tandems[all_tandems>=2]
    if len(two_tandem_s[d]) == 0:
        two_tandem_s[d] = [0]
two_lens_g = {}
two_lens_s = {}
for d in two_tandem_g:
    two_lens_g[d] = np.mean(two_tandem_g[d])
    if d in two_tandem_s:
        two_lens_s[d] = np.mean(two_tandem_s[d])
    else:
        two_lens_s[d] = 0
tandem_data_g = []
tandem_data_s = []
gdomains = iterate_domains(genuine)
for d in gdomains:
    tandem_data_g.append(two_lens_g[d])
    tandem_data_s.append(two_lens_s[d])
r,p =pearsonr(tandem_data_g,tandem_data_s)
print(r)
print(p)
