import pickle
from scipy.stats import pearsonr
import numpy as np
import sys

gfile = sys.argv[1]
sfile = sys.argv[2]

with open(gfile,'rb') as f:
    genuine = pickle.load(f)

with open(sfile,'rb') as f:
    simulated = pickle.load(f)

def countDomainTupleOccurrences(domainTuples):
    """
    Count occurrences of a domain tuple.
    """
    domTupCount = {}
    for dt in domainTuples:
        c = domTupCount.setdefault(dt, 0)
        domTupCount[dt] = c+1
    return domTupCount

def iterate_domains(domArchs):
    """
    A simple generator function for iterating over every domain in a set of
    domain architectures.
    """
    for arch in domArchs:
        for domain in arch:
            yield domain

def getDoubles(domArchs):
    """
    Get a list of doubles from a list of domain architectures.

    domArchs: list of list of domain ID's (strings)

    Return a list of tuples (of strings)
    """
    dbls = []
    for arch in domArchs:
        for i in range(len(arch)-1):
            dbls.append( tuple(arch[i:i+2]) )
    return dbls

def getTriples(domArchs):
    """
    Get a list of triples from a list of domain architectures.

    domArchs: list of list of domain ID's (strings)

    Return a list of tuples (of strings)
    """
    trpls = []
    for arch in domArchs:
        for i in range(len(arch)-2):
            trpls.append( tuple(arch[i:i+3]) )
    return trpls

def getDoublesCooccur(domArchs):
    """
    Get a list of doubles from a list of domain architectures.

    domArchs: list of list of domain ID's (strings)

    Return a list of tuples (of strings)
    """
    dbls = []
    for arch in domArchs:
        for i in range(len(arch)-1):
            for j in range(i+1,len(arch),1):
                mpair = np.sort([arch[i],arch[j]])
                dbls.append( tuple(mpair))
    return dbls

# co-occur
gpairs = getDoublesCooccur(genuine)
spairs = getDoublesCooccur(simulated)
g_unique_double = np.unique(gpairs,axis=0)
s_unique_double = np.unique(spairs,axis=0)
combine_unique_double = np.concatenate((g_unique_double,s_unique_double))
all_unique_double = np.unique(combine_unique_double,axis=0)
alldoubles = [(db[0],db[1]) for db in all_unique_double]
gdoubledict = countDomainTupleOccurrences(gpairs)
sdoubledict = countDomainTupleOccurrences(spairs)
gdoubledata = [gdoubledict[db] if db in gdoubledict else 0 for db in alldoubles]
sdoubledata = [sdoubledict[db] if db in sdoubledict else 0 for db in alldoubles]


r,p =pearsonr(gdoubledata,sdoubledata)
print(r)
print(p)
