import pickle
import sys
import numpy as np
import matplotlib.pyplot as plt
import numbers
import matplotlib.pylab as pylab
tsize = 14.5
params = {'legend.fontsize': tsize,
         'axes.labelsize': tsize,
         'axes.titlesize':tsize,}
pylab.rcParams.update(params)

plt.rc('axes', titlesize=tsize)     # fontsize of the axes title
plt.rc('axes', labelsize=tsize)    # fontsize of the x and y labels
plt.rc('legend', fontsize=tsize)    # legend fontsize
plt.rc('text', usetex=True)
plt.rc('legend', frameon=False)
plt.rcParams["font.family"] = "Times New Roman"

#doubleCounts-domArchs.pkl
with open(sys.argv[3],'rb') as f:
    doublecount = pickle.load(f)


#alphabet.pkl
with open(sys.argv[4],'rb') as f:
    alphabet = pickle.load(f)

#doubleEndCount.pkl
with open(sys.argv[5],'rb') as f:
    endc = pickle.load(f)



with open(sys.argv[1],'rb') as f:
    genuine = pickle.load(f)

with open(sys.argv[2],'rb') as f:
    randa = pickle.load(f)

genuine.sort(key=lambda x: len(x))
randa.sort(key=lambda x: len(x))

nda = len(genuine)
thresh = int(0.99*nda)

genuine = genuine[:thresh]
randa = randa[:thresh]


# size of vocabulary + 0000000
vsize = len(alphabet)+1

# add pseudocount to all pairs
phi = 0.0009

endc2 = {w[1]:endc[w]+vsize*phi for w in endc}
endc2['0000000'] = endc2['0000000']-phi


with open('startDomain.pkl','rb') as f:
    start = pickle.load(f)

# add phi to all domains as start domains
totalstart = len(genuine)+(vsize-1)*phi


# |alphabet| + 00000
# and subtract 00000,00000 pair
totalpair = (vsize)**2-1


allcount = np.sum(list(doublecount.values())) + phi*totalpair

allprobs = []

def start_prob(w):
    return pair_prob(('0000000',w))

def pair_prob(p):
    if p in doublecount:
        pcount = doublecount[p] + phi
    else:
        pcount = phi
    # total pairs with 1st domain the same
    dx = p[0]
    if dx in endc2:
        dxcount = endc2[dx]
    else:
        dxcount = vsize*phi
    return pcount/dxcount


for da in genuine:
    prob = start_prob(da[0])
    if len(da) > 1:
        for i in range(1,len(da),1):
            pair = (da[i-1],da[i])
            prob *= pair_prob(pair)
        # and last domain + null pair
    pair = (da[-1],'0000000')
    prob *= pair_prob(pair)
    allprobs.append(prob)
allprobs = np.log(allprobs)
allprobs2 = []
for da in randa:
    prob = start_prob(da[0])
    if len(da) > 1:
        for i in range(1,len(da),1):
            pair = (da[i-1],da[i])
            prob *= pair_prob(pair)
        # and last domain + null pair
    pair = (da[-1],'0000000')
    prob *= pair_prob(pair)
    allprobs2.append(prob)
#allprobs2 = np.sort(np.log(allprobs2))[11:]
allprobs2 = np.log(allprobs2)


fig,ax = plt.subplots()
gsort = np.sort(allprobs)
ssort = np.sort(allprobs2)
mmin = np.min([gsort[0],ssort[0]])-5
mmax = np.max([gsort[-1],ssort[-1]])+5
def qqplot(x, y, quantiles=None, interpolation='nearest', ax=None, rug=False,
           rug_length=0.05, rug_kwargs=None, **kwargs):
    """Draw a quantile-quantile plot for `x` versus `y`.

    Parameters
    ----------
    x, y : array-like
        One-dimensional numeric arrays.

    ax : matplotlib.axes.Axes, optional
        Axes on which to plot. If not provided, the current axes will be used.

    quantiles : int or array-like, optional
        Quantiles to include in the plot. This can be an array of quantiles, in
        which case only the specified quantiles of `x` and `y` will be plotted.
        If this is an int `n`, then the quantiles will be `n` evenly spaced
        points between 0 and 1. If this is None, then `min(len(x), len(y))`
        evenly spaced quantiles between 0 and 1 will be computed.

    interpolation : {‘linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}
        Specify the interpolation method used to find quantiles when `quantiles`
        is an int or None. See the documentation for numpy.quantile().

    rug : bool, optional
        If True, draw a rug plot representing both samples on the horizontal and
        vertical axes. If False, no rug plot is drawn.

    rug_length : float in [0, 1], optional
        Specifies the length of the rug plot lines as a fraction of the total
        vertical or horizontal length.

    rug_kwargs : dict of keyword arguments
        Keyword arguments to pass to matplotlib.axes.Axes.axvline() and
        matplotlib.axes.Axes.axhline() when drawing rug plots.

    kwargs : dict of keyword arguments
        Keyword arguments to pass to matplotlib.axes.Axes.scatter() when drawing
        the q-q plot.
    """
    # Get current axes if none are provided
    if ax is None:
        ax = plt.gca()

    if quantiles is None:
        quantiles = min(len(x), len(y))

    # Compute quantiles of the two samples
    if isinstance(quantiles, numbers.Integral):
        quantiles = np.linspace(start=0, stop=1, num=int(quantiles))
    else:
        quantiles = np.atleast_1d(np.sort(quantiles))
    x_quantiles = np.quantile(x, quantiles, interpolation=interpolation)
    y_quantiles = np.quantile(y, quantiles, interpolation=interpolation)

    # Draw the rug plots if requested
    if rug:
        # Default rug plot settings
        rug_x_params = dict(ymin=0, ymax=rug_length, c='gray', alpha=0.5)
        rug_y_params = dict(xmin=0, xmax=rug_length, c='gray', alpha=0.5)

        # Override default setting by any user-specified settings
        if rug_kwargs is not None:
            rug_x_params.update(rug_kwargs)
            rug_y_params.update(rug_kwargs)

        # Draw the rug plots
        for point in x:
            ax.axvline(point, **rug_x_params)
        for point in y:
            ax.axhline(point, **rug_y_params)

    # Draw the q-q plot
    ax.scatter(x_quantiles, y_quantiles, **kwargs)
    return ax


#sm.qqplot_2samples(gsort,ssort)
#ax = qqplot(gsort,ssort)


from scipy.stats import pearsonr
r,p =pearsonr(gsort,ssort)
rtext = "{rv:.3f}".format(rv=r)
plt.text(0.1,0.9,r"$r=%s$" % rtext,horizontalalignment='center',verticalalignment='center', transform=ax.transAxes)


ax.scatter(gsort,ssort)
plt.xlim([mmin,mmax])
plt.ylim([mmin,mmax])
plt.plot([0,1],[0,1], transform=ax.transAxes,color='black')
#ax.scatter(gsort,ssort)
plt.xlabel('genuine')
plt.ylabel('simulated')
#plt.title('Q-Q plot of DA log likelihood')
#plt.show()

outspecies = sys.argv[6]
outname = outspecies+'_qq.ps'
plt.savefig(outname,dpi=1200, bbox_inches = "tight")
