from Parsing import NULLDOM

def __parseRawFields__(line, ID='sf'):
    """
    Parse the following fields from a single line of a Superfamily species text file:
        identifier (superfamily or family)
        region(s)
        sequence ID
        model ID
        species ID

    line: string, one line from the text file.
    ID: 'sf' or 'fam', which Superfamily domain identifier to use.

    Return a list of tuples of these fields, where each tuple represents one domain (one region).
    """
    if ID not in ['sf', 'fam']:
        raise Exception("{} is not a valid identifier.  Must be 'sf' or 'fam'.".format(ID))
    try:
        split = line.split('\t')
        speciesID = split[0]
        seqID = split[1]
        modelID = split[2]
        regions = split[3]
        if ID == 'sf':
            identifier = split[5]
        elif ID == 'fam':
            identifier = split[8]
        results = []
        for x in regions.split(','):
            results.append( (identifier, x, seqID, modelID, speciesID) )
        return results
    except Exception as e:
        print(e)
        print(split)
        return None

def parseRawTextFile(filename, argID='sf'):
    """
    Parse domains from a Superfamily species text file.

    filename: string, the full path (relative or absolute) of the file to be read
    argID: 'sf' or 'fam', which Superfamily domain identifier to use.

    Returns a dictionary of domain sequences (key=sequence ID)
    """
    with open(filename, 'r') as fileObj:
        # Read in first four lines to bypass comments and header
        for i in range(4):
            fileObj.readline()
        # Read each line and parse domains into a sequence dictionary
        sequences = {}
        for l in iter(fileObj.readline, ''):
            lineContents = __parseRawFields__(l, argID)
            for domain in lineContents:
                # If key is not present, insert emtpy list and return
                # Otherwise, return the value for key
                domList = sequences.setdefault(domain[2], [])
                domList.append( (domain[0], domain[1], domain[3], domain[4]) )

    return sequences

def sortDomains(sequences):
    """
    Sort each domain sequence by the first number in the region span.

    sequences: dictionary of domain sequences

    Return the same dictionary of domain sequences, with sequences sorted.
    """
    for key in sequences:
        sequences[key].sort(key=lambda x: int(x[1].split('-')[0]))
    return sequences

def insertEnds(sequences):
    """
    Insert null domain ends into each domain sequence.

    sequences: dictionary of domain sequences

    Return the same dictionary of domain sequences, with null domain ends
    """
    # identifier, region span, model ID, species ID
    begin = (NULLDOM, '0-0', '0', '--')
    end =  (NULLDOM, '99999-99999', '0', '--')

    for key in sequences:
        domList = sequences[key]
        # species = domList[0][3]
        domList.insert(0, begin)
        domList.append(end)
    return sequences

def writeUniversalFile(filename, sequences):
    """
    Write the universal file of domain sequences to filename.

    filename: string, the full path (relative or absolute) of the file to be read
    sequences: dictionary of domain sequences
    """
    with open(filename, 'w') as fOut:
        # Write header
        print("Species ID,Sequence ID,Identifier,Model ID,Region", file=fOut)
        # Write comma-separated rows
        for key in sequences:
            for domain in sequences[key]:
                # domain = (identifier, region span, model ID, species ID)
                print(domain[3], key, domain[0], domain[2], domain[1], sep=',', file=fOut)

def readUniversalFile(filename):
    """
    Read the universal file of domain sequences from a file (filename).

    filename: string, the full path (relative or absolute) of the file to be read

    Return dictionary of domain sequences.
    """
    sequences = {}
    with open(filename, 'r') as fIn:
        fIn.readline() # Read header
        # Read csv rows
        for line in iter(fIn.readline, ''):
            key, value = __parseUniversalFields__(line)
            domList = sequences.setdefault(key, [])
            domList.append(value)

    return sequences

def __parseUniversalFields__(line):
    """
    Parse one line of the universal file of domain sequences.
    [internal convenience method]

    Return the sequence ID, tuple of information for one domain (identifier, region span, model ID, species ID)
    """
    # columns -> Species ID,Sequence ID,Identifier,Model ID,Region
    s = line.split(',')
    # (sequence ID, (identifier, region span, model ID, species ID) )
    return s[1], (s[2], s[4], s[3], s[0])

def getDomainArchitectures(sequences):
    """
    Find the unique domain architectures from a dictionary of domain sequences (sequences).

    sequences: dictionary of domain sequences
    (note: sequences is going to be sorted, but no set operation on sequences is performed.)

    Return a list of unique domain architectures (as lists).
    """
    sequences = sortDomains(sequences)
    domArchs = set()
    # using strings as an intermediate because they are hashable, unlike lists
    for key in sequences:
        tempDomArch = ""
        for domain in sequences[key]:
            tempDomArch += domain[0] + ' '
        domArchs.add(tempDomArch.strip())
    return [domArchStr.split(' ') for domArchStr in domArchs], sequences

def getDomainArchitecturesAll(sequences):
    """
    Find the unique domain architectures from a dictionary of domain sequences (sequences).

    sequences: dictionary of domain sequences
    (note: sequences is going to be sorted, but no set operation on sequences is performed.)

    Return a list of unique domain architectures (as lists).
    """
    sequences = sortDomains(sequences)
    domArchs = list()
    # using strings as an intermediate because they are hashable, unlike lists
    for key in sequences:
        tempDomArch = ""
        for domain in sequences[key]:
            tempDomArch += domain[0] + ' '
        domArchs.append(tempDomArch.strip())
    return [domArchStr.split(' ') for domArchStr in domArchs], sequences

