CRISPR_Studio_1.0.py

#!/usr/bin/env python

# CRISPRStudio is a python software developed to generate CRISPR array figures


import sys, os, argparse, re, subprocess, random
from collections import OrderedDict, Counter
import pandas as pd


# The function spacerExtract extracts the spacer information contained
# in the gff3 file generated with CRISPRDetect

def nonBlankLine(f):
    for l in f:
        line = l.rstrip()
        if line:
            yield line

def spacerExtract(inFile):
    
    '''
        function:
            -
        input: 
            - inFile is the gff file with generated with CRISPRDetect
        output:
            - spacerDict -> dict structured (spacerDict[sId]=[sSeq, sLen, orientation])
            - isolateList -> list of all the isolates
            - averageLength -> the average length of the spacer
            - crArrayList -> list of all the unique crId (ex. NZ_ALEL01000035.1||CRISPR1)
    '''

    spacerDict = OrderedDict() # create an empty dictionnary
    isolateList = [] # create an empty list for the isolates
    crArrayList = [] # create an empty list of the CRISPR array
    spacerLength = [] # create an empty list to store the spacer length to calculate the average length
    repeatRegionList = []
    with open(inFile, 'rU') as fl:
        newRepeatRegion = 'T'
        for line in nonBlankLine(fl):
            array = line.strip().split('\t')
            isolate = array[0]
            start= array[3]
            end= array[4]
            orientation = array[6] 
            if array[2] != 'repeat_region' and newRepeatRegion == 'T':
                if array[2] == 'binding_site' or array[2] == 'Spacer':
                    if isolate not in isolateList:
                        isolateList.append(isolate)
                    sArray = array[8].split(';')
                    crId = '||'.join([isolate, sArray[0][3:]])+'||'+sArray[0].split('_')[0][3:]
                    crArrayList.append(crId)
                    for it in sArray:
                        if re.match('Note=', it):
                            sSeq = it[5:]
                            sLen = int(array[5])
                            sId = '||'.join([isolate, sArray[0][3:]])+':'+str(sLen)+':'+orientation
                            # Store the information in a dictionnary
                            spacerDict[sId]=[sSeq, sLen, orientation]
                            spacerLength.append(sLen)
            elif array[2] == 'repeat_region':               
                repeatRegion = isolate+'||'+start+'_'+end
                if repeatRegion not in repeatRegionList:
                    repeatRegionList.append(repeatRegion)
                    newRepeatRegion = 'T'
                else:
                    newRepeatRegion = 'F'
                    
    averageLength = sum(spacerLength)/float(len(spacerLength))

    #return the dict, the isolateList, the averagelength and the crArrayList

    return(spacerDict, isolateList, averageLength, crArrayList)

# The function checkSpacer checks for erroneous spacers identified by CRISPRDetect by searching
# for unusually short or long spacers compared to the average spacers length of the dataset.

def checkSpacer(spacerDict, averageLength, outFasta):
    '''
        definition: 

        input:
            spacerDict -> generated by spacerExtract()
            averageLength -> averageLength of the spacers
            outFasta -> file where the sequence of all the spacers will be written in fasta format
        
        output:

    '''

    outlier = 'F'
    outlierList = []
    lengthList = []
    for key in spacerDict:
        if spacerDict[key][1] > averageLength*float(1.5) or spacerDict[key][1] < averageLength/float(1.5):
            outlier = 'T'
            outlierList.append(key)
            lengthList.append(spacerDict[key][1])
    if outlier == 'T':
        print('\n#####################\n\n\
        WARNING:\n\
        It was detected that the following spacer(s) is(are) at least 1.5 time longer or shorter than the average spacer size (%s): \
        \n\n\t\tSpacerId\t\t\t\t\t\tLength\n' % averageLength)
        for itA, itB in zip(outlierList, lengthList):
            print('\t\t%s\t%s'  % (itA, itB))
        print('\n\tPlease verify that these spacers are correct in the fasta file %s\n\t\
and rerun the script by specifying the verified fasta file with the flag -f\n\n\
#####################' % outFasta)
        sys.exit()

# The function writeFasta prepares a fasta file with the extracted spacers by reformatting the information

def writeFasta(spacerDict, outFasta):
    with open(outFasta, 'w') as out:
        for it in spacerDict:
            out.write('>%s\n%s\n' % (it, spacerDict[it][0]))

def readCorrectedFasta(correctedFasta):
    spacerDict = OrderedDict() # create an empty dictionnary
    isolateList = [] # create an empty list for the isolates
    crArrayList = [] # create an empty list or the CRISPR array
    spacerLength = [] # create an empty list to store the spacer length to calculate the average length
    with open(correctedFasta, 'rU') as fl:
        for line in fl:
            if re.match('>', line):
                array = line.strip().split(':')
                sId = line.strip()[1:]
                orientation = str(array[2])
                sLen = int(array[1])
                isolate=sId.split('||')[0]
                crId= isolate+'||'+ sId.split('||')[1].split('_')[0]
                crArrayList.append(crId)
                isolateList.append(isolate)
            else:
                sSeq = line.strip()
                spacerDict[sId]=[sSeq, sLen, orientation]
            spacerLength.append(sLen)
    averageLength = sum(spacerLength)/float(len(spacerLength))


    return spacerDict, isolateList, averageLength, crArrayList

# The function fastaAlign performs the local alignment using the fasta36 software.

def fastaAlign(outFasta):
    query= outFasta
    subject= outFasta
    outfmt=str(8) # output fasta alignment result in a tabular format
    outFile=outFasta+'_fasta36'
    try:
        print('Aligning the spacers with fasta36 aligner')
        path = os.path.dirname(os.path.realpath(sys.argv[0]))
        subprocess.check_output(os.path.join(path, 'tools/fasta-36.3.8g/bin/fasta36') + ' -m ' + outfmt + ' ' + query + ' ' + subject + ' >' + outFile, shell=True)
    except:
        try:
            subprocess.check_output('fasta36 -m '+outfmt +' '+  query + ' ' + subject + ' >'+outFile , shell=True)
        except:
            print('fasta36 not found, trying with fasta36.exe')
            try:
                print("Aligning the spacers with fasta36.exe aligner")
                subprocess.check_output('fasta36.exe -m '+outfmt +' '+  query + ' ' + subject + ' >'+outFile , shell=True)
            except:
                print("##### ERROR #####\nfasta36 alignment program was not found, please make sure that the latest version of fasta is installed on your system and that the folder with the executable was added in your path\nThe sofware can be downloaded at http://faculty.virginia.edu/wrpearson/fasta/CURRENT/")
                sys.exit()

# The function extractMatch finds identical spacers by filtering the alignment results
# and keeping only spacer pairs with a number of mismatches smaller than or equal to a cutoff

def extractMatch(alnFile, cutoff, spacerDict):
    outFile = open(alnFile+'.spacermatch', 'w')
    with open(alnFile, 'rU') as bl:
        for line in bl:
            array = line.strip().split()
            query = array[0]
            qLen = spacerDict[query][1]
            subject = array[1]
            alnLen = int(array[3])
            mis =  int(array[4])
            gap =  int(array[5])
            score = qLen - (alnLen - gap - mis) # formula to calculate the number of mismatches
            if  score <= cutoff and query != subject: # if score smaller than cutoff and if not vs self
                outFile.write('%s\t%s\n' % (query, subject))
        # For some reason, fasta36 does not align the query vs itself. The following
        # code allows to add all the self vs self
        for spacer in spacerDict:
            outFile.write('%s\t%s\n' % (spacer, spacer))

# The function makeCluster group matching spacers to form clusters of spacers
# For more information on the clustering algorithm, please visit : https://micans.org/mcl/

def makeCluster(spacermatch):
    outFile= open(spacermatch+'.mcl', 'w')
    clDict = {}
    clNum = 0 
    print('Clustering the spacers')
    with open(spacermatch) as fl: 
        newCluster = 'T'
        spacermatch = []
        for line in fl: 
            array =  line.strip().split()  
            spacermatch.append(array)

    for match in spacermatch: 
        if match[0] in [sp for sublist in clDict.values() for sp in sublist]:
            for key in clDict:
                if match[0] in clDict[key]:
                    if match[1] not in clDict[key]:
                        clDict[key].append(match[1])
        elif match[1] in [sp for sublist in clDict.values() for sp in sublist]:
            for key in clDict:
                if match[1] in clDict[key]:
                    if match[0] not in clDict[key]:
                        clDict[key].append(match[0])
        else:
            clNum += 1
            clName = 'cl%04d' % clNum
            clDict[clName]=[match[0],match[1]]
    for key in clDict:
        outFile.write(' '.join(clDict[key])+'\n')
                        
# The function gen_hex_colour_code generates random three decimal numbers which will be used for hexadecimal colors

def gen_hex_colour_code():
   return str(''.join([random.choice('0123456789abcdef') for x in range(3)]))

# And the function attributeClsColor attributes two random numbers to each cluster

def attributeClsColor(mclFile, spacerDict):
    cl = 0
    spList = []
    colFrList = ['999','FFF']
    colBkList = ['999','FFF']
    outFile = open(mclFile+'.col', 'w')
    with open(mclFile, 'rU') as fl:
        for line in fl:
            cl += 1
            clName = 'cls_%04d' % cl
            array = line.strip().split()
            colFr = gen_hex_colour_code() # front color of the spacer = diamond shape
            colBk = gen_hex_colour_code() # background color of the spacer = square
            while colFr in colFrList:
                colFr = gen_hex_colour_code()
            while colBk in colBkList:
                colBk = gen_hex_colour_code()
            for it in array:
                if it not in spList:
                    spacerDict[it].extend([str(colBk), str(colFr), clName])
                    spList.append(it)
            outFile.write('%s\t%s\t%s\n' % (colBk, colFr, '\t'.join(array)))
    return spacerDict

# The function attributeClsColorRerun is executed when the -r option is added to the command. It will keep the 
# colors initially attributed, to obtain reproducible results

def attributeClsColorRerun(mclFile, spacerDict, finalFile):
    cl = 0
    colFrList = ['999','FFF']
    colBkList = ['999','FFF']
    colDict = {}
    outFile = open(mclFile+'.col', 'w')
    lineList = []
    with open(finalFile, 'rU') as fl: 
        for line in fl:
            array = line.strip().split()
            spacer = array[0]
            colBk = array[4]
            colFr = array[5]
            colFrList.append(colFr)
            colBkList.append(colBk)
            colDict[spacer] = [colBk, colFr]
    with open(mclFile, 'rU') as fl:
        for line in fl:
            cl += 1
            clName = 'cls_%04d' % cl
            array = line.strip().split()
            oldSpacer = ['F','']
            colFr = gen_hex_colour_code()
            colBk = gen_hex_colour_code()
            for spacer in array:
                if spacer in colDict.keys():
                    oldSpacer= ['T',spacer]
                    break
            if oldSpacer[0] == 'T':
                colBk = colDict[oldSpacer[1]][0]
                colFr = colDict[oldSpacer[1]][1]
                if colBk == 'FFF' and cofFr == '999': 
                    colFr = gen_hex_colour_code()
                    colBk = gen_hex_colour_code()
                    while colFr in colFrList:
                        colFr = gen_hex_colour_code()
                    while colBk in colBkList:
                        colBk = gen_hex_colour_code()
                
                for spacer in array:
                    item = '%s\t%s\t%s\n' % (colBk, colFr, '\t'.join(array))
                    spacerDict[spacer].extend([colBk, colFr, clName])
                    lineList.append(item)
            else:
                colFr = gen_hex_colour_code()
                colBk = gen_hex_colour_code()
                while colFr in colFrList:
                    colFr = gen_hex_colour_code()
                while colBk in colBkList:
                    colBk = gen_hex_colour_code()
                for it in array:
                    spacerDict[it].extend([colBk, colFr, clName])
                item = '%s\t%s\t%s\n' % (colBk, colFr, '\t'.join(array))
                lineList.append(item)
        lineList= list(set(lineList))
        for it in lineList:
            outFile.write('%s\n' % it)
    return spacerDict

# The function reformatData writes a summary table with the spacer id, sequence, length, orientation
# front and background colors and cluster it belongs to

def reformatData(spacerDict, outFile):
    finalDict = OrderedDict()
    curIsolate = ""
    curCRid = ''
    maxSpacerCount = 0
    maxLociCount = 0
    spacerCount = 0
    lociCount = 1
    outFile = open(outFile, 'w')
    for spacer in spacerDict:
        isolate=spacer.split('||')[0]
        CRid = spacer.split('||')[1].split('_')[0]
        apList= spacerDict[spacer]
        l = [spacer]
        apList = l + apList
        if isolate != curIsolate:
            if spacerCount > maxSpacerCount:
                maxSpacerCount = spacerCount
            if lociCount > maxLociCount:
                maxLociCount = lociCount
            spacerCount = 1
            lociCount = 1
            curIsolate = isolate
            finalDict[isolate] = OrderedDict()
            curCRid = CRid
            finalDict[isolate][CRid]=[]
            finalDict[isolate][CRid].append(apList)
        elif isolate == curIsolate:
            if CRid == curCRid:
                spacerCount += 1
                finalDict[isolate][CRid].append(apList)
            elif CRid != curCRid:
                if spacerCount > maxSpacerCount:
                    maxSpacerCount = spacerCount
                spacerCount = 1
                lociCount += 1
                curCRid = CRid
                finalDict[isolate][CRid]=[]
                finalDict[isolate][CRid].append(apList)
    if lociCount > maxLociCount:
                print(lociCount, maxLociCount)
                maxLociCount = lociCount

    for isolate in finalDict:
        for cr in finalDict[isolate]:
            for spacer in finalDict[isolate][cr]:
                outFile.write('%s\t%s\t%s\t%s\n' % (spacer[0], spacer[1], str(spacer[2]), '\t'.join(spacer[3:])))
    return finalDict, maxSpacerCount, maxLociCount
    
def classifyCluster(isolateList, spacermatch):
  
    lenIsolate = {}
    pairscore = {}
    
    with open(spacermatch) as fl: 
        curIsolate = ''
        for line in fl:
            array = line.strip().split('\t')
            qmatch = array[0].split('||')[0]
            smatch = array[1].split('||')[0]
            if curIsolate != 'qmatch':
                n = 1
            else: 
                n += 1
                lenIsolate[qmatch] = n
            pair = qmatch+'||'+smatch
            if pair in pairscore:
                pairscore[pair]+=1
            else: 
                pairscore[pair] = 1
        for it in lenIsolate:
            pairscore[it+'||'+it]=lenIsolate[it]
    with open(spacermatch+'.score','w') as fl:
        for pair in pairscore:
           fl.write('%s\t%s\t%i\n' % (pair.split('||')[0], pair.split('||')[1], pairscore[pair]))
    
    scoreFile = open(spacermatch+'.score', 'r')
    df = pd.read_table(scoreFile, sep='\t', names=['qmatch','smatch','score'])

    df_matrix = df.pivot(index='qmatch', columns='smatch', values='score')

    df_matrix_adjusted = df_matrix.fillna(0)


    from skbio.stats.distance import DistanceMatrix
    from numpy import zeros

    def bray_curtis_distance(table, sample1_id, sample2_id):
        numerator = 0
        denominator = 0
        sample1_counts = table[sample1_id]
        sample2_counts = table[sample2_id]
        for sample1_count, sample2_count in zip(sample1_counts, sample2_counts):
            numerator += abs(sample1_count - sample2_count)
            denominator += sample1_count + sample2_count
        return numerator / denominator

    def table_to_distances(table, pairwise_distance_fn):
        sample_ids = table.columns
        num_samples = len(sample_ids)
        data = zeros((num_samples, num_samples))
        for i, sample1_id in enumerate(sample_ids):
            for j, sample2_id in enumerate(sample_ids[:i]):
                data[i,j] = data[j,i] = pairwise_distance_fn(table, sample1_id, sample2_id)
        return DistanceMatrix(data, sample_ids)

    bc_dm = table_to_distances(df_matrix_adjusted, bray_curtis_distance)
 

    from scipy.cluster.hierarchy import average, dendrogram
    lm = average(bc_dm.condensed_form())
    d = dendrogram(lm, labels=bc_dm.ids, orientation='right', link_color_func=lambda x: 'black')
    orderedIsolates = d['ivl']
    return orderedIsolates

# The function gray is executed when the -g option is added to the command. It will gray unique spacers.

def gray(finalDict, isolateList):
    clusterList = []
    for isolate in isolateList:
        for cr in finalDict[isolate]:
            for spacer in finalDict[isolate][cr]:
                clusterList.append(spacer[6])
    counts = Counter(clusterList)
    grayList=[]
    for cl in counts:
        if counts[cl] == 1:
            grayList.append(cl)
    return grayList

# The function generateSVG writes an SVG file containing the necessary information to generate a vector image
# which can be visualized in graphics editor software.
            
def generateSVG(finalDict, outFile, maxSpacerCount, maxLociCount, isolateList, grayList):
    outFile = open(outFile, 'w')
    outFile.write('<svg>\n')
    count_yaxis = 0
    count_xaxis = 0
    cr_xaxis = 500
    lociList = []
    i = 1
    while i <= maxLociCount:
        lociList.append('CRISPR'+str(i))
        i += 1
    for i, crLocus in enumerate(lociList):
        count_yaxis = 0
        maxLen = 0
        for isolate in isolateList:
            if isolate not in finalDict.keys():
                if i == 0:
                    # write strain name next to colored squares
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                    count_yaxis += 10
                else:
                    count_yaxis += 10

            elif isolate in finalDict.keys():
                if i == 0 :
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                count_xaxis = cr_xaxis - 7.5
                if crLocus in finalDict[isolate]:
                    arrayLen = len(finalDict[isolate][crLocus])
                    arrayOrientation = finalDict[isolate][crLocus][0][3]
                    if arrayLen > maxLen:
                        maxLen = arrayLen
                    if arrayOrientation == '-':
                        finalDict[isolate][crLocus] = finalDict[isolate][crLocus][::-1]

                    for spacer in finalDict[isolate][crLocus]:
                        if len(finalDict[isolate][crLocus][arrayLen-1]) != 7:
                            print('#####################\n  ERROR:  This spacer was not attributed to a cluster\n    Please verify the gff file or the fasta file and rerun CRISPR_Studio\n    %s\n#####################' % finalDict[isolate][crLocus][arrayLen-1])
                            sys.exit()  
                        else:
                            if finalDict[isolate][crLocus][arrayLen-1][6] in grayList:    
                                # square coordinates and color                    
                                outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#FFF\" />\n" % (str(count_xaxis), str(count_yaxis)))
                                # diamond coordinates and color
                                outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#999\"/>\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5)))
                            else:
                                outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\" />\n" % (str(count_xaxis), str(count_yaxis), finalDict[isolate][crLocus][arrayLen-1][4]))
                                outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\"/>\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5), finalDict[isolate][crLocus][arrayLen-1][5]))
                        count_xaxis -= 7.5
                        arrayLen -= 1
                    count_yaxis += 10
                else:
                    count_yaxis += 10
        cr_xaxis += (maxLen * -7.5) -20
    outFile.write("</svg>")

# The function appendSVGfile is executed when the -r option is added (in combination with the attributeClsColorRerun function)
# It will append a preexisting svg file with new sequences, by keeping the original svg the same and adding new squares at the bottom.

def appendSVGfile(finalDict, svgBkup, outFile, maxSpacerCount, maxLociCount, isolateList, grayList):
    outFile = open(outFile, 'w')
    outFile.write('<svg>')
    oldIsolateList = []
    maxY = 0
    with open(svgBkup, 'rU') as svg:
        for line in svg:
            if line.strip() != '</svg>':
                if re.match('<text', line):
                    array = line.strip().split('>')
                    isolate = array[1][:-6]
                    oldIsolateList.append(isolate)
    count_yaxis = maxY
    count_xaxis = 0
    cr_xaxis = 500
    lociList = []
    i = 1
    while i <= maxLociCount:
        lociList.append('CRISPR'+str(i))
        i += 1
    for i, crLocus in enumerate(lociList):
        count_yaxis = maxY + 10
        maxLen = 0
        for isolate in isolateList:
            if isolate not in finalDict.keys():
                if i == 0:
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                    count_yaxis += 10
                else:
                    count_yaxis += 10

            elif isolate in finalDict.keys():
                if i == 0 :
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                count_xaxis = cr_xaxis - 7.5
                if crLocus in finalDict[isolate]:
                    arrayLen = len(finalDict[isolate][crLocus])
                    arrayOrientation = finalDict[isolate][crLocus][0][3]
                    if arrayLen > maxLen:
                        maxLen = arrayLen
                    if arrayOrientation == '-':
                        finalDict[isolate][crLocus] = finalDict[isolate][crLocus][::-1]

                    for spacer in finalDict[isolate][crLocus]:
                        if finalDict[isolate][crLocus][arrayLen-1][6] in grayList:                        
                            outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#FFF\" />\n" % (str(count_xaxis), str(count_yaxis)))
                            outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#999\"/>\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5)))
                        else:
                            outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\" />\n" % (str(count_xaxis), str(count_yaxis), finalDict[isolate][crLocus][arrayLen-1][4]))
                            outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\"/>\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5), finalDict[isolate][crLocus][arrayLen-1][5]))
                        count_xaxis -= 7.5
                        arrayLen -= 1
                    count_yaxis += 10
                else:
                    count_yaxis += 10
        cr_xaxis += (maxLen * -7.5) -20
    outFile.write("</svg>")

# The function generateSVG writes an SVG file containing the necessary information to generate a vector image
# which can be visualized in graphics editor software.
            
def generateSVGunique(finalDict, outFile, maxSpacerCount, maxLociCount, isolateList, grayList):
    outFile = open(outFile, 'w')
    outFile.write('<svg>\n')
    count_yaxis = 0
    count_xaxis = 0
    cr_xaxis = 500
    lociList = []
    i = 1
    while i <= maxLociCount:
        lociList.append('CRISPR'+str(i))
        i += 1
    for i, crLocus in enumerate(lociList):
        count_yaxis = 0
        maxLen = 0
        for isolate in isolateList:
            if isolate not in finalDict.keys():
                if i == 0:
                    # write strain name next to colored squares
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                    count_yaxis += 10
                else:
                    count_yaxis += 10

            elif isolate in finalDict.keys():
                if i == 0 :
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                count_xaxis = cr_xaxis - 7.5
                if crLocus in finalDict[isolate]:
                    arrayLen = len(finalDict[isolate][crLocus])
                    arrayOrientation = finalDict[isolate][crLocus][0][3]
                    if arrayLen > maxLen:
                        maxLen = arrayLen
                    if arrayOrientation == '-':
                        finalDict[isolate][crLocus] = finalDict[isolate][crLocus][::-1]

                    for spacer in finalDict[isolate][crLocus]:
                        if len(finalDict[isolate][crLocus][arrayLen-1]) != 7:
                            print('#####################\n  ERROR:  This spacer was not attributed to a cluster\n    Please verify the gff file or the fasta file and rerun CRISPR_Studio\n    %s\n#####################' % finalDict[isolate][crLocus][arrayLen-1])
                            sys.exit()  
                        else:
                            if finalDict[isolate][crLocus][arrayLen-1][6] in grayList:    
                                # square coordinates and color                    
                                outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\" />\n" % (str(count_xaxis), str(count_yaxis), finalDict[isolate][crLocus][arrayLen-1][4]))
                                # diamond coordinates and color
                                outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\"/>\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5), finalDict[isolate][crLocus][arrayLen-1][5]))
                            else:
                                outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#FFF\"  />\n" % (str(count_xaxis), str(count_yaxis)))
                                outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#999\" />\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5)))    
                        count_xaxis -= 7.5
                        arrayLen -= 1
                    count_yaxis += 10
                else:
                    count_yaxis += 10
        cr_xaxis += (maxLen * -7.5) -20
    outFile.write("</svg>")


def generateSVGspacerSize(finalDict, outFile, maxSpacerCount, maxLociCount, isolateList, grayList):
    outFile = open(outFile, 'w')
    outFile.write('<svg>\n')
    count_yaxis = 0
    count_xaxis = 0
    cr_xaxis = 500
    lociList = []
    i = 1
    while i <= maxLociCount:
        lociList.append('CRISPR'+str(i))
        i += 1
    for i, crLocus in enumerate(lociList):
        count_yaxis = 0
        maxLen = 0
        for isolate in isolateList:
            if isolate not in finalDict.keys():
                if i == 0:
                    # write strain name next to colored squares
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                    count_yaxis += 10
                else:
                    count_yaxis += 10

            elif isolate in finalDict.keys():
                if i == 0 :
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                count_xaxis = cr_xaxis - 7.5
                if crLocus in finalDict[isolate]:
                    arrayLen = len(finalDict[isolate][crLocus])
                    arrayOrientation = finalDict[isolate][crLocus][0][3]
                    if arrayLen > maxLen:
                        maxLen = arrayLen
                    if arrayOrientation == '-':
                        finalDict[isolate][crLocus] = finalDict[isolate][crLocus][::-1]

                    for spacer in finalDict[isolate][crLocus]:
                        outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#FFF\" fill-opacity=\"0.2\" />\n" % (str(count_xaxis), str(count_yaxis)))
                        outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#999\" fill-opacity=\"0.2\" />\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5)))
                        outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.2\" height=\"3\" font-family=\"Verdana\" font-size=\"4.5\" font-weight=\"bold\" fill=\"black\">%s</text>\n" % (str(count_xaxis+1), str(count_yaxis+5.5), finalDict[isolate][crLocus][arrayLen-1][2]))

                        count_xaxis -= 7.5
                        arrayLen -= 1
                    count_yaxis += 10
                else:
                    count_yaxis += 10
        cr_xaxis += (maxLen * -7.5) -20
    outFile.write("</svg>")

# The function appendSVGfile is executed when the -r option is added (in combination with the attributeClsColorRerun function)
# It will append a preexisting svg file with new sequences, by keeping the original svg the same and adding new squares at the bottom.

def appendSVGfileUnique(finalDict, svgBkup, outFile, maxSpacerCount, maxLociCount, isolateList, grayList):
    outFile = open(outFile, 'w')
    outFile.write('<svg>')
    oldIsolateList = []
    maxY = 0
    with open(svgBkup, 'rU') as svg:
        for line in svg:
            if line.strip() != '</svg>':
                continue
                if re.match('<text', line):
                    array = line.strip().split('>')
                    isolate = array[1][:-6]
                    oldIsolateList.append(isolate)
    count_yaxis = maxY
    count_xaxis = 0
    cr_xaxis = 500
    lociList = []
    i = 1
    while i <= maxLociCount:
        lociList.append('CRISPR'+str(i))
        i += 1
    for i, crLocus in enumerate(lociList):
        count_yaxis = maxY + 10
        maxLen = 0
        for isolate in isolateList:
            if isolate in oldIsolateList:
                continue
            if isolate not in finalDict.keys():
                oldIsolateList.append(isolate)
                if i == 0:
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                    count_yaxis += 10
                else:
                    count_yaxis += 10

            elif isolate in finalDict.keys():
                oldIsolateList.append(isolate)
                if i == 0 :
                    outFile.write("<text x=\"%s\" y=\"%s\" width=\"1.6\" height=\"4\" font-family=\"Verdana\" font-size=\"7\" fill=\"black\">%s</text>\n" % (cr_xaxis, str(count_yaxis + 6.5), isolate))
                count_xaxis = cr_xaxis - 7.5
                if crLocus in finalDict[isolate]:
                    arrayLen = len(finalDict[isolate][crLocus])
                    arrayOrientation = finalDict[isolate][crLocus][0][3]
                    if arrayLen > maxLen:
                        maxLen = arrayLen
                    if arrayOrientation == '-':
                        finalDict[isolate][crLocus] = finalDict[isolate][crLocus][::-1]

                    for spacer in finalDict[isolate][crLocus]:
                        if finalDict[isolate][crLocus][arrayLen-1][6] not in grayList:                        
                            outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#FFF\" />\n" % (str(count_xaxis), str(count_yaxis)))
                            outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#999\" />\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5)))
                        else:
                            outFile.write("<rect x=\"%s\" y=\"%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\" />\n" % (str(count_xaxis), str(count_yaxis), finalDict[isolate][crLocus][arrayLen-1][4]))
                            outFile.write("<polygon points=\"%s,%s %s,%s %s,%s %s,%s\" stroke=\"black\" stroke-width=\"0.2\" width=\"7.5\" height=\"7.5\" style=\"fill:#%s\"/>\n" % (str(count_xaxis + 1), str(count_yaxis + 3.75),  str(count_xaxis + 3.75), str(count_yaxis + 1), str(count_xaxis + 6.5), str(count_yaxis + 3.75), str(count_xaxis +3.75), str(count_yaxis + 6.5), finalDict[isolate][crLocus][arrayLen-1][5]))
                        count_xaxis -= 7.5
                        arrayLen -= 1
                    count_yaxis += 10
                else:
                    count_yaxis += 10
        cr_xaxis += (maxLen * -7.5) -20
    outFile.write("</svg>")
                    
def main():
    parser = argparse.ArgumentParser(description='Description: CRISPR Studio generates a figure comparing the different CRISPR arrays identified by CRISPRDetect. Required File: GFF3 file with the CRISPR arrays identified with CRISPRDetect.')
    parser.add_argument("-i", "--inFile", dest="inFile",
                        help="GFF3 file generated with CRISPRDetect (Local installation or web platform http://brownlabtools.otago.ac.nz/CRISPRDetect/predict_crispr_array.html).", metavar="FILE")
    parser.add_argument("-l", "--listOfIsolates", dest='isolateFile', metavar='FILE', 
                        help="CRISPR_Studio will generate the figure with a subset of isolates listed in the file  (Optional: By default, CRISPR_Studio will generate a figure with all the isolates).")
    parser.add_argument("-gU", '--grayOutUnique', dest='gray', help='The unique spacers will be grayed (Optional: by default, CRISPR_Studio attributes a unique color to each group of spacer and unique spacers.)', action='store_true')
    parser.add_argument("-gS", '--grayOutSimilar', dest='unique', help='The conserved spacers will be grayed (Optional: by default, CRISPR_Studio attributes a unique color to each group of spacer and unique spacers.)', action='store_true')
    parser.add_argument("-f", '--checkFasta', dest='checkFasta', help='Verification of the fasta file generated from the GFF file will be skipped if this flag is provided. The verification is mainly based on the length of the spacer sequences. If a spacer is 1.5 time shorter or longer than the average spacer size of the dataset, a warning is raised and the script stops (Verification is ran by default). Correction can be either made in the initial gff file or in the fasta file. If the later, provide the name of the fasta file after the flag (ex. -f corrected_sequences.fasta', action='store_true')
    parser.add_argument("-s", '--sort', dest='sort', help='The order of the isolates in the figure: Available option: CRISPRDetect, DistMatrix or File providing a list of the isolates in the desired order. CRISPRDetect = order in the gff file. DistMatrix = Order extracted from a distance matrix based on the similarity of the arrays in the isolates. File = Order privided in a single column file with the isolates names as the should appear in the figure. The option -l override this option (Default: DistMatrix)')
    parser.add_argument("-r", '--rerun', dest='rerun', help='Use this option to keep the same color attributed to the spacer during a previous analysis', action='store_true')
    parser.add_argument("-c", '--cutoff', dest='cutoff', help='Set to score cutoff for pairing of the spacers (default = 2)')
    parser.add_argument('-n', '--spacerSize', dest='size', help='Show the size of the spacers over the boxed diamonds. We recommend to use this option only for experimental analysis of the data', action='store_true')
    args = parser.parse_args()
    appendSVG = False
    if not len(sys.argv) > 1:
        parser.print_help()
        sys.exit()

    if args.inFile:
        inFile = args.inFile
    else:
        parser.print_help()
        sys.exit()

    outFasta=inFile+'.fasta'

    if not args.checkFasta:
        spacerDict, isolateList, averageLength, crArrayList = spacerExtract(inFile)
        writeFasta(spacerDict, outFasta)
        checkSpacer(spacerDict, averageLength, outFasta)
    elif args.checkFasta:
        print('\n\tTHE FLAG -f (--checkFasta) WAS USED, NO VERIFICATION OF THE FASTA FILE WILL BE RUN')
        print('\tIf you want CRISPR_Studio to extract the spacers from the gff file')
        print('\twithout runing the verification step, just hit enter (leave the field empty)')
        correctedFasta = input('\n\tPlease enter the name of the corrected fasta file: ')
        if len(correctedFasta) > 1:
            spacerDict, isolateList, averageLength, crArrayList = readCorrectedFasta(correctedFasta)
        else:
            spacerDict, isolateList, averageLength, crArrayList = spacerExtract(inFile)
            writeFasta(spacerDict, outFasta)
    
    if args.gray and args.unique:
        print('\n\n\n\
        #################  ERROR #################\n\n\
        The options -g and -u are incompatible and \n\
        can\'t be used together\n\n\
        ##########################################\n\n')
        parser.print_help()
        sys.exit()
    if os.path.isfile(outFasta+'_fasta36'):
        rerunAln = input('CRISPRStudio detected that the alignment file %s already exist. Do you want to re-align the spacer? (y/n) ' % (outFasta+'_fasta36'))
        while rerunAln.lower() != 'y' and rerunAln.lower() != 'yes' and rerunAln.lower() != 'n' and rerunAln.lower() != 'no':
            rerunAln = input('Please enter y or n: ') 
        if rerunAln.lower() == 'y' or rerunAln.lower() == 'yes':
            fastaAlign(outFasta)
    else:
        fastaAlign(outFasta)        
    
    if args.cutoff:
        extractMatch(outFasta+'_fasta36', int(args.cutoff) , spacerDict)
    else:
        extractMatch(outFasta+'_fasta36', 2, spacerDict)

    makeCluster(outFasta+'_fasta36.spacermatch')

    if args.rerun:
        os.rename(outFasta+'_fasta36.spacermatch.mcl.svg',outFasta+'_fasta36.spacermatch.mcl.bkup.svg')
        print('\n\tTHE FLAG -r (--rerun) WAS USED. The color used to represent the spacer of the first figure will be preserved.')
        appendSVG = True
        clDict= attributeClsColorRerun( outFasta+'_fasta36.spacermatch.mcl', spacerDict, outFasta+'_fasta36.spacermatch.mcl.final')
        
    else:
        clDict = attributeClsColor(outFasta+'_fasta36.spacermatch.mcl', spacerDict)


    spacerList = sorted(clDict.keys())
    finalDict, maxSpacerCount, maxLociCount = reformatData(clDict, outFasta+'_fasta36.spacermatch.mcl.final')
    if len(isolateList) == 1 :
        orderedIsolates = isolateList
    elif args.isolateFile:
        orderedIsolates = []
        with open(args.isolateFile, 'rU') as fl :
            for line in fl:
                orderedIsolates.append(line.strip())
    else:
        if not args.sort:
            args.sort = 'DistMatrix'

        if args.sort == 'crisprdetect':
            orderedIsolates = isolateList
        elif args.sort.lower() == 'distmatrix':
            orderedIsolates = classifyCluster(finalDict, outFasta+'_fasta36.spacermatch')
        elif args.sort.lower() == 'file': 
            orderedIsolates = []
            orderedFile = input("Please provide the name of the file with the name of the isoltes in the desired order :")
            with open(orderedFile, 'rU') as fl:
                for line in fl:
                    orderedIsolates.append(line.strip())
        
    if args.gray or args.unique:
        grayList = gray(finalDict, orderedIsolates)
    else:
        grayList = []

    if args.unique:  
        if appendSVG == True:
            appendSVGfileUnique(finalDict, outFasta+'_fasta36.spacermatch.mcl.bkup.svg', outFasta+'_fasta36.spacermatch.mcl.svg', maxSpacerCount, maxLociCount, orderedIsolates, grayList)
        else:
            generateSVGunique(finalDict, outFasta+'_fasta36.spacermatch.mcl.svg', maxSpacerCount, maxLociCount, orderedIsolates, grayList)
    elif args.size:
        generateSVGspacerSize(finalDict, outFasta+'_fasta36.spacermatch.mcl.svg', maxSpacerCount, maxLociCount, orderedIsolates, grayList)
    else:     
        if appendSVG == True:
            appendSVGfile(finalDict, outFasta+'_fasta36.spacermatch.mcl.bkup.svg', outFasta+'_fasta36.spacermatch.mcl.svg', maxSpacerCount, maxLociCount, orderedIsolates, grayList)
        else:
            generateSVG(finalDict, outFasta+'_fasta36.spacermatch.mcl.svg', maxSpacerCount, maxLociCount, orderedIsolates, grayList)
    
if __name__=='__main__':

    main()