Skip to content

Commit

Permalink
Updated pileup reader for spiled alignments,seqsize now works with gz…
Browse files Browse the repository at this point in the history
…ip files, created an inline plotsr variant, general purpose bezierpath function
  • Loading branch information
mnshgl0110 committed Dec 5, 2023
1 parent ab9a001 commit a9e33f0
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 26 deletions.
2 changes: 1 addition & 1 deletion hometools/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.3.0"
__version__ = "1.3.1"
16 changes: 12 additions & 4 deletions hometools/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ def __init__(self, ls):
self.ref = ls[2]
self.rc = int(ls[3])
self.basestring = ls[4]
self.bases, self.indels = [[], []] if self.rc == 0 else self._getbases(ls[4])
if len(self.bases) != self.rc:
self.bases, self.indels, self.gapindex = [[], []] if self.rc == 0 else self._getbases(ls[4])
if len(self.bases) != self.rc - len(self.gapindex):
raise Exception('Number of identified bases if not equals to read count for {}:{}. ReadCount: {}, BaseCount: {}'.format(self.chr, self.pos, self.rc, len(self.bases)))
self.BQ = [ord(c) - 33 for c in ls[5]]
self.indelcount = len(self.indels)
Expand All @@ -306,6 +306,8 @@ def _getbases(self, l):
skip = 0
indel = False
ind = ''
gapindex = deque()
index = 0
for c in l:
if skip > 0 and indel == False:
skip -= 1
Expand All @@ -324,13 +326,17 @@ def _getbases(self, l):
if ind != '':
indellist.append(ind)
ind = ''
index += 1
if c == '*':
# self.indelcount += 1
# indelcount += 1
bases.append(c)
continue
if c == '$': continue
if c in ['<', '>']: sys.exit('spliced alignment found')
if c in ['<', '>']:
gapindex.append(index)
index += 1
continue
if c == '^':
skip = 1
continue
Expand All @@ -340,7 +346,8 @@ def _getbases(self, l):
indelcount += 1
continue
bases.append(c)
return [list(bases), list(indellist)]
index += 1
return list(bases), list(indellist), list(gapindex)

def forwardcnt(self):
try:
Expand Down Expand Up @@ -368,6 +375,7 @@ def getBQ(self, base):
return [self.BQ[i] for i in range(len(self.bases)) if self.bases[i] == base]

def getindelreads(self, reads):
# TODO: Check that it works properly with pileup from spliced reads
from collections import deque
self.indelreads = deque()
reads = reads.split(",")
Expand Down
42 changes: 21 additions & 21 deletions hometools/hometools.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,24 +468,24 @@ def order(*args):
# END


def p_adjust(*args):
def p_adjust(pvalues, method='bh'):
"""
Takes list of values and do multiple hypothesis adjustment
copied from
https://rosettacode.org/wiki/P-value_correction#Python
"""
method = "bh"
pvalues = args[0]
if len(args) > 1:
methods = {"bh", "fdr", "by", "holm", "hommel", "bonferroni", "hochberg"}
metharg = args[1].lower()
if metharg in methods:
method = metharg
# method = "bh"
# pvalues = args[0]
# if len(args) > 1:
methods = {"bh", "fdr", "by", "holm", "hommel", "bonferroni", "hochberg"}
# metharg = args[1].lower()
if method not in methods:
logger.error(f"Incorrect method provided. Available methods: {methods}")

lp = len(pvalues)
n = lp
qvalues = []

if method == 'hochberg':#already all lower case
o = order(pvalues, 'TRUE')
cummin_input = []
Expand All @@ -499,7 +499,7 @@ def p_adjust(*args):
o = order(pvalues, 'TRUE')
cummin_input = []
for index in range(n):
cummin_input.insert(index, (n/(n-index))* pvalues[o[index]])
cummin_input.insert(index, (n/(n-index)) * pvalues[o[index]])
ro = order(o)
cummin = cumminf(cummin_input)
pmin = pminf(cummin)
Expand Down Expand Up @@ -1021,6 +1021,7 @@ def extractseq(args):
else:
for c, s in out.items():
print(">{}\n{}".format(c, s))
return
# END


Expand Down Expand Up @@ -1319,8 +1320,11 @@ def getscaf(args):


def seqsize(args):
from gzip import open as gzopen
fin = args.fasta.name
out = [(chrom, len(seq)) for chrom, seq in readfasta(fin).items()]
f = gzopen(fin, 'r') if isgzip(fin) else open(fin, 'r')
# out = [(chrom, len(seq)) for chrom, seq in readfasta(fin).items()]
out = [(chrom, len(s)) for chrom, s in readfasta_iterator(f, isgzip(fin))]
#TODO: Add output file
for i in out:
print(i[0], i[1], sep="\t")
Expand Down Expand Up @@ -1360,7 +1364,7 @@ def basrat(args):

def getchr(args):
fin = args.fasta.name
if args.chrs is None and args.F is not None: chroms = [c.strip() for c in open(args.F.name, 'r').readlines()]
if args.chrs is None and args.F is not None: chroms = [c.strip().split(' ')[0] for c in open(args.F.name, 'r').readlines()]
elif args.chrs is not None and args.F is None: chroms = args.chrs
else: raise Exception('InvalidValue: Incorrect value for chrs provided')
out = args.o.name if args.o is not None else fin + ".filtered.fasta"
Expand All @@ -1370,10 +1374,6 @@ def getchr(args):
if (c not in chroms) != args.v:
fa.pop(c)
writefasta(fa, out)
# with open(out, 'w') as fout:
# for chrom, seq in .items():
# if (chrom in chroms) != args.v:
# fout.write(">" + chrom + "\n" + "\n".join([seq[i:i+60] for i in range(0, len(seq), 60)]) + '\n')
# END


Expand Down Expand Up @@ -1511,8 +1511,8 @@ def vcfdp(args):


def gffsort(args):
print('Required format of GFF file:\n')
print('Chr\tMethod\tType\tStart\tEnd\tScore\tStrand\tPhase\tAttribute(ID=uniq_id;other_info)\n')
logger.info('Required format of GFF file:\n')
logger.info('Chr\tMethod\tType\tStart\tEnd\tScore\tStrand\tPhase\tAttribute(ID=uniq_id;other_info)\n')
from collections import defaultdict
header = True
geneids = defaultdict(dict)
Expand Down Expand Up @@ -1546,7 +1546,7 @@ def gffsort(args):
gffdata[chr][id] = seq

for chr in sorted(list(geneids.keys())):
for gid in sorted(geneids[chr].keys(), key = lambda x: geneids[chr][x]):
for gid in sorted(geneids[chr].keys(), key=lambda x: geneids[chr][x]):
fout.write(gffdata[chr][gid])
# END

Expand Down Expand Up @@ -2853,8 +2853,8 @@ def main(cmd):
parser_exseq.set_defaults(func=extractseq)
parser_exseq.add_argument("fasta", help="fasta file", type=argparse.FileType('r'))
parser_exseq.add_argument("-c", "--chr", help="Chromosome ID", type=str)
parser_exseq.add_argument("-s", "--start", help="Start location (BED format, 0-base, end included)", type=int)
parser_exseq.add_argument("-e", "--end", help="End location (BED format, 0-base, end excluded)", type=int)
parser_exseq.add_argument("-s", "--start", help="Start location (BED format, 0-base, start included)", type=int)
parser_exseq.add_argument("-e", "--end", help="End location (BED format, 1-base, end excluded)", type=int)
parser_exseq.add_argument("--fin", help="File containing locations to extract (BED format)", type=argparse.FileType('r'))
parser_exseq.add_argument("-o", help="Output file name", type=argparse.FileType('w'), default=None)
parser_exseq.add_argument("-r", help="Reverse complement the sequence", default=False, action='store_true')
Expand Down
Loading

0 comments on commit a9e33f0

Please sign in to comment.