Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
mnshgl0110 committed Jan 31, 2024
2 parents d74d2cd + 935a84e commit 2e7608c
Showing 1 changed file with 59 additions and 6 deletions.
65 changes: 59 additions & 6 deletions hometools/hometools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1773,6 +1773,46 @@ def pbamrc(args):
# END


def bamrc2af(args):
"""
Reads the output of pbamrc and a corresponding VCF file and returns the allele frequencies of the alt alleles.
Currently, working for SNPs only
"""
from gzip import open as gzopen
logger = mylogger("bamrc2af")
rcfin = args.bamrc.name
vcffin = args.vcf.name
outfin = 'bamrc_af.txt' if args.out is None else args.out.name

logger.info('Reading VCF')
posdict = dict()
op = gzopen if isgzip(vcffin) else open
with op(vcffin, 'r') as vcf:
for line in vcf:
# break
if line[0] == 35: continue
line = line.decode()
line = line.strip().split()
if line[4].upper() not in 'ACGT' : continue
posdict[tuple(line[:2])] = line[3], line[4]

logger.info('Reading bamrc')
# Get AF from bam readcount
basedict = dict(zip('ACGT', range(4, 8)))
with open(rcfin, 'r') as rc, open(outfin, 'w') as out:
for line in rc:
line = line.strip().split()
try:
ref, alt = posdict[(line[0], line[1])]
except KeyError:
logger.warning(f'Position {line[0]}:{line[1]} not found in VCF. Skipping it.')
refi = basedict[ref]
alti = basedict[alt]
out.write(f'{line[0]}\t{line[1]}\t{ref}\t{alt}\t{round(int(line[refi])/int(line[3]) , 2)}\t{round(int(line[alti])/int(line[3]), 2)}\n')
logger.info('Finishe')
# END


def run_ppileup(locs, out, bam, pars):
from subprocess import Popen, PIPE
with open(out, 'w') as fout:
Expand Down Expand Up @@ -2569,22 +2609,29 @@ def main(cmd):
# <editor-fold desc="BAM Commands">
parser_bamcov = subparsers.add_parser("bamcov", help="BAM: Get mean read-depth for chromosomes from a BAM file", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_pbamrc = subparsers.add_parser("pbamrc", help="BAM: Run bam-readcount in a parallel manner by dividing the input bed file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_bamrc2af = subparsers.add_parser("bamrc2af", help="BAM: Reads the output of pbamrc and a corresponding VCF file and saves the allele frequencies of the ref/alt alleles.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_splitbam = subparsers.add_parser("splitbam", help="BAM: Split a BAM files based on TAG value. BAM file must be sorted using the TAG.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_mapbp = subparsers.add_parser("mapbp", help="BAM: For a given reference coordinate get the corresponding base and position in the reads/segments mapping the reference position", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_bam2coords = subparsers.add_parser("bam2coords", help="BAM: Convert BAM/SAM file to alignment coords", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_ppileup = subparsers.add_parser("ppileup", help="BAM: Currently it is slower than just running mpileup on 1 CPU. Might be possible to optimize later. Run samtools mpileup in parallel when pileup is required for specific positions by dividing the input bed file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# </editor-fold>

# <editor-fold desc="syri CLI">
parser_runsyri = subparsers.add_parser("runsyri", help=hyellow("syri: Parser to align and run syri on two genomes"),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_syriidx = subparsers.add_parser("syriidx", help=hyellow(
"syri: Generates index for syri.out. Filters non-SR annotations, then bgzip, then tabix index"),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_syri2bed = subparsers.add_parser("syri2bed", help=hyellow("syri: Converts syri output to bedpe format"),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# </editor-fold>

## syri
parser_runsyri = subparsers.add_parser("runsyri", help=hyellow("syri: Parser to align and run syri on two genomes"), formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_syriidx = subparsers.add_parser("syriidx", help=hyellow("syri: Generates index for syri.out. Filters non-SR annotations, then bgzip, then tabix index"), formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_syri2bed = subparsers.add_parser("syri2bed", help=hyellow("syri: Converts syri output to bedpe format"), formatter_class=argparse.ArgumentDefaultsHelpFormatter)

## Plotting
# <editor-fold desc="Plotting">
parser_plthist = subparsers.add_parser("plthist", help="Plot: Takes frequency output (like from uniq -c) and generates a histogram plot", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_plotal = subparsers.add_parser("plotal", help="Plot: Visualise pairwise-whole genome alignments between multiple genomes", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_plotbar = subparsers.add_parser("pltbar", help="Plot: Generate barplot. Input: a two column file with first column as features and second column as values", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# </editor-fold>


## Assembly graphs
parser_asmreads = subparsers.add_parser("asmreads", help=hyellow("GFA: For a given genomic region, get reads that constitute the corresponding assembly graph"), formatter_class=argparse.ArgumentDefaultsHelpFormatter)
Expand All @@ -2607,6 +2654,12 @@ def main(cmd):
parser.print_help()
sys.exit()

# bamrc2af
parser_bamrc2af.set_defaults(func=bamrc2af)
parser_bamrc2af.add_argument("bamrc", help="BAM readcount file generated using bamrc", type=argparse.FileType('r'))
parser_bamrc2af.add_argument("vcf", help="VCF file", type=argparse.FileType('r'))
parser_bamrc2af.add_argument("out", help="Output file", type=argparse.FileType('w'))

# xls2csv
parser_xls2tsv.set_defaults(func=xls2csv)
parser_xls2tsv.add_argument("xls", help="Input excel file", type=argparse.FileType('r'))
Expand Down

0 comments on commit 2e7608c

Please sign in to comment.