gtf2expr

#!/bin/bash
#PBS -l nodes=1:ppn=4

GENOME="mm9"
FEATURE="exon"
GROUP="gene_id"
PROCESSORS=1

#### usage ####
usage() {
	echo Program: "gtf2expr (compute read count corresponding to input GTF file)"
	echo Author: BRIC, University of Copenhagen, Denmark
	echo Version: 1.0
	echo Contact: pundhir@binf.ku.dk
	echo "Usage: gtf2expr -i <file> -j <file>"
	echo "Options:"
	echo " -i <file>   [input file containing genomic coordinate in GTF or BED format (can be stdin)]"
    echo " -j <file>   [input configuration file containing bam file information]"
    echo "             [<id> <bam file> (id should be tpm)]"
    echo "[OPTIONS]"
    echo " -s <float>  [normalize expression by input size factor]"
    echo "             [if multiple, seperate them by a comma]"
    echo " -m          [normalize expression by counts per million mapped reads]"
    echo " -d          [use SAM formatted file for the input BAM files]"
    echo "             [large BAM files sometime do not lead to correct results]"
    echo "             [details: https://github.com/chapmanb/bcbio-nextgen/issues/394]"
    echo " -e <int>    [extend 3' end of reads by input number of bases (useful for ChIP-seq data)]"
    echo " -g <string> [genome (default: mm9)]"
    echo "[OPTIONS: featureCounts]"
    echo " -t <string> [specify the feature type (default: exon)]"
    echo " -r <string> [Specify the attribute type used to group features, gene_id or gene_name (default: gene_id)]"
    echo " -M          [if specified, multi-mapping reads/fragments will be counted]"
    echo " -p <int>    [number of processors to use (default: 1)]"
    echo " -u          [if specified, reads that were marked as duplicates will be ignored]"
	echo " -h          [help]"
	echo
	exit 0
}

#### parse options ####
while getopts i:j:s:mde:g:t:r:Mp:uh ARG; do
	case "$ARG" in
		i) INPUTCOORFILE=$OPTARG;;
        j) INPUTCONFIGFILE=$OPTARG;;
        s) INPUTSIZEFACTORS=$OPTARG;; 
        m) CPM=1;;
        d) USE_SAM=1;;
        e) INPUTEXTENDS=$OPTARG;;
        g) GENOME=$OPTARG;;
        t) FEATURE=$OPTARG;;
        r) GROUP=$OPTARG;;
        M) MULTIMAPPING=1;;
        p) PROCESSORS=$OPTARG;;
        u) IGNOREDUPLICATES=1;;
		h) HELP=1;;
	esac
done

## usage, if necessary file and directories are given/exist
if [ -z "$INPUTCOORFILE" -o -z "$INPUTCONFIGFILE" -o "$HELP" ]; then
	usage
fi

## populating files based on input genome
GENOME_FILE=$(initialize_genome -i $FINDNFRPATH/data/annotations/GENOME_FILE -g $GENOME)
#GENOME_FILE=$FINDNFRPATH/data/annotations/$GENOME_FILE
if [ ! -f "$GENOME_FILE" ]; then
    >&2 echo
    >&2 echo "computation for $GENOME is not available yet"
    >&2 echo "please add the chromosome size file for $GENOME at $FINDNFRPATH/data/annotations"
    >&2 echo "also update the $FINDNFRPATH/data/annotations/GENOME_FILE"
    >&2 echo
    usage
fi
# echo $GENOME_FILE; exit

## create temporary GTF file if input is from stdin
if [ "$INPUTCOORFILE" == "stdin" ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
while read LINE; do
echo "${LINE}"
done > $TMP
#| perl -an -F'/\t+/' -e '$line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t$//g; chomp($line); print "$line\n";' > $TMP
INPUTCOORFILE=$TMP
fi

## create temporary file to store raw read counts
TMP_FILE=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)

## check, if input genomic coordinates are in BED format, and convert to GTF
IS_BED=$(zless $INPUTCOORFILE | cut -f 2 | head -n 2 | tail -n 1 | perl -ane 'if($_=~/^[0-9]+$/) { print $_; }' | wc -l);

if [ "$IS_BED" -gt 0 ]; then
    zless $INPUTCOORFILE | perl -ane 'chomp($F[5]); if($F[0]=~/\_/) { next; } $count++; $F[3]=~s/^.*\///g; print "$F[0]\tregion\t'$FEATURE'\t$F[1]\t$F[2]\t.\t$F[5]\t.\tgene_id \"$F[3]\"\n";' > $TMP_FILE.gtf
    INPUTCOORFILE="$TMP_FILE.gtf"
fi

## read configuration file
INPUTBAMFILES=$(cat $INPUTCONFIGFILE | perl -ane '
    if($_=~/^tpm/) {
        $file.="$F[1],";
    } END {
        $file=~s/\,$//g;
        print "$file\n";
    }'
)

## parse input bam files in an array
oIFS=$IFS
IFS=","
BAMFILES=($INPUTBAMFILES)
BAMFILES_COUNT=${#BAMFILES[@]}
IFS=$oIFS

## initialize size factors, if both size factors and total reads not provided
if [ -z "$INPUTSIZEFACTORS" -a -z "$CPM" ]; then
    INPUTSIZEFACTORS=""
    for(( i=0; i<$BAMFILES_COUNT; i++ )); do
        INPUTSIZEFACTORS="$INPUTSIZEFACTORS,1"
    done
    INPUTSIZEFACTORS=`echo $INPUTSIZEFACTORS | perl -ane '$_=~s/^\,//g; print $_;'`;
fi

## parse bam files in an array
IFS=","
INPUTBAMFILES=$(echo $INPUTBAMFILES | sed 's/\,/ /g')
IFS=$oIFS

## parse featureCounts arguments
ARGS_FEATURECOUNTS=""
if [ ! -z "$INPUTEXTENDS" ]; then
    ARGS_FEATURECOUNTS="--readExtension3 $INPUTEXTENDS "
fi

if [ ! -z "$MULTIMAPPING" ]; then
    ARGS_FEATURECOUNTS="$ARGS_FEATURECOUNTS -M "
fi

if [ ! -z "$IGNOREDUPLICATES" ]; then
    ARGS_FEATURECOUNTS="$ARGS_FEATURECOUNTS -u "
fi

#echo "featureCounts -t $FEATURE -g $GROUP -O $ARGS_FEATURECOUNTS -T $PROCESSORS -a $INPUTCOORFILE -o $TMP_FILE $INPUTBAMFILES"; exit

if [ ! -z "$INPUTSIZEFACTORS" ]; then
    ## parse input size factors in an array
    IFS=","
    SIZEFACTORS=($INPUTSIZEFACTORS)
    SIZEFACTORS_COUNT=${#SIZEFACTORS[@]}
    IFS=$oIFS

    if [ "$BAMFILES_COUNT" -ne "$SIZEFACTORS_COUNT" ]; then
        echo -n "Please provide size factor and extend parameter for each input bam file";
        usage
    fi

    if [ ! -z "$USE_SAM" ]; then
        INPUTBAMFILES=$(echo $INPUTBAMFILES | sed 's/bam/sam/g')
        featureCounts -t $FEATURE -g $GROUP -O $ARGS_FEATURECOUNTS -T $PROCESSORS -a $INPUTCOORFILE -o $TMP_FILE $INPUTBAMFILES
    else
        featureCounts -t $FEATURE -g $GROUP -O $ARGS_FEATURECOUNTS -T $PROCESSORS -a $INPUTCOORFILE -o $TMP_FILE $INPUTBAMFILES
    fi

    grep -vE '^[#|GeneId]+' $TMP_FILE | perl -ane '@sizeFactors=split(/\,/, "'$INPUTSIZEFACTORS'"); $field_count=scalar(@F)-'$BAMFILES_COUNT'; foreach(@F[0..$field_count-1]) { print "$_\t"; } $i=0; foreach(@F[$field_count..scalar(@F)-1]) { printf("%0.5f\t", $_/$sizeFactors[$i]); $i++; } print "\n";'
else
    MAPPEDREADS=""
    for(( i=0; i<$BAMFILES_COUNT; i++ )); do
        ## create index of input BAM file, if does not exist
        if [ ! -f "${BAMFILES[$i]}.bai" ]; then
            samtools index ${BAMFILES[$i]}
        fi

        COUNT=$(samtools idxstats ${BAMFILES[$i]} | grep -wE "^[0-9a-zA-Z]+" | perl -ane '$sum+=$F[2]; END { print "$sum"; }');
        MAPPEDREADS="$MAPPEDREADS,$COUNT"
    done
    MAPPEDREADS=$(echo $MAPPEDREADS | perl -ane '$_=~s/^\,//g; print $_;')

    if [ ! -z "$USE_SAM" ]; then
        INPUTBAMFILES=$(echo $INPUTBAMFILES | sed 's/bam/sam/g')
        featureCounts -t $FEATURE -g $GROUP -O $ARGS_FEATURECOUNTS -T $PROCESSORS -a $INPUTCOORFILE -o $TMP_FILE $INPUTBAMFILES
    else
        featureCounts -t $FEATURE -g $GROUP -O $ARGS_FEATURECOUNTS -T $PROCESSORS -a $INPUTCOORFILE -o $TMP_FILE $INPUTBAMFILES
    fi

    grep -vE '^[#|GeneId]+' $TMP_FILE | perl -ane '@mappedReads=split(/\,/, "'$MAPPEDREADS'"); $field_count=scalar(@F)-'$BAMFILES_COUNT'; foreach(@F[0..$field_count-1]) { print "$_\t"; } $i=0; foreach(@F[$field_count..scalar(@F)-1]) { printf("%0.5f\t", ($_*1000000)/$mappedReads[$i]); $i++; } print "\n";'
fi
#| perl -ane '$chr=$F[1]; $chr=~s/\;.*//g; @t=sort { $a <=> $b } split(/\;/, $F[2]); $start=$t[0]; @t=reverse sort { $a <=> $b } split(/\;/, $F[3]); $end=$t[0]; $strand=$F[4]; $strand=~s/\;.*//g; print "$chr\t$start\t$end\t$F[0]\t1\t$strand\t$F[5]\t"; foreach(@F[6..scalar(@F)-1]) { print "$_\t"; } print "\n";' | sortBed -i stdin

rm $TMP_FILE
rm $TMP_FILE.summary

if [ -f "$TMP_FILE.gtf" ]; then
    rm $TMP_FILE.gtf
fi

if [ ! -z "$TMP" ]; then
    rm $TMP
fi

exit