Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clarify descriptions of --normalize-by-specimen flag functionality in scripts/classif_rect.py #312

Merged
merged 5 commits into from
Feb 3, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 66 additions & 6 deletions scripts/classif_rect.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@
import logging
import sqlite3
import csv
import warnings

warnings.filterwarnings("always", category=DeprecationWarning)
deprecation_message="""
ATTENTION!!! This script (classif_rect.py) has been deprecated in favor of the streamlined classif_table.py,
and will be removed entirely in a future release. Please switch over when you are able.
"""
warnings.warn(deprecation_message, DeprecationWarning, 2)


log = logging.getLogger(__name__)

Expand All @@ -19,6 +28,7 @@ def cursor_to_csv(curs, outfile, description=None):
writer.writerows(curs)

def by_taxon(args):
"""This one just normalizes (for the normalized_tally) by taxon"""
# a bit ugly. maybe in the future we'll use sqlalchemy.
specimen_join = ''
if args.specimen_map:
Expand All @@ -44,17 +54,54 @@ def by_taxon(args):
with args.by_taxon:
cursor_to_csv(curs, args.by_taxon)

def by_taxon_normalized_by_specimen(args, by_specimen_results, specimens):
"""This one normalizes first by specimen, using the results from
by_specimen, then by taxon for the normalized_tally results."""

taxa_cols = ['tax_id', 'tax_name', 'rank']
n_specimens = len(specimens)
results = by_specimen_results.values()

cols = taxa_cols + ['normalized_tally']
with args.by_taxon:
writer = csv.writer(args.by_taxon)
writer.writerow(cols)
for result in results:
row = [result[c] for c in taxa_cols]
tax_total = sum([result[s] for s in specimens if s in
result.keys()])
row.append(tax_total / n_specimens)
writer.writerow(row)


def by_specimen(args):
log.info('tabulating counts by specimen')
log.info('tabulating total per specimen mass sums')
curs = args.database.cursor()
curs.execute("""
CREATE TEMPORARY TABLE specimen_mass
(specimen, total_mass, PRIMARY KEY (specimen))""")

curs.execute("""
INSERT INTO specimen_mass
SELECT specimen, SUM(mass)
FROM specimens
JOIN multiclass_concat mc USING (name)
JOIN placement_names USING (name, placement_id)
WHERE want_rank = ?
GROUP BY specimen
""", (args.want_rank,))

log.info('tabulating counts by specimen')
curs.execute("""
SELECT specimen,
COALESCE(tax_name, "unclassified") tax_name,
COALESCE(t.tax_id, "none") tax_id,
COALESCE(t.rank, "root") rank,
SUM(mass) tally,
COUNT(DISTINCT placement_id) placements
COUNT(DISTINCT placement_id) placements,
SUM(mass) / sm.total_mass normalized_tally
FROM specimens
JOIN specimen_mass sm USING (specimen)
JOIN multiclass_concat mc USING (name)
JOIN placement_names USING (name, placement_id)
LEFT JOIN taxa t USING (tax_id)
Expand All @@ -72,11 +119,11 @@ def by_specimen(args):

results = {}
specimens = set()
for specimen, tax_name, tax_id, rank, tally, _ in rows:
for specimen, tax_name, tax_id, rank, tally, placements, normalized_tally in rows:
row = results.get(tax_id)
if row is None:
row = results[tax_id] = dict(tax_id=tax_id, tax_name=tax_name, rank=rank)
row[specimen] = tally
row[specimen] = normalized_tally if args.normalize_by_specimen else tally
specimens.add(specimen)

log.info('writing by_specimen')
Expand All @@ -95,6 +142,8 @@ def by_specimen(args):

writer.writerows(results.itervalues())

return (results, specimens)


def main():
logging.basicConfig(
Expand All @@ -115,10 +164,16 @@ def main():
help='input CSV map from sequences to specimens')
parser.add_argument('--metadata-map', type=argparse.FileType('r'), metavar='CSV',
help='input CSV map including a specimen column and other metadata')
parser.add_argument('--normalize-by-specimen', default=False, action='store_true',
help="""By taxon output has normalized_tally results be normalized by
specimen first, then by taxon. Additionally, the output of by-specimen
will use normalized_tally rather than tally.""")

args = parser.parse_args()
if args.by_specimen and not args.specimen_map:
parser.error('specimen map is required for by-specimen output')
if args.normalize_by_specimen and not args.by_specimen:
parser.error('must compute by-specimen in order to normalize by specimen')

if args.specimen_map:
log.info('populating specimens table from specimen map')
Expand All @@ -134,9 +189,14 @@ def main():
reader = csv.DictReader(args.metadata_map)
args.metadata = {data['specimen']: data for data in reader}

by_taxon(args)
if args.by_specimen:
by_specimen(args)
by_specimen_results, specimens = by_specimen(args)
if args.normalize_by_specimen:
by_taxon_normalized_by_specimen(args, by_specimen_results, specimens)
else:
by_taxon(args)


main()


248 changes: 248 additions & 0 deletions scripts/classif_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
#!/usr/bin/env python
"""
Produce csv tables representing various counts in a classification database.

Without a specimen map, only by_taxon output is available. If a specimen map is
provided, additional output options are available presenting results by specimen.
This map also adds columns to by_taxon output (notably the avg_freq column,
indicating average frequency of the corresponding taxon across specimens).
"""

import argparse
import logging
import sqlite3
import csv

log = logging.getLogger(__name__)


# Used various places
taxa_cols = ['tax_id', 'tax_name', 'rank']


def cursor_to_csv(curs, outfile, description=None):
"Iterate over SQL cursor and write to outfile"
if description is None:
description = curs.description
writer = csv.writer(outfile)
writer.writerow([d[0] for d in description])
writer.writerows(curs)


def by_taxon(args):
"""This function queries for the tally and placement count totals for each taxon at the desired rank.
It does not require a specimen map."""

log.info('tabulating by_taxon')
curs = args.database.cursor()
curs.execute("""
SELECT COALESCE(tax_name, "unclassified") tax_name,
COALESCE(t.tax_id, "none") tax_id,
COALESCE(t.rank, "root") rank,
CAST(SUM(mass) AS INT) tally,
COUNT(DISTINCT placement_id) placements
FROM multiclass_concat mc
JOIN placement_names USING (name, placement_id)
LEFT JOIN taxa t USING (tax_id)
WHERE want_rank = ?
GROUP BY t.tax_id
ORDER BY tally DESC
""", (args.rank,))

with args.by_taxon:
cursor_to_csv(curs, args.by_taxon)


def by_taxon_with_specimens(args, by_specimen_results, specimens):
"""This function uses by_specimen results to compute, for each taxon, the average frequency of that
taxon across specimens."""

log.info('tabulating by_taxon using by_specimen results')
n_specimens = float(len(specimens))

cols = taxa_cols + ['tally', 'placements', 'avg_tally', 'avg_placements', 'avg_freq' ]
with args.by_taxon:
writer = csv.writer(args.by_taxon)
writer.writerow(cols)
# Iterate over taxa in the by_specimen_results
for result in by_specimen_results:
row = [result[c] for c in taxa_cols]
tally, placements, freqs = 0, 0, 0
# For each specimen, fetch counts from the dict they map to and add to totals
for spec in specimens:
if spec in result.keys():
spec_result = result[spec]
tally += spec_result['tally']
placements += spec_result['placements']
freqs += spec_result['freq']
# Add count data to output row, and write out
row += [tally, placements, tally/n_specimens, placements/n_specimens, freqs/n_specimens]
writer.writerow(row)


def by_specimen(args):
"""This function computes results by specimen, and if --by-specimen is supplied as an arg, writes results
out to the specified file. The results of this function are also used downstream in calls to by_taxon_wide
and by_specimen_wide.

This function returns a list of dicts, and a list of specimens. Each dict has tax_id, tax_name and rank
key/value pairs. For each specimen, these dicts map the corresponding specimen name to count results for
that specimen."""

log.info('tabulating total per specimen mass sums')
curs = args.database.cursor()
curs.execute("""
CREATE TEMPORARY TABLE specimen_mass
(specimen, total_mass, PRIMARY KEY (specimen))""")

curs.execute("""
INSERT INTO specimen_mass
SELECT specimen, SUM(mass)
FROM specimens
JOIN multiclass_concat mc USING (name)
JOIN placement_names USING (name, placement_id)
WHERE want_rank = ?
GROUP BY specimen
""", (args.rank,))

log.info('tabulating counts by specimen')
curs.execute("""
SELECT specimen,
COALESCE(tax_name, "unclassified") tax_name,
COALESCE(t.tax_id, "none") tax_id,
COALESCE(t.rank, "root") rank,
CAST(SUM(mass) AS INT) tally,
COUNT(DISTINCT placement_id) placements,
SUM(mass) / sm.total_mass freq
FROM specimens
JOIN specimen_mass sm USING (specimen)
JOIN multiclass_concat mc USING (name)
JOIN placement_names USING (name, placement_id)
LEFT JOIN taxa t USING (tax_id)
WHERE want_rank = ?
GROUP BY specimen, t.tax_id
ORDER BY freq DESC
""", (args.rank,))

desc = curs.description
rows = curs.fetchall()
# By specimen (tall)
if args.by_specimen:
log.info('writing by_specimen (tall)')
with args.by_specimen:
cursor_to_csv(rows, args.by_specimen, desc)

# This will be a dictionary mapping each tax_id to a dictionary of data for the corresponding taxon
results = {}
specimens = set()
for specimen, tax_name, tax_id, rank, tally, placements, freq in rows:
row = results.get(tax_id)
if row is None:
# The taxon dict will include name and rank...
row = results[tax_id] = dict(tax_id=tax_id, tax_name=tax_name, rank=rank)
# and for each specimen with classifications for this taxon, a dictionary of the count results
row[specimen] = dict(tally=tally, placements=placements, freq=freq)
specimens.add(specimen)

assert len(specimens.intersection(taxa_cols)) == 0, "The following are invalid specimen names: %r" % taxa_cols
return (results.values(), specimens)


def by_specimen_wide(args, handle, by_specimen_results, specimens, count):
"""Write out the specified count (tally/freq) into a wide table where rows are taxa and cols are
specimens"""
if not count in ('tally', 'freq', 'placements'):
raise ValueError("Invalid count specified for by_specimen_wide output: {0}".format(count))

log.info('writing wide output with {0}'.format(count))
cols = ['tax_name', 'tax_id', 'rank'] + list(specimens)
with handle:
writer = csv.DictWriter(handle, fieldnames=cols, restval=0)
writer.writeheader()

# Write in the metadata
if args.metadata_map:
for key in {k for data in args.metadata.itervalues() for k in data}:
d = {specimen: args.metadata.get(specimen, {}).get(key)
for specimen in specimens}
d['tax_name'] = key
d['tax_id'] = d['rank'] = ''
writer.writerow(d)

# Little helper for us to safely pass through the count and non-count information from each result in
# by_specimen_results.
def count_or_taxdata(v):
try:
return v[count]
except TypeError:
return v

# Write in the specified counts
for row in by_specimen_results:
row_dict = dict((col, count_or_taxdata(v)) for col, v in row.iteritems())
writer.writerow(row_dict)


def main():
logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")

# INPUTS
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('database', type=sqlite3.connect,
help='sqlite database (output of `rppr prep_db` after `guppy classify`)')
parser.add_argument('-m', '--specimen-map', type=argparse.FileType('r'), metavar='CSV',
help='input CSV map from sequences to specimens (headerless; 1st col sequence id, 2nd specimen name')
parser.add_argument('-M', '--metadata-map', type=argparse.FileType('r'), metavar='CSV',
help="""input CSV map including a specimen column and other metadata; if specified gets merged in with
whichever of freqs_wide and tallies_wide outputs are specified (must include header)""")
parser.add_argument('-r', '--rank', default='species', metavar='RANK',
help='rank at which results are to be tabulated (default: %(default)s)')

# OUTPUTS
parser.add_argument('by_taxon', type=argparse.FileType('w'),
help='output CSV file with counts by taxon')
parser.add_argument('-s', '--by-specimen', type=argparse.FileType('w'), nargs='?',
help='optional output CSV file with counts by specimen and taxa (requires specimen map)')
parser.add_argument('-f', '--freqs-wide', type=argparse.FileType('w'), nargs='?',
help="""optional output CSV file where rows are taxa, columns are specimens, and entries are
frequencies (requires specimen map)""")
parser.add_argument('-t', '--tallies-wide', type=argparse.FileType('w'), nargs='?',
help="""optional output CSV file where rows are taxa, columns are specimens, and entries are
tallies (requires specimen map)""")

args = parser.parse_args()
if (args.by_specimen or args.freqs_wide or args.tallies_wide) and not args.specimen_map:
parser.error('specimen map is required for by_specimen, freqs_wide and tallies_wide output')

if args.metadata_map:
log.info('reading metadata map')
with args.metadata_map:
reader = csv.DictReader(args.metadata_map)
args.metadata = {data['specimen']: data for data in reader}

if args.specimen_map:
log.info('populating specimens table from specimen map')
curs = args.database.cursor()
curs.execute("CREATE TEMPORARY TABLE specimens (name, specimen, PRIMARY KEY (name, specimen))")
with args.specimen_map:
reader = csv.reader(args.specimen_map)
curs.executemany("INSERT INTO specimens VALUES (?, ?)", reader)
by_specimen_results, specimens = by_specimen(args)

# By specimen wide output
if args.freqs_wide:
by_specimen_wide(args, args.freqs_wide, by_specimen_results, specimens, "freq")
if args.tallies_wide:
by_specimen_wide(args, args.tallies_wide, by_specimen_results, specimens, "tally")
# By taxon computed from by_specimen results
by_taxon_with_specimens(args, by_specimen_results, specimens)

else:
by_taxon(args)


if __name__ == "__main__":
main()