Skip to content

Commit

Permalink
Fix for annotations function, fix for x-axis labels on clusters plot
Browse files Browse the repository at this point in the history
  • Loading branch information
sandyjmacdonald committed Jul 17, 2015
1 parent 1d9e7de commit a0c3592
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 41 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
dots_backend/db
dots_backend/scratch.py
sample_data_two
build
dist
dots_for_microarrays.egg-info
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Dots

[![Build Status](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays.svg?branch=develop)](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays)
[![Build Status](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays.svg?branch=master)](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays) [![Coverage Status](https://coveralls.io/repos/sandyjmacdonald/dots_for_microarrays/badge.svg?branch=master&service=github)](https://coveralls.io/github/sandyjmacdonald/dots_for_microarrays?branch=master)

Dots is a Python package for working with microarray data.
Its back-end is a standalone package for reading in, normalisation, statistical
Expand Down Expand Up @@ -206,6 +206,24 @@ experiment = experiment.remove_sample('treated_1')
This method will be of more use once the quality control features are added, allowing you to remove
samples that are of low quality before proceeding with the analysis and plotting.

### The read_annotations function

For array files that lack gene symbols or descriptions, you can pass in an annotations file that
you downloaded from the Agilent website or from the NCBI GEO database. These annotations will be
added to your Array and Experiment class instances.

The `read_annotations` function takes a filename as an argument and returns a dictionary of the
annotations with the key being the probename and the values being a dictionary of `GeneName` and
`Description`

You can read them in as part of your `read_experiment` call as follows:

```python
experiment = read_experiment(array_filenames, baseline=True, annotations_file='annotations.txt')
```

Note that the `annotations_file` is an

## The dots_analysis module

This is the meat of the dots_backend.
Expand Down
53 changes: 29 additions & 24 deletions dots_backend/dots_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,27 @@ def __init__(self, df, filename, group, replicate):
self.group = group
self.replicate = int(replicate)
self.sampleid = group + '_' + str(replicate)

def get_probenames(self):
'''Return a list of the probe names from the array.'''

return self.df['ProbeName'].tolist()

def get_genenames(self):
'''Return a list of the gene names from the array.'''

return self.df['GeneName'].tolist()

def get_systematicnames(self):
'''Return a list of the systematics names from the array.'''

return self.df['SystematicName'].tolist()

def get_descriptions(self):
'''Return a list of the descriptions from the array.'''

return self.df['Description'].tolist()

def get_intensities(self):

'''Return a list of the intensity values from the array.'''
Expand Down Expand Up @@ -96,10 +96,10 @@ def __init__(self, arrays):

self.arrays = arrays
master_array = self.arrays[0].df[['FeatureNum', 'ProbeName', 'GeneName', 'SystematicName', 'Description', 'gProcessedSignal_log2_shifted']].copy()

## Rename the column containing the intesity values with the sample ID.
master_array.rename(columns={'gProcessedSignal_log2_shifted': self.arrays[0].sampleid}, inplace=True)

## Pull out the group names and set the groups attribute.
self.groups = list(set([array.group for array in self.arrays]))

Expand Down Expand Up @@ -176,21 +176,25 @@ def remove_sample(self, sampleid):

## Functions ##

def read_array(filename, group, replicate, annotations=None):
def read_array(filename, group, replicate, annotations_file=None):
'''Read in a raw array data file and return an Array instance.
Args:
filename (str): The filename of the raw array data file.
group (str): The name of the group to which this array belongs.
replicate (int): The replicate number within the group.
annotations_file (str): The filename of the annotations file.
'''

f = open(filename, 'r')
lines = f.readlines()
values = {}
headers = lines[9].rstrip().split('\t')


if annotations_file is not None:
annotations = read_annotations(annotations_file)

## Compile a list of indices for the various items that will be pulled out of the file.
for i, h in enumerate(headers):
if h == 'FeatureNum':
Expand All @@ -208,19 +212,19 @@ def read_array(filename, group, replicate, annotations=None):
elif h == 'ControlType':
control_type_ind = i

## Split the lines up and use the indices to pull out the values and add them
## Split the lines up and use the indices to pull out the values and add them
## to the values dictionary.
for l in lines[10:]:
l = l.rstrip().split('\t')
if l[control_type_ind] == '0':
if 'GeneName' in headers and 'Description' in headers:
if 'GeneName' in headers and 'Description' in headers:
values[l[feature_num_ind]] = { 'FeatureNum': l[feature_num_ind], \
'ProbeName': l[probe_name_ind], \
'GeneName': l[gene_name_ind], \
'SystematicName': l[sys_name_ind], \
'Description': l[desc_name_ind], \
'gProcessedSignal': l[signal_ind] }
elif annotations is not None:
elif annotations_file is not None:
values[l[feature_num_ind]] = { 'FeatureNum': l[feature_num_ind], \
'ProbeName': l[probe_name_ind], \
'GeneName': annotations[l[probe_name_ind]]['GeneName'], \
Expand All @@ -234,52 +238,53 @@ def read_array(filename, group, replicate, annotations=None):
'SystematicName': l[sys_name_ind], \
'Description': '', \
'gProcessedSignal': l[signal_ind] }

## Use the convenient Pandas function to create a data frame from a dictionary.
array_df = pd.DataFrame.from_dict(values, orient='index')

return Array(array_df, filename, group, replicate)

def read_experiment(array_filenames, baseline=False, annotations=None):
def read_experiment(array_filenames, baseline=False, annotations_file=None):
'''Read in a series of array data files and return an Experiment instance.
Args:
array_filenames (list): List of filenames of the raw array data files.
baseline (Boolean): Set the baseline to median?
annotations_file (str): The filename of the annotations file.
'''

arrays = []

## Loop through the array files, get the group and replicate, create a
## normalised Array instance, and then read all of the arrays into an
## Experiment instance with the baseline set to the median.
for a in array_filenames:
group, replicate = a.split('/')[-1].split('.')[0].split('_')
replicate = int(replicate)
norm_array = read_array(a, group, replicate, annotations).normalise()
norm_array = read_array(a, group, replicate, annotations_file).normalise()
arrays.append(norm_array)

experiment = Experiment(arrays)
if baseline is True:
experiment = experiment.baseline_to_median()

return experiment

def read_annotations(annotation_file):
def read_annotations(annotations_file):
'''Read in an Agilent annotation file and return a dictionary
of annotations.
Args:
annotation_file (str): The filename of the annotation file.
annotations_file (str): The filename of the annotations file.
'''

f = open(annotation_file, 'r')
f = open(annotations_file, 'r')
lines = [l for l in f.readlines() if not l.startswith('#') and not l.startswith('^')]
headers = lines[0].rstrip().split('\t')

## Compile a list of indices for the various items that will be pulled out of the file.
## Compile a list of indices for the various items that will be pulled out of the file.
for i, h in enumerate(headers):
if h == 'ProbeName' or h == 'NAME':
probe_name_ind = i
Expand All @@ -293,10 +298,10 @@ def read_annotations(annotation_file):
## Dictionary to which annotations are added.
annotations = {}

## Add the annotations to the dictionary.
## Add the annotations to the dictionary.
for l in lines[1:]:
l = l.rstrip().split('\t')
if l[control_type_ind] not in ['pos', 1, 'ignore', 'neg']:
annotations[l[probe_name_ind]] = {'GeneName': l[gene_name_ind], 'Description': l[desc_name_ind]}

return annotations
return annotations
33 changes: 17 additions & 16 deletions dots_backend/dots_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def do_boxplot(experiment, show=False, image=False, html_file='boxplot.html'):
boxplot = create_standard_plot(h=600, w=900, x_range=group_names)
boxplot.xgrid.grid_line_color = None
boxplot.xaxis.major_label_orientation = np.pi/3

qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)

Expand Down Expand Up @@ -257,7 +257,7 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca
sig_cols = [x for x in stats.columns.values if 'significant' in x]
stats = stats[['FeatureNum', 'p_val', 'p_val_adj'] + sig_cols].copy()

## Merge the fold changes and stats data frames, create new columns for
## Merge the fold changes and stats data frames, create new columns for
## -log 10 adj. p value and spot colours.
merged_df = pd.merge(fcs, stats, on = 'FeatureNum')
fc_cols = [x for x in merged_df.columns.values if 'logFC' in x]
Expand All @@ -284,11 +284,11 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca
volcanoplot.xaxis.axis_label = 'log 2 fold change, ' + column
volcanoplot.yaxis.axis_label = '- log 10 p value'
volcanoplot.circle(merged_df[column], merged_df['neg_log_10_p_val'], color=merged_df['colour'], fill_alpha=0.4, line_alpha=0, size=5, source=source)

## Set up the hover tooltips.
hover = volcanoplot.select(dict(type=bm.HoverTool))
hover.tooltips = [('log 2 FC', '@logfc'), ('adj. p value', '@pval'), ('gene', '@gene')]

## Shows the plot.
if show == True:
bp.show(volcanoplot)
Expand All @@ -297,7 +297,7 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca

if image == True:
render_plot_to_png(html_file, height=600, width=600, crop='top')

return html_file

def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):
Expand All @@ -312,7 +312,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):
'''

## Creates a new data frame with nornalised expression data and cluster
## Creates a new data frame with nornalised expression data and cluster
## numbers from hierarchical clustering.
cluster_df = get_clusters(experiment, how='hierarchical')

Expand All @@ -324,7 +324,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):
colourmap = RdYlGn_11.hex_colors[::-1]
limit = np.abs(vals_only.values).max()
val_to_colour = lambda x: int((x + limit) * (len(colourmap)/(2*limit))) - 1

## Empty lists to which we can add the data for the heatmap.
vals = []
colour = []
Expand Down Expand Up @@ -380,7 +380,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):

if image == True:
render_plot_to_png(html_file, height=height, width=600, crop='top')

return html_file

def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot.html'):
Expand All @@ -395,7 +395,7 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot
'''

## Creates a new data frame with nornalised expression data and cluster
## Creates a new data frame with nornalised expression data and cluster
## numbers from k-means clustering.
cluster_df = get_clusters(experiment, how='kmeans')
samples = experiment.get_sampleids()
Expand All @@ -411,11 +411,12 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot
vals = cluster[samples]
xvals = vals.columns.values.tolist()
yvals = vals.values.tolist()
clusterplot = create_standard_plot(tools='save')
clusterplot = create_standard_plot(tools='save', x_range=xvals)
clusterplot.xaxis.major_label_orientation = np.pi/3
clusterplot.yaxis.axis_label = 'normalised expression'

for y in yvals:
clusterplot.line(range(len(xvals)), y, legend=False)
clusterplot.line(xvals, y, legend=False)

plots.append([clusterplot])

Expand All @@ -432,11 +433,11 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot

if image == True:
render_plot_to_png(html_file, height=height, width=600, crop='side')

return html_file

def render_plot_to_png(html_file, height=600, width=600, crop='top'):
'''Render a Bokeh plot to png file and crop out top toolbar,
'''Render a Bokeh plot to png file and crop out top toolbar,
using PhantomJS.
Args:
Expand Down Expand Up @@ -477,5 +478,5 @@ def render_plot_to_png(html_file, height=600, width=600, crop='top'):
## Clean up.
os.remove(png_file)
os.remove(js_file)
return out_file

return out_file

0 comments on commit a0c3592

Please sign in to comment.