Fix for annotations function, fix for x-axis labels on clusters plot

sandyjmacdonald · Jul 17, 2015 · a0c3592 · a0c3592
1 parent 1d9e7de
commit a0c3592
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 41 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@
 dots_backend/db
 dots_backend/scratch.py
 sample_data_two
+build
+dist
+dots_for_microarrays.egg-info
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Dots
 
-[![Build Status](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays.svg?branch=develop)](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays)
+[![Build Status](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays.svg?branch=master)](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays) [![Coverage Status](https://coveralls.io/repos/sandyjmacdonald/dots_for_microarrays/badge.svg?branch=master&service=github)](https://coveralls.io/github/sandyjmacdonald/dots_for_microarrays?branch=master)
 
 Dots is a Python package for working with microarray data. 
 Its back-end is a standalone package for reading in, normalisation, statistical 
@@ -206,6 +206,24 @@ experiment = experiment.remove_sample('treated_1')
 This method will be of more use once the quality control features are added, allowing you to remove 
 samples that are of low quality before proceeding with the analysis and plotting.
 
+### The read_annotations function
+
+For array files that lack gene symbols or descriptions, you can pass in an annotations file that
+you downloaded from the Agilent website or from the NCBI GEO database. These annotations will be
+added to your Array and Experiment class instances.
+
+The `read_annotations` function takes a filename as an argument and returns a dictionary of the
+annotations with the key being the probename and the values being a dictionary of `GeneName` and
+`Description`
+
+You can read them in as part of your `read_experiment` call as follows:
+
+```python
+experiment = read_experiment(array_filenames, baseline=True, annotations_file='annotations.txt')
+```
+
+Note that the `annotations_file` is an 
+
 ## The dots_analysis module
 
 This is the meat of the dots_backend. 

diff --git a/dots_backend/dots_arrays.py b/dots_backend/dots_arrays.py
@@ -34,27 +34,27 @@ def __init__(self, df, filename, group, replicate):
 		self.group = group
 		self.replicate = int(replicate)
 		self.sampleid = group + '_' + str(replicate)
-	
+
 	def get_probenames(self):
 		'''Return a list of the probe names from the array.'''
 
 		return self.df['ProbeName'].tolist()
-	
+
 	def get_genenames(self):
 		'''Return a list of the gene names from the array.'''
 
 		return self.df['GeneName'].tolist()
-	
+
 	def get_systematicnames(self):
 		'''Return a list of the systematics names from the array.'''
 
 		return self.df['SystematicName'].tolist()
-	
+
 	def get_descriptions(self):
 		'''Return a list of the descriptions from the array.'''
 
 		return self.df['Description'].tolist()
-	
+
 	def get_intensities(self):
 
 		'''Return a list of the intensity values from the array.'''
@@ -96,10 +96,10 @@ def __init__(self, arrays):
 
 		self.arrays = arrays
 		master_array = self.arrays[0].df[['FeatureNum', 'ProbeName', 'GeneName', 'SystematicName', 'Description', 'gProcessedSignal_log2_shifted']].copy()
-		
+
 		## Rename the column containing the intesity values with the sample ID.
 		master_array.rename(columns={'gProcessedSignal_log2_shifted': self.arrays[0].sampleid}, inplace=True)
-		
+
 		## Pull out the group names and set the groups attribute.
 		self.groups = list(set([array.group for array in self.arrays]))
 
@@ -176,21 +176,25 @@ def remove_sample(self, sampleid):
 
 ## Functions ##
 
-def read_array(filename, group, replicate, annotations=None):
+def read_array(filename, group, replicate, annotations_file=None):
 	'''Read in a raw array data file and return an Array instance.
 
 	Args:
 			filename (str): The filename of the raw array data file.
 			group (str): The name of the group to which this array belongs.
 			replicate (int): The replicate number within the group.
+			annotations_file (str): The filename of the annotations file.
 
 	'''
 
 	f = open(filename, 'r')
 	lines = f.readlines()
 	values = {}
 	headers = lines[9].rstrip().split('\t')
-
+
+	if annotations_file is not None:
+		annotations = read_annotations(annotations_file)
+
 	## Compile a list of indices for the various items that will be pulled out of the file.
 	for i, h in enumerate(headers):
 		if h == 'FeatureNum':
@@ -208,19 +212,19 @@ def read_array(filename, group, replicate, annotations=None):
 		elif h == 'ControlType':
 			control_type_ind = i
 
-	## Split the lines up and use the indices to pull out the values and add them 
+	## Split the lines up and use the indices to pull out the values and add them
 	## to the values dictionary.
 	for l in lines[10:]:
 		l = l.rstrip().split('\t')
 		if l[control_type_ind] == '0':
-			if 'GeneName' in headers and 'Description' in headers: 
+			if 'GeneName' in headers and 'Description' in headers:
 				values[l[feature_num_ind]] = { 'FeatureNum': l[feature_num_ind], \
 											   'ProbeName': l[probe_name_ind], \
 											   'GeneName': l[gene_name_ind], \
 											   'SystematicName': l[sys_name_ind], \
 											   'Description': l[desc_name_ind], \
 											   'gProcessedSignal': l[signal_ind] }
-			elif annotations is not None:
+			elif annotations_file is not None:
 				values[l[feature_num_ind]] = { 'FeatureNum': l[feature_num_ind], \
 											   'ProbeName': l[probe_name_ind], \
 											   'GeneName': annotations[l[probe_name_ind]]['GeneName'], \
@@ -234,52 +238,53 @@ def read_array(filename, group, replicate, annotations=None):
 											   'SystematicName': l[sys_name_ind], \
 											   'Description': '', \
 											   'gProcessedSignal': l[signal_ind] }
-	
+
 	## Use the convenient Pandas function to create a data frame from a dictionary.
 	array_df = pd.DataFrame.from_dict(values, orient='index')
 
 	return Array(array_df, filename, group, replicate)
 
-def read_experiment(array_filenames, baseline=False, annotations=None):
+def read_experiment(array_filenames, baseline=False, annotations_file=None):
 	'''Read in a series of array data files and return an Experiment instance.
 
 	Args:
 			array_filenames (list): List of filenames of the raw array data files.
 			baseline (Boolean): Set the baseline to median?
+			annotations_file (str): The filename of the annotations file.
 
 	'''
-	
+
 	arrays = []
-	
+
 	## Loop through the array files, get the group and replicate, create a
 	## normalised Array instance, and then read all of the arrays into an
 	## Experiment instance with the baseline set to the median.
 	for a in array_filenames:
 		group, replicate = a.split('/')[-1].split('.')[0].split('_')
 		replicate = int(replicate)
-		norm_array = read_array(a, group, replicate, annotations).normalise()
+		norm_array = read_array(a, group, replicate, annotations_file).normalise()
 		arrays.append(norm_array)
 
 	experiment = Experiment(arrays)
 	if baseline is True:
 		experiment = experiment.baseline_to_median()
-	
+
 	return experiment
 
-def read_annotations(annotation_file):
+def read_annotations(annotations_file):
 	'''Read in an Agilent annotation file and return a dictionary
 	of annotations.
 
 	Args:
-			annotation_file (str): The filename of the annotation file.
+			annotations_file (str): The filename of the annotations file.
 
 	'''
 
-	f = open(annotation_file, 'r')
+	f = open(annotations_file, 'r')
 	lines = [l for l in f.readlines() if not l.startswith('#') and not l.startswith('^')]
 	headers = lines[0].rstrip().split('\t')
 
-	## Compile a list of indices for the various items that will be pulled out of the file.	
+	## Compile a list of indices for the various items that will be pulled out of the file.
 	for i, h in enumerate(headers):
 		if h == 'ProbeName' or h == 'NAME':
 			probe_name_ind = i
@@ -293,10 +298,10 @@ def read_annotations(annotation_file):
 	## Dictionary to which annotations are added.
 	annotations = {}
 
-	## Add the annotations to the dictionary.	
+	## Add the annotations to the dictionary.
 	for l in lines[1:]:
 		l = l.rstrip().split('\t')
 		if l[control_type_ind] not in ['pos', 1, 'ignore', 'neg']:
 			annotations[l[probe_name_ind]] = {'GeneName': l[gene_name_ind], 'Description': l[desc_name_ind]}
 
-	return annotations
+	return annotations
diff --git a/dots_backend/dots_plotting.py b/dots_backend/dots_plotting.py
@@ -147,7 +147,7 @@ def do_boxplot(experiment, show=False, image=False, html_file='boxplot.html'):
 	boxplot = create_standard_plot(h=600, w=900, x_range=group_names)
 	boxplot.xgrid.grid_line_color = None
 	boxplot.xaxis.major_label_orientation = np.pi/3
-	
+
 	qmin = groups.quantile(q=0.00)
 	qmax = groups.quantile(q=1.00)
 
@@ -257,7 +257,7 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca
 	sig_cols = [x for x in stats.columns.values if 'significant' in x]
 	stats = stats[['FeatureNum', 'p_val', 'p_val_adj'] + sig_cols].copy()
 
-	## Merge the fold changes and stats data frames, create new columns for 
+	## Merge the fold changes and stats data frames, create new columns for
 	## -log 10 adj. p value and spot colours.
 	merged_df = pd.merge(fcs, stats, on = 'FeatureNum')
 	fc_cols = [x for x in merged_df.columns.values if 'logFC' in x]
@@ -284,11 +284,11 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca
 	volcanoplot.xaxis.axis_label = 'log 2 fold change, ' + column
 	volcanoplot.yaxis.axis_label = '- log 10 p value'
 	volcanoplot.circle(merged_df[column], merged_df['neg_log_10_p_val'], color=merged_df['colour'], fill_alpha=0.4, line_alpha=0, size=5, source=source)
-	
+
 	## Set up the hover tooltips.
 	hover = volcanoplot.select(dict(type=bm.HoverTool))
 	hover.tooltips = [('log 2 FC', '@logfc'), ('adj. p value', '@pval'), ('gene', '@gene')]
-	
+
 	## Shows the plot.
 	if show == True:
 		bp.show(volcanoplot)
@@ -297,7 +297,7 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca
 
 	if image == True:
 		render_plot_to_png(html_file, height=600, width=600, crop='top')
-	
+
 	return html_file
 
 def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):
@@ -312,7 +312,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):
 
 	'''
 
-	## Creates a new data frame with nornalised expression data and cluster 
+	## Creates a new data frame with nornalised expression data and cluster
 	## numbers from hierarchical clustering.
 	cluster_df = get_clusters(experiment, how='hierarchical')
 
@@ -324,7 +324,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):
 	colourmap = RdYlGn_11.hex_colors[::-1]
 	limit = np.abs(vals_only.values).max()
 	val_to_colour = lambda x: int((x + limit) * (len(colourmap)/(2*limit))) - 1
-	
+
 	## Empty lists to which we can add the data for the heatmap.
 	vals = []
 	colour = []
@@ -380,7 +380,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'):
 
 	if image == True:
 		render_plot_to_png(html_file, height=height, width=600, crop='top')
-	
+
 	return html_file
 
 def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot.html'):
@@ -395,7 +395,7 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot
 
 	'''
 
-	## Creates a new data frame with nornalised expression data and cluster 
+	## Creates a new data frame with nornalised expression data and cluster
 	## numbers from k-means clustering.
 	cluster_df = get_clusters(experiment, how='kmeans')
 	samples = experiment.get_sampleids()
@@ -411,11 +411,12 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot
 		vals = cluster[samples]
 		xvals = vals.columns.values.tolist()
 		yvals = vals.values.tolist()
-		clusterplot = create_standard_plot(tools='save')
+		clusterplot = create_standard_plot(tools='save', x_range=xvals)
+		clusterplot.xaxis.major_label_orientation = np.pi/3
 		clusterplot.yaxis.axis_label = 'normalised expression'
-		
+
 		for y in yvals:
-			clusterplot.line(range(len(xvals)), y, legend=False)
+			clusterplot.line(xvals, y, legend=False)
 
 		plots.append([clusterplot])
 
@@ -432,11 +433,11 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot
 
 	if image == True:
 		render_plot_to_png(html_file, height=height, width=600, crop='side')
-	
+
 	return html_file
 
 def render_plot_to_png(html_file, height=600, width=600, crop='top'):
-	'''Render a Bokeh plot to png file and crop out top toolbar, 
+	'''Render a Bokeh plot to png file and crop out top toolbar,
 	using PhantomJS.
 
 	Args:
@@ -477,5 +478,5 @@ def render_plot_to_png(html_file, height=600, width=600, crop='top'):
 	## Clean up.
 	os.remove(png_file)
 	os.remove(js_file)
-	
-	return out_file
+
+	return out_file