diff --git a/.gitignore b/.gitignore index a4b5daa..3a1656a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ dots_backend/db dots_backend/scratch.py sample_data_two +build +dist +dots_for_microarrays.egg-info diff --git a/README.md b/README.md index 3464af6..c3bc2ba 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Dots -[![Build Status](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays.svg?branch=develop)](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays) +[![Build Status](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays.svg?branch=master)](https://travis-ci.org/sandyjmacdonald/dots_for_microarrays) [![Coverage Status](https://coveralls.io/repos/sandyjmacdonald/dots_for_microarrays/badge.svg?branch=master&service=github)](https://coveralls.io/github/sandyjmacdonald/dots_for_microarrays?branch=master) Dots is a Python package for working with microarray data. Its back-end is a standalone package for reading in, normalisation, statistical @@ -206,6 +206,24 @@ experiment = experiment.remove_sample('treated_1') This method will be of more use once the quality control features are added, allowing you to remove samples that are of low quality before proceeding with the analysis and plotting. +### The read_annotations function + +For array files that lack gene symbols or descriptions, you can pass in an annotations file that +you downloaded from the Agilent website or from the NCBI GEO database. These annotations will be +added to your Array and Experiment class instances. + +The `read_annotations` function takes a filename as an argument and returns a dictionary of the +annotations with the key being the probename and the values being a dictionary of `GeneName` and +`Description` + +You can read them in as part of your `read_experiment` call as follows: + +```python +experiment = read_experiment(array_filenames, baseline=True, annotations_file='annotations.txt') +``` + +Note that the `annotations_file` is an + ## The dots_analysis module This is the meat of the dots_backend. diff --git a/dots_backend/dots_arrays.py b/dots_backend/dots_arrays.py index c5c360d..76567e4 100644 --- a/dots_backend/dots_arrays.py +++ b/dots_backend/dots_arrays.py @@ -34,27 +34,27 @@ def __init__(self, df, filename, group, replicate): self.group = group self.replicate = int(replicate) self.sampleid = group + '_' + str(replicate) - + def get_probenames(self): '''Return a list of the probe names from the array.''' return self.df['ProbeName'].tolist() - + def get_genenames(self): '''Return a list of the gene names from the array.''' return self.df['GeneName'].tolist() - + def get_systematicnames(self): '''Return a list of the systematics names from the array.''' return self.df['SystematicName'].tolist() - + def get_descriptions(self): '''Return a list of the descriptions from the array.''' return self.df['Description'].tolist() - + def get_intensities(self): '''Return a list of the intensity values from the array.''' @@ -96,10 +96,10 @@ def __init__(self, arrays): self.arrays = arrays master_array = self.arrays[0].df[['FeatureNum', 'ProbeName', 'GeneName', 'SystematicName', 'Description', 'gProcessedSignal_log2_shifted']].copy() - + ## Rename the column containing the intesity values with the sample ID. master_array.rename(columns={'gProcessedSignal_log2_shifted': self.arrays[0].sampleid}, inplace=True) - + ## Pull out the group names and set the groups attribute. self.groups = list(set([array.group for array in self.arrays])) @@ -176,13 +176,14 @@ def remove_sample(self, sampleid): ## Functions ## -def read_array(filename, group, replicate, annotations=None): +def read_array(filename, group, replicate, annotations_file=None): '''Read in a raw array data file and return an Array instance. Args: filename (str): The filename of the raw array data file. group (str): The name of the group to which this array belongs. replicate (int): The replicate number within the group. + annotations_file (str): The filename of the annotations file. ''' @@ -190,7 +191,10 @@ def read_array(filename, group, replicate, annotations=None): lines = f.readlines() values = {} headers = lines[9].rstrip().split('\t') - + + if annotations_file is not None: + annotations = read_annotations(annotations_file) + ## Compile a list of indices for the various items that will be pulled out of the file. for i, h in enumerate(headers): if h == 'FeatureNum': @@ -208,19 +212,19 @@ def read_array(filename, group, replicate, annotations=None): elif h == 'ControlType': control_type_ind = i - ## Split the lines up and use the indices to pull out the values and add them + ## Split the lines up and use the indices to pull out the values and add them ## to the values dictionary. for l in lines[10:]: l = l.rstrip().split('\t') if l[control_type_ind] == '0': - if 'GeneName' in headers and 'Description' in headers: + if 'GeneName' in headers and 'Description' in headers: values[l[feature_num_ind]] = { 'FeatureNum': l[feature_num_ind], \ 'ProbeName': l[probe_name_ind], \ 'GeneName': l[gene_name_ind], \ 'SystematicName': l[sys_name_ind], \ 'Description': l[desc_name_ind], \ 'gProcessedSignal': l[signal_ind] } - elif annotations is not None: + elif annotations_file is not None: values[l[feature_num_ind]] = { 'FeatureNum': l[feature_num_ind], \ 'ProbeName': l[probe_name_ind], \ 'GeneName': annotations[l[probe_name_ind]]['GeneName'], \ @@ -234,52 +238,53 @@ def read_array(filename, group, replicate, annotations=None): 'SystematicName': l[sys_name_ind], \ 'Description': '', \ 'gProcessedSignal': l[signal_ind] } - + ## Use the convenient Pandas function to create a data frame from a dictionary. array_df = pd.DataFrame.from_dict(values, orient='index') return Array(array_df, filename, group, replicate) -def read_experiment(array_filenames, baseline=False, annotations=None): +def read_experiment(array_filenames, baseline=False, annotations_file=None): '''Read in a series of array data files and return an Experiment instance. Args: array_filenames (list): List of filenames of the raw array data files. baseline (Boolean): Set the baseline to median? + annotations_file (str): The filename of the annotations file. ''' - + arrays = [] - + ## Loop through the array files, get the group and replicate, create a ## normalised Array instance, and then read all of the arrays into an ## Experiment instance with the baseline set to the median. for a in array_filenames: group, replicate = a.split('/')[-1].split('.')[0].split('_') replicate = int(replicate) - norm_array = read_array(a, group, replicate, annotations).normalise() + norm_array = read_array(a, group, replicate, annotations_file).normalise() arrays.append(norm_array) experiment = Experiment(arrays) if baseline is True: experiment = experiment.baseline_to_median() - + return experiment -def read_annotations(annotation_file): +def read_annotations(annotations_file): '''Read in an Agilent annotation file and return a dictionary of annotations. Args: - annotation_file (str): The filename of the annotation file. + annotations_file (str): The filename of the annotations file. ''' - f = open(annotation_file, 'r') + f = open(annotations_file, 'r') lines = [l for l in f.readlines() if not l.startswith('#') and not l.startswith('^')] headers = lines[0].rstrip().split('\t') - ## Compile a list of indices for the various items that will be pulled out of the file. + ## Compile a list of indices for the various items that will be pulled out of the file. for i, h in enumerate(headers): if h == 'ProbeName' or h == 'NAME': probe_name_ind = i @@ -293,10 +298,10 @@ def read_annotations(annotation_file): ## Dictionary to which annotations are added. annotations = {} - ## Add the annotations to the dictionary. + ## Add the annotations to the dictionary. for l in lines[1:]: l = l.rstrip().split('\t') if l[control_type_ind] not in ['pos', 1, 'ignore', 'neg']: annotations[l[probe_name_ind]] = {'GeneName': l[gene_name_ind], 'Description': l[desc_name_ind]} - return annotations \ No newline at end of file + return annotations diff --git a/dots_backend/dots_plotting.py b/dots_backend/dots_plotting.py index 376f9cf..43b55a2 100644 --- a/dots_backend/dots_plotting.py +++ b/dots_backend/dots_plotting.py @@ -147,7 +147,7 @@ def do_boxplot(experiment, show=False, image=False, html_file='boxplot.html'): boxplot = create_standard_plot(h=600, w=900, x_range=group_names) boxplot.xgrid.grid_line_color = None boxplot.xaxis.major_label_orientation = np.pi/3 - + qmin = groups.quantile(q=0.00) qmax = groups.quantile(q=1.00) @@ -257,7 +257,7 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca sig_cols = [x for x in stats.columns.values if 'significant' in x] stats = stats[['FeatureNum', 'p_val', 'p_val_adj'] + sig_cols].copy() - ## Merge the fold changes and stats data frames, create new columns for + ## Merge the fold changes and stats data frames, create new columns for ## -log 10 adj. p value and spot colours. merged_df = pd.merge(fcs, stats, on = 'FeatureNum') fc_cols = [x for x in merged_df.columns.values if 'logFC' in x] @@ -284,11 +284,11 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca volcanoplot.xaxis.axis_label = 'log 2 fold change, ' + column volcanoplot.yaxis.axis_label = '- log 10 p value' volcanoplot.circle(merged_df[column], merged_df['neg_log_10_p_val'], color=merged_df['colour'], fill_alpha=0.4, line_alpha=0, size=5, source=source) - + ## Set up the hover tooltips. hover = volcanoplot.select(dict(type=bm.HoverTool)) hover.tooltips = [('log 2 FC', '@logfc'), ('adj. p value', '@pval'), ('gene', '@gene')] - + ## Shows the plot. if show == True: bp.show(volcanoplot) @@ -297,7 +297,7 @@ def do_volcanoplot(experiment, groups, show=False, image=False, html_file='volca if image == True: render_plot_to_png(html_file, height=600, width=600, crop='top') - + return html_file def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'): @@ -312,7 +312,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'): ''' - ## Creates a new data frame with nornalised expression data and cluster + ## Creates a new data frame with nornalised expression data and cluster ## numbers from hierarchical clustering. cluster_df = get_clusters(experiment, how='hierarchical') @@ -324,7 +324,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'): colourmap = RdYlGn_11.hex_colors[::-1] limit = np.abs(vals_only.values).max() val_to_colour = lambda x: int((x + limit) * (len(colourmap)/(2*limit))) - 1 - + ## Empty lists to which we can add the data for the heatmap. vals = [] colour = [] @@ -380,7 +380,7 @@ def do_heatmap(experiment, show=False, image=False, html_file='heatmap.html'): if image == True: render_plot_to_png(html_file, height=height, width=600, crop='top') - + return html_file def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot.html'): @@ -395,7 +395,7 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot ''' - ## Creates a new data frame with nornalised expression data and cluster + ## Creates a new data frame with nornalised expression data and cluster ## numbers from k-means clustering. cluster_df = get_clusters(experiment, how='kmeans') samples = experiment.get_sampleids() @@ -411,11 +411,12 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot vals = cluster[samples] xvals = vals.columns.values.tolist() yvals = vals.values.tolist() - clusterplot = create_standard_plot(tools='save') + clusterplot = create_standard_plot(tools='save', x_range=xvals) + clusterplot.xaxis.major_label_orientation = np.pi/3 clusterplot.yaxis.axis_label = 'normalised expression' - + for y in yvals: - clusterplot.line(range(len(xvals)), y, legend=False) + clusterplot.line(xvals, y, legend=False) plots.append([clusterplot]) @@ -432,11 +433,11 @@ def do_clusters_plot(experiment, show=True, image=False, html_file='clustersplot if image == True: render_plot_to_png(html_file, height=height, width=600, crop='side') - + return html_file def render_plot_to_png(html_file, height=600, width=600, crop='top'): - '''Render a Bokeh plot to png file and crop out top toolbar, + '''Render a Bokeh plot to png file and crop out top toolbar, using PhantomJS. Args: @@ -477,5 +478,5 @@ def render_plot_to_png(html_file, height=600, width=600, crop='top'): ## Clean up. os.remove(png_file) os.remove(js_file) - - return out_file \ No newline at end of file + + return out_file