diff --git a/.gitignore b/.gitignore index d17f029..cdfb535 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ paper/*.out paper/*.pdf paper/*.run.xml paper/*.tex +paper/*.py diff --git a/mistree/__init__.py b/mistree/__init__.py index f6ab3ca..889372d 100644 --- a/mistree/__init__.py +++ b/mistree/__init__.py @@ -1,5 +1,4 @@ # coordinate utility functions - from .coordinates.coordinate_utility import spherical_2_cartesian from .coordinates.coordinate_utility import celestial_2_cartesian from .coordinates.coordinate_utility import spherical_2_unit_sphere @@ -7,13 +6,11 @@ from .coordinates.coordinate_utility import perpendicular_distance_2_angle # levy flight distributions - from .levy_flight.levy_flight import get_random_flight from .levy_flight.levy_flight import get_levy_flight from .levy_flight.levy_flight import get_adjusted_levy_flight # mst - branch functions - from .mst.branches import get_branch_index from .mst.branches import get_branch_index_sub_divide from .mst.branches import get_branch_end_index @@ -21,34 +18,38 @@ from .mst.branches import get_branch_shape # mst - construction functions - from .mst.construct import construct_mst # mst - density vs variable functions - from .mst.density import variable_vs_density # mst - graph functions - from .mst.graph import graph2data from .mst.graph import data2graph -# mst - scale cut functions +# mst - histogram functions +from .mst.hist_mst import bin_data +from .mst.hist_mst import HistMST + +# mst - plotting functions +from .mst.plot_mst import set_plot_default +from .mst.plot_mst import plot_histogram_line +from .mst.plot_mst import plot_histogram_confidence +from .mst.plot_mst import plot_histogram_error +from .mst.plot_mst import PlotHistMST +# mst - scale cut functions from .mst.scale_cut import graph_scale_cut from .mst.scale_cut import graph_scale_cut from .mst.scale_cut import k_nearest_neighbour_scale_cut # mst - single class functions - from .mst.get_mst_class import GetMST # mst - mst statistical functions - from .mst.stats import get_graph_degree from .mst.stats import get_mean_degree_for_edges from .mst.stats import get_degree_for_edges # mst - tomographic functions - from .mst.tomo import convert_tomo_knn_length2angle diff --git a/mistree/mst/__init__.py b/mistree/mst/__init__.py index 68e7b4e..6b9ca72 100644 --- a/mistree/mst/__init__.py +++ b/mistree/mst/__init__.py @@ -1,5 +1,4 @@ # branch functions - from .branches import get_branch_index from .branches import get_branch_index_sub_divide from .branches import get_branch_end_index @@ -7,34 +6,39 @@ from .branches import get_branch_shape # construction functions - from .construct import construct_mst # density vs variable functions - from .density import variable_vs_density # graph functions - from .graph import graph2data from .graph import data2graph -# scale cut functions +# Histogram MST functions & class +from .hist_mst import bin_data +from .hist_mst import HistMST + +# Plotting MST class +from .plot_mst import set_plot_default +from .plot_mst import plot_histogram_line +from .plot_mst import plot_histogram_confidence +from .plot_mst import plot_histogram_error +from .plot_mst import PlotHistMST + +# scale cut functions from .scale_cut import graph_scale_cut from .scale_cut import graph_scale_cut from .scale_cut import k_nearest_neighbour_scale_cut # single class functions - from .get_mst_class import GetMST # mst statistical functions - from .stats import get_graph_degree from .stats import get_mean_degree_for_edges from .stats import get_degree_for_edges # tomographic functions - from .tomo import convert_tomo_knn_length2angle diff --git a/mistree/mst/hist_mst.py b/mistree/mst/hist_mst.py new file mode 100644 index 0000000..0d37d1d --- /dev/null +++ b/mistree/mst/hist_mst.py @@ -0,0 +1,239 @@ +import numpy as np +import matplotlib.pylab as plt + + +def bin_data(data, minimum=None, maximum=None, bin_size=None, bin_number=100, normalised=True): + """Returns the (normalised) number count of a data set with values within defined bins. + + Parameters + ---------- + data : array_like + The data to be binned. + minimum : float + A given minimum for the bin edges. + maximum : float + A given maximum for the bin edges + bin_size : float + The size of the bins. + bin_number : int + The number of bins to be used. + normalised : bool + Tf true will return a normalised histogram, if false returns a number counts. + + Returns + ------- + bin_centres : array_like + The central value of each bin. + binned_data : array_like + The binned number count (or normalised histogram) of the data set. + """ + if minimum is None: + minimum = np.min(data) + if maximum is None: + maximum = np.max(data) + if bin_size is None: + _bin_edge = np.linspace(minimum, maximum, bin_number+1) + binned_data, _bin_edges = np.histogram(data, bins=_bin_edge, density=normalised) + bin_centres = 0.5 * (_bin_edge[1:] + _bin_edge[:-1]) + return bin_centres, binned_data + + +class HistMST: + + def __init__(self): + """Initialises MST histogram class. + """ + # bin edges information + self.uselog = None + self.use_s_sqrt = None + self.usenorm = None + self.d_min = None + self.d_max = None + self.num_d_bins = None + self.l_min = None + self.l_max = None + self.num_l_bins = None + self.b_min = None + self.b_max = None + self.num_b_bins = None + self.s_min = None + self.s_max = None + self.num_s_bins = None + self.logl_min = None + self.logl_max = None + self.logb_min = None + self.logb_max = None + self.group_mode = False + + def setup(self, uselog=False, use_sqrt_s=True, usenorm=True, d_min=0.5, d_max=6.5, num_d_bins=6, + l_min=0., l_max=None, num_l_bins=100, b_min=0., b_max=None, num_b_bins=100, + s_min=0., s_max=1., num_s_bins=50, logl_min=None, logl_max=None, logb_min=None, logb_max=None): + """Setups bin sizes for the MST statistics. + + Parameters + ---------- + uselog : bool + Determines whether to use log bins for l and b. + use_sqrt_s : bool + Determines whether to use the sqrt(1-s) projection of s or just s itself. + usenorm : bool + Determines whether to normalise the histograms. + d_min : float + Minimum for degree bins (use half integer values). + d_max : float + Maximum for degree bins (use half integer values). + num_d_bins : int + Number of bins for the distribution of degree, this should be equal to d_max - d_min. + l_min : float + Minimum for edge length bins. + l_max : float + Maximum for edge length bins. + num_l_bins : int + Number of bins for the distribution of edge lengths. + b_min : float + Minimum for branch length bins. + b_max : float + Maximum for branch length bins. + num_b_bins : int + Number of bins for the distribution of branch lengths. + s_min : float + Minimum for branch shape bins. + s_max : float + Maximum for branch shape bins. + num_s_bins : int + Number of bins for the distribution of branch shapes. + logl_min : float + Minimum of edge lengths in log to base 10. + logl_max : float + Maximum of edge lengths in log to base 10. + logb_min : float + Minimum of branch lengths in log to base 10. + logb_max : float + Maximum of branch lengths in log to base 10. + """ + self.uselog = uselog + self.use_sqrt_s= use_sqrt_s + self.usenorm = usenorm + self.d_min = d_min + self.d_max = d_max + self.num_d_bins = num_d_bins + self.l_min = l_min + self.l_max = l_max + self.num_l_bins = num_l_bins + self.b_min = b_min + self.b_max = b_max + self.num_b_bins = num_b_bins + self.s_min = s_min + self.s_max = s_max + self.num_s_bins = num_s_bins + self.logl_min = logl_min + self.logl_max = logl_max + self.logb_min = logb_min + self.logb_max = logb_max + + def get_hist(self, d, l, b, s): + """Bins the MST distribution which is returned as a dictionary. + + Parameters + ---------- + d : array_like + Distribution of degree. + l : array_like + Distribution of edge length. + b : array_like + Distribution of branch length. + s : array_like + Distribution of branch shape. + + Returns + ------- + mst_hist : dict + Dictionary of MST binned histograms. + """ + # find minimum and maximum + if self.l_max is None: + self.l_max = 1.05*l.max() + if self.b_max is None: + self.b_max = 1.05*b.max() + if self.logl_min is None: + self.logl_min = np.log10(0.95*l.min()) + if self.logl_max is None: + self.logl_max = np.log10(1.05*l.max()) + if self.logb_min is None: + self.logb_min = np.log10(0.95*b.min()) + if self.logb_max is None: + self.logb_max = np.log10(1.05*b.max()) + # bin mst statistics + x_d, y_d = bin_data(d, minimum=self.d_min, maximum=self.d_max, bin_number=self.num_d_bins, normalised=self.usenorm) + if self.uselog == False: + x_l, y_l = bin_data(l, minimum=self.l_min, maximum=self.l_max, bin_number=self.num_l_bins, normalised=self.usenorm) + x_b, y_b = bin_data(b, minimum=self.b_min, maximum=self.b_max, bin_number=self.num_b_bins, normalised=self.usenorm) + else: + x_logl, y_l = bin_data(np.log10(l), minimum=self.logl_min, maximum=self.logl_max, bin_number=self.num_l_bins, normalised=self.usenorm) + x_logb, y_b = bin_data(np.log10(b), minimum=self.logb_min, maximum=self.logb_max, bin_number=self.num_b_bins, normalised=self.usenorm) + x_l = 10.**x_logl + x_b = 10.**x_logb + if self.use_sqrt_s == False: + x_s, y_s = bin_data(s, minimum=self.s_min, maximum=self.s_max, bin_number=self.num_s_bins, normalised=self.usenorm) + else: + x_s, y_s = bin_data(np.sqrt(1.-s), minimum=self.s_min, maximum=self.s_max, bin_number=self.num_s_bins, normalised=self.usenorm) + mst_hist = { + "uselog" : self.uselog, "use_sqrt_s" : self.use_sqrt_s, "usenorm" : self.usenorm, "isgroup" : False, + "x_d" : x_d, "y_d" : y_d, "x_l" : x_l, "y_l" : y_l, "x_b" : x_b, "y_b" : y_b, "x_s" : x_s, "y_s" : y_s + } + if self.group_mode == True: + self.x_d = mst_hist['x_d'] + self.x_l = mst_hist['x_l'] + self.x_b = mst_hist['x_b'] + self.x_s = mst_hist['x_s'] + self.group_y_d.append(mst_hist['y_d']) + self.group_y_l.append(mst_hist['y_l']) + self.group_y_b.append(mst_hist['y_b']) + self.group_y_s.append(mst_hist['y_s']) + return mst_hist + + def start_group(self): + """Begins group mode for calculating the mean and standard deviation of the MST + from different realisations of data points coming from the same model. + """ + self.group_mode = True + self.group_y_d = [] + self.group_y_l = [] + self.group_y_b = [] + self.group_y_s = [] + + def end_group(self): + """Ends group mode for calculating the mean and standard deviation of the MST. + + Returns + ------- + mst_hist : dict + Dictionary of the mean and standard deviation of the MST binned histograms. + """ + self.group_mode = False + self.group_y_d = np.array(self.group_y_d) + self.group_y_l = np.array(self.group_y_l) + self.group_y_b = np.array(self.group_y_b) + self.group_y_s = np.array(self.group_y_s) + y_d_mean = np.mean(self.group_y_d, axis=0) + y_l_mean = np.mean(self.group_y_l, axis=0) + y_b_mean = np.mean(self.group_y_b, axis=0) + y_s_mean = np.mean(self.group_y_s, axis=0) + y_d_std = np.std(self.group_y_d, axis=0) + y_l_std = np.std(self.group_y_l, axis=0) + y_b_std = np.std(self.group_y_b, axis=0) + y_s_std = np.std(self.group_y_s, axis=0) + mst_hist = { + "uselog" : self.uselog, "use_sqrt_s" : self.use_sqrt_s, + "usenorm" : self.usenorm, "isgroup" : True, + "x_d" : self.x_d, "y_d" : y_d_mean, "y_d_std" : y_d_std, + "x_l" : self.x_l, "y_l" : y_l_mean, "y_l_std" : y_l_std, + "x_b" : self.x_b, "y_b" : y_b_mean, "y_b_std" : y_b_std, + "x_s" : self.x_s, "y_s" : y_s_mean, "y_s_std" : y_s_std, + } + return mst_hist + + def clean(self): + """Resets HistMST variables. + """ + self.__init__() diff --git a/mistree/mst/plot_mst.py b/mistree/mst/plot_mst.py new file mode 100644 index 0000000..d14944f --- /dev/null +++ b/mistree/mst/plot_mst.py @@ -0,0 +1,623 @@ +import numpy as np +import matplotlib.pylab as plt + +from . import hist_mst + +def set_plot_default(use_bold=True): + """Sets the default fonts for the matplotlib plots. + + Parameters + ---------- + use_bold : bool + Determines whether the fonts used are in bold. + """ + plt.rc('text', usetex=True) + plt.rc('font', family='serif') + if use_bold == True: + plt.rcParams['text.latex.preamble'] = [r'\boldmath'] + + +def plot_histogram_line(x_mid, histogram, x_edges=None, ax=None, color='C0', + alpha=1., linewidth=1., linestyle='-'): + """Outputs the histogram line boxes without using the histogram function. + + Parameters + ---------- + x_mid : array_like + Midpoint of each histogram bar. + histogram : array_like + Histogram height. + x_edges : array_like + The edges of each histogram bar. Ideal for unequal spacings between histograms. + ax : class + The matplotlib plotting axis. + color : str + The color of the histogram error plot. + alpha : float + The transparency fraction of the histogram error plot. + linewidth : float + The linewidth of histogram line. + linestyle : str + The linestyle used. + """ + x_hist = [] + y_hist = [] + if x_edges is None: + _dx = x_mid[1]-x_mid[0] + for i in range(0,len(histogram)): + x_hist.append(x_mid[i] - 0.5*_dx) + x_hist.append(x_mid[i] + 0.5*_dx) + y_hist.append(histogram[i]) + y_hist.append(histogram[i]) + else: + for i in range(0, len(histogram)): + x_hist.append(x_edges[i]) + x_hist.append(x_edges[i+1]) + y_hist.append(histogram[i]) + y_hist.append(histogram[i]) + x_hist, y_hist = np.array(x_hist), np.array(y_hist) + if ax == None: + plt.plot(x_hist, y_hist, color=color, alpha=alpha, + linewidth=linewidth, linestyle=linestyle) + else: + ax.plot(x_hist, y_hist, color=color, alpha=alpha, + linewidth=linewidth, linestyle=linestyle) + + +def plot_histogram_confidence(x_mid, histogram_min, histogram_max, x_edges=None, + ax=None, color='C0', alpha=0.25): + """Plots the confidence envelopes for a histogram. + + Parameters + ---------- + x_mid : array_like + Midpoint of each histogram bar. + histogram_min : array_like + Histogram minimum envelope. + histogram_max : array_like + Histogram maximum envelope. + x_edges : array_like + The edges of each histogram bar. Ideal for unequal spacings between histograms. + ax : class + The matplotlib plotting axis. + color : str + The color of the histogram error plot. + alpha : float + The transparency fraction of the histogram error plot. + """ + if x_edges is None: + _dx = x_mid[1]-x_mid[0] + for i in range(0, len(histogram_min)): + if ax == None: + plt.fill_between(np.array([x_mid[i]-0.5*_dx, x_mid[i]+0.5*_dx]), + np.array([histogram_min[i], histogram_min[i]]), + np.array([histogram_max[i], histogram_max[i]]), + color=color,alpha=alpha) + else: + ax.fill_between(np.array([x_mid[i]-0.5*_dx, x_mid[i]+0.5*_dx]), + np.array([histogram_min[i], histogram_min[i]]), + np.array([histogram_max[i], histogram_max[i]]), + color=color,alpha=alpha) + else: + for i in range(0, len(histogram_min)): + if ax == None: + plt.fill_between(np.array([x_edges[i], x_edges[i+1]]), + np.array([histogram_min[i], histogram_min[i]]), + np.array([histogram_max[i], histogram_max[i]]), + color=color,alpha=alpha) + else: + ax.fill_between(np.array([x_edges[i], x_edges[i+1]]), + np.array([histogram_min[i], histogram_min[i]]), + np.array([histogram_max[i], histogram_max[i]]), + color=color,alpha=alpha) + + +def plot_histogram_error(x_mid, histogram, histogram_error, x_edges=None, ax=None, + color='C0', alpha=0.25): + """Plots the errorbars envelopes for a histogram. + + Parameters + ---------- + x_mid : array_like + Midpoint of each histogram bar. + histogram : array_like + Histogram. + histogram_error : array_like + The error associated to each histogram bar. + x_edges : array_like + The edges of each histogram bar. Ideal for unequal spacings between histograms. + ax : class + The matplotlib plotting axis. + color : str + The color of the histogram error plot. + alpha : float + The transparency fraction of the histogram error plot. + """ + plot_histogram_confidence(x_mid, histogram-histogram_error, histogram+histogram_error, + x_edges=x_edges, ax=ax, color=color, alpha=alpha) + + +class PlotHistMST: + + def _get_rotate_colors(self): + """Cycles through the default matplotlib colours, i.e. C0 - C9. + """ + color = 'C'+str(self.rotate_colors) + if self.rotate_colors + 1 == 10: + self.rotate_colors = 0 + else: + self.rotate_colors += 1 + return color + + def _get_rotate_linestyles(self): + """Cycles through linestyles "-", "--", "-." and ":". + """ + linestyle = self.linestyle_all[self.rotate_linestyle] + if self.rotate_linestyle + 1 == 4: + self.rotate_linestyle = 0 + else: + self.rotate_linestyle += 1 + return linestyle + + def __init__(self): + """Initialises the plotting class. + """ + # tracking data + self.num_data = 0 + # line information + self.colors = [] + self.alphas = [] + self.alphas_envelope = [] + self.labels = [] + self.linestyles = [] + self.linewidths = [] + self.binned_data = [] + self.need_envelopes = [] + # colors and styles + self.rotate_colors = 0 + self.rotate_linestyle = 0 + self.linestyle_all = ['-', '--', '-.', ':'] + # Type of data used + self.use_sqrt_s = None + self.uselog = None + # Binning information + self.d_min = None + self.d_max = None + self.l_min = None + self.l_max = None + self.b_min = None + self.b_max = None + self.s_min = None + self.s_max = None + self.l_edges = None + self.b_edges = None + self.s_edges = None + self.usenorm = None + + def read_mst(self, mst_hist, color=None, linewidth=2., linestyle=None, alpha=0.8, + label=None, alpha_envelope=0.3): + """Input minimum spanning tree statistics. + + Parameters + ---------- + mst_hist : dict + Binned MST dictionary, given in the format outputted by HistMST. + color : str + Color of the MST histogram. + linewidth : float + Width of line used. + linestyle : str + Linestyle used. + alpha : float + The transparency of the line. + label : str + label used in the legend, this will only appear in the legend if this is given. + alpha_envelope : float + The transparency of envelopes (usually the standard deviation of the counts in each bin). + """ + if self.usenorm is None: + self.usenorm = mst_hist['usenorm'] + self.binned_data.append(mst_hist) + self.num_data += 1 + delta_d = mst_hist['x_d'][1]-mst_hist['x_d'][0] + self.d_min = mst_hist['x_d'][0] - delta_d/2. + self.d_max = mst_hist['x_d'][-1] + delta_d/2. + if mst_hist['uselog'] == False: + self.uselog = False + delta_l = mst_hist['x_l'][1]-mst_hist['x_l'][0] + self.l_min = mst_hist['x_l'][0] - delta_l/2. + self.l_max = mst_hist['x_l'][-1] + delta_l/2. + delta_b = mst_hist['x_b'][1]-mst_hist['x_l'][0] + self.b_min = mst_hist['x_b'][0] - delta_b/2. + self.b_max = mst_hist['x_b'][-1] + delta_b/2. + self.l_edges = np.linspace(self.l_min, self.l_max, len(mst_hist['x_l'])+1) + self.b_edges = np.linspace(self.b_min, self.b_max, len(mst_hist['x_b'])+1) + else: + self.uselog = True + logdelta_l = np.log10(mst_hist['x_l'][1])-np.log10(mst_hist['x_l'][0]) + self.l_min = 10.**(np.log10(mst_hist['x_l'][0]) - logdelta_l/2.) + self.l_max = 10.**(np.log10(mst_hist['x_l'][-1]) + logdelta_l/2.) + logdelta_b = np.log10(mst_hist['x_b'][1])-np.log10(mst_hist['x_b'][0]) + self.b_min = 10.**(np.log10(mst_hist['x_b'][0]) - logdelta_b/2.) + self.b_max = 10.**(np.log10(mst_hist['x_b'][-1]) + logdelta_b/2.) + logl_edges = np.linspace(np.log10(self.l_min), np.log10(self.l_max), len(mst_hist['x_l'])+1) + logb_edges = np.linspace(np.log10(self.b_min), np.log10(self.b_max), len(mst_hist['x_b'])+1) + self.l_edges = 10.**logl_edges + self.b_edges = 10.**logb_edges + delta_s = mst_hist['x_s'][1]-mst_hist['x_s'][0] + self.s_min = 0. + self.s_max = 1. + if mst_hist['use_sqrt_s'] == True: + self.use_sqrt_s = True + if color is None: + self.colors.append(self._get_rotate_colors()) + else: + self.colors.append(color) + self.alphas.append(alpha) + self.alphas_envelope.append(alpha_envelope) + self.linewidths.append(linewidth) + if linestyle is None: + self.linestyles.append(self._get_rotate_linestyles()) + else: + self.linestyles.append(linestyle) + self.labels.append(label) + self.need_envelopes.append(mst_hist['isgroup']) + + def plot(self, usebox=True, saveas=None, fontsize=16, figsize=(16, 4), units=None, + showenvelopes=True, usecomp=False, usemean=True, height_ratios=[2, 1], + usefraction=False, whichcomp=0, plotzeroline=True, legend=True, subplot_adjust_top=0.85, + legend_fontsize=14, legend_column=4, xlabels=[None, None, None, None]): + """Outputs the final plot of the MST statistics. + + Parameters + ---------- + usebox : bool + For l, b and s this sets whether to use boxes for the histogram or to simple plot as line + from the bin centre. + saveas : str + If not None, this will save the plot with the name provided. + fontsize : int + Fontsize of axis labels. + figsize : tuple + Dimensions of the figure. + units : str + Units of l and b MST statistics, if None is supplied then we assume it is unitless. + showenvelopes : bool + This determines whether to plot data with input standard deviation. + usecomp : bool + Determines whether to include comparison subplots. + usemean : bool + For comparison plots, the determines whether to use the mean of input distributions. + height_ratios : tuple + Height ratio between main plots and comparison plots. + usefraction : bool + If true comparison plots are given by the fractional difference to a reference data otherwise + they are given as absolute differences. + whichcomp : int + Determines which data should be the reference data that all other data is compared to. Only used + if usemean == False. + plotzeroline : bool + Plots horizontal zero line for subplot. + legend : bool + Determines whether to include a legend. + subplot_adjust_top : float + If legend == True then the figure's top is adjusted by the input value. + legend_fontsize : int + Size of legend text. + legend_column : int + Number of keys in each line of the legend. + xlabels : list + List of string labels to replace the default if not set to None. + """ + # setting up figure + if self.usenorm == True: + ylabel = '\\bar{N}' + else: + ylabel = 'N' + if usecomp == True and self.num_data > 1: + f, ((ax1, ax2, ax3, ax4), + (ax15, ax25, ax35, ax45)) = plt.subplots(2, 4, figsize=figsize, + gridspec_kw={'height_ratios': height_ratios}) + if xlabels[0] is None: + ax15.set_xlabel(r'$d$', fontsize=fontsize) + else: + ax15.set_xlabel(xlabels[0], fontsize=fontsize) + if units is None: + if xlabels[1] is None: + ax25.set_xlabel(r'$l$', fontsize=fontsize) + else: + ax25.set_xlabel(xlabels[1], fontsize=fontsize) + if xlabels[2] is None: + ax35.set_xlabel(r'$b$', fontsize=fontsize) + else: + ax35.set_xlabel(xlabels[2], fontsize=fontsize) + else: + if xlabels[1] is None: + ax25.set_xlabel(r'$l$ $[%s]$' %(units), fontsize=fontsize) + else: + ax25.set_xlabel(xlabels[1]+r' $[%s]$' %(units), fontsize=fontsize) + if xlabels[2] is None: + ax35.set_xlabel(r'$b$ $[%s]$' %(units), fontsize=fontsize) + else: + ax35.set_xlabel(xlabels[2]+r' $[%s]$' %(units), fontsize=fontsize) + if self.use_sqrt_s == True: + if xlabels[3] is None: + ax45.set_xlabel(r'$\sqrt{1-s}$', fontsize=fontsize) + else: + ax45.set_xlabel(xlabels[3], fontsize=fontsize) + else: + if xlabels[3] is None: + ax45.set_xlabel(r'$s$', fontsize=fontsize) + else: + ax45.set_xlabel(xlabels[3], fontsize=fontsize) + if usefraction == False: + ax15.set_ylabel(r'$\Delta %s$' %(ylabel), fontsize=fontsize) + ax25.set_ylabel(r'$\Delta %s$' %(ylabel), fontsize=fontsize) + ax35.set_ylabel(r'$\Delta %s$' %(ylabel), fontsize=fontsize) + ax45.set_ylabel(r'$\Delta %s$' %(ylabel), fontsize=fontsize) + else: + if usemean == False: + if self.labels[0] is None: + comp_label = str(0) + else: + comp_label = str(self.labels[0]) + else: + comp_label = '{\\rm Mean}' + ax15.set_ylabel(r'$\frac{%s}{%s_{%s}}-1$' %(ylabel, ylabel, comp_label), fontsize=fontsize) + ax25.set_ylabel(r'$\frac{%s}{%s_{%s}}-1$' %(ylabel, ylabel, comp_label), fontsize=fontsize) + ax35.set_ylabel(r'$\frac{%s}{%s_{%s}}-1$' %(ylabel, ylabel, comp_label), fontsize=fontsize) + ax45.set_ylabel(r'$\frac{%s}{%s_{%s}}-1$' %(ylabel, ylabel, comp_label), fontsize=fontsize) + else: + f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=figsize) + ax1.set_xlabel(r'$d$', fontsize=fontsize) + if units is None: + ax2.set_xlabel(r'$l$', fontsize=fontsize) + ax3.set_xlabel(r'$b$', fontsize=fontsize) + else: + ax2.set_xlabel(r'$l$ $[%s]$' %(units), fontsize=fontsize) + ax3.set_xlabel(r'$b$ $[%s]$' %(units), fontsize=fontsize) + if self.use_sqrt_s == True: + ax4.set_xlabel(r'$\sqrt{1-s}$', fontsize=fontsize) + else: + ax4.set_xlabel(r'$s$', fontsize=fontsize) + ax1.set_ylabel(r'$%s$' %(ylabel), fontsize=fontsize) + ax2.set_ylabel(r'$%s$' %(ylabel), fontsize=fontsize) + ax3.set_ylabel(r'$%s$' %(ylabel), fontsize=fontsize) + ax4.set_ylabel(r'$%s$' %(ylabel), fontsize=fontsize) + if self.uselog == True: + ax2.set_xscale('log') + ax3.set_xscale('log') + if usecomp == True and self.num_data > 1: + ax25.set_xscale('log') + ax35.set_xscale('log') + # Main Plots + if usecomp == True and self.num_data > 1: + if usemean == False: + y_d_comp = self.binned_data[whichcomp]['y_d'] + y_l_comp = self.binned_data[whichcomp]['y_l'] + y_b_comp = self.binned_data[whichcomp]['y_b'] + y_s_comp = self.binned_data[whichcomp]['y_s'] + else: + y_d_all = [] + y_l_all = [] + y_b_all = [] + y_s_all = [] + for i in range(0, len(self.binned_data)): + y_d_all.append(self.binned_data[i]['y_d']) + y_l_all.append(self.binned_data[i]['y_l']) + y_b_all.append(self.binned_data[i]['y_b']) + y_s_all.append(self.binned_data[i]['y_s']) + y_d_all = np.array(y_d_all) + y_l_all = np.array(y_l_all) + y_b_all = np.array(y_b_all) + y_s_all = np.array(y_s_all) + y_d_comp = np.mean(y_d_all, axis=0) + y_l_comp = np.mean(y_l_all, axis=0) + y_b_comp = np.mean(y_b_all, axis=0) + y_s_comp = np.mean(y_s_all, axis=0) + if usefraction == True: + c1 = np.where(y_d_comp != 0.)[0] + c2 = np.where(y_l_comp != 0.)[0] + c3 = np.where(y_b_comp != 0.)[0] + c4 = np.where(y_s_comp != 0.)[0] + for i in range(0, len(self.binned_data)): + plot_histogram_line(self.binned_data[i]['x_d'], self.binned_data[i]['y_d'], + ax=ax1, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + plot_histogram_error(self.binned_data[i]['x_d'], self.binned_data[i]['y_d'], self.binned_data[i]['y_d_std'], + ax=ax1, color=self.colors[i], alpha=self.alphas_envelope[i]) + if usebox == True: + plot_histogram_line(self.binned_data[i]['x_l'], self.binned_data[i]['y_l'], x_edges=self.l_edges, + ax=ax2, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + plot_histogram_line(self.binned_data[i]['x_b'], self.binned_data[i]['y_b'], x_edges=self.b_edges, + ax=ax3, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + plot_histogram_line(self.binned_data[i]['x_s'], self.binned_data[i]['y_s'], + ax=ax4, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + plot_histogram_error(self.binned_data[i]['x_l'], self.binned_data[i]['y_l'], self.binned_data[i]['y_l_std'], + x_edges=self.l_edges, ax=ax2, color=self.colors[i], alpha=self.alphas_envelope[i]) + plot_histogram_error(self.binned_data[i]['x_b'], self.binned_data[i]['y_b'], self.binned_data[i]['y_b_std'], + x_edges=self.b_edges, ax=ax3, color=self.colors[i], alpha=self.alphas_envelope[i]) + plot_histogram_error(self.binned_data[i]['x_s'], self.binned_data[i]['y_s'], self.binned_data[i]['y_s_std'], + ax=ax4, color=self.colors[i], alpha=self.alphas_envelope[i]) + else: + ax2.plot(self.binned_data[i]['x_l'], self.binned_data[i]['y_l'], + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax3.plot(self.binned_data[i]['x_b'], self.binned_data[i]['y_b'], + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax4.plot(self.binned_data[i]['x_s'], self.binned_data[i]['y_s'], + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + ax2.fill_between(self.binned_data[i]['x_l'], self.binned_data[i]['y_l']-self.binned_data[i]['y_l_std'], + self.binned_data[i]['y_l']+self.binned_data[i]['y_l_std'], color=self.colors[i], + alpha=self.alphas_envelope[i]) + ax3.fill_between(self.binned_data[i]['x_b'], self.binned_data[i]['y_b']-self.binned_data[i]['y_b_std'], + self.binned_data[i]['y_b']+self.binned_data[i]['y_b_std'], color=self.colors[i], + alpha=self.alphas_envelope[i]) + ax4.fill_between(self.binned_data[i]['x_s'], self.binned_data[i]['y_s']-self.binned_data[i]['y_s_std'], + self.binned_data[i]['y_s']+self.binned_data[i]['y_s_std'], color=self.colors[i], + alpha=self.alphas_envelope[i]) + if usecomp == True and self.num_data > 1: + if usefraction == False: + plot_histogram_line(self.binned_data[i]['x_d'], self.binned_data[i]['y_d']-y_d_comp, + ax=ax15, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + plot_histogram_error(self.binned_data[i]['x_d'], self.binned_data[i]['y_d']-y_d_comp, self.binned_data[i]['y_d_std'], + ax=ax15, color=self.colors[i], alpha=self.alphas_envelope[i]) + if usebox == True: + plot_histogram_line(self.binned_data[i]['x_l'], self.binned_data[i]['y_l']-y_l_comp, + x_edges=self.l_edges, ax=ax25, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + plot_histogram_line(self.binned_data[i]['x_b'], self.binned_data[i]['y_b']-y_b_comp, + x_edges=self.b_edges, ax=ax35, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + plot_histogram_line(self.binned_data[i]['x_s'], self.binned_data[i]['y_s']-y_s_comp, + ax=ax45, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + plot_histogram_error(self.binned_data[i]['x_l'], self.binned_data[i]['y_l']-y_l_comp, + self.binned_data[i]['y_l_std'], x_edges=self.l_edges, + ax=ax25, color=self.colors[i], alpha=self.alphas_envelope[i]) + plot_histogram_error(self.binned_data[i]['x_b'], self.binned_data[i]['y_b']-y_b_comp, + self.binned_data[i]['y_b_std'], x_edges=self.b_edges, + ax=ax35, color=self.colors[i], alpha=self.alphas_envelope[i]) + plot_histogram_error(self.binned_data[i]['x_s'], self.binned_data[i]['y_s']-y_s_comp, + self.binned_data[i]['y_s_std'], + ax=ax45, color=self.colors[i], alpha=self.alphas_envelope[i]) + else: + ax25.plot(self.binned_data[i]['x_l'], self.binned_data[i]['y_l']-y_l_comp, + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax35.plot(self.binned_data[i]['x_b'], self.binned_data[i]['y_b']-y_b_comp, + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax45.plot(self.binned_data[i]['x_s'], self.binned_data[i]['y_s']-y_s_comp, + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + ax25.fill_between(self.binned_data[i]['x_l'], self.binned_data[i]['y_l']-self.binned_data[i]['y_l_std']-y_l_comp, + self.binned_data[i]['y_l']+self.binned_data[i]['y_l_std']-y_l_comp, + color=self.colors[i], alpha=self.alphas_envelope[i]) + ax35.fill_between(self.binned_data[i]['x_b'], self.binned_data[i]['y_b']-self.binned_data[i]['y_b_std']-y_b_comp, + self.binned_data[i]['y_b']+self.binned_data[i]['y_b_std']-y_b_comp, + color=self.colors[i], alpha=self.alphas_envelope[i]) + ax45.fill_between(self.binned_data[i]['x_s'], self.binned_data[i]['y_s']-self.binned_data[i]['y_s_std']-y_s_comp, + self.binned_data[i]['y_s']+self.binned_data[i]['y_s_std']-y_s_comp, color=self.colors[i], + alpha=self.alphas_envelope[i]) + else: + plot_histogram_line(self.binned_data[i]['x_d'][c1], self.binned_data[i]['y_d'][c1]/y_d_comp[c1]-1., + ax=ax15, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + plot_histogram_confidence(self.binned_data[i]['x_d'][c1], + (self.binned_data[i]['y_d'][c1]-self.binned_data[i]['y_d_std'][c1])/y_d_comp[c1]-1, + (self.binned_data[i]['y_d'][c1]+self.binned_data[i]['y_d_std'][c1])/y_d_comp[c1]-1, + ax=ax15, color=self.colors[i], alpha=self.alphas_envelope[i]) + if usebox == True: + plot_histogram_line(self.binned_data[i]['x_l'][c2], self.binned_data[i]['y_l'][c2]/y_l_comp[c2]-1., + x_edges=self.l_edges, ax=ax25, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + plot_histogram_line(self.binned_data[i]['x_b'][c3], self.binned_data[i]['y_b'][c3]/y_b_comp[c3]-1., + x_edges=self.b_edges, ax=ax35, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + plot_histogram_line(self.binned_data[i]['x_s'][c4], self.binned_data[i]['y_s'][c4]/y_s_comp[c4]-1, + ax=ax45, color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + plot_histogram_confidence(self.binned_data[i]['x_l'][c2], + (self.binned_data[i]['y_l'][c2]-self.binned_data[i]['y_l_std'][c2])/y_l_comp[c2]-1, + (self.binned_data[i]['y_l'][c2]+self.binned_data[i]['y_l_std'][c2])/y_l_comp[c2]-1, + x_edges=self.l_edges, ax=ax25, color=self.colors[i], alpha=self.alphas_envelope[i]) + plot_histogram_confidence(self.binned_data[i]['x_b'][c3], + (self.binned_data[i]['y_b'][c3]-self.binned_data[i]['y_b_std'][c3])/y_b_comp[c3]-1, + (self.binned_data[i]['y_b'][c3]+self.binned_data[i]['y_b_std'][c3])/y_b_comp[c3]-1, + x_edges=self.b_edges, ax=ax35, color=self.colors[i], alpha=self.alphas_envelope[i]) + plot_histogram_confidence(self.binned_data[i]['x_s'][c4], + (self.binned_data[i]['y_s'][c4]-self.binned_data[i]['y_s_std'][c4])/y_s_comp[c4]-1, + (self.binned_data[i]['y_s'][c4]+self.binned_data[i]['y_s_std'][c4])/y_s_comp[c4]-1, + ax=ax45, color=self.colors[i], alpha=self.alphas_envelope[i]) + else: + ax25.plot(self.binned_data[i]['x_l'][c2], self.binned_data[i]['y_l'][c2]/y_l_comp[c2]-1., + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax35.plot(self.binned_data[i]['x_b'][c3], self.binned_data[i]['y_b'][c3]/y_b_comp[c3]-1., + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax45.plot(self.binned_data[i]['x_s'][c4], self.binned_data[i]['y_s'][c4]/y_s_comp[c4]-1., + color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if showenvelopes == True and self.need_envelopes[i] == True: + ax25.fill_between(self.binned_data[i]['x_l'][c2], + (self.binned_data[i]['y_l'][c2]-self.binned_data[i]['y_l_std'][c2])/y_l_comp[c2]-1., + (self.binned_data[i]['y_l'][c2]+self.binned_data[i]['y_l_std'][c2])/y_l_comp[c2]-1., + color=self.colors[i], alpha=self.alphas_envelope[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax35.fill_between(self.binned_data[i]['x_b'][c3], + (self.binned_data[i]['y_b'][c3]-self.binned_data[i]['y_b_std'][c3])/y_b_comp[c3]-1., + (self.binned_data[i]['y_b'][c3]+self.binned_data[i]['y_b_std'][c3])/y_b_comp[c3]-1., + color=self.colors[i], alpha=self.alphas_envelope[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + ax45.fill_between(self.binned_data[i]['x_s'][c4], + (self.binned_data[i]['y_s'][c4]-self.binned_data[i]['y_s_std'][c4])/y_s_comp[c4]-1., + (self.binned_data[i]['y_s'][c4]+self.binned_data[i]['y_s_std'][c4])/y_s_comp[c4]-1., + color=self.colors[i], alpha=self.alphas_envelope[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + if plotzeroline == True: + ax15.axhline(0., color='k', linestyle=':', linewidth=1.) + ax25.axhline(0., color='k', linestyle=':', linewidth=1.) + ax35.axhline(0., color='k', linestyle=':', linewidth=1.) + ax45.axhline(0., color='k', linestyle=':', linewidth=1.) + ax1.set_xlim(self.d_min, self.d_max) + ax2.set_xlim(self.l_min, self.l_max) + ax3.set_xlim(self.b_min, self.b_max) + ax4.set_xlim(self.s_min, self.s_max) + ax1.set_ylim(0., None) + ax2.set_ylim(0., None) + ax3.set_ylim(0., None) + ax4.set_ylim(0., None) + if usecomp == True and self.num_data > 1: + ax1.set_xticks([]) + ax2.set_xticks([]) + ax3.set_xticks([]) + ax4.set_xticks([]) + ax15.set_xlim(self.d_min, self.d_max) + ax25.set_xlim(self.l_min, self.l_max) + ax35.set_xlim(self.b_min, self.b_max) + ax45.set_xlim(self.s_min, self.s_max) + plt.tight_layout() + if legend == True: + lines = [] + labels = [] + for i in range(0, len(self.binned_data)): + if self.labels[i] is None: + pass + else: + if showenvelopes == True and self.need_envelopes[i] == True: + line = ax1.fill_between([], [], [], color=self.colors[i], alpha=self.alphas_envelope[i]) + lines.append(line) + labels.append(self.labels[i]) + else: + line, = ax1.plot([], [], color=self.colors[i], alpha=self.alphas[i], + linewidth=self.linewidths[i], linestyle=self.linestyles[i]) + lines.append(line) + labels.append(self.labels[i]) + f.subplots_adjust(top=subplot_adjust_top) + plt.figlegend(handles=lines, labels=labels, loc='upper center', fontsize=legend_fontsize, ncol=legend_column, frameon=False) + if saveas is None: + pass + else: + plt.savefig(saveas) + plt.show() + + def clean(self): + """Resets PlotHistMST variables. + """ + self.__init__() diff --git a/paper/paper.bbl b/paper/paper.bbl new file mode 100644 index 0000000..e69de29 diff --git a/paper/paper.bib b/paper/paper.bib index 2410978..c49326b 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,3 +1,82 @@ +@article{spada2004, + title={Use of the minimum spanning tree model for molecular epidemiological investigation of a nosocomial outbreak of hepatitis C virus infection}, + author={Spada, Enea and Sagliocca, Luciano and Sourdis, John and Garbuglia, Anna Rosa and Poggi, Vincenzo and De Fusco, Carmela and Mele, Alfonso}, + journal={Journal of clinical microbiology}, + volume={42}, + number={9}, + pages={4230--4236}, + year={2004}, + publisher={Am Soc Microbiol} +} + +@articles{prim, + author={R. C. {Prim}}, + journal={The Bell System Technical Journal}, + title={Shortest connection networks and some generalizations}, + year={1957}, + volume={36}, + number={6}, + pages={1389-1401}, + keywords={}, + doi={10.1002/j.1538-7305.1957.tb01515.x}, + ISSN={0005-8580}, + month={Nov},} + +@article{rainbolt:2016, + Archiveprefix = {arXiv}, + Author = {Rainbolt, Jessica Lovelace and Schmitt, Michael}, + Date-Added = {2017-02-17 18:52:54 +0000}, + Date-Modified = {2017-02-17 18:58:48 +0000}, + Doi = {10.1088/1748-0221/12/02/P02009}, + Eprint = {1608.04772}, + Journal = {JINST}, + Number = {02}, + Pages = {P02009}, + Primaryclass = {stat.AP}, + Reportnumber = {NUHEP-EX-16-04}, + Slaccitation = {%%CITATION = ARXIV:1608.04772;%%}, + Title = {{The Use of Minimal Spanning Trees in Particle Physics}}, + Volume = {12}, + Year = {2017}, + Bdsk-Url-1 = {http://dx.doi.org/10.1088/1748-0221/12/02/P02009}} + +@article{gama:2014, + Adsnote = {Provided by the SAO/NASA Astrophysics Data System}, + Adsurl = {http://adsabs.harvard.edu/abs/2014MNRAS.438..177A}, + Archiveprefix = {arXiv}, + Author = {{Alpaslan}, M. and {Robotham}, A.~S.~G. and {Driver}, S. and {Norberg}, P. and {Baldry}, I. and {Bauer}, A.~E. and {Bland-Hawthorn}, J. and {Brown}, M. and {Cluver}, M. and {Colless}, M. and {Foster}, C. and {Hopkins}, A. and {Van Kampen}, E. and {Kelvin}, L. and {Lara-Lopez}, M.~A. and {Liske}, J. and {Lopez-Sanchez}, A.~R. and {Loveday}, J. and {McNaught-Roberts}, T. and {Merson}, A. and {Pimbblet}, K.}, + Date-Added = {2017-02-17 19:00:00 +0000}, + Date-Modified = {2017-02-17 19:00:13 +0000}, + Doi = {10.1093/mnras/stt2136}, + Eprint = {1311.1211}, + Journal = {\mnras}, + Keywords = {methods: observational, surveys, large-scale structure of Universe}, + Month = feb, + Pages = {177-194}, + Primaryclass = {astro-ph.CO}, + Title = {{Galaxy And Mass Assembly (GAMA): the large-scale structure of galaxies and comparison to mock universes}}, + Volume = 438, + Year = 2014, + Bdsk-Url-1 = {http://dx.doi.org/10.1093/mnras/stt2136}} + +@article{allison:2009, + Adsnote = {Provided by the SAO/NASA Astrophysics Data System}, + Adsurl = {http://adsabs.harvard.edu/abs/2009MNRAS.395.1449A}, + Archiveprefix = {arXiv}, + Author = {{Allison}, R.~J. and {Goodwin}, S.~P. and {Parker}, R.~J. and {Portegies Zwart}, S.~F. and {de Grijs}, R. and {Kouwenhoven}, M.~B.~N.}, + Date-Added = {2017-03-07 15:48:21 +0000}, + Date-Modified = {2017-03-07 15:48:32 +0000}, + Doi = {10.1111/j.1365-2966.2009.14508.x}, + Eprint = {0901.2047}, + Journal = {\mnras}, + Keywords = {methods: data analysis}, + Month = may, + Pages = {1449-1454}, + Primaryclass = {astro-ph.GA}, + Title = {{Using the minimum spanning tree to trace mass segregation}}, + Volume = 395, + Year = 2009, + Bdsk-Url-1 = {http://dx.doi.org/10.1111/j.1365-2966.2009.14508.x}} @ARTICLE{naidoo:2019, author = {{Naidoo}, Krishna and {Whiteway}, Lorne and {Massara}, Elena and diff --git a/paper/paper.blg b/paper/paper.blg new file mode 100644 index 0000000..7f772c3 --- /dev/null +++ b/paper/paper.blg @@ -0,0 +1,48 @@ +This is BibTeX, Version 0.99d (TeX Live 2017) +Capacity: max_strings=100000, hash_size=100000, hash_prime=85009 +The top-level auxiliary file: paper.aux +I found no \citation commands---while reading file paper.aux +I found no \bibdata command---while reading file paper.aux +I found no \bibstyle command---while reading file paper.aux +You've used 0 entries, + 0 wiz_defined-function locations, + 83 strings with 484 characters, +and the built_in function-call counts, 0 in all, are: += -- 0 +> -- 0 +< -- 0 ++ -- 0 +- -- 0 +* -- 0 +:= -- 0 +add.period$ -- 0 +call.type$ -- 0 +change.case$ -- 0 +chr.to.int$ -- 0 +cite$ -- 0 +duplicate$ -- 0 +empty$ -- 0 +format.name$ -- 0 +if$ -- 0 +int.to.chr$ -- 0 +int.to.str$ -- 0 +missing$ -- 0 +newline$ -- 0 +num.names$ -- 0 +pop$ -- 0 +preamble$ -- 0 +purify$ -- 0 +quote$ -- 0 +skip$ -- 0 +stack$ -- 0 +substring$ -- 0 +swap$ -- 0 +text.length$ -- 0 +text.prefix$ -- 0 +top$ -- 0 +type$ -- 0 +warning$ -- 0 +while$ -- 0 +width$ -- 0 +write$ -- 0 +(There were 3 error messages) diff --git a/paper/paper.md b/paper/paper.md index 1e181c2..7a79122 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -19,12 +19,61 @@ bibliography: paper.bib # Summary -``MiSTree`` is a ``Python`` package for the construction and analysis of *minimum spanning trees* (MST) from points given in 2/3 dimensions and in tomographic (on a unit sphere) or spherical polar coordinates. The weights of the edges are assumed to be the distances between points; i.e. the Euclidean distance for 2/3 dimension and spherical polar coordinates, and angular distances for tomographic coordinates. The package was designed to be used in cosmology; to extract cosmic web information present in the large scale distribution of galaxies (shown in @naidoo:2019) but could in principle be constructed on any distribution of points with non-Gaussian features. +``MiSTree`` is a ``Python`` package for the fast construction of the *minimum spanning tree* (MST). ``MiSTree`` can take as input a distribution of points defined in different coordinate systems (2/3 dimensions, tomographic and spherical polar coordinates). A user can then use the constructed MST to look for patterns in the distribution or to measure and plot the MST statistics. -The MST is constructed by initially creating a k-nearest neighbour graph (using ``scikit-learn``'s ``kneighbours_graph`` function) and then runs the Kruskal algorithm [@kruskal1956shortest] (using ``scipy`` ``minimum_spanning_tree`` function). Statistics of the constructed MST can then be measured and are described in @naidoo:2019. +# Motivation -The package was designed to be easy to use, with the class function ``GetMST`` allowing for the construction and subsequent analysis of the MST to be carried out fairly quickly and on a variety of data sets. A user can then choose to either measure the default statistics, discussed in @naidoo:2019, or develop their own. +Studies of point distributions often measure their 2-point statistics (i.e. the distribution of distances between pairs of points) which are then compared to theoretical models. This is a powerful technique and has been used very successfully in the field of cosmology to study the early Universe and the large scale distribution of galaxies. Unfortunately this statistic can only fully describe a distribution that is Gaussian, if it is non-Gaussian then the 2-point is no longer sufficient. The conventional method to incorporate non-Gaussian information is to look at the distribution's $N$-point statistic (if $N\!=\!3$ we look at the distribution of triangles, if $N\!=\!4$ we look at the distribution of quadrilaterals and so on). This method is well motivated as in principle all the information that can describe a distribution of points is contained within its $N$-point statistics. However, calculating $N$-point statistics even for $N\!>\!3$ becomes quickly intractable for large data sets. -Dependencies include the ``Python`` modules ``numpy`` [@numpy], ``scipy`` [@scipy], ``scikit-learn`` [@scikit-learn] and ``f2py`` [@peterson2009f2py] (the latter of which is used to compile ``Fortran`` subroutines). The source code can be found on [github](https://github.com/knaidoo29/mistree) and documentation and tutorials are provided [here](https://knaidoo29.github.io/mistreedoc/). +The MST offers an alternative approach; the MST graph draws lines between pairs of points so that all points are linked in a single skeletal structure that contains no loops and has minimal total edge length. Unlike $N$-point statistics, that typically scale by $\mathcal{O}(n^{N})$ for $n$ points, the MST (computed using the Kruskal algorithm @kruskal1956shortest) can be constructed much faster (at best $\mathcal{O}(n\log n)$). While the MST does not contain all the information present in $N$-point statistics, it enables some of this information to be captured and allows the identification of skeletal patterns, as such it has found a broad range of applications in physics: such as finding filaments in the distribution of galaxies [@gama:2014], classifying particle physics collisions [@rainbolt:2016] and mass segregation in star clusters [@allison:2009]. The MST has also been used in a number of other scientific field such as computer science, sociology and epidemiology. + +While algorithms to construct the *minimum spanning tree* are well known (e.g. @prim and @kruskal1956shortest) implementations of these often require the input of a matrix of pairwise distances. For a large data set the creation of this matrix (with $n^{2}$ elements) can be a significant strain on memory while also making the construction of the MST slower ($\mathcal{O}(n^{2}\log n)$). + +# MiSTree + +``MiSTree`` tries to solve these issues by providing a public ``Python`` package for the construction and analysis of the MST. The package initially creates a $k$-nearest neighbour graph ($k$NN, a graph that links each point to the nearest $k$ neighbours, using ``scikit-learn``'s ``kneighbours_graph`` function) which improves speed by limiting the number of considered edges from $n^{2}$ to $kn$ (where $k\!\l\!l n$) and then runs the Kruskal algorithm [@kruskal1956shortest] (using ``scipy``s' ``minimum_spanning_tree`` function). It was designed to be easy to use, with the raw input data into ``scipy``s' ``minimum_spanning_tree`` function and output being handled by ```MiSTree``` itself. + +The MST can be constructed from data provided in 2/3 dimensions and in tomographic (on a unit sphere) or spherical polar coordinates. The weights of the edges are assumed to be the distances between points; i.e. the Euclidean distance for 2/3 dimension and spherical polar coordinates, and angular distances for tomographic coordinates. Furthermore, the package can very quickly measure the standard statistics of this graph: + +- degree ($d$) -- the number of edges attached to each node. +- edge length ($l$) -- the length of edges in the MST. + +While also being able to measure the statistics of branches, which are chains of edges connected with degree $=2$: + +- branch length ($b$) -- the sum of the length of member edges. +- branch shape ($s$) -- the straight line distance between the tips of branches divided by the branch length. + +The statistics calculated by ``MiSTree`` are extensively explored in @naidoo:2019 and found to significantly improve constraints on cosmological parameters when tested on simulations. + +# Basic Usage + +To construct the MST using ``MiSTree`` from a distribution of points in 2 dimensions you would use the following commands: + +``` +import mistree as mist + +# initialised MiSTree Minimum Spanning Tree class +mst = mist.GetMST(x=x, y=y) +mst.construct_mst() +``` +![An example of how ``MiSTree`` constructs the MST from a distribution of points (shown on the left). ``MiSTree`` first begins by constructing a $k$NN graph which links all points to their nearest $k$ neighbours (shown in the centre) and then runs the Kruskal algorithm to construct the MST (shown on the right).](mistree_in_action.png) + +The stages of the MST construction are shown in Figure 1. Once the MST is constructed it can either be used to look for features in the distribution or to measure statistics of the graph which in turn tell us about how points have been distributed. ``MiSTree`` can measure four statistics by default, which can be calculated directly after initialising the ```GetMST``` class (an example of the distribution of these statistics is shown in Figure 2): + +``` +d, l, b, s = mst.get_stats() +``` + +The source code can be found on [github](https://github.com/knaidoo29/mistree) while documentation and more complicated tutorials are provided [here](https://knaidoo29.github.io/mistreedoc/). + +![Histograms of the distribution of the MST statistics degree ($d$), edge length ($l$), branch length ($b$) and branch shape ($s$) for a Levy-Flight distribution of points [details of which are provided in @naidoo:2019] in 3 dimensions.](mst_levy_flight_example.png) + +# Dependencies + +Dependencies for ``MiSTree`` include the ``Python`` modules ``numpy`` [@numpy], ``matplotlib`` [@Hunter:2007], ``scipy`` [@scipy], ``scikit-learn`` [@scikit-learn] and ``f2py`` [@peterson2009f2py] (the latter of which is used to compile ``Fortran`` subroutines). + +# Acknowledgement + +I thank Ofer Lahav and Lorne Whiteway for their guidance and suggestions in developing this package and acknowledge support from the Science and Technology Fascilities Council grant ST/N50449X. # References