diff --git a/pubmedcentral/.gitignore b/pubmedcentral/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/pubmedcentral/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/pubmedcentral/README.md b/pubmedcentral/README.md new file mode 100644 index 0000000..54bb9ab --- /dev/null +++ b/pubmedcentral/README.md @@ -0,0 +1,52 @@ +# PubMed Central + +A collection of journal articles from [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/), "a free full-text archive of biomedical and life sciences journal literature at the U.S. National Institutes of Health's National Library of Medicine (NIH/NLM)." + +## Data Download and Processing + +Downloading and processing code is easy. Simply use `bash run.sh` (or `bash run.sh 1000` for debugging with 1000 samples) + +
+Under the hood of run.sh +Run.sh has 3 main steps: + +1. Download the list of all articles with `bash get-filelist.sh` +2. Download the data and convert from nxml to markdown with `bash download-and-convert-to-md.sh` +3. Convert the data to the Dolma format with `python to-dolma.py` +
+ +Files converted to markdown will live in `data/md`, author lists live in `data/authors`, and processed files will live in `data/pubmedcentral` + +## Data Stats + +| # Articles | # Tokens | +| ---------: | -------: | +| 3997890 | | + +## Example + +``` json +{ + "id": "PMC176545", + "text": "# Introduction {#s1}\n\nHuman malaria is caused by four species of the parasitic protozoan genus...", + "source": "PubMed Central", + "added": "2024-04-19T17:48:14.010842", + "created": "2003-8-18", + "metadata": + { + "license": "Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/", + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC176545/", + "journal": "PLoS Biol. 2003 Oct 18; 1(1):e5", + "authors": [{"first": "Zbynek", "last": "Bozdech"}, {"first": "Manuel", "last": "Llin\u00e1s"}, {"first": "Brian Lee", "last": "Pulliam"}, {"first": "Edith D", "last": "Wong"}, {"first": "Jingchun", "last": "Zhu"}, {"first": "Joseph L", "last": "DeRisi"}], + } +} +``` + +## Notes +Converting documents from nxml to markdown requires the pandoc library, which can be installed following the instructions on the [pandoc website](https://pandoc.org/installing.html). + + +TODO: +- [ ] Confirm article and token #s, fill in example +- [ ] Handle references to figures and tables +- [ ] Handle citations diff --git a/pubmedcentral/download-convert-to-md.sh b/pubmedcentral/download-convert-to-md.sh new file mode 100644 index 0000000..282053a --- /dev/null +++ b/pubmedcentral/download-convert-to-md.sh @@ -0,0 +1,6 @@ +#!/usr/bash/env sh + +TOTAL_DOCS="${1:-0}" + +# downloads each file from the filelist and converts the .nxml file to .md +python3 download_and_convert_to_md.py --filelist data/permissive_filelist.txt --total_docs "${TOTAL_DOCS}" diff --git a/pubmedcentral/download_and_convert_to_md.py b/pubmedcentral/download_and_convert_to_md.py new file mode 100644 index 0000000..b759f3e --- /dev/null +++ b/pubmedcentral/download_and_convert_to_md.py @@ -0,0 +1,201 @@ +import argparse +import functools +import json +import multiprocessing as mp +import os +import re +import shutil +import subprocess +import tarfile +import traceback +import xml.etree.ElementTree as ET + +from tqdm import tqdm + +from licensed_pile import logs +from licensed_pile.scrape import get_page + +parser = argparse.ArgumentParser(description="Convert xml documents to markdown.") +parser.add_argument("--filelist", help="The path to the filelist.txt file.") +parser.add_argument( + "--output_dir", default="data/md/", help="Where the markdown files go." +) +parser.add_argument( + "--total_docs", + default=0, + type=int, + help="Total number of documents to convert, for debugging.", +) +parser.add_argument( + "--metadata_dir", default="data/metadata/", help="Where the metadata files go." +) +parser.add_argument( + "--processes", + default=mp.cpu_count(), + type=int, + help="Number of processes to use for conversion.", +) + + +def get_authors_and_date(nxml_file: str, pmcid: str): + # get authors from nxml file + authors = [] + date_created = None + + tree = ET.parse(nxml_file) + + # search for author tags + for author in tree.findall(".//contrib[@contrib-type='author']"): + surname = author.find("name/surname") + given_names = author.find("name/given-names") + if surname is not None and given_names is not None: + authors.append({"first": given_names.text, "last": surname.text}) + + # get date + # date can be found under "epub" or "pmc-release" tags + pub_types = ["epub", "pmc-release"] + for pub_type in pub_types: + date = tree.find(f".//pub-date[@pub-type='{pub_type}']") + if date is not None: + year = date.find("year").text + month = date.find("month").text + day = date.find("day").text + # convert to YYYY-MM-DD format + date_created = f"{year}-{month}-{day}" + break + + if date_created is None: + # haven't seen any examples without a date, but just in case + # not a fatal error, just log it + logger = logs.get_logger("pubmedcentral") + logger.error(f"Date not found for {pmcid}") + + return authors, date_created + + +def download(f_url: str, output_dir: str): + # download file from f_url to output_dir + try: + # get the tarball + r = get_page(f_url) + + # write tarball to disk + with open(os.path.join(output_dir, f_url.split("/")[-1]), "wb") as fh: + fh.write(r.content) + except: + logger = logs.get_logger("pubmedcentral") + logger.error(f"Error downloading {f_url}") + logger.error(traceback.print_exc()) + + +def extract_and_convert_tarball(t: str, output_dir: str): + if not os.path.exists(t): + return + try: + with tarfile.open(t) as tar: + nxml = [f for f in tar.getnames() if f.endswith(".nxml")] + + # make sure there's only one nxml file + if len(nxml) > 1: + # haven't seen an example with more than one nxml file, but just in case + error_message = f"More than one nxml file in {t}" + logger = logs.get_logger("pubmedcentral") + logger.error(error_message) + raise ValueError(error_message) + nxml = nxml[0] + + # extract nxml file + tar.extract(nxml) + + # get pmcid + pmcid = nxml.split("/")[0] + + # get metadata from nxml file + authors, date_created = get_authors_and_date(nxml, pmcid) + metadata = {"authors": authors, "created": date_created} + # write to file + with open( + f"{os.path.join(args.metadata_dir, pmcid)}.json", "w", encoding="utf-8" + ) as f: + json.dump(metadata, f, ensure_ascii=False) + + # convert nxml to markdown + # pandoc options: + # --quiet is to suppress messages + # --from jats specifies the input format as Journal Article Tag Suite (https://jats.nlm.nih.gov/) + # -o is the output file + # --wrap=none is to prevent pandoc from wrapping lines + options = [ + "pandoc", + "--quiet", + "--from", + "jats", + nxml, + "-o", + f"{pmcid}.md", + "--wrap=none", + ] + subprocess.run(options) + + # remove extracted files + os.rename(f"{pmcid}.md", f"{output_dir}/{pmcid}.md") + shutil.rmtree(nxml.split("/")[0], ignore_errors=True) + + except: + logger = logs.get_logger("pubmedcentral") + logger.error(f"Error extracting {t}") + logger.error(traceback.print_exc()) + + +def download_and_convert( + line: str, output_dir: str, base_url="https://ftp.ncbi.nlm.nih.gov/pub/pmc/" +): + # split line into parts + partial_path = line.split("\t")[0] + + # create paths for the url and the destination of the markdown file + f_url = os.path.join(base_url, partial_path) + f_dest = os.path.join(output_dir, partial_path.split("/")[-1]) + + try: + download(f_url, output_dir) + extract_and_convert_tarball(f_dest, output_dir) + + # delete the tarball + os.remove(f_dest) + + except Exception as e: + logger = logs.get_logger("pubmedcentral") + logger.error(e) + + +def main(args): + os.makedirs(args.output_dir, exist_ok=True) + os.makedirs(args.metadata_dir, exist_ok=True) + + with open(args.filelist) as fh: + files = fh.read().split("\n") + + # ignore the header + files = files[1:] + + if args.total_docs > 0: + files = files[: args.total_docs] + + with mp.Pool(args.processes) as p: + # use list to force the execution of the imap iterable within the context of the multiprocessing pool + _ = list( + tqdm( + p.imap( + functools.partial(download_and_convert, output_dir=args.output_dir), + files, + ), + total=len(files), + ) + ) + + +if __name__ == "__main__": + args = parser.parse_args() + logs.configure_logging("pubmedcentral") + main(args) diff --git a/pubmedcentral/example/PMC1149498.md b/pubmedcentral/example/PMC1149498.md new file mode 100644 index 0000000..5700b1e --- /dev/null +++ b/pubmedcentral/example/PMC1149498.md @@ -0,0 +1,37 @@ +Almost exactly five years ago, in early June 2000, BMC Bioinformatics received its first submission. Five years on, it has received over a thousand submissions, and the journal is continuing to grow rapidly (Figure [1](#F1){ref-type="fig"}). + +In the past few months, developments have included a refreshed international [editorial board](http://www.biomedcentral.com/bmcbioinformatics/edboard), which now consists of over 50 leaders in the field, and a [Bioinformatics and Genomics gateway](http://www.biomedcentral.com/gateways/bioinformaticsgenomics/) that brings together relevant content from across BioMed Central\'s 130+ Open Access journals. And by the time you read this, *BMC Bioinformatics* should have its first official ISI Impact Factor. Impact factors certainly have their problems -- a previous editorial in this journal\[[@B1]\] discussed the arbitrariness of the process by which ISI selects journals for tracking, and the resulting unnecessary time delay before Impact Factors become available. One thing is clear though -- with *BMC Bioinformatics* having an Impact Factor, there are more reasons than ever to make it the first choice for your research. + +# Five years in bioinformatics + +Looking back over the first 5 years of the journal, are any significant trends evident? One thing that is noticeable is the prevalence of the open-source model of software development. In fact more than 10% of all BMC Bioinformatics articles include the term \"open-source\". Hundreds of open-source bioinformatics projects are now hosted on sites such as [bioinformatics.org](http://bioinformatics.org) and [sourceforge.net](http://sourceforge.net/softwaremap/trove_list.php?form_cat=252). No doubt the similar philosophies of open-source software and Open Access publishing have been a factor in making *BMC Bioinformatics* one of BioMed Central\'s most successful journals. Two other emerging trends are, firstly, an increasing use of web service technology to connect disparate tools into analysis pipelines, and secondly, the development of systems to allow biological knowledge to be modelled and expressed in structured form. The linking factor between both these trends is that increasingly, as the data deluge continues, the \'users\' of bioinformatics tools and the \'readers\' of the biological literature, are likely to be computer systems rather than human beings. + +# Web services and data analysis pipelines + +As bioinformatics tools have proliferated, the complexity of data analysis has increased. Often, a sequence of analysis steps each using different tools must be carried out one after the other. This might be done manually or by using a monolithic system that is capable of carrying out multiple analyses, or, more flexibly, by writing special \'glue code\', often in Perl, to connect together multiple tools into a pipeline. The problem with the latter approach, though, is that in the absence of defined standards for the input and output of different tools, lots of glue code has to be written in order to create each new pipeline. Worse, systems built in this way tend to be fragile, since at any time one of the tools in the pipeline may change the format of its input or output (breaking the system), because there is no explicit \'contract\' between the various tools as to what input and output formats each will support. Web services \[[@B2]\], and more generally, \'Service Oriented Architectures\' \[[@B3]\] promise to provide a solution by providing a means for codifying standard interfaces that can be used to expose bioinformatics tools over the web. Projects such as MyGrid \[[@B4]\] have then built on these standards to provide biologists with graphical user interfaces that can be used to build new analysis pipelines interactively, without needing to write code. *BMC Bioinformatics* has published several articles on the use of Web Service technologies such as the Simple Object Access Protocol (SOAP) - if you are interested, try searching the journal for: [SOAP OR \"web services\"](http://www.biomedcentral.com/search/results.asp?drpField1=&txtSearch1=SOAP+OR+%22web+services%22&drpPhrase1=and&drpField2=%5BTI%5D&txtSearch2=&drpPhrase2=and&drpField3=%5BAU%5D&txtSearch3=&drpPhrase3=and&drpField4=%5BTIAB%5D&txtSearch4=&drpPhrase4) + +# Text mining and biological semantics + +Another growth area in bioinformatics has been the structured representation and modelling of biological knowledge. The Gene Ontology project \[[@B5]\] has provided an important foundation for much of this work, defining a set of controlled vocabularies that allow biological concepts and relationships to be expressed in a standard way. + +Much of the initial work on modelling biological knowledge has explored the use of text-mining techniques to automatically derive structured semantic information from the relatively unstructured text of scientific research articles. [BioMed Central\'s Open Access corpus](http://www.biomedcentral.com/info/about/datamining)\[[@B6]\] is now rapidly approaching 10,000 articles and provides ideal raw material for such research.. It is already being used by many researchers, both in industry and academia. + +*BMC Bioinformatics* publishes many papers on text-mining topics, including the recently published supplement \[[@B7]\], which consists of papers presented at last year\'s BioCreAtIvE text-mining workshop in Granada, Spain. Text mining has its limits, however. Imagine what could be achieved if articles, rather than consisting entirely of free-form natural language, contained explicit assertions about biological knowledge in unambiguous, machine-readable form. This is the oft-vaunted promise of the 'Semantic Web' \[[@B8]\], but it has proved to be very difficult to realize in practice. + +Some recent developments, however, suggest that progress is being made. For example, this editorial was created using [Publicon](http://www.biomedcentral.com/info/ifora/publicon)\[[@B9]\]- a new breed of scientific authoring tool developed by Wolfram Research with input from BioMed Central. Publicon is easy to use, but it is also a highly structured authoring environment. It can not only output BioMed Central\'s native article XML format, but also embed mathematical equations as \'islands\' of semantically-rich MathML \[[@B10]\]. This structured mathematical information is then preserved throughout the publication process, from the author\'s computer right through to the reader\'s desktop with no intermediate unstructured version along the way that might cause information to be lost. + +So, for example, if you are accessing this editorial online using a suitable browser, you should be able to cut and paste the equation below into any MathML-aware application, as a mathematically meaningful equation rather than an image. + +$${\left( {i~{\nabla{- m}}} \right)~{\Phi_{e^{2}}\left\lbrack {B,x} \right\rbrack}} = {{{B(x)}~{\Phi_{e^{2}}\left\lbrack {B,x} \right\rbrack}} + {i~e^{2}~\gamma_{\mu}~\left. \int{{\delta_{+}\left( s_{x~1}^{2} \right)}~\left( {{{{\delta\Phi}_{e^{2}}\left\lbrack {B,x} \right\rbrack}/\delta}~{B_{\mu}(1)}} \right)~{\mathbb{d}\tau_{1}}} \right.}}$$ + +In two accompanying Commentaries, the issues associated with capturing and representing biological knowledge are discussed further. Murray-Rust *et al.*\[[@B11]\] consider how chemical information can best be represented within scientific articles, and what bioinformaticists and chemists can learn from one another. Meanwhile, Mons \[[@B12]\] explores in more detail how smart authoring tools can enrich the scientific literature by allowing authors to express themselves unambiguously, avoiding the \'data burying\' that makes text mining necessary in the first place. + +::: {#refs} +::: + +## Figures and Tables + +
+

+
Number of submissions to BMC Bioinformatics. The figure for 2005 represents a conservative projection based on the rate of growth of submissions during the first half of the year.
+
diff --git a/pubmedcentral/example/PMC509239.md b/pubmedcentral/example/PMC509239.md new file mode 100644 index 0000000..75ef2a0 --- /dev/null +++ b/pubmedcentral/example/PMC509239.md @@ -0,0 +1,459 @@ +# Background + +The construction of genetic linkage maps based on molecular markers has become a routine tool for comparative studies of genome structure and organization and the identification of loci affecting complex traits in different organisms \[[@B1]\]. Statistical methods for linkage analysis and map construction have been well developed in inbred line crosses \[[@B2]\] and implemented in the computer packages MAPMAKER \[[@B3]\], CRI-MAP \[[@B4]\], JOINMAP \[[@B5]\] and MULTIMAP \[[@B6]\]. Increasing efforts have been made to develop robust tools for analyzing marker data in outcrossing organisms \[[@B7]-[@B12]\], in which inbred lines are not available due to the heterozygous nature of these organisms and/or long-generation intervals. + +Genetic analyses and statistical methods in outcrossing species are far more complicated than in species that can be selfed to produce inbred lines. There are two reasons for this. First, the number of marker alleles and the segregation pattern of marker genotypes may vary from locus to locus in outcrossing species, whereas an inbred line-initiated segregating population, such as an F~2~ or backcross, always has two alleles and a consistent segregation ratio across different markers. Second, linkage phases among different markers are not known *a priori* for outbred parents and, therefore, an algorithm should be developed to characterize a most likely linkage phase for linkage analysis. + +To overcome these problems of linkage analysis in outcrossoing species, Grattapaglia and Sederoff \[[@B13]\] proposed a two-way pseudo-testcross mapping stratety in which one parent is heterozygous whereas the other is null for all markers. Using this strategy, two parent-specific linkage maps will be constructed. The limitation of the pseudo-testcross strategy is that it can only make use of a portion of molecular markers. Ritter et al. \[[@B7]\] and Ritter and Salamini \[[@B9]\] proposed statistical methods for estimating the recombination fractions between different segregation types of markers. Using both analytical and simulation approaches, Maliepaard et al. \[[@B10]\] discussed the power and precision of the estimation of the pairwise recombination fractions between markers. Wu et al. \[[@B11]\] formulated a multilocus likelihood approach to simultaneously estimate the linkage and linkage phases of the crossed parents over multiple markers. Ling \[[@B14]\] proposed a three-step analytical procedure for linkage analysis in out-crossing populations, which includes (1) determining the parental haplotypes for all of the markers in a linkage group, (2) estimating the recombination fractions, and (3) choosing a most likely marker order based on optimization analysis. This procedure was used to analyze segregating data in an outcrossing forest tree \[[@B15]\]. Currently, none of these models for linkage analysis in outcrossing species can provide a one-step analysis for the linkage, parental linkage phase and marker order from segregating marker data. + +In this article, we construct a unifying likelihood analysis to simultaneously estimate linkage, linkage phases and gene order for a group of markers that display all possible segregation patterns in a full-sib family derived from two outbred parents (see Table [1](#T1){ref-type="table"} of Wu et al. \[[@B11]\]). Our idea here is to integrate all possible linkage phases between a pair of markers in the two parents, each specified by a phase probability, into the framework of a mixture statistical model. In characterizing a most likely linkage phase (or parental diplotype) based on the phase probabilities, the recombination fractions are also estimated using a likelihood approach. This integrative idea is extended to consider gene orders in a multilocus analysis, in which the probabilities of all possible gene orders are estimated and a most likely order is chosen, along with the estimation of the linkage and parental diplotype. We perform extensive simulation studies to investigate the robustness, power and precision of our statistical mapping method incorporating linkage, parental diplotype and gene orders. An example from the published literature is used to validate the application of our method to linkage analysis in outcrossing species. + +::: {#T1 .table-wrap} +::: caption +Estimation from two-point analysis of the recombination fraction ( + +![](1471-2156-5-20-i1.gif) + +± SD) and the parental diplotype probability of parent P ( + +![](1471-2156-5-20-i2.gif) + +) and Q ( + +![](1471-2156-5-20-i3.gif) + +) for five markers in a full-sib family of *n* = 100 +::: + + Parental diplotype *r* = 0.05 *r* = 0.20 + ---------------------------- -------------------- ----- --- -------- ----- ---------------------------- ---------------------------- ---------------------------- ---------------------------- ---------------------------- ---------------------------- + + Marker P^*a*^ × Q^*a*^ ![](1471-2156-5-20-i1.gif) ![](1471-2156-5-20-i2.gif) ![](1471-2156-5-20-i3.gif) ![](1471-2156-5-20-i1.gif) ![](1471-2156-5-20-i2.gif) ![](1471-2156-5-20-i3.gif) + \| \| \| \| + ![](1471-2156-5-20-i4.gif) *a* *b* *c* *d* + \| \| \| \| 0.530 ± 0.0183 0.2097 ± 0.0328 + ![](1471-2156-5-20-i5.gif) *a* *b* *a* *b* 0.9960 0.9972 0.9882 0.9878 + \| \| \| \| 0.0464 ± 0.0303 0.2103 ± 0.0848 + ![](1471-2156-5-20-i6.gif) *a* *o* × *o* *a* 1 (0^*b*^) 0(1^*b*^) 1 (0^*b*^) 0(1^*b*^) + \| \| \| \| 0.0463 ± 0.0371 0.1952 ± 0.0777 + ![](1471-2156-5-20-i7.gif) *a* *b* *b* *b* 1 1/0^*c*^ 1 1/0^*c*^ + \| \| \| \| 0.0503 ± 0.0231 0.2002 ± 0.0414 + ![](1471-2156-5-20-i8.gif) *a* *b* *c* *d* 1 1/0^*c*^ 1 1/0^*c*^ + +^*a*^Shown is the parental diplotype of each parent for the five markers hypothesized, where the vertical lines denote the two homologous chromosomes. ^*b*^The values in the parentheses present a second possible solution. For any two symmetrical markers (2 and 3), + +![](1471-2156-5-20-i2.gif) + += 1, + +![](1471-2156-5-20-i3.gif) + += 0 and + +![](1471-2156-5-20-i2.gif) + += 0, + +![](1471-2156-5-20-i3.gif) + += 1 give an identical likelihood ratio test statistic (Wu et al. 2002a). Thus, when the two parents have different diplotypes for symmetrical markers, their parental diplotypes cannot be correctly determined from two-point analysis. ^*c*^The parental diplotype of parent *P*~2~ cannot be estimated in these two cases because marker 4 is homozygous in this parent. The MLE of *r* is given between two markers under comparison, whereas the MLEs of *p* and *q* given at the second marker. +::: + +## Two-locus analysis + +### A general framework + +Suppose there is a full-sib family of size *n* derived from two outcrossed parents P and Q. Two sets of chromosomes are coded as **1** and **2** for parent P and **3** and **4** for parent Q. Consider two marker loci ![](1471-2156-5-20-i4.gif) and ![](1471-2156-5-20-i5.gif), whose genotypes are denoted as 12/12 and 34/34 for parent P and Q, respectively, where we use / to separate the two markers. When the two parents are crossed, we have four different progeny genotypes at each marker, i.e., 13, 14, 23 and 24, in the full-sib family. Let *r* be the recombination fraction between the two markers. + +In general, the genotypes of the two markers for the two parents can be observed in a molecular experiment, but the allelic arrangement of the two markers in the two homologous chromosomes of each parent (i.e., linkage phase) is not known. In the current genetic literatuire, a linear arrangement of nonalleles from different markers on the same chromosomal region is called the *haplotype*. The observable two-marker genotype of parent P is 12/12, but it may be derived from one of two possible combinations of maternally- and paternally-derived haplotypes, i.e., \[11\] \[22\] or \[12\] \[21\], where we use \[\] to define a haplotype. The combination of two haplotypes is called the *diplotype*. Diplotype \[11\] \[22\] (denoted by 1) is generated due to the combination of two-marker haplotypes \[11\] and \[22\], whereas diplotype \[12\] \[21\] (denoted by ![](1471-2156-5-20-i9.gif)) is generated due to the combination of two-marker haplotypes \[12\] and \[21\]. If the probability of forming diplotype \[11\] \[22\] is *p*, then the probability of forming diplotype \[12\] \[21\] is 1 - *p*. The genotype of parent Q and its possible diplotypes \[33\] \[44\] and \[34\] \[43\] can be defined analogously; the formation probabilities of the two diplotypes are *q* and 1 - *q*, respectively. + +The cross of the two parents should be one and only one of four possible parental diplotype combinations, i.e., \[11\] \[22\] × \[33\] \[44\]), \[11\] \[22\] × \[34\] \[43\], \[12\] \[21\] × \[33\] \[44\] and \[12\] \[21\] × \[34\] \[43\], expressed as 11, 1![](1471-2156-5-20-i9.gif), ![](1471-2156-5-20-i9.gif)1 and ![](1471-2156-5-20-i10.gif), with a probability of *pq*, *p*(1 - *q*), (1 - *p*)*q* and (1 - *p*) (1 - *q*), respectively. The estimation of the recombination fraction in the full-sib family should be based on a correct diplotype combination \[[@B10]\]. The four combinations each will generate 16 two-marker progeny genotypes, whose frequencies are expressed, in a 4 × 4 matrix, as + +![](1471-2156-5-20-i11.gif) + +for \[11\] \[22\] × \[33\] \[44\], + +![](1471-2156-5-20-i12.gif) + +for \[11\] \[22\] × \[34\] \[43\], + +![](1471-2156-5-20-i13.gif) + +for \[12\] \[21\] × \[33\] \[44\] and + +![](1471-2156-5-20-i14.gif) + +for \[12\] \[21\] × \[34\] \[43\]. Note that these matrices are expressed in terms of the combinations of the progeny genotypes for two markers ![](1471-2156-5-20-i4.gif) and ![](1471-2156-5-20-i5.gif), respectively. + +Let **n** = (*n*~*j*1*j*2~)~4\ ×\ 4~ denote the matrix for the observations of progeny where *j*~1~,*j*~2~ = 1 for 13, 2 for 14, 3 for 23, or 4 for 34 for the progeny genotypes at these two markers. Under each parental diplotype combination, *n*~*j*1*j*2~ follows a multinomial distribution. The likelihoods for the four diplotype combinations are expressed as + +![](1471-2156-5-20-i15.gif) + +where *N*~1~ = *n*~11~ + *n*~22~ + *n*~33~ + *n*~44~, *N*~2~ = *n*~14~ + *n*~23~ + *n*~32~ + *n*~41~, *N*~3~ = *n*~12~ + *n*~21~ + *n*~34~ + *n*~43~, and *N*~4~ = *n*~13~ + *n*~31~ + *n*~24~ + *n*~42~. It can be seen that the maximum likeihood estimate (MLE) of r (![](1471-2156-5-20-i1.gif)) under the first diplotype combination is equal to one minus ![](1471-2156-5-20-i1.gif) under the fourth combination, and the same relation holds between the second and third diplotype combinations. Although there are identical plug-in likelihood values between the first and fourth combinatins as well as between the second and third combinations, one can still choose an appropriate ![](1471-2156-5-20-i1.gif) from these two pairs because one of them leads to ![](1471-2156-5-20-i1.gif) greater than 0.5. Traditional approaches for estimating the linkage and parental diplotypes are to estimate the recombination fractions and likelihood values under each of the four combinations and choose one legitimate estimate of *r* with a higher likelihood. + +In this study, we incorporate the four parental diplotype combinations into the observed data likelihood, expressed as + +![](1471-2156-5-20-i16.gif) + +where **Θ** = (*r*, *p*, *q*) is an unknown parameter vector, which can be estimated by differentiating the likelihood with respect to each unknown parameter, setting the derivatives equal to zero and solving the likelihood equations. This estimation procedure can be implemented with the EM algorithm \[[@B2],[@B11],[@B16]\]. Let **H** be a mixture matrix of the genotype frequencies under the four parental diplotype combinations weighted by the occurring probabilities of the diplotype combinations, expressed as + +![](1471-2156-5-20-i17.gif) + +where + +![](1471-2156-5-20-i18.gif) + +Similar to the expression of the genotype frequencies as a mixture of the four diplotype combinations, the expected number of recombination events contained within each two-marker progeny genotype is the mixture of the four different diplotype combinations, i.e., + +![](1471-2156-5-20-i19.gif) + +where the expected number of recombination events for each combination are expressed as + +![](1471-2156-5-20-i20.gif) + +Define + +![](1471-2156-5-20-i21.gif) + +The general procedure underlying the {*τ* + 1}th EM step is given as follows: + +**E Step:** At step *τ*, using the matrix **H** based on the current estimate *r*^{*τ*}^, calculate the expected number of recombination events between two markers for each progeny genotype and ![](1471-2156-5-20-i22.gif), + +![](1471-2156-5-20-i23.gif) + +![](1471-2156-5-20-i24.gif) + +![](1471-2156-5-20-i25.gif) + +where *d*~*j*1*j*2~, *h*~*j*1*j*2~, *p*~*j*1*j*2~ and *q*~*j*1*j*2~ are the (j~1~*j*~2~)th element of matrix **D**, **H**, **P** and **Q**, respectively. + +**M Step:** Calculate *r*^{*τ*+1}^ using the equation, + +![](1471-2156-5-20-i26.gif) + +The E step and M step among Eqs. (4) -- (7) are repeated until *r* converges to a value with satisfied precision. The converged values are regarded as the MLEs of **Θ**. + +### Model for partially informative markers + +Unlike an inbred line cross, a full-sib family may have many different marker segregation types. We symbolize observed marker alleles in a full-sib family by *A*~1~, *A*~2~, *A*~3~ and *A*~4~, which are codominant to each other but dominant to the null allele, symbolized by *O*. Wu et al. \[[@B11]\] listed a total of 28 segregation types, which are classified into 7 groups based on the amount of information for linkage analysis: + +A. Loci that are heterozygous in both parents and segregate in a 1:1:1:1 ratio, involving either four alleles *A*~1~*A*~2~ × *A*~3~*A*~4~, three non-null alleles *A*~1~*A*~2~ × *A*~1~*A*~3~, three non-null alleles and a null allele *A*~1~*A*~2~ × *A*~3~*O*, or two null alleles and two non-null alleles *A*~1~*O* × *A*~2~*O*; + +B. Loci that are heterozygous in both parents and segregate in a 1:2:1 ratio, which include three groups: + +B~1~. One parent has two different dominant alleles and the other has one dominant allele and one null allele, e.g., *A*~1~*A*~2~ × *A*~1~*O*; + +B~2~. The reciprocal of B~1~; + +B~3~. Both parents have the same genotype of two codominant alleles, i.e., *A*~1~*A*~2~ × *A*~1~*A*~2~; + +C. Loci that are heterozygous in both parents and segregate in a 3:1 ratio, i.e., *A*~1~*O* × *A*~1~*O*; + +D. Loci that are in the testcross configuration between the parents and segregate in a 1:1 ratio, which include two groups: + +D~1~. Heterozygous in one parent and homozygous in the other, including three alleles *A*~1~*A*~2~ × *A*~3~*A*~3~, two alleles *A*~1~*A*~2~ × *A*~1~*A*~1~, *A*~1~*A*~2~ × *OO* and *A*~2~*O* × *A*~1~*A*~1~, and one allele (with three null alleles) *A*~1~*O* × *OO*; + +D~2~. The reciprocals of D~1~. + +The marker group A is regarded as containing *fully informative* markers because of the complete distinction of the four progeny genotypes. The other six groups all contain the *partially informative* markers since some progeny genotype cannot be phenotypically separated from other genotypes. This incomplete distinction leads to the segregation ratios 1:2:1 (B), 3:1 (C) and 1:1 (D). Note that marker group D can be viewed as fully informative if we are only interested in the heterozygous parent. + +In the preceding section, we defined a (4 × 4)-matrix **H** for joint *genotype* frequencies between two fully informative markers. But for partially informative markers, only the joint *phenotypes* can be observed and, thus, the joint genotype frequencies, as shown in **H**, will be collapsed according to the same phenotype. Wu et al. \[[@B11]\] designed specific incidence matrices (**I**) relating the genotype frequencies to the phenotype frequencies for different types of markers. Here, we use the notation ![](1471-2156-5-20-i27.gif) for a (*b*~1~ × *b*~2~) matrix of the phenotype frequencies between two partially informative markers, where *b*~1~ and *b*~2~ are the numbers of distinguishable phenotypes for markers ![](1471-2156-5-20-i4.gif) and ![](1471-2156-5-20-i5.gif), respectively. Correspondingly, we have ![](1471-2156-5-20-i28.gif). The EM algorithm can then be developed to estimate the recombination fraction between any two partial informative markers. + +**E Step:** At step *τ*, based on the matrix (**DH**)\' derived from the current estimate *r*^{*τ*}^, calculate the expected number of recombination events between the two markers for a given progeny genotype and ![](1471-2156-5-20-i22.gif): + +![](1471-2156-5-20-i29.gif) + +![](1471-2156-5-20-i30.gif) + +![](1471-2156-5-20-i31.gif) + +where ![](1471-2156-5-20-i32.gif), ![](1471-2156-5-20-i33.gif), ![](1471-2156-5-20-i34.gif) and ![](1471-2156-5-20-i35.gif) is the (*j*~1~*j*~2~)th element of matrices (**DH**)\', **H**\', **P**\' and **Q**\', respectively. + +**M Step**: Calculate *r*^{*τ*+1}^ using the equation, + +![](1471-2156-5-20-i36.gif) + +The E and M steps between Eqs. (8) -- (11) are repeated until the estimate converges to a stable value. + +## Three-locus analysis + +### A general framework + +Consider three markers in a linkage group that have three possible orders ![](1471-2156-5-20-i37.gif), ![](1471-2156-5-20-i38.gif) and ![](1471-2156-5-20-i39.gif). Let *o*~1~, *o*~2~ and *o*~3~ be the corresponding probabilities of occurrence of these orders in the parental genome. Without loss of generality, for a given order, the allelic arrangement of the first marker between the two homologous chromosomes can be fixed for a parent. Thus, the change of the allelic arrangements at the other two markers will lead to 2 × 2 = 4 parental diplotypes. The three-marker genotype of parent P (12/12/12) may have four possible diplotypes, \[111\] \[222\], \[112\] \[221\], \[121\] \[212\] and \[122\] \[211\]. Relative to the fixed allelic arrangement 1\|2\| of the first marker on the two homologous chromosomes **1** and **2**, the probabilities of allelic arragments 1\|2\| and 2\|1\| are denoted as *p*~1~ and 1 - *p*~1~ for the second marker and as *p*~2~ and 1 - *p*~2~ for the third marker, respectively. Assuming that allelic arrangements are independent between the second and third marker, the probabilities of these four three-marker diplotypes can be described by *p*~1~*p*~2~, *p*~1~(1 - *p*~2~), (1 - *p*~1~)*p*~2~ and (1 - *p*~1~) (1 - *p*~2~), respectively. The four diplotypes of parent Q can also be constructed, whose probabilities are defined as *q*~1~*q*~2~, *q*~1~(1 - *q*~2~), (1 - *q*~1~)*q*~2~ and (1 - *q*~1~) (1 - *q*~2~) respectively. Thus, there are 4 × 4 = 16 possible diplotype combinations (whose probabilities are the product of the corresponding diplotype probabilities) when parents P and Q are crossed. + +Let *r*~12~ denote the recombination fraction between markers ![](1471-2156-5-20-i4.gif) and ![](1471-2156-5-20-i5.gif), with *r*~23~ and *r*~13~defined similarly. These recombination fractions are associated with the probabilities with which a crossover occurs between markers ![](1471-2156-5-20-i4.gif) and ![](1471-2156-5-20-i5.gif) and between markers ![](1471-2156-5-20-i5.gif) and ![](1471-2156-5-20-i6.gif). The event that a crossover or no crossover occurs in each interval is denoted by *D*~11~ and *D*~00~, respectively, whereas the events that a crossover occurs only in the first interval or in the second interval is denoted by *D*~10~ and *D*~01~, respectively. The probabilities of these events are denoted by *d*~00~, *d*~01~, *d*~10~and *d*~11~, respectively, whose sum equals 1. According to the definition of recombination fraction as the probability of a crossover between a pair of loci, we have *r*~12~ = *d*~10~ + *d*~11~, *r*~23~ = *d*~01~ + *d*~11~ and *r*~13~ = *d*~01~ + *d*~10~. These relationships have been used by Haldane \[[@B17]\] to derive the map function that converts the recombination fraction to the corresponsding genetic distance. + +For a three-point analysis, there are a total of 16 (16 × 4)-matrices for genotype frequencies under a given marker order (![](1471-2156-5-20-i40.gif)), each corresponding to a diplotype combination, denoted by ![](1471-2156-5-20-i41.gif), where ![](1471-2156-5-20-i42.gif) for 1\|2\| or 2 for 2\|1\| denote the two alternative allelic arrangements of the second and third marker, respectively, for parent P, and ![](1471-2156-5-20-i43.gif) for 1\|2\| or 2 for 2\|1\| denote the two alternative allelic arrangements of the second and third marker, respectively, for parent Q. According to Ridout et al. \[[@B18]\] and Wu et al. \[[@B11]\], elements in ![](1471-2156-5-20-i41.gif) are expressed in terms of *d*~00~, *d*~01~, *d*~10~ and *d*~11~. + +Similarly, there are 16 (16 × 4)-matrices for the expected numbers of crossover that have occurred for *D*~00~, *D*~01~, *D*~10~ and *D*~11~ for a given marker order, denoted by ![](1471-2156-5-20-i44.gif), ![](1471-2156-5-20-i45.gif), ![](1471-2156-5-20-i46.gif) and ![](1471-2156-5-20-i47.gif) respectively. In their Table [2](#T2){ref-type="table"}, Wu et al. \[[@B11]\] gave the three-locus genotype frequencies and the number of crossovers on different marker intervals under marker order ![](1471-2156-5-20-i48.gif). + +::: {#T2 .table-wrap} +::: caption +Estimation from three-point analysis of the recombination fraction ( + +![](1471-2156-5-20-i1.gif) + +± SD) and the parental diplotype probabilities of parent P ( + +![](1471-2156-5-20-i2.gif) + +) and Q ( + +![](1471-2156-5-20-i3.gif) + +) for five markers in a full-sib family of *n* = 100 +::: + + Parental diplotype ![](1471-2156-5-20-i1.gif) ![](1471-2156-5-20-i1.gif) + ------------------------------- -------------------- ----- --- ----- ----- ---------------------------- ----------------- ---------------------------- ---------------------------- ---------------------------- ----------------- ---------------------------- ---------------------------- + + Marker P × Q Case 1 Case 2 ![](1471-2156-5-20-i2.gif) ![](1471-2156-5-20-i3.gif) Case 1 Case 2 ![](1471-2156-5-20-i2.gif) ![](1471-2156-5-20-i3.gif) + Recombination fraction = 0.05 + \| \| \| \| + ![](1471-2156-5-20-i4.gif) *a* *b* *c* *d* + \| \| \| \| 0.0511 ± 0.0175 + ![](1471-2156-5-20-i5.gif) *a* *b* *a* *b* 0.1008 ± 0.0298 0.9978 0.9986 + \| \| \| \| 0.0578 ± 0.0269 0.0557 ± 0.0312 + ![](1471-2156-5-20-i6.gif) *a* *o* × *o* *a* 0.9977 0 0.0988 ± 0.0277 1 0 + \| \| \| \| 0.0512 ± 0.0307 0.0476 ± 0.0280 1 1/0 + ![](1471-2156-5-20-i7.gif) *a* *b* *b* *b* 0.0932 ± 0.0301 1 1/0 1 1/0 + \| \| \| \| 0.0514 ± 0.0229 + ![](1471-2156-5-20-i8.gif) *a* *b* *c* *d* 1 1 + \| \| \| \| + + Recombination fraction = 0.20 + \| \| \| \| + ![](1471-2156-5-20-i4.gif) *a* *b* *c* *d* + \| \| \| \| 0.2026 ± 0.0348 + ![](1471-2156-5-20-i5.gif) *a* *b* *a* *b* 0.3282 ± 0.0482 0.9918 0.9916 + \| \| \| \| 0.2240 ± 0.0758 0.2408 ± 0.0939 + ![](1471-2156-5-20-i6.gif) *a* *o* × *o* *a* 0.9944 0 0.3241 ± 0.0488 1 0 + \| \| \| \| 0.1927 ± 0.0613 0.1824 ± 0.0614 + ![](1471-2156-5-20-i7.gif) *a* *b* *b* *b* 0.3161 ± 0.0502 1 1/0 1 1/0 + \| \| \| \| 0.2017 ± 0.0393 + ![](1471-2156-5-20-i8.gif) *a* *b* *c* *d* 1 1 + \| \| \| \| + +Case 1 denotes the recombination fraction between two adjacent markers, whereas case 2 denotes the recombination fraction between the two markers separated by a third marker. See Table 1 for other explanations. +::: + +The joint genotype frequencies of the three markers can be viewed as a mixture of 16 diplotype combinations and three orders, weighted by their occurring probabilities, and is expressed as + +![](1471-2156-5-20-i49.gif) + +Similarly, the expected number of recombination events contained within a progeny genotype is the mixture of the different diplotype and order combinations, expressed as: + +![](1471-2156-5-20-i50.gif) + +![](1471-2156-5-20-i51.gif) + +![](1471-2156-5-20-i52.gif) + +![](1471-2156-5-20-i53.gif) + +Also define + +![](1471-2156-5-20-i54.gif) + +The occurring probabilities of the three marker orders are the mixture of all diplotype combinations, expressed, in matrix notation, as + +![](1471-2156-5-20-i55.gif) + +We implement the EM algorithm to estimate the MLEs of the recombination fractions between the three markers. The general equations formulating the iteration of the {*τ* + 1}th EM step are given as follows: + +**E Step:** As step *τ*, calculate the expected number of recombination events associated with *D*~00~(*α*), *D*~01~ (*β*), *D*~10~(*γ*), *D*~11~(*δ*) for the (*j*~1~*j*~2~*j*~3~)th progeny genotype (where *j*~1~, *j*~2~ and *j*~3~ denote the progeny genotypes of the three individual markers, respectively): + +![](1471-2156-5-20-i56.gif) + +![](1471-2156-5-20-i57.gif) + +![](1471-2156-5-20-i58.gif) + +![](1471-2156-5-20-i59.gif) + +Calculate ![](1471-2156-5-20-i60.gif), ![](1471-2156-5-20-i61.gif), ![](1471-2156-5-20-i62.gif), ![](1471-2156-5-20-i63.gif) and ![](1471-2156-5-20-i64.gif), (*k* = 1,2,3) using + +![](1471-2156-5-20-i65.gif) + +![](1471-2156-5-20-i66.gif) + +![](1471-2156-5-20-i67.gif) + +![](1471-2156-5-20-i68.gif) + +![](1471-2156-5-20-i69.gif) + +where *n*~*j*1*j*2*j*3~ denote the number of progeny with a particular three-marker genotype, h~*j*1*j*2*j*3~, ![](1471-2156-5-20-i70.gif), ![](1471-2156-5-20-i71.gif), ![](1471-2156-5-20-i72.gif), ![](1471-2156-5-20-i73.gif), *p*~1(*j*1*j*2*j*3)~, *p*~2(*j*1*j*2*j*3)~, *q*~1(*j*1*j*2*j*3)~ and *q*~2(*j*1*j*2*j*3)~ are the (*j*~1~*j*~2~*j*~3~)th element of matrices **H**, **D**~00~, **D**~01~, **D**~10~, **D**~11~, **P**~1~, **P**~2~, **Q**~1~ and **Q**~2~, respectively. + +**M Step:** Calculate ![](1471-2156-5-20-i74.gif), ![](1471-2156-5-20-i75.gif), ![](1471-2156-5-20-i76.gif) and ![](1471-2156-5-20-i77.gif) using the equations, + +![](1471-2156-5-20-i78.gif) + +![](1471-2156-5-20-i79.gif) + +![](1471-2156-5-20-i80.gif) + +![](1471-2156-5-20-i81.gif) + +The E and M steps are repeated among Eqs. (19) -- (32) until *d*~00~, *d*~01~, *d*~10~ and *d*~11~ converge to values with satisfied precision. From the MLEs of the *g\'s*, the MLEs of recombination fractions *r*~12~, *r*~13~ and *r*~23~ can be obtained according to the invariance property of the MLEs. + +### Model for partial informative markers + +Consider three partially informative markers with the numbers of distinguishable pheno-types denoted by *b*~1~, *b*~2~ and *b*~3~, respectively. Define ![](1471-2156-5-20-i82.gif) is a (*b*~1~*b*~2~ × *b*~3~) matrix of genotype frequencies for three partially informative markers. Similarly, we define ![](1471-2156-5-20-i83.gif), and ![](1471-2156-5-20-i84.gif). + +Using the procedure described in Section (2.2), we implement the EM algorithm to estimate the MLEs of the recombination fractions among the three partially informative markers. + +### m-point analysis + +Three-point analysis considering the dependence of recombination events among different marker intervals can be extended to perform the linkage analysis of an arbitrary number of markers. Suppose there are *m* ordered markers on a linkage group. The joint genotype probabilities of the *m* markers form a (4^*m*-1^ × 4)-dimensional matrix. There are 2^*m*-1^ × 2^*m*-1^ such probability matrices each corresponding to a different parental diplotype combination. The reasonable estimates of the recombination fractions rely upon the characterization of a most likely parental diplotype combination based on the multilocus likelihood values calculated. + +The *m*-marker joint genotype probabilities can be expressed as a function of the probability of whether or not there is a crossover occurring between two adjacent markers, ![](1471-2156-5-20-i85.gif) where *l*~1~, *l*~2~, \..., *l*~*m*-1~ are the indicator variables denoting the crossover event between markers ![](1471-2156-5-20-i4.gif) and ![](1471-2156-5-20-i5.gif), markers ![](1471-2156-5-20-i5.gif) and ![](1471-2156-5-20-i6.gif), \..., and markers ![](1471-2156-5-20-i86.gif) and ![](1471-2156-5-20-i87.gif), respectively. An indicator is defined as 1 if there is a crossover and 0 otherwise. Because each indicator can be taken as one or zero, there are a total of 2^*m*-1^ D\'s. + +The occurring probability of interval-specific crossover ![](1471-2156-5-20-i88.gif) can be estimated using the EM algorithm. In the E step, the expected number of interval specific crossovers is calculated (see Eqs. (19) -- (22) for three-point analysis). In the M step, an explicit equation is used to estimate the probability ![](1471-2156-5-20-i88.gif). The MLEs of ![](1471-2156-5-20-i88.gif) are further used to estimate *m*(*m* - 1)/2 recombination fractions between all possible marker pairs. In *m*-point analysis, parental diplotypes and gene orders can be incorporated in the model. + +## Monte Carlo simulation + +Simulation studies are performed to investigate the statistical properties of our model for simultaneously estimating linkage, parental diplotype and gene order in a full-sib family derived from two outbred parents. Suppose there are five markers of a known order on a chromosome. These five markers are segregating differently in order, 1:1:1:1, 1:2:1, 3:1, 1:1 and 1:1:1:1. The diplotypes of the two parents for the five markers are given in Table [1](#T1){ref-type="table"} and using these two parents a segregating full-sib family is generated. In order to examine the effects of parameter space on the estimation of linkage, parental diplotype and gene order, the full-sib family is simulated with different degrees of linkage (*r* = 0.05 vs. 0.20) and different sample sizes (*n* = 100 vs. 200). + +As expected, the estimation precision of the recombination fraction depends on the marker type, the degree of linkage and sample size. More informative markers, more tightly linked markers and larger sample sizes display greater estimation precision of linkage than less informative markers, less tightly linked markers and smaller sample sizes (Tables [1](#T1){ref-type="table"} and [2](#T2){ref-type="table"}). To save space, we do not give the results about the effects of sample size in the tables. Our model can provide an excellent estimation of parental linkage phases, i.e., parental diplotype, in two-point analysis. For example, the MLE of the probability (*p* or *q*) of parental diplotype is close to 1 or 0 (Table [1](#T1){ref-type="table"}), suggesting that we can always accurately estimate parental diplotypes. But for two symmetrical markers (e.g., markers ![](1471-2156-5-20-i5.gif) and ![](1471-2156-5-20-i6.gif) in this example), two sets of MLEs, ![](1471-2156-5-20-i2.gif) = 1, ![](1471-2156-5-20-i3.gif) = 0 and ![](1471-2156-5-20-i2.gif) = 0, ![](1471-2156-5-20-i3.gif) = 1, give an identical likelihood ratio test statistic. Thus, two-point analysis cannot specify parental diplotypes for symmetrical markers even when the two parents have different diplotypes. + +The estimation precision of linkage can be increased when a three-point analysis is performed (Table [2](#T2){ref-type="table"}), but this depends on different marker types and different degrees of linkage. Advantage of three-point analysis over two-point analysis is more pronounced for partially than fully informative markers, and for less tightly than more tightly linked markers. For example, the sampling error of the MLE of the recombination fraction (assuming *r* = 0.20) between markers ![](1471-2156-5-20-i5.gif) and ![](1471-2156-5-20-i6.gif) from two-point analysis is 0.0848, whereas this value from a three-point analysis decreases to 0.0758 when combining fully informative marker ![](1471-2156-5-20-i4.gif) but increases to 0.0939 when combining partially informative marker ![](1471-2156-5-20-i7.gif). The three-point analysis can clearly determine the diplotypes of different parents as long as one of the three markers is asymmetrical. In our example, using either asymmetrical marker ![](1471-2156-5-20-i4.gif) or ![](1471-2156-5-20-i7.gif), the diplotypes of the two parents for two symmetrical markers (![](1471-2156-5-20-i5.gif) and ![](1471-2156-5-20-i6.gif)) can be determined. Our model for three-point analysis can determine a most likely gene order. In the three-point analyses combining markers ![](1471-2156-5-20-i89.gif), markers ![](1471-2156-5-20-i90.gif) and marker ![](1471-2156-5-20-i91.gif), the MLEs of the probabilities of gene order are all almost equal to 1, suggesting that the estimated gene order is consistent with the order hypothesized. + +To demonstrate how our linkage analysis model is more advantageous over the existing models for a full-sib family population, we carry out a simulation study for linked dominant markers. In two-point analysis, two different parental diplotype combinations are assumed: (1) \[*aa*\] \[*oo*\] × \[*aa*\] \[*oo*\] (*cis* × *cis*) and (2) \[*ao*\] \[*oa*\] × \[*ao*\] \[*oa*\] (*trans* × *trans*). The MLE of the linkage under combination (2), in which two dominant alleles are in a repulsion phase, is not as precise as that under combination (1), in which two dominant non-alleles are in a coupling phase \[[@B12]\]. For a given data set with unknown linkage phase, the traditional procedure for estimating the recombination fraction is to calculate the likelihood values under all possible linkage phase combinations (i.e., *cis* × *cis*, *cis* × *trans*, *trans* × *cis* and *trans* × *trans*). The combinations, *cis* × *cis* and *trans* × *trans*, have the same likelihood value, with the MLE of one combination being equal to the subtraction of the MLE of the second combination from 1. The same relationship is true for *cis* × *trans* and *trans* × *cis*. A most likely phase combination is chosen corresponding to the largest likelihood and a legitimate MLE of the recombination fraction (*r* ≤ 0.5) \[[@B10]\]. + +For our data set simulated from \[*aa*\] \[*oo*\] × \[*aa*\] \[*oo*\], one can easily select *cis* × *cis* as the best estimation of phase combination because it corresponds to a larger likelihood and a smaller ![](1471-2156-5-20-i1.gif) (Table [3](#T3){ref-type="table"}). Our model incorporating the parental diplotypes can provide comparable estimation precision of the linkage for the data from \[*aa*\] \[*oo*\] × \[*aa*\] \[*oo*\] and precisely determine the parental diplotypes (see the MLEs of *p* and *q*; Table [3](#T3){ref-type="table"}). Our model has great advantage over the traditional model for the data derived from \[*ao*\] \[*oa*\] × \[*ao*\] \[*oa*\]. For this data set, the same likelihood was obtained under all possible four diplotype combinations (Table [3](#T3){ref-type="table"}). In this case, one would select *cis* × *trans* or *trans* × *cis* because these two phase combinations are associated with a lower estimate of *r*. But this estimate of *r* (0.0393) is biased since it is far less than the value of 0.20 hypothesized. Our model gives the same estimation precision of the linkage for the data derived from \[*ao*\] \[*oa*\] × \[*ao*\] \[*oa*\] as obtained when the analysis is based on a correct diplotype combination (Table [3](#T3){ref-type="table"}). Also, our model can precisely determine the parental diplotypes (![](1471-2156-5-20-i2.gif) = ![](1471-2156-5-20-i3.gif) = 0 ). + +::: {#T3 .table-wrap} +::: caption +Comparison of the estimation of the linkage and parental diplotype between two dominant markers in a full-sib family of *n* = 100 from the traditional and our model +::: + ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| | Traditional model | | | | Our model | ++=======================================+===================+=================+=================+===================+=================+ +| | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| | *cis* × *cis* | *cis* × *trans* | *trans* × *cis* | *trans* × *trans* | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Data simulated from *cis* × *cis* | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Correct diplotype combination | Correct | Incorrect | Incorrect | Incorrect | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Log-likelihood^*a*^ | -46.2 | -92.3 | -92.3 | -46.2 | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| ![](1471-2156-5-20-i1.gif) | 0.1981 ± 0.0446 | 0.5000 ± 0.0000 | 0.5000 ± 0.0000 | 0.8018 ± 0.0446 | | +| | | | | | | +| under each diplotype combination | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Estimated diplotype combination | Selected | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| ![](1471-2156-5-20-i1.gif) | 0.1981 ± 0.0446 | | | | 0.1982 ± 0.0446 | +| | | | | | | +| under correct diplotype combination | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Diplotype probability for parent P ( | | | | | 1.0000 ± 0.0000 | +| | | | | | | +| ![](1471-2156-5-20-i2.gif) | | | | | | +| | | | | | | +| ) | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Diplotype probability for parent Q ( | | | | | 1.0000 ± 0.0000 | +| | | | | | | +| ![](1471-2156-5-20-i3.gif) | | | | | | +| | | | | | | +| ) | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Data simulated from *trans* × *trans* | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Correct diplotype combination | Incorrect | Incorrect | Incorrect | Correct | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Log-likelihood^*a*^ | -89.6 | -89.6 | -89.6 | -89.6 | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| ![](1471-2156-5-20-i1.gif) | 0.8573 ± 0.1253 | 0.0393 ± 0.0419 | 0.0393 ± 0.0419 | 0.1426 ± 0.1253 | | +| | | | | | | +| under each diplotype combination | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Estimated diplotype combination | | Selected | Selected | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| ![](1471-2156-5-20-i1.gif) | | | | 0.1426 ± 0.1253 | 0.1428 ± 0.1253 | +| | | | | | | +| under correct diplotype combination | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Diplotype probability for parent P ( | | | | | 0.0000 ± 0.0000 | +| | | | | | | +| ![](1471-2156-5-20-i2.gif) | | | | | | +| | | | | | | +| ) | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ +| Diplotype probability for parent Q ( | | | | | 0.0000 ± 0.0000 | +| | | | | | | +| ![](1471-2156-5-20-i3.gif) | | | | | | +| | | | | | | +| ) | | | | | | ++---------------------------------------+-------------------+-----------------+-----------------+-------------------+-----------------+ + +^*a*^The log-likelihood values given here are those from one random simulation for each diplotype combination by the traditional model. +::: + +In three-point analysis, we examine the advantage of implementing linkage analysis with gene orders. Three dominant markers are assumed to have two different parental diplotypes combinations: (1) \[*aaa*\] \[*ooo*\] × \[*aaa*\] \[*ooo*\] and (2) \[*aao*\] \[*ooa*\] × \[*aao*\] \[*ooa*\]. The traditional approach is to calculate the likelihood values under three possible gene orders and choose one of a maximum likelihood to estimate the linkage. Under combination (1), a most likely gene order can be well determined and, therefore, the recombination fractions between the three markers well estimated, because the likelihood value of the correct order is always larger than those of incorrect orders (Table [4](#T4){ref-type="table"}). However, under combination (2), the estimates of linkage are not always precise because with a frequency of 20% gene orders are incorrectly determined. The estimates of *r*\'s will largely deviate from their actual values based on a wrong gene order (Table [4](#T4){ref-type="table"}). Our model incorporating gene order can provide the better estimation of linkage than the traditional approach, especially between those markers with dominant alleles being in a repulsion phase. Furthermore, a most likely gene order can be determined from our model at the same time when the linkage is estimated. + +::: {#T4 .table-wrap} +::: caption +Comparison of the estimation of the linkage and gene order between three dominant markers in a full-sib family of *n* = 100 from the traditional and our model +::: + + MLE Traditional model Our model + ---------------------------------------------------------------- ----------------------------- ----------------------------- ----------------------------- ----------------- + + ![](1471-2156-5-20-i92.gif) ![](1471-2156-5-20-i93.gif) ![](1471-2156-5-20-i94.gif) + Data stimulated from \[*aaa*\] \[*ooo*\] × \[*aaa*\] \[*ooo*\] + Correct gene order Correct Incorrect Incorrect + Estimated best gene order (%^*a*^) 100 0 0 + ![](1471-2156-5-20-i95.gif) 0.2047 ± 0.0422 0.2048 ± 0.0422 + ![](1471-2156-5-20-i96.gif) 0.1980 ± 0.0436 0.1985 ± 0.0434 + ![](1471-2156-5-20-i97.gif) 0.3245 ± 0.0619 0.3235 ± 0.0618 + ![](1471-2156-5-20-i98.gif) 0.9860 ± 0.0105 + ![](1471-2156-5-20-i99.gif) 0.0060 ± 0.0071 + ![](1471-2156-5-20-i100.gif) 0.0080 ± 0.0079 + Data simulated from \[*aao*\] \[*ooa*\] × \[*aao*\] \[*ooa*\] + Correct gene order Correct Incorrect Incorrect + Estimated best gene order (%^*a*^) 80 11 9 + ![](1471-2156-5-20-i95.gif) 0.1991 ± 0.0456 0.8165 ± 0.1003 0.9284 ± 0.0724 0.2104 ± 0.0447 + ![](1471-2156-5-20-i96.gif) 0.1697 ± 0.0907 0.8220 ± 0.0338 0.1636 ± 0.0608 0.2073 ± 0.0754 + ![](1471-2156-5-20-i97.gif) 0.3218 ± 0.0755 0.2703 ± 0.0586 0.7821 ± 0.0459 0.2944 ± 0.0929 + ![](1471-2156-5-20-i98.gif) 0.9952 ± 0.0058 + ![](1471-2156-5-20-i99.gif) 0.0045 ± 0.0058 + ![](1471-2156-5-20-i100.gif) 0.0003 ± 0.0015 + +^*a*^The percents of a total of 200 simulations that have a largest likelihood for a given gene order estimated from the traditional approach. In this example used to examine the advantage of implementing gene orders, known linkage phases are assumed. +::: + +Our model is further used to perform joint analyses including more than three markers. When the number of markers increases, the number of parameters to be estimated will be exponentially increased. For four-point analysis, the speed of convergence was slow and the accuracy and precision of parameter estimation have been affected for a sample size of 200 (data not shown). According to our simulation experience, the improvement of more-than-three-point analysis can be made possible by increasing sample size or by using the estimates from two- or three-point analysis as initial values. + +## A worked example + +We use an example from published literature \[[@B18]\] to demonstrate our unifying model for simultaneous estimation of linkage, parental diplotype and gene order. A cross was made between two triple heterozygotes with genotype *AaVvXx* for markers ![](1471-2156-5-20-i101.gif), ![](1471-2156-5-20-i102.gif) and ![](1471-2156-5-20-i103.gif). Because these three markers are dominant, the cross generates 8 distinguishable genotypes, with observations of 28 for *A*~-~/*V*~-~/*X*~-~, 4 for *A*~-~/*V*~-~/*xx*, 12 for *A*~-~/*vv*/*X*~-~, 3 for *A*~-~/*vv*/*xx*, 1 for *aa*/*V*~-~/*X*~-~, 8 for *aa*/*V*~-~/*xx*, 2 for *aa*/*vv*/*X*~-~ and 2 for *aa*/*vv*/*xx*. We first use two-point analysis to estimate the recombination fractions and parental diplotypes between all possible pairs of the three markers. The recombination fraction between markers ![](1471-2156-5-20-i101.gif) and ![](1471-2156-5-20-i102.gif) is ![](1471-2156-5-20-i104.gif), whose the estimated parental diplotypes are \[*Av*\] \[*aV*\] × \[*AV*\] \[*av*\] or \[*AV*\] \[*av*\] × \[*Av*\] \[*aV*\]. The other two recombination fractions and the corresponding parental displotypes are estimated as ![](1471-2156-5-20-i105.gif), \[*Vx*\] \[*vX*\] × \[*VX*\] \[*vx*\] or \[*VX*\] \[*vx*\] × \[*Vx*\] \[*vX*\] and ![](1471-2156-5-20-i106.gif), \[*AX*\] \[*ax*\] × \[*AX*\] \[*ax*\], respectively. From the two-point analysis, one of the two parents have dominant alleles from markers ![](1471-2156-5-20-i101.gif) and ![](1471-2156-5-20-i103.gif) are repulsed with the dominant alleles from marker ![](1471-2156-5-20-i102.gif). + +Our subsequent three-point analysis combines parental diplotypes and gene orders to estimate the linkage between these three dominant markers. The estimated gene order is ![](1471-2156-5-20-i107.gif). The MLEs of the recombination fractions are ![](1471-2156-5-20-i108.gif), ![](1471-2156-5-20-i109.gif) and ![](1471-2156-5-20-i110.gif). The parental diplotype combination is \[*XAV*\] \[*xav*\] × \[*XAv*\] \[*xaV*\] or \[*XAv*\] \[*xaV*\] × \[*XAV*\] \[*xav*\]. The three-point analysis for these three markers by Ridout et al. \[[@B18]\] led to the estimates of the three recombination fractions all equal to 0.20. But their estimates may not be optimal because the effect of gene order on ![](1471-2156-5-20-i1.gif) was not considered. + +# Discussion + +Several statistical methods and software packages have been developed for linkage analysis and map construction in experimental crosses and well-structured pedigrees \[[@B2]-[@B6]\], but these methods need unambiguous linkage phases over a set of markers in a linkage group. For outcrossing species, such as forest trees, it is not possible to know exact linkage phases for any of two parents that are crossed to generate a full-sib family prior to linkage analysis. This uncertainty about linkage phases makes linkage mapping in outcrossing populations much more difficult than that in phase-known pedigrees \[[@B7],[@B9]\]. + +In this article we present a unifying model for simultaneously estimating the linkage, parental diplotype and gene order in a full-sib family derived from two outbred parents. As demonstrated by simulation studies, our model is robust to different parameter space. Compared to the traditional approaches that calculate the likelihood values separately under all possible linkage phases or orders \[[@B9],[@B10],[@B18]\], our approach is more advantageous in three aspects. First, it provides a one-step analysis of estimating the linkage, parental diplotype and gene order, thus facilitating the implementation of a general method for analyzing any segregating type of markers for outcrossing populations in a package of computer program. For some short-generation-interval outcrossing species, we can obtain marker information from grandparents, parents and progeny. The model presented here allow for the use of marker genotypes of the grandparents to derive the diplotype of the parents. Second, our model for the first time incorporates gene ordering into a unified linkage analysis framework, whereas most earlier studies only emphasized on the characterization of linkage phases through a multilocus likelihood analysis \[[@B11],[@B14],[@B15]\]. Instead of a comparative analysis of different orders, we proposed to determine a most likely gene order by estimating the order probabilities. + +Third, and most importantly, our unifying approach can significantly improve the estimation precision of the linkage for dominant markers whose alleles are in repulsion phase. Previous analyses have indicated that the estimate of the linkage between dominant markers in a repulsion phase is biased and imprecise, especially when the linkage is not strong and when sample size is small \[[@B12]\]. There are two reasons for this: (1) the linkage phase cannot be correctly determined, and/or (2) there is a fairly high possibility (20%) of detecting a wrong gene order. Our approach provides more precise estimates of the recombination fraction because correct parental diplotypes and a correct gene order can be determined. + +Our approach will be broadly useful in genetic mapping of outcrossing species. In practice, a two-point analysis can first be performed to obtain the pairwise estimates of the recombination fractions and using this pairwise information markers are grouped based on the criteria of a maximum recombination fraction and minimum likelihood ratio test statistic \[[@B2]\]. The parental diplotypes of markers in individual groups are constructed using a three-point analysis. With a limited sample size available in practice, we do not recommend more-than-three-point analysis because this would bring too many more unknown parameters to be precisely estimated. If such an analysis is desirable, however, one may use the results from these lower-point analyses as initial values to improve the convergence rate and possibly the precision of parameter estimation. + +In any case, our two- and three-point analysis has built a key stepping stone for map construction through two approaches. One is the least-squares method, as originally developed by Stam \[[@B5]\], that can integrate the pairwise recombination fractions into reconstruction of multilocus linkage map. The second is to use the hidden Markov chain (HMC) model, first proposed by Lander and Green \[[@B2]\], to construct genetic linkage maps by treating map construction as a combinatorial optimization problem. The simulated annealing algorithm \[[@B19]\] for searching for optima of the multilocus likelihood function need to be implemented for the HMC model. A user-friendly package of software that is being written by the senior author will implement two- and three-point analyses as well as the algorithm for map construction based on the estimates of pairwise recombination fractions. This software will be online available to the public. + +Our maximum likelihood-based approach is implemented with the EM algorithm. We also incorporate the Gibbs sampler \[[@B20]\] into the estimation procedure of the mixture model for the linkage characterizing different parental diplotypes and gene orders of different markers. The results from the Gibbs sampler are broadly consistent with those from the EM algorithm, but the Gibbs sampler is computationally more efficient for a complicated problem than the EM algorithm. Therefore, the Gibbs sampler may be particularly useful when our model is extended to consider multiple full-sib families in which the parents may be selected from a natural population. For such a multi-family design, some population genetic parameters describing the genetic structure of the original population, such as allele frequencies and linkage disequilibrium, should be incorporated and estimated in the model for linkage analysis. It can be anticipated that the Gibbs sampler will play an important role in estimating these parameters simultaneously along with the linkage, linkage phases, and gene order. + +# Authors\' contributions + +QL derived the genetic and statistical models and wrote computer programs. YHC participated in the derivations of models and statistical analyses. RLW conceived of ideas and algorithms, and wrote the draft. All authors read and approved the final manuscript. + +### Acknowledgements + +We thank two anonymous referees for their constructive comments on the manuscript. This work is partially supported by a University of Florida Research Opportunity Fund (02050259) and a University of South Florida Biodefense Grant (7222061-12) to R. W. The publication of this manuscript is approved as Journal Series No. R-10073 by the Florida Agricultural Experiment Station. diff --git a/pubmedcentral/example/PMC544568.md b/pubmedcentral/example/PMC544568.md new file mode 100644 index 0000000..27eebe3 --- /dev/null +++ b/pubmedcentral/example/PMC544568.md @@ -0,0 +1,164 @@ +# Background + +Geographic surveillance of chronic disease is central to understanding spatial or spatial-temporal patterns that may help to identify discrepancies in disease burden among different regions or communities. As part of ongoing efforts in New York State to understand spatial patterns of cancer and to help implement cancer prevention and control programs, small area maps of cancer relative risk, expressed as standardized incidence ratios (SIRs), have been produced and shared with the public \[[@B1]\] for the most common anatomical cancer sites. + +Prostate cancer, the focus of this paper, was included because it is the most common non-dermatologic malignancy diagnosed among men and the second leading cause of cancer deaths for men in the United States (US) \[[@B2]\]. Although mortality from this disease in the US has statistically significantly decreased at a rate of 2.6% per year from 1990 to 2000 \[[@B3]\], unexplained geographic discrepancies in mortality rates do exist \[[@B4]\]. Furthermore, several treatment options appear to be associated with excellent long-term disease-specific survival for otherwise healthy men with localized disease \[[@B5]\]. + +Results for prostate cancer (all stages combined) are reproduced in Figure [1](#F1){ref-type="fig"}, where ZIP code-level standardized incidence ratios (SIRs) are presented along with results from analyzing these data with the spatial scan statistic \[[@B6]\]. The circles in Figure [1](#F1){ref-type="fig"} represent statistically elevated regions based on Poisson likelihood ratios comparing rates inside the circle to those outside the circle. Details of how the scan statistic results were reduced to the circles presented in Figure [1](#F1){ref-type="fig"} are found in Boscoe et al \[[@B7]\]. + +
+

+
All stage prostate cancer Incidence by ZIP Code in New York State, 1994–1998. ZIP code-level ratios of observed incidence to age- and race-adjusted expected incidence, along with significant spatial scan statistic circles that are non-overlapping within specified ranges of standardized incidence ratios, based on reference [1]. Select cities and regions overlaid for reference.
+
+ +It is well recognized that the stability of population-based statistics like the SIRs in Figure [1](#F1){ref-type="fig"} can vary greatly among small geographic areas due to varying population size. Different methods of smoothing have been developed to address this issue, where all are based on the phenomenon that observations close together in space are more likely to share similar properties than those that are far apart \[[@B8]\]. While this positive spatial autocorrelation may be problematic for statistical methods that require independent observations, it can also be embraced to help smooth noisy maps by borrowing strength from neighbors for those mapping units with small populations. Of the different approaches to spatial smoothing, only a few appear to have gained acceptance in spatial epidemiology. Non-parametric approaches include spatial filtering \[[@B9],[@B10]\] and the head-banging algorithm \[[@B11]\], both of which are basically variations on a moving window kernel-type smoother. + +The parametric approach of generalized linear modelling \[[@B12]\] treats the observed response, *y*, as a random variable that has arisen from a probability distribution with expectation *θ*. This expectation is modeled, via an appropriate link *g*(·), as a linear function *g*(*θ*) = *α* + **x\'*β*** + *ε*, for a common value *α*, explanatory covariates **x\'*β*** and a random effect *ε* that captures unexplained variation. If the random effect is associated with exchangeable spatial heterogeneity, estimates are smoothed towards a global mean, whereas if the random effect is associated with local spatial autocorrelation, estimates are smoothed towards a local neighborhood mean, which is typically more meaningful in geographic epidemiology. There are different approaches to modelling local spatial dependence, and section 6.3 of Cressie \[[@B13]\] presents several arguments in favor of the conditional autoregressive (CAR) model originally conceived by Besag \[[@B14]\]. + +Estimation of model parameters can proceed by maximum likelihood \[[@B13]\]; however, the hierarchical nature of generalized linear models lends itself well to Bayesian analysis whereby linear terms in the model are assigned prior distributions that, in turn, have \"hyperprior\" parameters. Earlier applications employed empirical Bayes methods \[[@B15]\], where hyperparameters are estimated directly from the data. This approach is limited because it assigns a point estimate to the hyperparameter without allowing for variability that may be associated with it, and this variability can be large \[[@B16],[@B17]\]. + +Fully Bayesian modelling assigns hyperprior distributions to these hyperparameters, so that every parameter of the hierarchical model is allowed to vary over a prior distribution and no single point estimate is used to represent an unknown parameter value. Furthermore, the fully Bayesian approach allows the convolution model that incorporates both a heterogeneous and spatially structured random effect \[[@B18]\], thus allowing the most flexibility in model development. Several reviews consistently support the fully Bayesian approach over empirical Bayes modelling \[[@B16],[@B17],[@B19]\]. + +Since there are no closed form analytical solutions for parameter estimates of a fully Bayesian model, nor likelihood profiles to maximize, Markov Chain Monte Carlo (MCMC) methods are used to generate large samples from the posterior distributions of all stochastic nodes of the hierarchical model, given the likelihood describing the original data distribution and all appropriate prior and hyperprior distributions of the likelihood parameters \[[@B16]\]. Estimation and inference in the fully Bayesian paradigm are based upon these large sample approximations of the posterior distributions. + +In what follows, the fully Bayesian approach is applied to simulating large samples from the posterior distribution of prostate cancer relative risk in each of 1412 ZIP codes in New York State. Various aspects of these distributions are then mapped to reveal information on the geographic patterns of prostate cancer. + +# Results + +The model defined by equations 1--5 was applied to simulate a sample of 1000 independent observations from the stationary posterior distribution of standardized incidence ratios for each ZIP code. Summary statistics and graphical analysis of these empirical distributions indicated that they arose from generally symmetric posterior distributions. Since the sample mean, median and mode were very similar for each ZIP code, the median was chosen to represent central tendency and is mapped in Figure [2](#F2){ref-type="fig"}. This \"smoothed\" map of SIRs provides a picture of spatial pattern inherent in the raw data mapped in Figure [1](#F1){ref-type="fig"}. Uncertainty associated with these estimates of relative risk is mapped in Figure [3](#F3){ref-type="fig"} as the 95 percentile range (97.5^th^ -- 2.5^th^ percentile) of the 1000 values sampled from the posterior distribution of SIRs for each ZIP code. + +
+

+
Bayesian smoothed prostate cancer incidence. Median of the posterior distribution of ZIP code-level standardized incidence ratios. Thematic categories based on natural breaks method, with slight adjustment.
+
+ +
+

+
Uncertainty of Bayesian Smoothed prostate cancer incidence. ZIP code-level 95th percentile range of posterior distribution of standardized incidence ratios. Thematic categories based on Natural Breaks Method.
+
+ +The posterior distributions can also be used to identify ZIP codes where a specified mass of the distribution of relative risk is greater or less than the null value. For example, Figure [4](#F4){ref-type="fig"} shows ZIP codes where 95% of the simulated SIRs exceed the value of one. In the Bayesian paradigm, those ZIP codes highlighted in Figure [4](#F4){ref-type="fig"} have a 95% probability of higher than expected risk. Likewise, Figure [5](#F5){ref-type="fig"} highlights ZIP codes expressing a 95% probability of lower than expected risk. + +
+

+
ZIP codes with a 95% probability of relative risk exceeding 1. The lower 5th percentile of the posterior distribution of standardized incidence ratios exceeds or equals 1.
+
+ +
+

+
ZIP codes with a 95% probability of relative risk being less than 1. The upper 95th percentile of the posterior distribution of standardized incidence ratios is less than or equal to 1.
+
+ +# Discussion + +## Methodology + +The Poisson model applied in this paper is a particular application of hierarchical Bayes spatial generalized linear models for the exponential family of likelihoods \[[@B20]\]. Particular models are specified by the likelihood that is assumed to give rise to the observations, the structure of the prior, and the hyperprior distributions of variance components, which are typically vague to allow learning from the data. An aspect of these models that will influence outcomes is the neighborhood weights, as in Equation (4); however, defining these weights remains an open area of research. + +Most applications to date use the first order binary weighting scheme where *w*~*ij*~ = 1 if a mapping unit *j* shares a common border with unit *i*, and *w*~*ij*~ = 0 otherwise. This weighting scheme actually has its roots in image analysis, for which this type of modelling was developed \[[@B14]\], and it makes sense when the spatial units are equal size and shape pixels and the response variable has a constant variance for each pixel. However, this is not the case when smoothing disease maps where the mapping units are of irregular size and shape, and stability of the response variable estimates varies with changing population size. This problem is especially relevant for mapping units like ZIP codes. + +A proper approach may be to define weights as a decay function of geographic distance between population-weighted centroids of the mapping units. This function may be obtained by fitting a model correlogram to residuals that are obtained from a model that does not include a random effect to account for spatial autocorrelation. Cressie and Chan \[[@B21]\] used the empirical variogram of the response variable to determine the range of spatial autocorrelation. For neighborhood distances within this range, weights were defined as a function of Euclidean distance. Meanwhile, Griffith \[[@B22]\] provides some \"rules of thumb\" for defining geographic weights, but they are very general. Ferrandiz, et al \[[@B23]\] used a weight of *n*~*i*~*n*~*j*~/*d*~*ij*~ for neighboring mapping units separated by geographic distance *d*~*ij*~ and of population sizes *n*~*i*~ and *n*~*j*~. These authors applied such a weight to prostate cancer mortality mapping; however, this gravity-type weighting may be better suited for infectious disease, not chronic disease. + +If a decay function is fit from the data, the varying stability of disease rates among the mapping units presents a challenge. Since the Bayesian model is designed to adjust for varying stability, perhaps a hierarchical model like the one applied in this paper can be extended so that the weights in Equation (4) are defined as a decay function whose unknown parameters can be assigned \"hyperprior\" distributions. + +## Application to prostate cancer mapping + +Maps like in Figure [1](#F1){ref-type="fig"} present a compromise between the need to protect personal privacy and public demand for fine geographic resolution. Such small mapping units are necessary for discerning among communities that can vary drastically across a region with respect to possible risk factors and both population density and demographics. However, this comes with the cost of unstable risk estimates for many mapping units that have small populations. Smoothing is therefore applied to help visualize spatial pattern that is inherent in the data of Figure [1](#F1){ref-type="fig"}. It is demonstrated how hierarchical Bayes spatial modelling has the appealing feature of providing a whole distribution of possible outcomes that can be used for not only smoothing, but also to explore other aspects of spatial pattern. + +Viewing Figures [1](#F1){ref-type="fig"} through [3](#F3){ref-type="fig"} indicate that the Bayesian model is behaving as expected since the smoothed estimates are increasingly dependent on the prior model as uncertainty increases due to decreasing population, whereas for ZIP codes with large populations, like in New York City, the smoothed estimates are similar to the raw SIRs. + +We also see that many edge ZIP codes in less populated areas tend to have greater uncertainty relative to their non-edge neighbors because there are fewer neighbors to borrow strength from. For the heterogeneous Poisson model applied in this paper, Lawson et al \[[@B24]\] suggest treating edge mapping units as a guard and not as part of the actual study area. However, presenting the width of Bayesian posterior distributions provides a way to retain the edge units while also showing the relative uncertainty associated with their smoothed values. + +The smoothed pattern in Figure [2](#F2){ref-type="fig"} is highlighted for areas of high and low incidence in Figures [4](#F4){ref-type="fig"} and [5](#F5){ref-type="fig"}, respectively. Geographic patterns seen in these maps are potentially influenced by many factors, including differences between regions of the state in terms of racial, ethnic and socio-demographic composition. Yet it is well recognized that interpreting any possible relationships with risk factors is confounded by differences in screening and diagnostic practices across the state. Prostate-specific antigen (PSA) testing remains high among US males over 40 \[[@B25]\] and there is evidence of a steady increase in testing rates in New York State during the years corresponding to the data analyzed in this paper \[[@B26]\]. Along these lines, we note that a relatively large proportion of the four most populated counties of New York City reveal a high probability of less than expected incidence (see Figure [5](#F5){ref-type="fig"} inset). This may be partially explained by the large immigrant population in these four counties, as indicated by much higher proportions of people who are foreign-born and/or do not speak English at home \[[@B27]\], which may translate to lower screening rates. + +Although the patterns seen in Figures [2](#F2){ref-type="fig"}, [4](#F4){ref-type="fig"} and [5](#F5){ref-type="fig"} may be partially explained by geographic variations in PSA testing and diagnostic follow up, such variation is not actually known, therefore we cannot adjust for this potential confounder. In the neighboring state of Connecticut, geographic variation of invasive prostate cancer incidence was large and revealed some consistency before and after the introduction of PSA testing, while the pattern was completely different and variation was much smaller during the years of PSA introduction \[[@B28]\]. These authors suggest that such a space-time pattern reflects the impact of introducing PSA testing, although this cannot be confirmed in the absence of data on geographic differences in PSA use. + +Some areas appear elevated that are in popular vacation spots, such as the eastern forks of Long Island and the \"north country\" of New York State including the Thousand Islands area along the Saint Lawrence River (near Watertown in Figure [1](#F1){ref-type="fig"}) and the Adirondack region. This may possibly be due to a seasonal residence effect whereby vacation areas tend to have artificially inflated chronic disease rates \[[@B29]\]. This occurs when seasonal residents provide a health care provider with a local address of a vacation home, while their primary residence is where they are counted by the decennial US census. Consequently, if their residence at time of diagnoses is in the vacation area, this record inflates the SIR numerator for that area, while they are counted in the denominator for the area of their primary residence as captured by the census. This effect is enhanced since the population spending extended periods in vacation areas tends to be over age 55, which is the age cohort at highest risk of chronic disease. Boscoe and Mclaughlin \[[@B29]\] have presented evidence of increased overall cancer rates in areas with seasonally resident populations in New York State, especially the Thousand Islands area. This uncertainty is reflected in Figure [3](#F3){ref-type="fig"} where the width of the posterior distribution of SIRs for these areas is relatively large. + +While the smoothed results in Figure [2](#F2){ref-type="fig"} present an advantage over mapping raw data, a limitation of smoothing is that the pattern we decipher is subject to confounding by spatially varying population sizes \[[@B30]\]. In other words, smoothed maps like in Figure [2](#F2){ref-type="fig"} reveal high and low relative risk in areas with larger populations, while areas with small populations tend to be smoothed towards the null value. While this means that areas with small populations that actually have abnormally high or low disease rates may be obscured, it is still well recognized that many extreme values associated with small populations may simply reflect random noise. + +Other methods like the spatial scan statistic can be used in conjunction with Bayes smoothing to strengthen overall spatial analysis. Statistically significant scan statistic circles like those in Figure [1](#F1){ref-type="fig"} can vary in size, potentially encompassing many ZIP codes, so are not restricted to only pre-defined neighborhoods like the conditional autoregressive model used by the Bayes smoothing algorithm. In this regard, the spatial scan statistic is similar to smoothing by spatial filtering with variable-radius circles \[[@B10]\]. General regions of statistically elevated relative risk may be identified by the scan statistic, with supporting evidence from Bayesian posterior distributions to help identify the mapping units that contribute strongly to a scan statistic circle. Indeed, each spatial scan statistic circle reported in Figure [1](#F1){ref-type="fig"} contains at least one elevated ZIP code identified in Figure [4](#F4){ref-type="fig"} by the Bayesian model. + +There is ample flexibility for exploratory analysis by varying the display parameters for results from these two methods. For example, the results in Figure [4](#F4){ref-type="fig"} can be either more generalized or further focused by identifying ZIP codes where, say, 90% or 99%, respectively, of the posterior probability mass exceeds the null value of one. At the same time, we can display the set of non-overlapping scan statistic circles that correspond to lower levels of relative risk than are shown in Figure [1](#F1){ref-type="fig"}, thus capturing more geographic area. In fact, when this is done for scan statistic circles corresponding to 15--49% relative risk (not shown), all of the elevated ZIP codes in Figure [4](#F4){ref-type="fig"} are spatially associated with significant scan statistic circles. + +There is extensive literature on hierarchical Bayes spatial modelling for disease mapping; however, most papers are theoretical in nature and use illustrative examples, often with the same data sets. One exception was recently published by Short et al \[[@B31]\], who applied Bayes modelling to produce maps of cancer control variables. Specifically, they smoothed maps of different outcomes (mortality, incidence, staging and screening) for each of breast, colorectal and lung cancer in Minnesota (USA) counties. Cancer control maps were created for each cancer site by obtaining a weighted sum of each smoothed outcome, and an overall cancer control map was obtained by a weighted sum of the individual cancer control map values. These results can help guide resource allocation for state cancer prevention and control efforts. + +While there are open areas for improvement in the methodology of hierarchical Bayes spatial modelling, it is a valuable tool for geo-spatial assessment of disease patterns that can help identify differences among communities. This may in turn indicate patterns of health care access, screening and diagnostic follow up and possibly indicate etiologic clues about causal relationships. + +# Methods + +## Data + +Observed and expected values of prostate cancer incidence used to calculate SIRs for 1412 New York State ZIP codes \[[@B1]\] were obtained for the years 1994 to 1998 from the New York State Cancer Registry (NYSCR). The expected values used in the SIR denominator are based on indirect standardization using the age-by-race distribution in each ZIP code and the statewide age- and race-specific incidence rates as a reference. Age and race distributions correspond to the year 1997, as estimated by the Claritas Corporation™ based on prior census values. ZIP code boundaries were delineated by the GDT Corporation™ in 1999. + +ZIP Code delivery areas are prone to change over time \[[@B32]\], particularly in rapidly growing parts of the country. According to the NYSCR \[personal communication\], a review of all of the issues of the Postal Bulletin, where these changes are documented, from 1990 to the present revealed that New York has had stable ZIP Code delivery areas. Approximately 50 small, rural post offices were closed, 3 new post offices were added, and none were realigned. ZIP codes were combined in instances where service delivery area changed between 1990 and 1999 or for confidentiality reasons where necessary \[[@B33]\]. + +## Modelling + +Letting the geographic domain (New York State) be subdivided into *i* = 1, \..., *n* distinct mapping units (*n* = 1412 ZIP codes for our application), the number of cases within each unit, *Y*~*i*~, conditional on location *i*, is defined as a Poisson random variable with expectation *E*~*i*~*θ*~*i*~, where *E*~*i*~ equals the age- and race-adjusted expected number of prostate cancer cases, and *θ*~*i*~ equals the area-specific relative risk. Given an observed response *y*~*i*~, note that the maximum likelihood estimate of relative risk is ![](1476-072X-3-29-i1.gif) = *y*~*i*~ / *E*~*i*~, the standardized incidence ratio (SIR). + +The relative risk parameter *θ*~*i*~ is assigned a log-normal prior distribution, log(*θ*~*i*~) \~ N(*μ*~*i*~, ![](1476-072X-3-29-i2.gif)), where the expectation and variance are defined by a linear function of a common value (intercept), *α*, and two independent random effects, a heterogeneous component, *u*~*i*~, that does not depend on geographic location (exchangeable) and an autocorrelated component, *v*~*i*~, that reflects local spatial structure by incorporating the influence of neighboring geographic units. Prior distributions are then assigned to these linear terms and consequent hyperprior distributions are assigned to the variance terms, thus creating a 4-level hierarchical model as follows. + +Level 1, define the likelihood: *Y*~*i*~ \~ Poisson(*E*~*i*~*θ*~*i*~)     (1) + +Level 2, link to a linear function: log(*θ*~*i*~) = *α* + *u*~*i*~ + *v*~*i*~     (2) + +Level 3, assign prior distributions: *α* \~ N(0,0.0001), noting that 0.0001 is the precision, thus defining a vague prior, + +![](1476-072X-3-29-i3.gif) + +![](1476-072X-3-29-i4.gif) + +for a neighborhood of geographic units *δ*~*i*~ with respect to unit *i* and *w*~*ij*~ is a weight defining the relationship between geographic unit *i* and its neighbor *j*. The weight is defined simply as *w*~*ij*~ = 1 if ZIP codes *i* and *j* are adjacent (share a common border) and *w*~*ij*~ = 0 otherwise. + +Level 4, assign hyperprior distributions to precision terms: + +*τ*~*u*~ = 1 / ![](1476-072X-3-29-i5.gif) \~ Gamma(a,b) and *τ*~*v*~ = 1 / ![](1476-072X-3-29-i6.gif) \~ Gamma(c,d)     (5) + +for shape parameters a and c, and inverse scale parameters b and d. + +This is the convolution model originally proposed by Besag, York and Mollie \[[@B18]\], where the random effect associated with spatial autocorrelation, *v*~*i*~, is defined according to the conditional auto-regressive model (CAR) \[[@B14]\]. Note that the distribution of *v*~*i*~ is conditional on geographic location, whereby its expectation equals a local neighborhood average. The Bayesian model puts increasing emphasis on this term as the underlying population at location *i* decreases. + +Although covariates can be incorporated into the log-linear expression at the second level of the model, our interest is with estimating and mapping the relative risk, *θ*~*i*~ = exp(*α* + *u*~*i*~ + *v*~*i*~). + +## Choosing Gamma Hyperpriors + +While it is established that a vague prior is acceptable for the linear term *α* in Equation (2) (i.e. Ghosh et al \[[@B20]\]), the model should be evaluated for sensitivity to choice of the Gamma hyperprior distributions of the precision terms, as in Equation (5). Two very different hyperprior specifications that appear in the literature for this convolution model were experimented with. Hyperparameters were specified for one model as Γ(1,1), which yields a probability of 99% that the precision lays between 0.01 and 4.6, and for the other model as Γ(0.5,0.0005), which yields a probability of 99% that the precision lies between 0.16 and 6635, with most of the probability concentrated towards 0. Note that these parameter choices also satisfy sufficient conditions for ensuring a proper joint posterior distribution of all the stochastic nodes \[[@B20]\]. For the statewide collection of New York ZIP code log(SIR)s to be smoothed in this paper, the sample precision equals 6.25 (variance = 0.16) and the precision of first order neighborhood means equals 20 (variance = 0.05). Therefore, it may be desired to retain the model with hyperprior specification of Γ(0.5,0.0005) to at least capture the sample-based precision estimates, while also defining a vague hyperprior that allows more learning from the data. + +Final smoothed results from each model are compared in Figure [6](#F6){ref-type="fig"} where we see, in agreement with Bernardinelli et al \[[@B34]\], that it essentially makes no difference which hyperprior is used. Therefore, the Γ(0.5,0.0005) hyperprior specification was chosen. It is not the intention of this paper to perform a rigorous sensitivity analysis with respect to hyperprior specification; however, the two models assessed in Figure [6](#F6){ref-type="fig"} represent very different distributions and therefore indicate that the fully Bayes hierarchical model is quite robust with respect to hyperprior specification when smoothed relative risks are the objective. + +
+

+
Comparison of smoothed standardized incidence ratios for two different specifications of the hyperprior distributions. Each point represents a ZIP code with two medians of the posterior distribution of standardized incidence ratios obtained from different models.
+
+ +## Running the Gibbs Sampler + +WINBUGS 1.4 \[[@B35]\] was used for running three independent Markov Chains. Initial values of all stochastic nodes of the model were chosen to provide dispersed initial values without being excessively overdispersed. For the common intercept, *α*, and heterogeneous random effect, *u*~*i*~, zero (0) was used to initiate one chain, plus/minus four standard deviations of the statewide log(SIR) were used to initiate the other respective chains. Zero is the statewide average log(SIR) that provides a point estimate for *α*, plus it is the expected value of *u*~*i*~. For the random effect associated with local spatial clustering, *v*~*i*~, the initial values were based on the average plus/minus four standard deviations of the log of first order neighborhood average SIRs. For the precision terms *τ*~*u*~ and *τ*~*v*~, the inverse of the sample variances of the log(SIR) and the log of the first order neighborhood average SIRs were used respectively for one chain, plus lower and higher values were chosen for the other two chains in order to be well dispersed from the middle value, but not wildly so with respect to what is reasonably expected based on the observed spatial variability. + +Convergence of relative risk for the three independent chains was confirmed by graphing their traces and observing random mixing of all chains, which revealed white noise variation around a common value, with no trend. This was supported by observing Brooks-Gelman-Rubin diagnostics that clearly satisfied convergence criteria \[[@B36]\]. After a burn-in of 10,000 iterations, which was far more than actually necessary, the following 1000 iterations were sampled from each of the three chains by choosing every third iteration to help avoid possible autocorrelation within a chain. This large sample approximation of the stationary posterior distribution for each ZIP code relative risk was then summarized in WINBUGS and brought into a Geographic Information System \[[@B37]\] for mapping. + +## Model Selection + +Variations of the model defined above were compared by evaluating the mean deviance of 1000 iterations chosen from the three independent Markov Chains after burn-in. This was done by obtaining the mean of -2(log likelihood) for each iteration, as provided by the *deviance* node in WINBUGS. The mean deviance was then calculated as *D*(**y**, *μ*) = 2 \[*l*(**y**, **y**) - *l*(**y**, *μ*)\], where *l*(**y**, **y**) is the mean maximum achievable log likelihood, obtained for a saturated model where a parameter is assigned to each datum, and *l*(**y**, *μ*) is the mean log likelihood obtained for the model in question. This takes the conventional assessment of deviance for generalized linear models \[[@B12]\] and applies it to the many outcomes of Monte Carlo simulation, as per Spiegelhalter et al \[[@B38]\]. + +Incorporating a random effect associated with local spatial structure (CAR term) provides much stronger prior information than the exchangeable random effect alone (Table [1](#T1){ref-type="table"}), which assumes purely heterogeneous variation across the state. This agrees with findings by Spiegelhalter et al \[[@B38]\], who developed the Deviance Information Criterion as a penalized version of deviance and applied it to Scottish lip cancer data. The convolution model was therefore chosen, which incorporates both random effects. + +::: {#T1 .table-wrap} +::: caption +Deviance analysis. See text for explanation. +::: + + *Model* *Mean (-2 LL)* *Mean Deviance* + ------------------- ---------------- ----------------- + Saturated 8082.0 + Convolution 8194.0 112.0 + CAR only 8191.0 109.0 + Exchangeable only 10800.0 2718.0 +::: + +### Acknowledgements + +The author sincerely appreciates the encouragement and dialogue with colleagues throughout this project; namely, from the New York State Department of Health, Colleen McLaughlin and Maria Schymura of the New York State Cancer Registry, and Edward Fitzgerald, Syni-An Hwang, Thomas Talbot and Francis Boscoe of the Bureau of Environmental and Occupational Epidemiology, along with Thomas Richards of the US Centers for Disease Control. + +Contract/grant sponsor: This publication was made possible through a Cooperative Agreement between the Centers for Disease Control and Prevention (CDC) and the Association of Schools of Public Health (ASPH), award number S1355-20/22. Its contents are the responsibility of the author and do not necessarily reflect the official views of the CDC or ASPH. diff --git a/pubmedcentral/example/PMC553995.md b/pubmedcentral/example/PMC553995.md new file mode 100644 index 0000000..523b1eb --- /dev/null +++ b/pubmedcentral/example/PMC553995.md @@ -0,0 +1,124 @@ +# Background + +Delight over recent survival gains for the very premature infant has been tempered by the frequent presence of cerebral injury and developmental impairment. One quarter of those born before 26 weeks postmenstrual age (at least 11 weeks premature) show evidence of severe cerebral injury including cognitive dysfunction by 30 months of age \[[@B1]\]. Preterm children without any disability remain at risk of a range of motor, cognitive, behavioural and psychological deficits during childhood even if not born so close to the margin of viability \[[@B2]\]. To date, the pathophysiological processes leading to such impairment remain largely occult. In particular, cerebral imaging has failed to identify structural correlates of impaired higher function \[[@B3]\] although imaging can predict many cases of motor abnormality (such as cerebral palsy) due to the presence of periventricular white matter injury \[[@B4]\]. + +Three factors seem to play important roles in the aetiology of preterm cerebral injury. Firstly, exposure to inflammatory stimuli is associated with white matter injury and cerebral palsy in the preterm \[[@B5]\]. Secondly, reduced glucose and oxygen delivery to the developing brain (hypoxia-ischaemia: local cerebral or systemic) may cause excito-toxic neurotransmitter release followed by neuronal death \[[@B6]\]. Thirdly, free-radicals may damage the oligodendrocytes of white matter of the preterm brain \[[@B6]\]. Damage to the primitive white matter prevents the normal formation of grey matter connections which may influence cognitive development in childhood \[[@B7]\]. + +Candidate systems that might influence motor or cognitive outcome after premature birth are likely to be those which affect these responses. The human renin-angiotensin systems may be such a system. Angiotensin converting enzyme (ACE), a key component of the circulating (or endocrine) renin-angiotensin system (RAS), cleaves angiotensin I to yield the potent vasoconstrictor angiotensin II. In addition, ACE degrades vasodilator kinins. In these ways, endocrine RAS plays an important role in circulatory homeostasis. However, local RAS also exist in diverse human tissues including lung, myocardium, vasculature, lymphocyte and brain tissue. These are powerful regulators of mitochondrial respiration and whole-cell metabolism \[[@B8]\] and exert profound effects on whole-human metabolism and metabolic efficiency: elevated ACE may impair cellular aerobic metabolism \[[@B9]\]. RAS also plays a key role in the regulation of tissue inflammatory responses; ACE, through generation of angiotensin II, stimulates the synthesis of pro-inflammatory cytokines, including IL-6 which itself is thought to exert major neurocytotoxic effects with the genesis of functionally significant lesions in the developing preterm brain \[[@B5]\]. It has also been noted that the inhibition of RAS may reduce the effects of excitotoxic neurotransmitters and free radicals \[[@B10]\]. It is possible therefore that enhanced ACE activity may adversely influence the development of the child born prematurely. + +A common variant of the human ACE gene provides a tool to determine if ACE activity does influence developmental progress after preterm birth. The presence (insertion, or \'I\' allele) rather than the absence (deletion, or \'D\' allele) of a 284-base-pair fragment in the human ACE gene is associated with lower ACE activity in organs including both circulating inflammatory cells \[[@B11]\] and the circulation itself \[[@B12]\]. Given the likely causal association of pro-inflammatory responses, ischaemic-hypoxia, excitotoxic neurotransmitters, and free radical attack with impaired neuro-outcome; and given the potential role of increased RAS activity in amplifying these effects, we might expect the DD genotype (encoding raised ACE activity) to be associated with poorer neuro-developmental progress after pretem birth. Comparable findings have been described with respect to the deterioration of cognitive function in the elderly by some authors \[[@B13]-[@B15]\]. We have tested this hypothesis by studying the association of the ACE I/D polymorphism with measures of neuro-developmental progress at 2 and 5 1/2 years of age in children who had participated in a neuro-developmental outcome study (The Avon Premature Infant Project, APIP \[[@B16]\]). All the patients were born at less than 33 weeks postmenstrual age (normal gestation is 37--40 weeks). + +# Methods + +## Patients + +The study was approved by the ethical committees of Southmead Hospital and United Bristol Health Care Trust. Parental consent was obtained for participation in neurodevelopmental follow-up \[[@B16]\] (see below). Consent was not required for the genetic component of this study as all personal information was held separately from the genetic information and patients were identified only by study codes. + +All children were born at 32 weeks gestation or less, between December 1990 and July 1993 at Southmead Hospital or St. Michael\'s Hospital, Bristol. All had participated in the Avon Premature Infant Project (APIP) \[[@B16]\]. Briefly, this was a randomised controlled trial in which developmental support (Portage) or supportive counselling (parental adviser), each started at discharge and continued for up to 2 years, were found to confer some measurable (3--4 DQ points (below)) but clinically insignificant benefit to development at 2 years of age, when given in addition to appropriate primary care and community support, after adjusting for social variables. + +## Neuro-developmental outcome + +The Griffiths Mental Development Scales, used to assess motor and cognitive performance, was performed at 2 years corrected age \[[@B17]\]. The Griffiths scales comprise five subscales, including personal and social, hearing and speech, locomotor, eye hand co-ordination and performance domains, from which is derived an overall developmental quotient (DQ). A lower Griffiths DQ reflects a poorer neuro-developmental performance, with a difference DQ of five points being clinically apparent. DQ was standardised originally to a mean of 100, with a standard deviation of 15, but secular drifts in population scores have resulted in a higher population mean. Thus for severe disability a score of 70 (-2 standard deviations (sd)) would indicate severe disability. Cognitive developmental progress at 5.5 years of age was assessed using the British Ability Scales \[[@B18]\]. The BAS-II was standardised in the early 1990s and was used to compute general cognitive ability (GCA) together with visuospatial, verbal and non-verbal subscales. The GCA is a developmental quotient, equivalent to an IQ estimate, normalised at 100 (sd +/- 15) in which a lower score again indicates poorer conceptual ability. The Movement ABC scales were used to assess manual dexterity, ball skills, and balance over ten tests at 5 1/2 years of age. Scores of each component are summed to produce a composite score ranging from 0--40, with high scores indicating a more impaired motor skills and 0 indicating normal skills. + +A psychologist performed the Griffiths Scales of Mental Development and a second psychologist performed the British Ability Scales (second edition) (BAS). The ABC Movement tests were performed by a trained research nurse. All assessments were blind to the child\'s neonatal course and subsequent progress. + +## ACE genotyping + +DNA was extracted from the Guthrie card blood spots (newborn metabolic screening cards). ACE genotype was determined using 3-primer PCR amplification \[[@B9]\]. Primer ratios corresponded to 50 pmol of an I-specific oligonucleotide in a 20-υl reaction volume. The PCR was performed using Taq polymeraase yielding amplification products of 84 bp for the D allele, and 65 bp for the I allele. Amplification products were visualised using a 7.5% polyacrylamide gel stained with ethidium bromide. Genotyping was performed by staff blind to all clinical data. + +## Study Size + +An estimate of sample size suggested that 144 patients would be needed for this study. The assumptions made for this calculation were that DD genotype infants had a mean DQ of 92.5 (1/2 SD below the norm) compared to a mean DQ of 100 in the ID+II group, assumed typical genotype distributions, and a significance of 0.05 with 80% power. + +## Statistical analysis + +Data were stored in SPPS v9.0 for Windows. Lymphocyte \[[@B11]\] and tissue ACE \[[@B12]\] activity is primarily raised in DD genotype when compared to either ID or II genotype, and so data for those of DD genotype were compared to those from I-allele carriers. Categorical data were analysed by Chi square and continuous data by Student\'s T Test if normally distributed or Mann-Whitney U test as appropriate. + +# Results + +Guthrie cards were located for 230 of 308 children. After exclusion of non-Caucasians and, at random, 1 child of any identical twin pairs (based on genotypes and gender) 176 babies with ACE genotype formed the study population (median birthweight 1475 g, range 645--2480 g; gestation 30 weeks, range 22--32) with follow-up data at 2 years. 122 of these also had follow-up at 5 1/2 years. The ACE genotype distribution was 49 \[27.8%\] DD, 73 \[41.5%\] ID, 54 \[30.7%\] II, demonstrated Hardy-Weinberg equilibrium, and was similar to that observed in the newborn term population from the same region of the UK (203 \[24.1%\] DD, 433 \[51.5%\] ID, 205 \[24.4%) II). Baseline characteristics were independent of genotype, except that fewer individuals of DD genotype were from twin births (*p* = 0.047) (table [1](#T1){ref-type="table"}). There was no association between markers of neonatal cerebral injury: severe intraventricular haemorrrhage or white matter injury (table [1](#T1){ref-type="table"}). There was no association with the presence of any disability at 2 years of age (DD 17% vs ID/II 15%, *p* = 0.65). + +::: {#T1 .table-wrap} +::: caption +Perinatal and social factors +::: + + DD Genotype (n = 49) ID/II Genotype (n = 127) + --------------------------------------- ---------------------- -------------------------- + + No maternal antenatal corticosteroids 44 (80%) 112 (81%) + No. of children from twin pregnancy\* 4 (8%) 27 (21%) + Male 32 (65%) 76 (60%) + Gestation, weeks (± SEM) 29.7 (± 0.3) 30.0 (± 0.2) + Birth weight, g (± SEM) 1453 (± 56) 1461 (± 34) + Portage, parent adviser 17 (31%), 19 (35%) 46 (33%), 42 (30%) + Severe intraventricular haemorrhage 5 (11%) 7 (6%) + White matter injury 7 (14%) 14 (11%) + Maternal age (± SEM) 27.2 (± 0.8) 27.4 (± 0.8) + Manual occupation 28 (57%) 76 (60%) + Maternal car use 30 (61%) 75 (60%) + Mother educated beyond 16 yrs. 17 (35%) 48 (38%) + +\*p = 0.047 (Fisher\'s Exact Probability Test) + +Continuous data is shown as mean (± standard error of mean). +::: + +Measures of developmental cognitive and motor outcome were entirely independent of genotype (table [2](#T2){ref-type="table"}). The findings were unchanged after post hoc subgroup analysis of singletons, infants with normal cranial scans, amongst children without disability and after adjusting for potential influential variables (including twin birth) using multiple regression (data not shown). + +::: {#T2 .table-wrap} +::: caption +ACE genotype and developmental performance at 2 and 5 1/2 years of age. Data shown is mean (± SEM). +::: + + Developmental tests DQ for DD DQ for ID/II *p* + -------------------------------------------------------- ------------- -------------- ------ + + Griffith DQ at 2 years 96.2 (3.1) 96.3 (1.3) 0.95 + +  Locomotor subscale 92.7 (2.7) 92.4 (1.3) 0.92 +  Personal & social subscale 101.9 (3.0) 101.0 (1.6) 0.80 +  Hearing and speech subscale 92.9 (4.1) 94.0 (2.1) 0.80 +  Eye hand co-ordination subscale 90.8 (3.1) 92.8 (1.2) 0.46 +  Performance subscale 102.2 (4.3) 101.3 (1.6) 0.79 + + Griffith DQ at 2 years (adjusted for social variables) 100.0 (0.9) 99.3 (0.6) 0.43 + + ABC Movement summative score 8.1 (1.8) 8.0 (0.9) 0.97 + + GCA at 5 1/2 years 99.2 (3.4) 100.2 (2.0) 0.80 + +  Verbal ability subscale 98.0 (4.0) 103.2 (1.7) 0.22 +  Pictoral ability subscale 99.9 (3.3) 98.7 (1.7) 0.99 +  Spatial ability subscale 98.4 (3.3) 97.3 (1.9) 0.67 +::: + +# Discussion + +After a search of Embase and Medline we believe that this study is the first to attempt to dissect out the contribution of genetic variation in the ACE gene to developmental progress after pre-term delivery. Despite much physiological and biochemical evidence to support our hypothesis, we found that ACE DD genotype was not associated with adverse long term developmental outcome in infants of \< 33 weeks gestation in this study. + +These data are perhaps at variance with previous studies of Alzheimer\'s disease, age-associated memory impairment and vascular dementia, all of which have implicated the ACE D allele in having a role in mental decline \[[@B13]-[@B15]\]. However this is not a universal finding. Furthermore although ACE inhibitors appear to reduce inflammatory responses, ischaemic effects, and excitotoxic and free radical induced injury \[[@B10]\], angiotensin II does not (indeed angiotensin II may actually enhance ischaemic and excitotoxic neural injury via the AT2 receptor). In addition, both captopril and losartan (RAS inhibitors) appear to improve cognitive performance in mice \[[@B19]\] and humans \[[@B20]\]. It should be noted however that little is known about the ontogeny of the RAS in the human foetus. Certainly RAS (and angiotensin II receptors in particular) play a role in blood-brain barrier and central nervous system development in mice, and alterations in RAS receptor expression over foetal and neonatal life are recognised. It is thus possible that developmentally regulated patterns of AT1 receptor expression might offer some level of protection against the potentially detrimental effects of ACE-mediated angiotensin II synthesis. + +Although there may be similar molecular pathways that effect cerebral injury in the preterm infant and the elderly, ontological differences in the expression of genes involved in predisposition to neural injury are well described. In particular reactive production of nitric oxide may be enhanced in the elderly and the ability to protect the brain from oxidants may be reduced in the elderly (22). Thus the effect of any one polymorphism, with a relatively minor effect, may be swamped in the newborn infant by other protective mechanisms. + +The lack of any association between ACE genotype and scores of developmental progress was also surprising because we have demonstrated an association between DD genotype and markers of poor cardio-respiratory instability in the perinatal period in this patient group \[[@B21]\]. This association (between genotype and worse early cardio-respiratory status) could predispose to death, which would in turn weaken any association (if it exists) between DD genotype and worse developmental quotients. It is of course possible that our sample size was insufficient to demonstrate any association with ACE genotype and developmental progress. However, similar-sized studies have been sufficient to demonstrate an association between ACE D allele and cognitive decline in the elderly \[[@B13]-[@B15]\], and power calculations suggested we had enough patients to demonstrate at least a trend. If an undetected genotype-association does exist such an effect is weak. + +# Conclusion + +We cannot support an association of ACE genotype with cognitive or motor development in survivors born preterm or, thus, the use of RAS inhibition as a neuroprotective agent in the preterm. Given the current lack of understanding of the mechanisms leading to cerebral injury and subsequent impairment -- particularly of higher function -- in such patients, further genetic association studies of other candidate genes are warranted. + +# List of abbreviations used + +ACE, angiotensin-1 converting enzyme; DQ developmental quotient, BAS, British ability scales (second edition); GCA, general cognitive ability; RAS, renin angiotensin system; PCR, polymerase chain rection. + +# Competing interests + +The author(s) declare that they have no competing interests. + +# Authors\' contributions + +DH, HM, AW, NM conceived the study and its design and wrote the manuscript. DH, DD and SD performed data collection, DNA extraction and PCR and participated in analysis of the data with SH and HM. NM reviewed all cranial imaging. All authors participated in the writing of the manuscript and approved the final manuscript. + +### Acknowledgements + +The developmental assessments were performed by Dr Margaret Robinson (Griffiths Assessments), Pat Anderson (BAS-II) and Wendy Ring (Movement ABC). This research was supported by awards from The Southmead Hospital Research Foundation to AW and DH. The British Heart Foundation (grant numbers RG200015, SP98003, FS01XXX) SHE, HM, and SD. The original APIP study was supported by Action Research (Grant to NM). diff --git a/pubmedcentral/get-filelist.sh b/pubmedcentral/get-filelist.sh new file mode 100644 index 0000000..da434be --- /dev/null +++ b/pubmedcentral/get-filelist.sh @@ -0,0 +1,14 @@ +#!/usr/bash/env sh + +NONPERMISSIVE_LICENSES="NO-CC CODE\|CC BY-NC\|CC BY-ND\|CC BY-NC-ND\|CC BY-NC-NA\|CC BY-NC-SA" + +# download full filelist +# -nc: no clobber, do not overwrite existing files +wget -nc https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt -P data/ + +PERMISSIVE_FILELIST_DEST="data/permissive_filelist.txt" +# -v invert match (find lines without these patterns), -w match whole word +grep -vw "${NONPERMISSIVE_LICENSES}" data/oa_file_list.txt > "${PERMISSIVE_FILELIST_DEST}" + +# don't count the header line as a file +echo "Found $(($(wc -l < ${PERMISSIVE_FILELIST_DEST})-1)) permissive licenses in data/oa_file_list.txt" diff --git a/pubmedcentral/run.sh b/pubmedcentral/run.sh new file mode 100644 index 0000000..b33830e --- /dev/null +++ b/pubmedcentral/run.sh @@ -0,0 +1,28 @@ +#!/usr/bash/env sh + +TOTAL_DOCS="${1:-0}" + +# downloads the list of articles and keeps only permissively licensed articles +bash get-filelist.sh + +# if total_docs is not 0, then only write TOTAL_DOCS lines into data/permissive_filelist.txt +if [ "${TOTAL_DOCS}" -ne 0 ]; then + # -c -1 is to remove the trailing newline + head -n "$((${TOTAL_DOCS}+1))" data/permissive_filelist.txt > data/permissive_filelist.txt.tmp + + mv data/permissive_filelist.txt.tmp data/permissive_filelist.txt + echo "Reduced data/permissive_filelist.txt to ${TOTAL_DOCS} documents" +fi + +# if the last line is empty, remove it +if [[ $(tail -c1 data/permissive_filelist.txt | wc -l) -gt 0 ]]; then + truncate -s -1 data/permissive_filelist.txt +fi + +echo "Downloading and converting to markdown..." +bash download-convert-to-md.sh +echo "Finished downloading and converting to markdown" + +echo "Converting to dolma format..." +python to-dolma.py +echo "Finished converting to dolma format" diff --git a/pubmedcentral/to-dolma.py b/pubmedcentral/to-dolma.py new file mode 100644 index 0000000..9c1b17c --- /dev/null +++ b/pubmedcentral/to-dolma.py @@ -0,0 +1,92 @@ +import argparse +import datetime +import functools +import json +import os + +from licensed_pile.licenses import PermissiveLicenses +from licensed_pile.write import to_dolma + +parser = argparse.ArgumentParser(description="Collect PubMedCentral into Dolma format.") +parser.add_argument( + "--filelist", + default="data/permissive_filelist.txt", + help="The path to the filelist.txt file.", +) +parser.add_argument( + "--data_dir", default="data/md/", help="Path to the directory of markdown files." +) +parser.add_argument( + "--metadata_dir", default="data/metadata/", help="Where the metadata files go." +) +parser.add_argument( + "--output_dir", + default="data/pubmedcentral/", + help="Where the dolma formatted data goes.", +) +parser.add_argument( + "--filename", + default="pubmedcentral.jsonl.gz", + help="The base filename for our books.", +) +parser.add_argument( + "--shard_size", type=int, default=1, help="Size, in GB, for each shard." +) + + +LICENSE_MAP = { + "CC0": PermissiveLicenses.CC0, + "CC BY": PermissiveLicenses.CC_BY, + "CC BY-SA": PermissiveLicenses.CC_BY_SA, +} +SOURCE_NAME = "PubMed Central" + + +def format_dolma( + file: str, + data_dir: str, + source_name: str = SOURCE_NAME, + base_url: str = "https://www.ncbi.nlm.nih.gov/pmc/articles", +): + file, journal, accessionID, _, lic = file.split("\t") + file = os.path.basename(file).replace("tar.gz", "md") + with open(os.path.join(data_dir, file)) as f: + text = f.read() + + with open( + os.path.join(args.metadata_dir, f"{os.path.splitext(file)[0]}.json"), + encoding="utf-8", + ) as f: + metadata = json.load(f) + + return { + "id": accessionID, + "text": text, + "source": source_name, + "added": datetime.datetime.utcnow().isoformat(), + "created": metadata["created"], + "metadata": { + "license": str(LICENSE_MAP[lic]), + "url": f"{base_url}/{accessionID}/", + "journal": journal, + "authors": metadata["authors"], + }, + } + + +def main(args): + os.makedirs(args.output_dir, exist_ok=True) + + with open(args.filelist) as f: + files = f.read().split("\n") + + # Remove the header + files = files[1:] + + files = map(functools.partial(format_dolma, data_dir=args.data_dir), files) + to_dolma(files, args.output_dir, args.filename, args.shard_size) + + +if __name__ == "__main__": + args = parser.parse_args() + main(args)