CAST-genomics · aryarm · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/docs/commands/simgenotype.rst b/docs/commands/simgenotype.rst
@@ -75,6 +75,10 @@ If speed is important, it's generally faster to use PGEN files than VCFs.
   --pop_field \
   --out tests/data/example_simgenotype.pgen
 
+.. warning::
+  Writing PGEN files will require more memory than writing VCFs. The memory will depend on the number of simulated samples and variants.
+  You can reduce the memory required for this step by writing the variants in chunks. Just specify a ``--chunk-size`` value.
+
 All files used in these examples are described :doc:`here </project_info/example_files>`.
 
 

diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst
@@ -38,6 +38,12 @@ To convert a VCF containing tandem repeats to PGEN, use this command, instead.
 
 	plink2 --vcf-half-call m --make-pgen 'pvar-cols=vcfheader,qual,filter,info' --vcf input.vcf --make-pgen --out output
 
+If you are seeing cryptic errors with haptools and your PGEN file, please validate it first:
+
+.. code-block:: bash
+
+	plink2 --pfile output --validate
+
 Tandem repeats
 ~~~~~~~~~~~~~~
 VCFs containing tandem repeats usually have a *type* indicating the tool from which they originated. We support whichever types are supported by `TRTools <https://trtools.readthedocs.io/en/stable/CALLERS.html>`_.

diff --git a/haptools/__main__.py b/haptools/__main__.py
@@ -202,6 +202,17 @@ def karyogram(bp, sample, out, title, centromeres, colors, verbosity):
         " continue to simulate a vcf file."
     ),
 )
+@click.option(
+    "-c",
+    "--chunk-size",
+    type=int,
+    default=None,
+    show_default="all variants",
+    help=(
+        "If requesting a PGEN output file, write genotypes in chunks of X variants; "
+        "reduces memory"
+    ),
+)
 @click.option(
     "-v",
     "--verbosity",
@@ -224,6 +235,7 @@ def simgenotype(
     sample_field,
     no_replacement,
     only_breakpoint,
+    chunk_size,
     verbosity,
 ):
     """
@@ -307,6 +319,7 @@ def simgenotype(
             no_replacement,
             out,
             log,
+            chunk_size,
         )
     end = time.time()
 

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
@@ -1590,6 +1590,7 @@ def write(self):
                 end = start + chunks
                 if end > len(self.variants):
                     end = len(self.variants)
+                self.log.debug(f"Writing variant #{start} through variant #{end}")
                 size = end - start
                 try:
                     missing = np.ascontiguousarray(
@@ -1599,31 +1600,37 @@ def write(self):
                     # https://stackoverflow.com/a/46575580
                     allele_cts = self._num_unique_alleles(data[start:end])
                     subset_data = np.ascontiguousarray(data[start:end], dtype=np.int32)
-                except np.core._exceptions._ArrayMemoryError as e:
+                    subset_data.resize((size, len(self.samples) * 2))
+                    missing.resize((size, len(self.samples) * 2))
+                except (np.core._exceptions._ArrayMemoryError, MemoryError) as e:
                     raise ValueError(
                         "You don't have enough memory to write these genotypes! Try"
                         " specifying a value to the chunk_size parameter, instead"
                     ) from e
-                subset_data.resize((len(self.variants), len(self.samples) * 2))
-                missing.resize((len(self.variants), len(self.samples) * 2))
                 # convert any missing genotypes to -9
                 subset_data[missing] = -9
-                # finally, append the genotypes to the PGEN file
-                if self._prephased or self.data.shape[2] < 3:
-                    pgen.append_alleles_batch(
-                        subset_data,
-                        all_phased=True,
-                        allele_cts=allele_cts,
-                    )
-                else:
-                    # TODO: figure out why this sometimes leads to a corrupted file?
-                    subset_phase = self.data[:, start:end, 2].T.copy(order="C")
-                    pgen.append_partially_phased_batch(
-                        subset_data,
-                        subset_phase,
-                        allele_cts=allele_cts,
-                    )
-                    del subset_phase
+                try:
+                    # finally, append the genotypes to the PGEN file
+                    if self._prephased or self.data.shape[2] < 3:
+                        pgen.append_alleles_batch(
+                            subset_data,
+                            all_phased=True,
+                            allele_cts=allele_cts,
+                        )
+                    else:
+                        # TODO: why does this sometimes leads to a corrupted file?
+                        subset_phase = self.data[:, start:end, 2].T.copy(order="C")
+                        pgen.append_partially_phased_batch(
+                            subset_data,
+                            subset_phase,
+                            allele_cts=allele_cts,
+                        )
+                        del subset_phase
+                except RuntimeError as e:
+                    if not np.all(allele_cts <= max_allele_ct):
+                        raise ValueError("Variant(s) have more alleles than expected")
+                    else:
+                        raise e
             del subset_data
             del missing
             gc.collect()

diff --git a/haptools/sim_genotype.py b/haptools/sim_genotype.py
@@ -1,12 +1,11 @@
+# fmt: off
 from __future__ import annotations
 
 import os
 import re
-import sys
 import glob
 import numpy as np
 from cyvcf2 import VCF
-from pysam import VariantFile
 from collections import defaultdict
 from .admix_storage import GeneticMarker, HaplotypeSegment
 from .data import GenotypesVCF, GenotypesPLINK
@@ -23,8 +22,9 @@ def output_vcf(
         pop_field, 
         sample_field, 
         no_replacement,
-        out, 
-        log
+        out,
+        log,
+        chunk_size = None,
     ):
     """
     Takes in simulated breakpoints and uses reference files, vcf and sampleinfo, 
@@ -70,6 +70,8 @@ def output_vcf(
         output prefix
     log: log object
         Outputs messages to the appropriate channel.
+    chunk_size: int, optional
+        The max number of variants to write to a PGEN file together
     """
 
     log.info(f"Outputting file {out}")
@@ -110,6 +112,7 @@ def output_vcf(
         vcf.read()
     else:
         vcf.read(region=f"{region['chr']}:{region['start']}-{region['end']}")
+    vcf.check_missing()
 
     log.debug(f"Read in variants from {variant_file}")
 
@@ -126,8 +129,11 @@ def output_vcf(
     # Use search_sorted() numpy function to find indices in populations/samples in order to determine population and sample info
     ref_vars = vcf.variants
     output_gts = np.empty((int(len(breakpoints)//2), len(vcf.variants), 2), dtype=vcf.data.dtype)
-    output_pops = np.empty((int(len(breakpoints)//2), len(vcf.variants), 2), dtype=np.uint8)
-    output_labels = np.empty((int(len(breakpoints)//2), len(vcf.variants), 2), dtype=object)
+    if not out.endswith(".pgen"):
+        if pop_field:
+            output_pops = np.empty((int(len(breakpoints)//2), len(vcf.variants), 2), dtype=np.uint8)
+        if sample_field:
+            output_labels = np.empty((int(len(breakpoints)//2), len(vcf.variants), 2), dtype=object)
 
     # cover "chr" prefix cases
     if ref_vars["chrom"][0].startswith("chr"):
@@ -187,10 +193,11 @@ def output_vcf(
             cur_var = end_var
 
         output_gts[int(hap_ind//2), : , hap_ind % 2] = ref_gts
-        if pop_field:
-            output_pops[int(hap_ind//2), : , hap_ind % 2] = ref_pops
-        if sample_field:
-            output_labels[int(hap_ind//2), : , hap_ind % 2] = ref_labels
+        if not out.endswith(".pgen"):
+            if pop_field:
+                output_pops[int(hap_ind//2), : , hap_ind % 2] = ref_pops
+            if sample_field:
+                output_labels[int(hap_ind//2), : , hap_ind % 2] = ref_labels
 
     # output vcf header to new vcf file we create
     output_samples = [f"Sample_{hap+1}" for hap in range(int(len(breakpoints)/2))]
@@ -215,7 +222,7 @@ def output_vcf(
             gts = GenotypesVCF(out, log=log)
 
     else:
-        gts = GenotypesPLINK(out, log=log)
+        gts = GenotypesPLINK(out, chunk_size=chunk_size, log=log)
 
     gts.samples = output_samples
     gts.variants = vcf.variants
@@ -305,6 +312,76 @@ def _find_coord(cur_hap, chrom, start_coord, end_coord):
     cur_hap.append((chrom, start_coord, end_coord))
     return False
 
+def _prepare_coords(coords_dir, chroms, region):
+    # coord file structure chr variant cMcoord bpcoord
+    # NOTE coord files in directory should have chr{1-22, X} in the name
+    def numeric_alpha(x):
+        chrom = re.search(r'(?<=chr)(X|\d+)', x).group()
+        if chrom == 'X':
+            return 23
+        else:
+            return int(chrom)
+
+    # sort coordinate files to ensure coords read are in sorted order
+    # remove all chr files not found in chroms list
+    all_coord_files = glob.glob(f'{coords_dir}/*.map')
+    all_coord_files = [coord_file for coord_file in all_coord_files \
+                if re.search(r'(?<=chr)(X|\d+)', coord_file) and \
+                   re.search(r'(?<=chr)(X|\d+)', coord_file).group() in chroms]
+    all_coord_files.sort(key=numeric_alpha)
+
+    if len(all_coord_files) != len(chroms):
+        raise Exception(f"Unable to find all chromosomes {chroms} in map file directory.")
+
+    # coords list has form chroms x coords
+    coords = []
+    for coords_file in all_coord_files:
+        file_coords = []
+        with open(coords_file, 'r') as cfile:
+            prev_coord = None
+            for line in cfile:
+                # create marker from each line and append to coords
+                data = line.strip().split()
+                if len(data) != 4:
+                    raise Exception(f"Map file contains an incorrect amount of fields {len(data)}. It should contain 4.")
+
+                if data[0] == 'X':
+                    chrom = 23
+                else:
+                    chrom = int(data[0])
+                gen_mark = GeneticMarker(chrom, float(data[2]), 
+                                         int(data[3]), prev_coord)
+                prev_coord = gen_mark
+                file_coords.append(gen_mark)
+        coords.append(file_coords)
+
+    # subset the coordinates to be within the region
+    if region:
+        start_ind = -1
+        for ind, marker in enumerate(coords[0]):
+            if marker.get_bp_pos() >= region['start'] and start_ind < 0:
+                start_ind = ind
+
+            if marker.get_bp_pos() >= region['end']:
+                end_ind = ind+1
+                break
+            else:
+                end_ind = len(coords[0])
+        coords = [coords[0][start_ind:end_ind]]
+
+    # Update end coords of each chromosome to have max int as the bp coordinate to
+    #    prevent issues with variants in VCF file beyond the specified coordinate
+    for chrom_coord in coords:
+        chrom_coord[-1].bp_map_pos = np.iinfo(np.int32).max
+
+    # store end coords 
+    end_coords = [chrom_coord[-1] for chrom_coord in coords]
+
+    # convert coords to numpy array for easy masking
+    max_coords = max([len(chrom_coord) for chrom_coord in coords])
+    np_coords = np.zeros((len(coords), max_coords)).astype(object)
+    return coords, np_coords, max_coords, end_coords
+
 def simulate_gt(model_file, coords_dir, chroms, region, popsize, log, seed=None):
     """
     Simulate admixed genotypes based on the parameters of model_file.
@@ -366,66 +443,8 @@ def simulate_gt(model_file, coords_dir, chroms, region, popsize, log, seed=None)
     for pop_ind, pop in enumerate(pops):
         pop_dict[pop_ind] = pop
 
-    # coord file structure chr variant cMcoord bpcoord
-    # NOTE coord files in directory should have chr{1-22, X} in the name
-    def numeric_alpha(x):
-        chrom = re.search(r'(?<=chr)(X|\d+)', x).group()
-        if chrom == 'X':
-            return 23
-        else:
-            return int(chrom)
-
-    # sort coordinate files to ensure coords read are in sorted order
-    # remove all chr files not found in chroms list
-    all_coord_files = glob.glob(f'{coords_dir}/*.map')
-    all_coord_files = [coord_file for coord_file in all_coord_files \
-                if re.search(r'(?<=chr)(X|\d+)', coord_file) and \
-                   re.search(r'(?<=chr)(X|\d+)', coord_file).group() in chroms]
-    all_coord_files.sort(key=numeric_alpha)
-
-    if len(all_coord_files) != len(chroms):
-        raise Exception(f"Unable to find all chromosomes {chroms} in map file directory.")
-
-    # coords list has form chroms x coords
-    coords = []
-    for coords_file in all_coord_files:
-        file_coords = []
-        with open(coords_file, 'r') as cfile:
-            prev_coord = None
-            for line in cfile:
-                # create marker from each line and append to coords
-                data = line.strip().split()
-                if len(data) != 4:
-                    raise Exception(f"Map file contains an incorrect amount of fields {len(data)}. It should contain 4.")
-
-                if data[0] == 'X':
-                    chrom = 23
-                else:
-                    chrom = int(data[0])
-                gen_mark = GeneticMarker(chrom, float(data[2]), 
-                                         int(data[3]), prev_coord)
-                prev_coord = gen_mark
-                file_coords.append(gen_mark)
-        coords.append(file_coords)
-
-    # subset the coordinates to be within the region
-    if region:
-        start_ind = -1
-        for ind, marker in enumerate(coords[0]):
-            if marker.get_bp_pos() >= region['start'] and start_ind < 0:
-                start_ind = ind
-
-            if marker.get_bp_pos() >= region['end']:
-                end_ind = ind+1
-                break
-        coords = [coords[0][start_ind:end_ind]]
-
-    # store end coords 
-    end_coords = [chrom_coord[-1] for chrom_coord in coords]
-
-    # convert coords to numpy array for easy masking
-    max_coords = max([len(chrom_coord) for chrom_coord in coords])
-    np_coords = np.zeros((len(coords), max_coords)).astype(object)
+    # Load coordinates to use for simulating
+    coords, np_coords, max_coords, end_coords = _prepare_coords(coords_dir, chroms, region)
 
     # precalculate recombination probabilities (given map pos in cM and we want M)
     #     shape: len(chroms) x max number of coords (max so not uneven)

diff --git a/tests/data/var_greater.bp b/tests/data/var_greater.bp
@@ -0,0 +1,12 @@
+Sample_1_1
+YRI	1	59423086	85.107755
+CEU	1	2147483647	266.495714
+Sample_1_2
+YRI	1	59423086	85.107755
+YRI	1	2147483647	266.495714
+Sample_2_1
+CEU	1	59423086	85.107755
+YRI	1	2147483647	266.495714
+Sample_2_2
+CEU	1	59423086	85.107755
+CEU	1	2147483647	266.495714
diff --git a/tests/data/var_greater.vcf b/tests/data/var_greater.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##filedate=20180225
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG00096	HG00097	HG00099	HG00100	HG00101
+1	10114	chr1:10114:T:C	T	C	.	.	.	GT	1|1	1|1	0|0	0|0	0|0
+1	249403765	chr1:249403765:A:G	A	G	.	.	.	GT	0|0	0|0	1|1	1|1	1|1
diff --git a/tests/data/var_greater.vcf.gz b/tests/data/var_greater.vcf.gz
diff --git a/tests/data/var_greater.vcf.gz.tbi b/tests/data/var_greater.vcf.gz.tbi