Skip to content
This repository has been archived by the owner on Oct 15, 2020. It is now read-only.

Precompute fixed-length string dtypes #20

Merged
merged 3 commits into from
Aug 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions sgkit_plink/pysnptools.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""PLINK 1.9 reader implementation"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, List, Mapping, Optional, Tuple, Union

import dask.array as da
import dask.dataframe as dd
import numpy as np
from dask.array import Array
from dask.dataframe import DataFrame
from pysnptools.snpreader import Bed
from xarray import Dataset
Expand Down Expand Up @@ -98,11 +99,26 @@ def close(self) -> None:
self.bed._close_bed() # pragma: no cover


def _to_dict(df: dd.DataFrame, dtype: Any = None) -> Dict[str, da.Array]:
return {
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
for c in df
}
def _max_str_len(arr: Array) -> Array:
return arr.map_blocks(lambda s: np.char.str_len(s.astype(str)), dtype=np.int8).max()


def _to_dict(
df: DataFrame, dtype: Optional[Mapping[str, Any]] = None
) -> Mapping[str, Any]:
arrs = {}
for c in df:
a = df[c].to_dask_array(lengths=True)
dt = df[c].dtype
if dtype:
dt = dtype[c]
kind = np.dtype(dt).kind
if kind in ["U", "S"]:
# Compute fixed-length string dtype for array
max_len = _max_str_len(a).compute()
dt = f"{kind}{max_len}"
arrs[c] = a.astype(dt)
return arrs


def read_fam(path: PathType, sep: str = " ") -> DataFrame:
Expand Down
134 changes: 67 additions & 67 deletions sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim
Original file line number Diff line number Diff line change
@@ -1,100 +1,100 @@
1 1:1:A:C 0.0 1 C A
1 1:2:A:C 0.0 2 C A
1 1:3:A:C 0.0 3 C A
1 1:4:A:C 0.0 4 C A
1 1:5:A:C 0.0 5 C A
1 1:6:A:C 0.0 6 C A
1 1:7:A:C 0.0 7 C A
1 1:8:A:C 0.0 8 C A
1 1:9:A:C 0.0 9 C A
1 1:1:G:CGCGCG 0.0 1 CGCGCG G
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you document how you generated this file?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm how about this:

  • I'll add some notes for now saying this was created using software in a separate environment (hail)
  • Create an issue to document this properly and include the associated code that created it
  • Wait to see where the multi-repo conversation goes (https://github.com/pystatgen/sgkit/issues/65#issuecomment-670049733)
  • Add this to the validation folder if we merge repos since it is essentially the same process as the one I used in the REGENIE and HWE PRs

That sound good? I'd rather not bootstrap another validation-like concept with all the associated CI changes if I can help it.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tomwhite does @eric-czech's proposal sound good?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes (sorry forgot to reply) - sounds great!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#29

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add some notes for now saying this was created using software in a separate environment (hail)

467d4bd#diff-33e9afa89862f603b25f9c5abf5ef334R6

1 1:2:ACT:G 0.0 2 G ACT
1 1:3:ACT:G 0.0 3 G ACT
1 1:4:G:CGCGCG 0.0 4 CGCGCG G
1 1:5:G:CGCGCG 0.0 5 CGCGCG G
1 1:6:ACT:G 0.0 6 G ACT
1 1:7:G:CGCGCG 0.0 7 CGCGCG G
1 1:8:T:GTGG 0.0 8 GTGG T
1 1:9:T:GTGG 0.0 9 GTGG T
1 1:10:A:C 0.0 10 C A
1 1:11:A:C 0.0 11 C A
1 1:12:A:C 0.0 12 C A
1 1:13:A:C 0.0 13 C A
1 1:14:A:C 0.0 14 C A
1 1:15:A:C 0.0 15 C A
1 1:11:ACT:G 0.0 11 G ACT
1 1:12:G:CGCGCG 0.0 12 CGCGCG G
1 1:13:G:CGCGCG 0.0 13 CGCGCG G
1 1:14:T:GTGG 0.0 14 GTGG T
1 1:15:ACT:G 0.0 15 G ACT
1 1:16:A:C 0.0 16 C A
1 1:17:A:C 0.0 17 C A
1 1:18:A:C 0.0 18 C A
1 1:17:ACT:G 0.0 17 G ACT
1 1:18:T:GTGG 0.0 18 GTGG T
1 1:19:A:C 0.0 19 C A
1 1:20:A:C 0.0 20 C A
1 1:21:A:C 0.0 21 C A
1 1:22:A:C 0.0 22 C A
1 1:23:A:C 0.0 23 C A
1 1:21:T:GTGG 0.0 21 GTGG T
1 1:22:G:CGCGCG 0.0 22 CGCGCG G
1 1:23:T:GTGG 0.0 23 GTGG T
1 1:24:A:C 0.0 24 C A
1 1:25:A:C 0.0 25 C A
1 1:26:A:C 0.0 26 C A
1 1:27:A:C 0.0 27 C A
1 1:28:A:C 0.0 28 C A
1 1:29:A:C 0.0 29 C A
1 1:26:ACT:G 0.0 26 G ACT
1 1:27:G:CGCGCG 0.0 27 CGCGCG G
1 1:28:ACT:G 0.0 28 G ACT
1 1:29:T:GTGG 0.0 29 GTGG T
1 1:30:A:C 0.0 30 C A
1 1:31:A:C 0.0 31 C A
1 1:32:A:C 0.0 32 C A
1 1:33:A:C 0.0 33 C A
1 1:34:A:C 0.0 34 C A
1 1:31:T:GTGG 0.0 31 GTGG T
1 1:32:G:CGCGCG 0.0 32 CGCGCG G
1 1:33:ACT:G 0.0 33 G ACT
1 1:34:G:CGCGCG 0.0 34 CGCGCG G
1 1:35:A:C 0.0 35 C A
1 1:36:A:C 0.0 36 C A
1 1:37:A:C 0.0 37 C A
1 1:36:G:CGCGCG 0.0 36 CGCGCG G
1 1:37:T:GTGG 0.0 37 GTGG T
1 1:38:A:C 0.0 38 C A
1 1:39:A:C 0.0 39 C A
1 1:40:A:C 0.0 40 C A
1 1:40:T:GTGG 0.0 40 GTGG T
1 1:41:A:C 0.0 41 C A
1 1:42:A:C 0.0 42 C A
1 1:43:A:C 0.0 43 C A
1 1:44:A:C 0.0 44 C A
1 1:45:A:C 0.0 45 C A
1 1:46:A:C 0.0 46 C A
1 1:47:A:C 0.0 47 C A
1 1:42:G:CGCGCG 0.0 42 CGCGCG G
1 1:43:T:GTGG 0.0 43 GTGG T
1 1:44:ACT:G 0.0 44 G ACT
1 1:45:G:CGCGCG 0.0 45 CGCGCG G
1 1:46:ACT:G 0.0 46 G ACT
1 1:47:G:CGCGCG 0.0 47 CGCGCG G
1 1:48:A:C 0.0 48 C A
1 1:49:A:C 0.0 49 C A
1 1:50:A:C 0.0 50 C A
1 1:51:A:C 0.0 51 C A
1 1:51:G:CGCGCG 0.0 51 CGCGCG G
1 1:52:A:C 0.0 52 C A
1 1:53:A:C 0.0 53 C A
1 1:53:ACT:G 0.0 53 G ACT
1 1:54:A:C 0.0 54 C A
1 1:55:A:C 0.0 55 C A
1 1:56:A:C 0.0 56 C A
1 1:57:A:C 0.0 57 C A
1 1:55:G:CGCGCG 0.0 55 CGCGCG G
1 1:56:T:GTGG 0.0 56 GTGG T
1 1:57:G:CGCGCG 0.0 57 CGCGCG G
1 1:58:A:C 0.0 58 C A
1 1:59:A:C 0.0 59 C A
1 1:60:A:C 0.0 60 C A
1 1:61:A:C 0.0 61 C A
1 1:59:T:GTGG 0.0 59 GTGG T
1 1:60:G:CGCGCG 0.0 60 CGCGCG G
1 1:61:ACT:G 0.0 61 G ACT
1 1:62:A:C 0.0 62 C A
1 1:63:A:C 0.0 63 C A
1 1:64:A:C 0.0 64 C A
1 1:65:A:C 0.0 65 C A
1 1:66:A:C 0.0 66 C A
1 1:67:A:C 0.0 67 C A
1 1:68:A:C 0.0 68 C A
1 1:69:A:C 0.0 69 C A
1 1:70:A:C 0.0 70 C A
1 1:71:A:C 0.0 71 C A
1 1:72:A:C 0.0 72 C A
1 1:63:G:CGCGCG 0.0 63 CGCGCG G
1 1:64:T:GTGG 0.0 64 GTGG T
1 1:65:T:GTGG 0.0 65 GTGG T
1 1:66:ACT:G 0.0 66 G ACT
1 1:67:T:GTGG 0.0 67 GTGG T
1 1:68:ACT:G 0.0 68 G ACT
1 1:69:G:CGCGCG 0.0 69 CGCGCG G
1 1:70:G:CGCGCG 0.0 70 CGCGCG G
1 1:71:ACT:G 0.0 71 G ACT
1 1:72:G:CGCGCG 0.0 72 CGCGCG G
1 1:73:A:C 0.0 73 C A
1 1:74:A:C 0.0 74 C A
1 1:75:A:C 0.0 75 C A
1 1:75:T:GTGG 0.0 75 GTGG T
1 1:76:A:C 0.0 76 C A
1 1:77:A:C 0.0 77 C A
1 1:78:A:C 0.0 78 C A
1 1:77:ACT:G 0.0 77 G ACT
1 1:78:ACT:G 0.0 78 G ACT
1 1:79:A:C 0.0 79 C A
1 1:80:A:C 0.0 80 C A
1 1:81:A:C 0.0 81 C A
1 1:82:A:C 0.0 82 C A
1 1:82:T:GTGG 0.0 82 GTGG T
1 1:83:A:C 0.0 83 C A
1 1:84:A:C 0.0 84 C A
1 1:84:ACT:G 0.0 84 G ACT
1 1:85:A:C 0.0 85 C A
1 1:86:A:C 0.0 86 C A
1 1:87:A:C 0.0 87 C A
1 1:86:G:CGCGCG 0.0 86 CGCGCG G
1 1:87:ACT:G 0.0 87 G ACT
1 1:88:A:C 0.0 88 C A
1 1:89:A:C 0.0 89 C A
1 1:90:A:C 0.0 90 C A
1 1:91:A:C 0.0 91 C A
1 1:92:A:C 0.0 92 C A
1 1:90:T:GTGG 0.0 90 GTGG T
1 1:91:T:GTGG 0.0 91 GTGG T
1 1:92:T:GTGG 0.0 92 GTGG T
1 1:93:A:C 0.0 93 C A
1 1:94:A:C 0.0 94 C A
1 1:95:A:C 0.0 95 C A
1 1:96:A:C 0.0 96 C A
1 1:97:A:C 0.0 97 C A
1 1:98:A:C 0.0 98 C A
1 1:99:A:C 0.0 99 C A
1 1:97:T:GTGG 0.0 97 GTGG T
1 1:98:ACT:G 0.0 98 G ACT
1 1:99:T:GTGG 0.0 99 GTGG T
1 1:100:A:C 0.0 100 C A
20 changes: 10 additions & 10 deletions sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
0 0 0 0 0 NA
0 1 0 0 0 NA
0 2 0 0 0 NA
0 3 0 0 0 NA
0 4 0 0 0 NA
0 5 0 0 0 NA
0 6 0 0 0 NA
0 7 0 0 0 NA
0 8 0 0 0 NA
0 9 0 0 0 NA
0 000 0 0 0 NA
0 001 0 0 0 NA
0 002 0 0 0 NA
0 003 0 0 0 NA
0 004 0 0 0 NA
0 005 0 0 0 NA
0 006 0 0 0 NA
0 007 0 0 0 NA
0 008 0 0 0 NA
0 009 0 0 0 NA
16 changes: 16 additions & 0 deletions sgkit_plink/tests/test_pysnptools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import xarray as xr
from sgkit_plink.pysnptools import read_plink

# This data was generated externally using Hail
# for 10 samples, 100 variants, and genotype calls
# that are missing in ~10% of cases.
# TODO: document and move code to central location
# (cf. https://github.com/pystatgen/sgkit-plink/pull/20#discussion_r466907811)
example_dataset_1 = "plink_sim_10s_100v_10pmiss"


Expand Down Expand Up @@ -32,6 +37,17 @@ def test_raise_on_both_path_types():
read_plink(path="x", bed_path="x")


def test_fixlen_str_variable(ds1):
assert ds1["sample_id"].dtype == np.dtype("<U3")
assert ds1["variant_id"].dtype == np.dtype("<U13")
assert ds1["variant_allele"].dtype == np.dtype("|S6")
assert ds1["sample_family_id"].dtype == np.dtype("<U1")
# TODO: Remove 'None' strings https://github.com/pystatgen/sgkit-plink/issues/16
# which should make these <U1
assert ds1["sample_maternal_id"].dtype == np.dtype("<U4")
assert ds1["sample_paternal_id"].dtype == np.dtype("<U4")


def test_read_slicing(ds1):
gt = ds1["call_genotype"]
shape = gt.shape
Expand Down