Skip to content

Commit

Permalink
fix max_allele
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjaynagi committed Oct 8, 2024
1 parent bf06193 commit c9067cd
Showing 1 changed file with 51 additions and 19 deletions.
70 changes: 51 additions & 19 deletions workflow/notebooks/allele-frequencies.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "cd3f429b",
"metadata": {
"tags": [
Expand All @@ -20,9 +20,7 @@
" \"\"\"\n",
" Load VCF and filter poor-quality samples\n",
" \"\"\"\n",
" \n",
" sampleIDs = metadata.sampleID.to_list()\n",
" \n",
" \n",
" # load vcf and get genotypes and positions\n",
" vcf = allel.read_vcf(vcf_path, fields='*')\n",
" samples = vcf['samples']\n",
Expand Down Expand Up @@ -74,6 +72,7 @@
" anns.append(ann_value)\n",
"\n",
" return np.array(anns)\n",
"\n",
"def vcf_to_snp_dataframe(vcf_path, metadata):\n",
"\n",
" geno, pos, contig, samples, ref, alt, ann = load_vcf(vcf_path=vcf_path, metadata=metadata)\n",
Expand Down Expand Up @@ -117,10 +116,8 @@
" for coh in cohs:\n",
" coh_dict[coh] = np.where(metadata[cohort_col] == coh)[0]\n",
" \n",
" tot_ac = geno.count_alleles()\n",
"\n",
" # get allele counts for each population\n",
" ac = geno.count_alleles_subpops(coh_dict)\n",
" ac = geno.count_alleles_subpops(coh_dict, max_allele=3)\n",
" \n",
" for coh in cohs:\n",
" total_counts = []\n",
Expand Down Expand Up @@ -175,7 +172,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "d4571035",
"metadata": {
"tags": [
Expand All @@ -185,11 +182,11 @@
},
"outputs": [],
"source": [
"dataset = 'ampseq-vigg-002'\n",
"dataset = 'ampseq-colony'\n",
"metadata_path = \"../../results/config/metadata.qcpass.tsv\"\n",
"cohort_cols = 'taxon,location'\n",
"cohort_cols = 'strain' #taxon,location'\n",
"bed_path = \"../../config/ag-vampir.bed\"\n",
"vcf_path = \"../../results/vcfs/targets/ampseq-vigg002.annot.vcf\"\n",
"vcf_path = \"../../results/vcfs/targets/ampseq-colony.annot.vcf\"\n",
"wkdir = \"../..\""
]
},
Expand All @@ -209,7 +206,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "ee8c132c",
"metadata": {
"tags": [
Expand All @@ -236,15 +233,38 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "19a63cb1",
"metadata": {
"scrolled": false,
"tags": [
"remove-input"
]
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/snagi/miniconda3/lib/python3.11/site-packages/allel/io/vcf_read.py:1732: UserWarning: invalid INFO header: '##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">\\n'\n",
" warnings.warn('invalid INFO header: %r' % header)\n"
]
},
{
"ename": "IndexError",
"evalue": "index 2 is out of bounds for axis 1 with size 2",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[11], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m frq_dfs \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cohort_col \u001b[38;5;129;01min\u001b[39;00m cohort_cols:\n\u001b[0;32m----> 6\u001b[0m freq_df \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_frequencies_cohort\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43msnp_df\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msnp_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mgeno\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeno\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mcohort_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcohort_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43maf_filter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mmissense_filter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m frq_dfs\u001b[38;5;241m.\u001b[39mappend(freq_df\u001b[38;5;241m.\u001b[39mreset_index(drop\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m))\n\u001b[1;32m 16\u001b[0m plot_allele_frequencies(\n\u001b[1;32m 17\u001b[0m df\u001b[38;5;241m=\u001b[39mfreq_df\u001b[38;5;241m.\u001b[39mfilter(like\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfrq_\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 18\u001b[0m cohort_col\u001b[38;5;241m=\u001b[39mcohort_col\n\u001b[1;32m 19\u001b[0m )\n",
"Cell \u001b[0;32mIn[1], line 116\u001b[0m, in \u001b[0;36mcalculate_frequencies_cohort\u001b[0;34m(snp_df, metadata, geno, cohort_col, af_filter, missense_filter)\u001b[0m\n\u001b[1;32m 114\u001b[0m alt_idx \u001b[38;5;241m=\u001b[39m row[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124malt_index\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 115\u001b[0m total_counts\u001b[38;5;241m.\u001b[39mappend(ac[coh][var_idx,:]\u001b[38;5;241m.\u001b[39msum())\n\u001b[0;32m--> 116\u001b[0m alt_counts\u001b[38;5;241m.\u001b[39mappend(\u001b[43mac\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcoh\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mvar_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malt_idx\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 118\u001b[0m df\u001b[38;5;241m.\u001b[39mloc[:, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcount_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcoh\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(alt_counts)\n\u001b[1;32m 119\u001b[0m df\u001b[38;5;241m.\u001b[39mloc[:, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfrq_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcoh\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mround(np\u001b[38;5;241m.\u001b[39marray(alt_counts)\u001b[38;5;241m/\u001b[39mnp\u001b[38;5;241m.\u001b[39marray(total_counts), \u001b[38;5;241m3\u001b[39m)\n",
"File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/allel/model/ndarray.py:2624\u001b[0m, in \u001b[0;36mAlleleCountsArray.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 2623\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[0;32m-> 2624\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mindex_allele_counts_array\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mtype\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/allel/model/generic.py:143\u001b[0m, in \u001b[0;36mindex_allele_counts_array\u001b[0;34m(ac, item, cls)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mindex_allele_counts_array\u001b[39m(ac, item, \u001b[38;5;28mcls\u001b[39m):\n\u001b[1;32m 141\u001b[0m \n\u001b[1;32m 142\u001b[0m \u001b[38;5;66;03m# apply indexing operation on underlying values\u001b[39;00m\n\u001b[0;32m--> 143\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mac\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m[\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;66;03m# decide whether to wrap the result as HaplotypeArray\u001b[39;00m\n\u001b[1;32m 146\u001b[0m wrap_array \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28mhasattr\u001b[39m(out, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mndim\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m out\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;66;03m# dimensionality preserved\u001b[39;00m\n\u001b[1;32m 148\u001b[0m ac\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m out\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;66;03m# number of alleles preserved\u001b[39;00m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m contains_newaxis(item)\n\u001b[1;32m 150\u001b[0m )\n",
"\u001b[0;31mIndexError\u001b[0m: index 2 is out of bounds for axis 1 with size 2"
]
}
],
"source": [
"snp_df, geno = vcf_to_snp_dataframe(vcf_path, metadata)\n",
"\n",
Expand Down Expand Up @@ -277,15 +297,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "72ffa152",
"metadata": {
"scrolled": false,
"tags": [
"remove-input"
]
},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'snp_df' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[8], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m pd\u001b[38;5;241m.\u001b[39mset_option(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdisplay.max_rows\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m200\u001b[39m)\n\u001b[1;32m 2\u001b[0m pd\u001b[38;5;241m.\u001b[39mset_option(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdisplay.max_columns\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m100\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m snp_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat([\u001b[43msnp_df\u001b[49m] \u001b[38;5;241m+\u001b[39m frq_dfs, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 5\u001b[0m snp_df\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwkdir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/results/snp_frequencies_summary.tsv\u001b[39m\u001b[38;5;124m\"\u001b[39m, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m snp_df\n",
"\u001b[0;31mNameError\u001b[0m: name 'snp_df' is not defined"
]
}
],
"source": [
"pd.set_option(\"display.max_rows\", 200)\n",
"pd.set_option('display.max_columns', 100)\n",
Expand Down Expand Up @@ -347,9 +379,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "AmpSeq_python",
"display_name": "base",
"language": "python",
"name": "ampseq_python"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -361,7 +393,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.11.6"
}
},
"nbformat": 4,
Expand Down

0 comments on commit c9067cd

Please sign in to comment.