fix max_allele

sanjaynagi · Oct 8, 2024 · c9067cd · c9067cd
1 parent bf06193
commit c9067cd
Showing 1 changed file with 51 additions and 19 deletions.
diff --git a/workflow/notebooks/allele-frequencies.ipynb b/workflow/notebooks/allele-frequencies.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "cd3f429b",
    "metadata": {
     "tags": [
@@ -20,9 +20,7 @@
     "    \"\"\"\n",
     "    Load VCF and filter poor-quality samples\n",
     "    \"\"\"\n",
-    "    \n",
-    "    sampleIDs = metadata.sampleID.to_list()\n",
-    "    \n",
+    "      \n",
     "    # load vcf and get genotypes and positions\n",
     "    vcf = allel.read_vcf(vcf_path, fields='*')\n",
     "    samples = vcf['samples']\n",
@@ -74,6 +72,7 @@
     "            anns.append(ann_value)\n",
     "\n",
     "    return np.array(anns)\n",
+    "\n",
     "def vcf_to_snp_dataframe(vcf_path, metadata):\n",
     "\n",
     "    geno, pos, contig, samples, ref, alt, ann = load_vcf(vcf_path=vcf_path, metadata=metadata)\n",
@@ -117,10 +116,8 @@
     "    for coh in cohs:\n",
     "        coh_dict[coh] = np.where(metadata[cohort_col] == coh)[0]\n",
     "    \n",
-    "    tot_ac = geno.count_alleles()\n",
-    "\n",
     "    # get allele counts for each population\n",
-    "    ac = geno.count_alleles_subpops(coh_dict)\n",
+    "    ac = geno.count_alleles_subpops(coh_dict, max_allele=3)\n",
     "    \n",
     "    for coh in cohs:\n",
     "        total_counts = []\n",
@@ -175,7 +172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "d4571035",
    "metadata": {
     "tags": [
@@ -185,11 +182,11 @@
    },
    "outputs": [],
    "source": [
-    "dataset = 'ampseq-vigg-002'\n",
+    "dataset = 'ampseq-colony'\n",
     "metadata_path = \"../../results/config/metadata.qcpass.tsv\"\n",
-    "cohort_cols = 'taxon,location'\n",
+    "cohort_cols = 'strain' #taxon,location'\n",
     "bed_path = \"../../config/ag-vampir.bed\"\n",
-    "vcf_path = \"../../results/vcfs/targets/ampseq-vigg002.annot.vcf\"\n",
+    "vcf_path = \"../../results/vcfs/targets/ampseq-colony.annot.vcf\"\n",
     "wkdir = \"../..\""
    ]
   },
@@ -209,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "ee8c132c",
    "metadata": {
     "tags": [
@@ -236,15 +233,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "19a63cb1",
    "metadata": {
     "scrolled": false,
     "tags": [
      "remove-input"
     ]
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/snagi/miniconda3/lib/python3.11/site-packages/allel/io/vcf_read.py:1732: UserWarning: invalid INFO header: '##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">\\n'\n",
+      "  warnings.warn('invalid INFO header: %r' % header)\n"
+     ]
+    },
+    {
+     "ename": "IndexError",
+     "evalue": "index 2 is out of bounds for axis 1 with size 2",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 6\u001b[0m\n\u001b[1;32m      3\u001b[0m frq_dfs \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cohort_col \u001b[38;5;129;01min\u001b[39;00m cohort_cols:\n\u001b[0;32m----> 6\u001b[0m     freq_df \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_frequencies_cohort\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m        \u001b[49m\u001b[43msnp_df\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msnp_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      9\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgeno\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeno\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     10\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcohort_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcohort_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     11\u001b[0m \u001b[43m        \u001b[49m\u001b[43maf_filter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     12\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmissense_filter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m     13\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     14\u001b[0m     frq_dfs\u001b[38;5;241m.\u001b[39mappend(freq_df\u001b[38;5;241m.\u001b[39mreset_index(drop\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m))\n\u001b[1;32m     16\u001b[0m     plot_allele_frequencies(\n\u001b[1;32m     17\u001b[0m         df\u001b[38;5;241m=\u001b[39mfreq_df\u001b[38;5;241m.\u001b[39mfilter(like\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfrq_\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m     18\u001b[0m         cohort_col\u001b[38;5;241m=\u001b[39mcohort_col\n\u001b[1;32m     19\u001b[0m     )\n",
+      "Cell \u001b[0;32mIn[1], line 116\u001b[0m, in \u001b[0;36mcalculate_frequencies_cohort\u001b[0;34m(snp_df, metadata, geno, cohort_col, af_filter, missense_filter)\u001b[0m\n\u001b[1;32m    114\u001b[0m     alt_idx \u001b[38;5;241m=\u001b[39m row[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124malt_index\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m    115\u001b[0m     total_counts\u001b[38;5;241m.\u001b[39mappend(ac[coh][var_idx,:]\u001b[38;5;241m.\u001b[39msum())\n\u001b[0;32m--> 116\u001b[0m     alt_counts\u001b[38;5;241m.\u001b[39mappend(\u001b[43mac\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcoh\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mvar_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malt_idx\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m    118\u001b[0m df\u001b[38;5;241m.\u001b[39mloc[:, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcount_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcoh\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(alt_counts)\n\u001b[1;32m    119\u001b[0m df\u001b[38;5;241m.\u001b[39mloc[:, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfrq_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcoh\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mround(np\u001b[38;5;241m.\u001b[39marray(alt_counts)\u001b[38;5;241m/\u001b[39mnp\u001b[38;5;241m.\u001b[39marray(total_counts), \u001b[38;5;241m3\u001b[39m)\n",
+      "File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/allel/model/ndarray.py:2624\u001b[0m, in \u001b[0;36mAlleleCountsArray.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m   2623\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[0;32m-> 2624\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mindex_allele_counts_array\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mtype\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/allel/model/generic.py:143\u001b[0m, in \u001b[0;36mindex_allele_counts_array\u001b[0;34m(ac, item, cls)\u001b[0m\n\u001b[1;32m    140\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mindex_allele_counts_array\u001b[39m(ac, item, \u001b[38;5;28mcls\u001b[39m):\n\u001b[1;32m    141\u001b[0m \n\u001b[1;32m    142\u001b[0m     \u001b[38;5;66;03m# apply indexing operation on underlying values\u001b[39;00m\n\u001b[0;32m--> 143\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[43mac\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m[\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m    145\u001b[0m     \u001b[38;5;66;03m# decide whether to wrap the result as HaplotypeArray\u001b[39;00m\n\u001b[1;32m    146\u001b[0m     wrap_array \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    147\u001b[0m         \u001b[38;5;28mhasattr\u001b[39m(out, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mndim\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m out\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m  \u001b[38;5;66;03m# dimensionality preserved\u001b[39;00m\n\u001b[1;32m    148\u001b[0m         ac\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m out\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;129;01mand\u001b[39;00m  \u001b[38;5;66;03m# number of alleles preserved\u001b[39;00m\n\u001b[1;32m    149\u001b[0m         \u001b[38;5;129;01mnot\u001b[39;00m contains_newaxis(item)\n\u001b[1;32m    150\u001b[0m     )\n",
+      "\u001b[0;31mIndexError\u001b[0m: index 2 is out of bounds for axis 1 with size 2"
+     ]
+    }
+   ],
    "source": [
     "snp_df, geno = vcf_to_snp_dataframe(vcf_path, metadata)\n",
     "\n",
@@ -277,15 +297,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "72ffa152",
    "metadata": {
     "scrolled": false,
     "tags": [
      "remove-input"
     ]
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'snp_df' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m pd\u001b[38;5;241m.\u001b[39mset_option(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdisplay.max_rows\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m200\u001b[39m)\n\u001b[1;32m      2\u001b[0m pd\u001b[38;5;241m.\u001b[39mset_option(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdisplay.max_columns\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m100\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m snp_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat([\u001b[43msnp_df\u001b[49m] \u001b[38;5;241m+\u001b[39m frq_dfs, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m      5\u001b[0m snp_df\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwkdir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/results/snp_frequencies_summary.tsv\u001b[39m\u001b[38;5;124m\"\u001b[39m, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      6\u001b[0m snp_df\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'snp_df' is not defined"
+     ]
+    }
+   ],
    "source": [
     "pd.set_option(\"display.max_rows\", 200)\n",
     "pd.set_option('display.max_columns', 100)\n",
@@ -347,9 +379,9 @@
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "AmpSeq_python",
+   "display_name": "base",
    "language": "python",
-   "name": "ampseq_python"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -361,7 +393,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.18"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,