From 2f88f562ed2b0447cca2bde30d214bebd0200b63 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Thu, 7 Mar 2024 15:30:38 +0100 Subject: [PATCH 1/9] exception for the oktoberfest workshop ipynb --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8926250a..7149efa6 100644 --- a/.gitignore +++ b/.gitignore @@ -145,6 +145,7 @@ hash.file # output files in tutorials folder tutorials/ !tutorials/Oktoberfest Tutorial.ipynb +!tutorials/Oktoberfest_workshop.ipynb # example data data/ From 8c3796b45516209e1851972f444be438acdbce93 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Thu, 7 Mar 2024 15:39:29 +0100 Subject: [PATCH 2/9] added a new workshop for Oktoberfest --- tutorials/Oktoberfest_workshop.ipynb | 935 +++++++++++++++++++++++++++ 1 file changed, 935 insertions(+) create mode 100644 tutorials/Oktoberfest_workshop.ipynb diff --git a/tutorials/Oktoberfest_workshop.ipynb b/tutorials/Oktoberfest_workshop.ipynb new file mode 100644 index 00000000..3ba0090c --- /dev/null +++ b/tutorials/Oktoberfest_workshop.ipynb @@ -0,0 +1,935 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "v4RyWlO0elRo" + }, + "source": [ + "# Oktoberfest Workshop\n", + "\n", + "This notebook is prepared to be run in Google [Colaboratory](https://colab.research.google.com/).\n", + "\n", + "This notebook contains tasks that are designed to guide new users through the following topics:\n", + "\n", + "1. How to install oktoberfest and load packages\n", + "2. How to get the required data\n", + "3. How to prepare a configuration file\n", + "4. How to run a job and interpret the output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qr-3HR_3Dymy" + }, + "source": [ + "# 1. Installation\n", + "\n", + "Before using Oktoberfest, the package and dependencies need to be installed. This step is only required once on your notebook, but it may need to be repeated in Google Collab.\n", + "\n", + "## Task 1.1\n", + "\n", + "What are the requirements for Oktoberfest and where do you find this information? (Hint: Search the Oktoberfest documentation at readthedocs using your favourite search engine).\n", + "\n", + "## Task 1.2\n", + "\n", + "Execute the below code cell, which installs percolator and Oktoberfest and restart the session if asked." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "iYBkB4ygeMNP", + "outputId": "0d7722fc-7cdc-4d30-fb1f-fde52d980b67" + }, + "outputs": [], + "source": [ + "!wget https://github.com/percolator/percolator/releases/download/rel-3-06-01/percolator-v3-06-linux-amd64.deb\n", + "!dpkg -i percolator-v3-06-linux-amd64.deb\n", + "!pip install oktoberfest" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "h5-R5ogFtNge" + }, + "source": [ + "For this notebook to work, a few packages need to be imported that provide the functions used in the following. Shouly you get an error here, check that installation of the required packages was successful.\n", + "\n", + "## Task 1.3\n", + "\n", + "Import the below packages and functions by executing the code in the cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NhIY7CQueku_" + }, + "outputs": [], + "source": [ + "from oktoberfest.runner import run_job\n", + "from oktoberfest import __version__ as version\n", + "import os\n", + "import json\n", + "import urllib.request\n", + "import shutil\n", + "from tqdm.auto import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I91JIxGPGUql" + }, + "source": [ + "If this works, you have installed Oktoberfest correctly.\n", + "\n", + "1.4 How can you check that you are using the current stable version? (check the output of __version__ using the below code cell and the Oktoberfest documentation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "2RM3nbzjGGGK", + "outputId": "620b568e-0db2-4916-dc38-537bf3a9be5e" + }, + "outputs": [], + "source": [ + " # add code here to check the version of the imported oktoberfest version" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MPB8spZ8gCiP" + }, + "source": [ + "# Task 2: Getting the data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PJz0mVfKgHt7" + }, + "source": [ + "The data used in this notebook is provided as a zip archive that can be downloaded from zenodo from this record https://zenodo.org/records/10793943\n", + "\n", + "## Task 2.1\n", + "\n", + "Find the download link in the public zenodo record. You can copy the link by hovering over the download button, click your right mouse button and choose the option to copy the download link.\n", + "\n", + "## Task 2.2\n", + "\n", + "Define variables for the download link, URL, and the local file name using the below code cell and execute the cell afterwards.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eulzl9fOgKZs" + }, + "outputs": [], + "source": [ + "url = # here goes the download link of the file to download from the zenodo record, it should look like \"https://zenodo.org/records/10782588/...\", make sure to include the \"\"\n", + "download_dir = # you can chose any directory, e.g. \"Oktoberfest_input/\", make sure to include the \"\"\n", + "file_name = # you can chose any filename, e.g. \"sample_data.zip\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_gtbcdaoskvq" + }, + "source": [ + "## Task 2.3\n", + "\n", + "Download and unpack the data using the below code cell. You should see a progress bar while it is downloading the file (86MB, approx. 1 minute)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "4dfab2054a374c0f9de63da7b19f74f1", + "45def39bdf6b4e0d9b63d329392a08ce", + "5d77990432714181a9f21545939c8775", + "39d0c356fcfd48b1b1d42142397ab294", + "c5edae3fc6894d29aebb4aa3463bb667", + "c13cb4073ce549d09626ca546231010a", + "fcdeeacca4f747368ee07b0a9b925383", + "8311cf488ddb4fac88e454f07f5eceb5", + "94108a7aabcd4dd5a53d4e137fe6d5a6", + "f36332d9421345598bbbf690b39efef5", + "27123654fb134ea79526e74d2cbb3f89" + ] + }, + "id": "IgxrLYTzsk3-", + "outputId": "b26b731f-107e-4f90-d621-f91cfa051640" + }, + "outputs": [], + "source": [ + "if not os.path.isdir(download_dir):\n", + " os.mkdir(download_dir)\n", + "download_file = os.path.join(download_dir, 'HLA_sample.zip')\n", + "with tqdm(unit=\"B\", total=70958154, unit_scale=True, unit_divisor=1000, miniters=1, desc=url.split(\"/\")[-1]) as t:\n", + " urllib.request.urlretrieve(url=url, filename=download_file, reporthook=lambda blocks, block_size, _: t.update(blocks * block_size - t.n))\n", + "shutil.unpack_archive(download_file, download_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xCpLdxnuK9f7" + }, + "source": [ + "## Task 2.4\n", + "\n", + "Check that the download was successful. Hint: Use the file browser on the left side to search for the folder you defined using the __download_dir__ variable above and check the content. What do you find here?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Sw6EevVMnVRR" + }, + "source": [ + "# Task 3: Rescoring with Oktoberfest\n", + "\n", + "The main feature of oktoberfest is to perform rescoring. This requires two main inputs:\n", + "- unfiltered search results, for MaxQuant, this would mean a run with 100% PSM and peptide FDR\n", + "- acquired spectra, either in ThermoFisher .RAW, Bruker .d, or mzML format\n", + "\n", + "In addition, Oktoberfest can acquire predictions from various data dependent models, that are provided by a Koina instance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DnWbSve1gLpb" + }, + "source": [ + "## Task 3.1\n", + "\n", + "Where do you find information about the configuration options, example configurations, and the supported prediction models (Hint: Check the [Usage principles](https://oktoberfest.readthedocs.io/en/latest/usage.html) in the Oktoberfest documentation)? Define below variables accordingly.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HN7YqI6pOhaD" + }, + "outputs": [], + "source": [ + "spectra = # this is the location of the mzML file containing the measured spectra, i.e. \"/.mzml\"\n", + "spectra_type = # this is the format the spectra are provided in (\"mzml\", \"RAW\", \"d\"), which one is correct here?\n", + "\n", + "search_results = # this is the location of the search engine output, i.e. \"/\"\n", + "search_results_type = # this is the name of the search engine that produced the search results, which is the correct search engine here?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JBa27_d5gSpV" + }, + "source": [ + "## Task 3.2\n", + "\n", + "The data we are working with here was aquired using beam-type collision induced dissociation (HCD) without tandem mass tags (TMT).\n", + "\n", + "Which are the models to use for fragment intensity prediction and retention time prediction and the server URL that provides access to these models (Hint: Check the [Usage principles](https://oktoberfest.readthedocs.io/en/latest/usage.html) in the Oktoberfest documentation)?\n", + "\n", + "Also specify the directory you want to store all the outputs from Oktoberfest in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EQEFvoOxgTuE" + }, + "outputs": [], + "source": [ + "intensity_model = # this is the model used for fragment intensity prediction, e.g. \"some model\"\n", + "retention_time_model = # this is the model used for retention time prediction, e.g. \"some model\"\n", + "prediction_server = # the Koina server that provides access to the specified models, e.g. \":\"\n", + "\n", + "output_directory = # this is the output folder for everything Oktoberfest produces during rescoring, e.g. \"rescore_out\"\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "17D9Zkb4xEU3" + }, + "source": [ + "## Task 3.3\n", + "\n", + "Save the variables you have defined above in the below configuration and store it to disk. For simplicity, this is providing a minimal configuration for this task, so you can simply execute the code cell.\n", + "\n", + "A detailed explanation of all available configuration options can be found in the [Usage principles](https://oktoberfest.readthedocs.io/en/latest/usage.html) in the Oktoberfest documentation.\n", + "\n", + "What are the mass tolerance and unit variables for?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QJWBZP1fgiXL" + }, + "outputs": [], + "source": [ + "task_config_rescoring = {\n", + " \"type\": \"Rescoring\",\n", + " \"inputs\":{\n", + " \"search_results\": search_results,\n", + " \"search_results_type\": search_results_type,\n", + " \"spectra\": spectra,\n", + " \"spectra_type\": spectra_type\n", + " },\n", + " \"output\": output_directory,\n", + " \"models\": {\n", + " \"intensity\": intensity_model,\n", + " \"irt\": retention_time_model\n", + " },\n", + " \"prediction_server\": prediction_server,\n", + " \"ssl\": True,\n", + " \"numThreads\": 1,\n", + " \"fdr_estimation_method\": \"percolator\",\n", + " \"massTolerance\": 20,\n", + " \"unitMassTolerance\": \"ppm\"\n", + "}\n", + "\n", + "# this is for storing the file on disk\n", + "with open('./rescoring_config.json', 'w') as fp:\n", + " json.dump(task_config_rescoring, fp)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m7TJIKwljrOV" + }, + "source": [ + "(Optional) You can now check the configuration file on disk, to see if it looks correctly by finding it with the file browser on the left.\n", + "\n", + "The oktoberfest documentation provides [example configurations](https://oktoberfest.readthedocs.io/en/latest/jobs.html#c-rescoring) that show you how a typical rescoring run for MaxQuant is set up with all the available options.\n", + "\n", + "If you want to get detailed information about individual options and allowed values, you can check the documentation for the [full configuration](https://oktoberfest.readthedocs.io/en/latest/config.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cPZcc7qlkC9w" + }, + "source": [ + "## Task 3.4\n", + "\n", + "Start the rescoring run.\n", + "\n", + "After preparation of the configuration file, oktoberfest can be instructed to run a job with the provided configuration file. This step may take a while (approx. 3-5 minutes) and provide you with log output that tracks the progress of rescoring.\n", + "Oktoberfest will perform the following steps:\n", + "\n", + "- read the search results from maxquant and translate them to the internal format used by Oktoberfest. The specification for this format can be found in the documentation.\n", + "- parse the mzml data to retreive MS2 spectra, then merge with the search results to generate PSMs, filtering out spectra without a search result\n", + "- annotation of spectra for all y- and b-fragments in charge states 1-3\n", + "- perform a NCE calibration using the top 1000 highest scoring target PSMs, to determine the NCE for which the highest spectral angle can be achieved with the acquired predictions\n", + "- fragment intensity and retention time prediction for all PSMs\n", + "- retention time alignment, spectral angle and further feature calculation for rescoring using percolator\n", + "- rescoring using features from intensity and retention time prediction and the original search engine score\n", + "- plotting summaries of the rescoring run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1_OsKhWZmeP1", + "outputId": "22954f37-4125-4b6c-af49-b7898203f466" + }, + "outputs": [], + "source": [ + "run_job(\"./rescoring_config.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Odldu8aDms4f" + }, + "source": [ + "## Task 3.5\n", + "\n", + "Explore the output folder of Oktoberfest using the file brwoser on the left.\n", + "\n", + "Where do you find information about the output folder structure and what you can find where (Hint: Check the [Usage principles](https://oktoberfest.readthedocs.io/en/latest/usage.html) in the Oktoberfest documentation)?\n", + "\n", + "Did rescoring work and provide better results (Results discussion follows)?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xyS7rx9enKeH" + }, + "source": [ + "# Task 4: Spectral library generation\n", + "\n", + "A second feature of Oktoberfest is the generation of spectral libraries, which can be used for DIA analysis. Similarly to rescoring, a configuration file needs to be prepared. In this case, one main input is required:\n", + "\n", + "- fasta file, to perform an in-silico digestion with given settings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "esXnhTu9o16z" + }, + "source": [ + "## Task 4.1\n", + "\n", + "What inputs are required for spectral library generation? You can check the documentation again, and fill out the below code cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AVtH70bWzXZj" + }, + "outputs": [], + "source": [ + "library_input = # this is the location of the fasta or peptide list, e.g. \"/path/to/.fasta\"\n", + "library_input_type = # this is the format the you provide, e.g. \"fasta\" or \"peptides\", which one is correct here?\n", + "\n", + "output_directory = # this is the output folder for everything Oktoberfest produces during spectral library generation, e.g. \"speclib_out\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5pUZGiCx7B2e" + }, + "source": [ + "## Task 4.2\n", + "\n", + "Chose some settings for the library generation. You can check the documentation for detailed information and play around with the values below. Beware that more freedom in missed cleavages or more than one precursor charge will lead to longer prediction time and larger file sizes. It makes sense to try this out with minimal values first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AEQinnTN67KQ" + }, + "outputs": [], + "source": [ + "collisionEnergy = # the collision energy for which the spectral library should be produces, e.g. 30\n", + "precursorCharge = # the precursor charges that should be considered when creating the library, e.g. 0 or [2,3], or [1,2,3] (more than one increases prediction time / file size)\n", + "format = # the desired format for the library, e.g. \"spectronaut\" or \"msp\", \"msp\" is smaller, \"spectronaut\"\n", + "\n", + "missedCleavages = # this is the number of missed cleavages that should be allowed (higher values increase prediction time / file size)\n", + "minLength = # minimal allowed peptide length, prosit accepts everythin >= 7\n", + "maxLength = # maximal allowed peptide length, prosit accepts everything <= 30\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2gokulhU8one" + }, + "source": [ + "## Task 4.3\n", + "\n", + "Save the variables you have defined above in the below configuration and store it to disk. For simplicity, this is providing a minimal configuration for this task, so you can simply execute the code cell.\n", + "\n", + "A detailed explanation of all available configuration options can be found in the Usage principles in the Oktoberfest documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ybaf6vuSpAPH" + }, + "outputs": [], + "source": [ + "task_config_spectral_lib = {\n", + " \"type\": \"SpectralLibraryGeneration\",\n", + " \"tag\": \"\",\n", + " \"inputs\": {\n", + " \"library_input\": library_input,\n", + " \"library_input_type\": library_input_type\n", + " },\n", + " \"output\": output_directory,\n", + " \"models\": {\n", + " \"intensity\": intensity_model,\n", + " \"irt\": retention_time_model\n", + " },\n", + " \"prediction_server\": prediction_server,\n", + " \"ssl\": True,\n", + " \"numThreads\": 1,\n", + " \"spectralLibraryOptions\": {\n", + " \"fragmentation\": \"HCD\",\n", + " \"collisionEnergy\": collisionEnergy,\n", + " \"precursorCharge\": precursorCharge,\n", + " \"minIntensity\": 5e-4,\n", + " \"batchsize\": 10000,\n", + " \"format\": format,\n", + " },\n", + " \"fastaDigestOptions\": {\n", + " \"digestion\": \"full\",\n", + " \"missedCleavages\": missedCleavages,\n", + " \"minLength\": minLength,\n", + " \"maxLength\": maxLength,\n", + " \"enzyme\": \"trypsin\",\n", + " \"specialAas\": \"KR\",\n", + " \"db\": \"target\"\n", + " },\n", + "}\n", + "\n", + "with open('./spectral_library_config.json', 'w') as fp:\n", + " json.dump(task_config_spectral_lib, fp)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cawfWOoKqMiX" + }, + "source": [ + "## Task 4.4\n", + "\n", + "Start the spectral library generation.\n", + "\n", + "This step may take a while (TODO minutes). The log output tracks the progress of library generation. Oktoberfest will perform the following steps:\n", + "\n", + "- read the fasta file and perform an in-silico digest according to the settings provided in the configuration file\n", + "- acquire fragment intensity and retention time predictions in batches and write them to disk on the fly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "EPLOR3gvpuS7", + "outputId": "5cae34b1-e7e1-483b-8ddf-b5e0e7ad3fcf" + }, + "outputs": [], + "source": [ + "run_job(\"./spectral_library_config.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8r-akZkrpzHC" + }, + "source": [ + "After Oktoberfest is done generating the library, the specified output folder contains a file called \"myPrositLib.msp\" (MSP) or \"myPrositLib.csv\" (spectronaut). Check to see if everything worked out correctly." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "27123654fb134ea79526e74d2cbb3f89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "39d0c356fcfd48b1b1d42142397ab294": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f36332d9421345598bbbf690b39efef5", + "placeholder": "​", + "style": "IPY_MODEL_27123654fb134ea79526e74d2cbb3f89", + "value": " 85.8M/? [00:05<00:00, 22.7MB/s]" + } + }, + "45def39bdf6b4e0d9b63d329392a08ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c13cb4073ce549d09626ca546231010a", + "placeholder": "​", + "style": "IPY_MODEL_fcdeeacca4f747368ee07b0a9b925383", + "value": "HLA_sample.zip?download=1: " + } + }, + "4dfab2054a374c0f9de63da7b19f74f1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_45def39bdf6b4e0d9b63d329392a08ce", + "IPY_MODEL_5d77990432714181a9f21545939c8775", + "IPY_MODEL_39d0c356fcfd48b1b1d42142397ab294" + ], + "layout": "IPY_MODEL_c5edae3fc6894d29aebb4aa3463bb667" + } + }, + "5d77990432714181a9f21545939c8775": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8311cf488ddb4fac88e454f07f5eceb5", + "max": 90001961, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_94108a7aabcd4dd5a53d4e137fe6d5a6", + "value": 90001961 + } + }, + "8311cf488ddb4fac88e454f07f5eceb5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "94108a7aabcd4dd5a53d4e137fe6d5a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c13cb4073ce549d09626ca546231010a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c5edae3fc6894d29aebb4aa3463bb667": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f36332d9421345598bbbf690b39efef5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fcdeeacca4f747368ee07b0a9b925383": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 3823df5571f6c0f4c36a1f46cca2d69331d3feaa Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Thu, 7 Mar 2024 15:40:25 +0100 Subject: [PATCH 3/9] added additional sequence filters --- oktoberfest/preprocessing/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oktoberfest/preprocessing/preprocessing.py b/oktoberfest/preprocessing/preprocessing.py index 794a7306..31aabbe9 100644 --- a/oktoberfest/preprocessing/preprocessing.py +++ b/oktoberfest/preprocessing/preprocessing.py @@ -180,7 +180,7 @@ def filter_peptides(peptides: pd.DataFrame, min_length: int, max_length: int, ma & (~peptides["MODIFIED_SEQUENCE"].str.contains(r"\(ac\)")) & (~peptides["MODIFIED_SEQUENCE"].str.contains(r"\(Acetyl \(Protein N-term\)\)")) & (~peptides["MODIFIED_SEQUENCE"].str.contains(r"\[UNIMOD\:21\]")) - & (~peptides["SEQUENCE"].str.contains("U|X")) + & (~peptides["SEQUENCE"].str.contains(r"B|\*|\.|U|X|Z")) ] From 5b40977d4a1dda063f76672640e18ca7aa938aa7 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Thu, 7 Mar 2024 15:45:43 +0100 Subject: [PATCH 4/9] separate prediction step test --- oktoberfest/data/spectra.py | 6 +++++ oktoberfest/runner.py | 47 +++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py index 20a3c104..649e8b56 100644 --- a/oktoberfest/data/spectra.py +++ b/oktoberfest/data/spectra.py @@ -214,6 +214,12 @@ def from_hdf5(cls: Type[SpectraT], input_file: Union[str, Path]) -> SpectraT: sparse_raw_intensities = hdf5.read_file(input_file, f"sparse_{hdf5.INTENSITY_RAW_KEY}") if not sparse_raw_intensities.empty: spectra.add_matrix_from_hdf5(sparse_raw_intensities, FragmentType.RAW) + try: + sparse_pred_intensities = hdf5.read_file(input_file, f"sparse_{hdf5.INTENSITY_PRED_KEY}") + if not sparse_pred_intensities.empty: + spectra.add_matrix_from_hdf5(sparse_pred_intensities, FragmentType.PRED) + except Exception as e: + logger.warning(e) sparse_raw_mzs = hdf5.read_file(input_file, f"sparse_{hdf5.MZ_RAW_KEY}") if not sparse_raw_mzs.empty: spectra.add_matrix_from_hdf5(sparse_raw_mzs, FragmentType.MZ) diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py index 05387f52..02d0f15f 100644 --- a/oktoberfest/runner.py +++ b/oktoberfest/runner.py @@ -351,6 +351,14 @@ def generate_spectral_lib(config_path: Union[str, Path]): # Create a pool for producer processes predictor_pool = pool.Pool(config.num_threads) + consumer_process = Process( + target=speclib.async_write, + args=( + shared_queue, + writing_progress, + ), + ) + try: results = [] for i in batches: @@ -376,13 +384,7 @@ def generate_spectral_lib(config_path: Union[str, Path]): total=n_batches, desc="Writing library", postfix={"successful": 0, "missing": 0} ) as writer_pbar: # Start the consumer process - consumer_process = Process( - target=speclib.async_write, - args=( - shared_queue, - writing_progress, - ), - ) + consumer_process.start() with tqdm( total=n_batches, desc="Getting predictions", postfix={"successful": 0, "failed": 0} @@ -475,23 +477,28 @@ def _calculate_features(spectra_file: Path, config: Config): if calc_feature_step.is_done(): return - predict_kwargs = { - "server_url": config.prediction_server, - "ssl": config.ssl, - } + predict_step = ProcessStep(config.output, "predict." + spectra_file.stem) + if not predict_step.is_done(): - pred_intensities = pr.predict( - data=library.spectra_data, - model_name=config.models["intensity"], - **predict_kwargs, - ) + predict_kwargs = { + "server_url": config.prediction_server, + "ssl": config.ssl, + } - pred_irts = pr.predict(data=library.spectra_data, model_name=config.models["irt"], **predict_kwargs) + pred_intensities = pr.predict( + data=library.spectra_data, + model_name=config.models["intensity"], + **predict_kwargs, + ) - library.add_matrix(pd.Series(pred_intensities["intensities"].tolist(), name="intensities"), FragmentType.PRED) - library.add_column(pred_irts["irt"], name="PREDICTED_IRT") + pred_irts = pr.predict(data=library.spectra_data, model_name=config.models["irt"], **predict_kwargs) - library.write_pred_as_hdf5(config.output / "data" / spectra_file.with_suffix(".mzml.pred.hdf5").name).join() + library.add_matrix(pd.Series(pred_intensities["intensities"].tolist(), name="intensities"), FragmentType.PRED) + library.add_column(pred_irts["irt"], name="PREDICTED_IRT") + + library.write_pred_as_hdf5(config.output / "data" / spectra_file.with_suffix(".mzml.pred.hdf5").name).join() + + predict_step.mark_done() # produce percolator tab files fdr_dir = config.output / "results" / config.fdr_estimation_method From 2926950805b9287b1891b5bbe43d7fa68396ad18 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Thu, 7 Mar 2024 15:55:42 +0100 Subject: [PATCH 5/9] updated everything to new koina URL --- docs/config.rst | 2 +- docs/jobs.rst | 6 +++--- docs/predictions.rst | 20 +++++++++---------- oktoberfest/predict/koina.py | 6 +++--- oktoberfest/utils/example_configs.py | 6 +++--- tests/unit_tests/configs/ce_calib_ransac.json | 2 +- .../configs/spectral_library_with_digest.json | 2 +- tests/unit_tests/test_predictions.py | 6 +++--- 8 files changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/config.rst b/docs/config.rst index c14f9860..0474ba36 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -22,7 +22,7 @@ Always required +----------------------------+--------------------------------------------------------------------------------------------------------------------+ | irt | Name of the model used for indexed retention time prediction | +----------------------------+--------------------------------------------------------------------------------------------------------------------+ - | prediction_server | Server and port for obtaining peptide property predictions; default: "koina.proteomicsdb.org:443" | + | prediction_server | Server and port for obtaining peptide property predictions; default: "koina.wilhelmlab.org:443" | +----------------------------+--------------------------------------------------------------------------------------------------------------------+ | ssl | Use ssl when making requests to the prediction server, can be true or false; default = true | +----------------------------+--------------------------------------------------------------------------------------------------------------------+ diff --git a/docs/jobs.rst b/docs/jobs.rst index 8dc747e5..0b416cc0 100644 --- a/docs/jobs.rst +++ b/docs/jobs.rst @@ -65,7 +65,7 @@ Example config file: "intensity": "Prosit_2020_intensity_HCD", "irt": "Prosit_2019_irt" }, - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "numThreads": 1, "regressionMethod": "spline", "ssl": true, @@ -138,7 +138,7 @@ Example config file: "specialAas": "KR", "db": "concat" }, - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "numThreads": 1, "ssl": true } @@ -190,7 +190,7 @@ Example config file: "intensity": "Prosit_2020_intensity_HCD", "irt": "Prosit_2019_irt" }, - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "numThreads": 1, "fdr_estimation_method": "mokapot", "allFeatures": false, diff --git a/docs/predictions.rst b/docs/predictions.rst index 7ccfbec2..c61d762a 100644 --- a/docs/predictions.rst +++ b/docs/predictions.rst @@ -1,18 +1,18 @@ Retrieving predictions ====================== -Oktoberfest relies on retrieving predictions from a `Koina `_ server that hosts specific models for peptide property prediction. Users can use any publicly available community server or host their own server. +Oktoberfest relies on retrieving predictions from a `Koina `_ server that hosts specific models for peptide property prediction. Users can use any publicly available community server or host their own server. Connecting to a community server -------------------------------- -Our publicly available community server is available at `koina.proteomicsdb.org:443`. +Our publicly available community server is available at `koina.wilhelmlab.org:443`. If you want to connect to it, you need to have the following flags in your config file (default settings): .. code-block:: json { - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "ssl": true, } @@ -31,13 +31,13 @@ This is the list of currently supported and tested models for Oktoberfest provid +==================================================================================================================+==============================================================================================================================================================================================+ | Prosit_2019_intensity | Developed for HCD tryptic peptides only. We recommend using the Prosit_2020_intensity_HCD model instead, since it showed slightly superior performance on tryptic peptides as well. | +------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | `Prosit_2020_intensity_HCD `_ | Developed for HCD tryptic and non-tryptic peptides. Supported modifications are oxidation and carbamidomethylation. Latest version we recommend to use for HCD. | + | `Prosit_2020_intensity_HCD `_ | Developed for HCD tryptic and non-tryptic peptides. Supported modifications are oxidation and carbamidomethylation. Latest version we recommend to use for HCD. | +------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | `Prosit_2020_intensity_CID `_ | Developed for CID tryptic and non-tryptic peptides. Supported modifications are oxidation and carbamidomethylation. Latest version we recommend to use for CID. | + | `Prosit_2020_intensity_CID `_ | Developed for CID tryptic and non-tryptic peptides. Supported modifications are oxidation and carbamidomethylation. Latest version we recommend to use for CID. | +------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | `Prosit_2020_intensity_TMT `_ | Developed for HCD and CID, tryptic and non-tryptic peptides. Latest version we commend for TMT labeled peptides in general. | + | `Prosit_2020_intensity_TMT `_ | Developed for HCD and CID, tryptic and non-tryptic peptides. Latest version we commend for TMT labeled peptides in general. | +------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | `Prosit_2023_intensity_timsTOF `_ | Developed for timsTOF, tryptic and non-tryptic peptides. Latest version we commend to use for timsTOF. | + | `Prosit_2023_intensity_timsTOF `_ | Developed for timsTOF, tryptic and non-tryptic peptides. Latest version we commend to use for timsTOF. | +------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ .. table:: @@ -46,9 +46,9 @@ This is the list of currently supported and tested models for Oktoberfest provid +-----------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ | iRT models | Description | +===============================================================================================+===========================================================================================================================+ - | `Prosit_2019_irt `_ | While developed for tryptic peptides only, we did not observe a drop in prediction performance for non-tryptic peptides. | + | `Prosit_2019_irt `_ | While developed for tryptic peptides only, we did not observe a drop in prediction performance for non-tryptic peptides. | +-----------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ - | `Prosit_2020_irt_TMT `_ | Developed for TMT labeled peptides. | + | `Prosit_2020_irt_TMT `_ | Developed for TMT labeled peptides. | +-----------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ Once support for additional models is implemented in Oktoberfest, they will be added here. @@ -56,4 +56,4 @@ Once support for additional models is implemented in Oktoberfest, they will be a Hosting and adding your own models ---------------------------------- -In case you are planning to host your own private or public instance of Koina or want us to host your model, please refer to the official `Koina documentation `_. \ No newline at end of file +In case you are planning to host your own private or public instance of Koina or want us to host your model, please refer to the official `Koina documentation `_. \ No newline at end of file diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py index 20bc79fc..48496e88 100644 --- a/oktoberfest/predict/koina.py +++ b/oktoberfest/predict/koina.py @@ -34,7 +34,7 @@ class Koina: def __init__( self, model_name: str, - server_url: str = "koina.proteomicsdb.org:443", + server_url: str = "koina.wilhelmlab.org:443", ssl: bool = True, targets: Optional[List[str]] = None, disable_progress_bar: bool = False, @@ -49,7 +49,7 @@ def __init__( and that the specified model is available on the server. :param model_name: The name of the Koina model to be used for inference. - :param server_url: The URL of the inference server. Defaults to "koina.proteomicsdb.org:443". + :param server_url: The URL of the inference server. Defaults to "koina.wilhelmlab.org:443". :param ssl: Indicates whether to use SSL for communication with the server. Defaults to True. :param targets: An optional list of targets to predict. If this is None, all model targets are predicted and received. @@ -100,7 +100,7 @@ def _is_server_ready(self): if not self.client.is_server_live(): raise ValueError("Server not yet started.") except InferenceServerException as e: - if self.url == "koina.proteomicsdb.org:443": + if self.url in ["koina.wilhelmlab.org:443", "koina.proteomicsdb.org:443"]: if self.ssl: raise InferenceServerException( "The public koina network seems to be inaccessible at the moment. " diff --git a/oktoberfest/utils/example_configs.py b/oktoberfest/utils/example_configs.py index b5cc6ea6..55298ac3 100644 --- a/oktoberfest/utils/example_configs.py +++ b/oktoberfest/utils/example_configs.py @@ -4,7 +4,7 @@ "inputs": {"search_results": "msms.txt", "search_results_type": "Maxquant", "spectra": "./", "spectra_type": "raw"}, "output": "./out", "models": {"intensity": "Prosit_2020_intensity_HCD", "irt": "Prosit_2019_irt"}, - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "ssl": True, "thermoExe": "ThermoRawFileParser.exe", "numThreads": 1, @@ -25,7 +25,7 @@ "inputs": {"search_results": "msms.txt", "search_results_type": "Maxquant", "spectra": "./", "spectra_type": "raw"}, "output": "./out", "models": {"intensity": "Prosit_2020_intensity_HCD", "irt": "Prosit_2019_irt"}, - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "ssl": True, "thermoExe": "ThermoRawFileParser.exe", "numThreads": 1, @@ -48,7 +48,7 @@ }, "output": "./out", "models": {"intensity": "Prosit_2020_intensity_HCD", "irt": "Prosit_2019_irt"}, - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "ssl": True, "spectralLibraryOptions": { "fragmentation": "HCD", diff --git a/tests/unit_tests/configs/ce_calib_ransac.json b/tests/unit_tests/configs/ce_calib_ransac.json index e73d1d5d..159cec6e 100644 --- a/tests/unit_tests/configs/ce_calib_ransac.json +++ b/tests/unit_tests/configs/ce_calib_ransac.json @@ -12,7 +12,7 @@ "intensity": "Prosit_2023_intensity_timsTOF", "irt": "Prosit_2019_irt" }, - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "ssl": true, "numThreads": 1, "massTolerance": 20, diff --git a/tests/unit_tests/configs/spectral_library_with_digest.json b/tests/unit_tests/configs/spectral_library_with_digest.json index 165d016e..e76de025 100644 --- a/tests/unit_tests/configs/spectral_library_with_digest.json +++ b/tests/unit_tests/configs/spectral_library_with_digest.json @@ -11,7 +11,7 @@ "irt": "Prosit_2019_irt" }, "outputFormat": "spectronaut", - "prediction_server": "koina.proteomicsdb.org:443", + "prediction_server": "koina.wilhelmlab.org:443", "ssl": true, "spectralLibraryOptions": { "fragmentation": "HCD", diff --git a/tests/unit_tests/test_predictions.py b/tests/unit_tests/test_predictions.py index e4aba786..ebcf40fc 100644 --- a/tests/unit_tests/test_predictions.py +++ b/tests/unit_tests/test_predictions.py @@ -19,12 +19,12 @@ def test_prosit_tmt(self): pred_intensities = predict( input_data, model_name="Prosit_2020_intensity_TMT", - server_url="koina.proteomicsdb.org:443", + server_url="koina.wilhelmlab.org:443", ssl=True, targets=["intensities", "annotation"], ) pred_irt = predict( - input_data, model_name="Prosit_2020_irt_TMT", server_url="koina.proteomicsdb.org:443", ssl=True + input_data, model_name="Prosit_2020_irt_TMT", server_url="koina.wilhelmlab.org:443", ssl=True ) library.add_matrix(pd.Series(pred_intensities["intensities"].tolist(), name="intensities"), FragmentType.PRED) @@ -48,7 +48,7 @@ def test_failing_koina(self): predict, input_data, model_name="Prosit_2020_intensity_HCD", - server_url="koina.proteomicsdb.org:443", + server_url="koina.wilhelmlab.org:443", ssl=True, targets=["intensities", "annotation"], ) From ecf70fd5d6290f30a5797db3d5ed328c8925b1e7 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 10 Mar 2024 10:21:49 +0100 Subject: [PATCH 6/9] fixed typos and download link example --- tutorials/Oktoberfest_workshop.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/Oktoberfest_workshop.ipynb b/tutorials/Oktoberfest_workshop.ipynb index 3ba0090c..0f27bb48 100644 --- a/tutorials/Oktoberfest_workshop.ipynb +++ b/tutorials/Oktoberfest_workshop.ipynb @@ -146,9 +146,9 @@ }, "outputs": [], "source": [ - "url = # here goes the download link of the file to download from the zenodo record, it should look like \"https://zenodo.org/records/10782588/...\", make sure to include the \"\"\n", - "download_dir = # you can chose any directory, e.g. \"Oktoberfest_input/\", make sure to include the \"\"\n", - "file_name = # you can chose any filename, e.g. \"sample_data.zip\"" + "url = # here goes the download link of the file to download from the zenodo record, it should look like \"https://zenodo.org/records/10793943/...\", make sure to include the \"\"\n", + "download_dir = # you can choose any directory, e.g. \"Oktoberfest_input/\", make sure to include the \"\"\n", + "file_name = # you can choose any filename, e.g. \"sample_data.zip\"" ] }, { From 36482dda83b0ee9c1776a0536605734fd6af9db2 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 10 Mar 2024 11:01:42 +0100 Subject: [PATCH 7/9] show rescoring summary plots --- oktoberfest/plotting/plotting.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/oktoberfest/plotting/plotting.py b/oktoberfest/plotting/plotting.py index 479b3b56..e3451666 100644 --- a/oktoberfest/plotting/plotting.py +++ b/oktoberfest/plotting/plotting.py @@ -118,11 +118,16 @@ def joint_plot( height=10, joint_kws={"rasterized": True, "edgecolor": "none", "s": 10}, ) + jplot.ax_joint.axhline(y=0, c="red") + jplot.ax_joint.axvline(x=0, c="red") + jplot.ax_marg_y.axhline(y=0, c="red") + jplot.ax_marg_x.axvline(x=0, c="red") + jplot.ax_joint.set_ylabel("Score\n(peptide property prediction)") jplot.ax_joint.set_xlabel("Score\n(search engine)") jplot.fig.suptitle(f"Score distribution ({level.capitalize()})", y=0.99) plt.savefig(filename, dpi=300) - plt.plot() + plt.show() plt.close() @@ -196,7 +201,7 @@ def plot_gain_loss(prosit_target: pd.DataFrame, andromeda_target: pd.DataFrame, ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) # grid - ax.set_ylabel("number of lost-common-shared targets below 1% FDR") + ax.set_ylabel(f"number of target {level.lower()}s below 1% FDR") ax.set_axisbelow(True) ax.yaxis.grid(color="black") ax.tick_params(axis="y", which="major") @@ -207,7 +212,7 @@ def plot_gain_loss(prosit_target: pd.DataFrame, andromeda_target: pd.DataFrame, legend_label = ["Common", "Gained", "Lost"] plt.legend(legend_label, ncol=1, bbox_to_anchor=([1.2, 0.5, 0, 0]), frameon=False) plt.savefig(filename, dpi=300, bbox_inches="tight") - plt.plot() + plt.show() plt.close() @@ -236,7 +241,11 @@ def plot_violin_sa_ce(sa_ce_df: pd.DataFrame, filename: Union[str, Path]): """ fig, ax = plt.subplots(figsize=(8, 8)) sns.violinplot(data=sa_ce_df, x="COLLISION_ENERGY", y="SPECTRAL_ANGLE", ax=ax, color="#1f77b4") - ax.axvline(x=sa_ce_df["COLLISION_ENERGY"][sa_ce_df["SPECTRAL_ANGLE"].idxmax()], color="red") + ax.axvline( + x=sa_ce_df["COLLISION_ENERGY"][sa_ce_df["SPECTRAL_ANGLE"].idxmax()] - sa_ce_df["COLLISION_ENERGY"].min(), + color="red", + ) + plt.xticks(rotation=90) plt.grid() plt.savefig(filename, dpi=300) plt.plot() From 70a77068419e036ac598fdcc0af2bdf87d26a4e5 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 10 Mar 2024 11:08:11 +0100 Subject: [PATCH 8/9] fixed some typos and added zip result folder --- tutorials/Oktoberfest_workshop.ipynb | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tutorials/Oktoberfest_workshop.ipynb b/tutorials/Oktoberfest_workshop.ipynb index 0f27bb48..3db2e997 100644 --- a/tutorials/Oktoberfest_workshop.ipynb +++ b/tutorials/Oktoberfest_workshop.ipynb @@ -217,9 +217,9 @@ "\n", "The main feature of oktoberfest is to perform rescoring. This requires two main inputs:\n", "- unfiltered search results, for MaxQuant, this would mean a run with 100% PSM and peptide FDR\n", - "- acquired spectra, either in ThermoFisher .RAW, Bruker .d, or mzML format\n", + "- get spectra, either in ThermoFisher .RAW, Bruker .d, or mzML format\n", "\n", - "In addition, Oktoberfest can acquire predictions from various data dependent models, that are provided by a Koina instance." + "In addition, Oktoberfest can get predictions from various data dependent models, that are provided by a Koina instance." ] }, { @@ -358,7 +358,7 @@ "- read the search results from maxquant and translate them to the internal format used by Oktoberfest. The specification for this format can be found in the documentation.\n", "- parse the mzml data to retreive MS2 spectra, then merge with the search results to generate PSMs, filtering out spectra without a search result\n", "- annotation of spectra for all y- and b-fragments in charge states 1-3\n", - "- perform a NCE calibration using the top 1000 highest scoring target PSMs, to determine the NCE for which the highest spectral angle can be achieved with the acquired predictions\n", + "- perform a normalized collision energy (NCE) calibration using the top 1000 highest scoring target PSMs, to determine the NCE for which the highest spectral angle can be achieved\n", "- fragment intensity and retention time prediction for all PSMs\n", "- retention time alignment, spectral angle and further feature calculation for rescoring using percolator\n", "- rescoring using features from intensity and retention time prediction and the original search engine score\n", @@ -392,7 +392,18 @@ "\n", "Where do you find information about the output folder structure and what you can find where (Hint: Check the [Usage principles](https://oktoberfest.readthedocs.io/en/latest/usage.html) in the Oktoberfest documentation)?\n", "\n", - "Did rescoring work and provide better results (Results discussion follows)?" + "Did rescoring work and provide better results (Results discussion follows)?\n", + "\n", + "You can use the following code cell to create a zip file of the Oktoberfest output folder you specified, then download the zip file \"oktoberfest.zip\" for easy exploration:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!zip -r \"oktoberfest.zip\" $output_directory " ] }, { @@ -457,7 +468,7 @@ "format = # the desired format for the library, e.g. \"spectronaut\" or \"msp\", \"msp\" is smaller, \"spectronaut\"\n", "\n", "missedCleavages = # this is the number of missed cleavages that should be allowed (higher values increase prediction time / file size)\n", - "minLength = # minimal allowed peptide length, prosit accepts everythin >= 7\n", + "minLength = # minimal allowed peptide length, prosit accepts everything >= 7\n", "maxLength = # maximal allowed peptide length, prosit accepts everything <= 30\n" ] }, @@ -530,10 +541,10 @@ "\n", "Start the spectral library generation.\n", "\n", - "This step may take a while (TODO minutes). The log output tracks the progress of library generation. Oktoberfest will perform the following steps:\n", + "This step may take a while (3-5 minutes). The log output tracks the progress of library generation. Oktoberfest will perform the following steps:\n", "\n", "- read the fasta file and perform an in-silico digest according to the settings provided in the configuration file\n", - "- acquire fragment intensity and retention time predictions in batches and write them to disk on the fly" + "- get fragment intensity and retention time predictions in batches and write them to disk on the fly" ] }, { From be2cf27c051dcbbc748e9f2897be2134cf962de9 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Sun, 10 Mar 2024 11:26:54 +0100 Subject: [PATCH 9/9] Bump version from 0.6.0 to 0.6.1 --- .cookietemple.yml | 2 +- .github/release-drafter.yml | 4 ++-- cookietemple.cfg | 2 +- docs/conf.py | 4 ++-- oktoberfest/__init__.py | 2 +- pyproject.toml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.cookietemple.yml b/.cookietemple.yml index 2d9e94ca..8b61c22a 100644 --- a/.cookietemple.yml +++ b/.cookietemple.yml @@ -15,5 +15,5 @@ full_name: Victor Giurcoiu email: victor.giurcoiu@tum.de project_name: oktoberfest project_short_description: Public repo oktoberfest -version: 0.6.0 +version: 0.6.1 license: MIT diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 8f8af9be..96aaae27 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -1,5 +1,5 @@ -name-template: "0.6.0 🌈" # <> -tag-template: 0.6.0 # <> +name-template: "0.6.1 🌈" # <> +tag-template: 0.6.1 # <> exclude-labels: - "skip-changelog" diff --git a/cookietemple.cfg b/cookietemple.cfg index cad364db..0970cb29 100644 --- a/cookietemple.cfg +++ b/cookietemple.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.0 +current_version = 0.6.1 [bumpversion_files_whitelisted] init_file = oktoberfest/__init__.py diff --git a/docs/conf.py b/docs/conf.py index 3371e6e6..f2b0cb20 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -54,9 +54,9 @@ # the built documents. # # The short X.Y version. -version = "0.6.0" +version = "0.6.1" # The full version, including alpha/beta/rc tags. -release = "0.6.0" +release = "0.6.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/oktoberfest/__init__.py b/oktoberfest/__init__.py index 4a8cef93..e0e138f2 100644 --- a/oktoberfest/__init__.py +++ b/oktoberfest/__init__.py @@ -5,7 +5,7 @@ __author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)""" __copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich" __license__ = "MIT" -__version__ = "0.6.0" +__version__ = "0.6.1" import logging.handlers import sys diff --git a/pyproject.toml b/pyproject.toml index 7d1e60f4..defc59e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "oktoberfest" -version = "0.6.0" # <> +version = "0.6.1" # <> description = "Public repo oktoberfest" authors = ["Wilhelmlab at Technical University of Munich"] license = "MIT"