diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py index b6fa3158a..657162946 100644 --- a/qiita_db/metadata_template/base_metadata_template.py +++ b/qiita_db/metadata_template/base_metadata_template.py @@ -823,7 +823,7 @@ def _common_extend_steps(self, md_template): new_cols = set(headers).difference(self.categories()) if not new_cols and not new_samples: - return + return None, None is_extendable, error_msg = self.can_be_extended(new_samples, new_cols) @@ -1723,7 +1723,7 @@ def _identify_column_names_with_invalid_characters(cls, column_names): set of words containing invalid (illegal) characters. """ valid_initial_char = ascii_letters - valid_rest = set(ascii_letters+digits+'_') + valid_rest = set(ascii_letters+digits+'_:|') invalid = [] for s in column_names: if s[0] not in valid_initial_char: diff --git a/qiita_db/metadata_template/test/test_base_metadata_template.py b/qiita_db/metadata_template/test/test_base_metadata_template.py index eace46dad..48a49c045 100644 --- a/qiita_db/metadata_template/test/test_base_metadata_template.py +++ b/qiita_db/metadata_template/test/test_base_metadata_template.py @@ -84,14 +84,17 @@ def test_identify_invalid_characters(self): 'sampleid', 'sample_id', '{', - 'this|is', + 'bla:1', + 'bla|2', + 'bla1:2|3', + 'this&is', '4column', 'just_fine2']) self.assertCountEqual(set(results), {'tax on', 'bla.', '.', '{', - 'this|is', + 'this&is', '4column'}) diff --git a/qiita_db/metadata_template/test/test_sample_template.py b/qiita_db/metadata_template/test/test_sample_template.py index 5d49c7a47..fc7518dc0 100644 --- a/qiita_db/metadata_template/test/test_sample_template.py +++ b/qiita_db/metadata_template/test/test_sample_template.py @@ -838,7 +838,7 @@ def test_clean_validate_template_no_invalid_chars2(self): def test_clean_validate_template_no_invalid_chars3(self): ST = qdb.metadata_template.sample_template.SampleTemplate - self.metadata.rename(columns={'taxon_id': 'this|is'}, inplace=True) + self.metadata.rename(columns={'taxon_id': 'this&is'}, inplace=True) with self.assertRaises(qdb.exceptions.QiitaDBColumnError): ST._clean_validate_template(self.metadata, 2) @@ -1781,6 +1781,10 @@ def test_extend_update(self): st = qdb.metadata_template.sample_template.SampleTemplate.create( self.metadata, self.new_study) + # test updating with same data, none of the rest of the code/tests + # should change + st.extend_and_update(self.metadata) + self.metadata_dict['Sample4'] = { 'physical_specimen_location': 'location1', 'physical_specimen_remaining': 'true', diff --git a/qiita_pet/handlers/base_handlers.py b/qiita_pet/handlers/base_handlers.py index aa5df62e5..96bf2e4ed 100644 --- a/qiita_pet/handlers/base_handlers.py +++ b/qiita_pet/handlers/base_handlers.py @@ -16,10 +16,10 @@ class BaseHandler(RequestHandler): def get_current_user(self): '''Overrides default method of returning user curently connected''' - username = self.get_secure_cookie(b"user") + username = self.get_secure_cookie("user") if username is not None: # strip off quotes added by get_secure_cookie - username = username.strip("\"' ") + username = username.decode('ascii').strip("\"' ") return User(username) else: self.clear_cookie("user") diff --git a/qiita_pet/handlers/qiita_redbiom.py b/qiita_pet/handlers/qiita_redbiom.py index 194f95f08..4072c2c01 100644 --- a/qiita_pet/handlers/qiita_redbiom.py +++ b/qiita_pet/handlers/qiita_redbiom.py @@ -14,6 +14,7 @@ import redbiom.util import redbiom.fetch from tornado.gen import coroutine, Task +from tornado.web import HTTPError from qiita_core.util import execute_as_transaction from qiita_db.util import generate_study_list_without_artifacts @@ -58,7 +59,7 @@ def _redbiom_feature_search(self, query, contexts): study_artifacts = defaultdict(lambda: defaultdict(list)) query = [f for f in query.split(' ')] for ctx in contexts: - for idx in redbiom.util.ids_from(query, True, 'feature', ctx): + for idx in redbiom.util.ids_from(query, False, 'feature', ctx): aid, sample_id = idx.split('_', 1) sid = sample_id.split('.', 1)[0] study_artifacts[sid][aid].append(sample_id) @@ -71,7 +72,12 @@ def _redbiom_taxon_search(self, query, contexts): # find the features with those taxonomies and then search # those features in the samples features = redbiom.fetch.taxon_descendents(ctx, query) - for idx in redbiom.util.ids_from(features, True, 'feature', ctx): + # from empirical evidence we saw that when we return more than 600 + # features we'll reach issue #2312 so avoiding saturating the + # workers and raise this error quickly + if len(features) > 600: + raise HTTPError(504) + for idx in redbiom.util.ids_from(features, False, 'feature', ctx): aid, sample_id = idx.split('_', 1) sid = sample_id.split('.', 1)[0] study_artifacts[sid][aid].append(sample_id) diff --git a/qiita_pet/support_files/doc/source/checklist-for-ebi-ena-submission.rst b/qiita_pet/support_files/doc/source/checklist-for-ebi-ena-submission.rst index 29e2f67e3..7b551f53d 100644 --- a/qiita_pet/support_files/doc/source/checklist-for-ebi-ena-submission.rst +++ b/qiita_pet/support_files/doc/source/checklist-for-ebi-ena-submission.rst @@ -1,8 +1,44 @@ -Checklist to send data to EBI-ENA -================================= +.. role:: red + +Send data to EBI-ENA +==================== + +Qiita allows users to deposit their study, sample, experiment and sequence data to the +`European Nucleotide Archive (ENA) `__, which is the permanent data +repository of the `European Bioinformatics Institute (EBI) `__. Submitting to +this repository will provide you with a unique identifier for your study, which is generally a +requirement for publications. Your study will be housed with all other Qiita submissions +and so we require adherence to the `MiXs standard `__. `Here `__ you will find a document outlining these requirements, with examples, when possible. +Note that submissions are time consuming and need full collaboration from the user. +:red:`Do not wait until the last minute to request help.` In general, the best +time to request a submission is when you are writing your paper. Remember that the +data can be submitted to EBI and can be kept private and simply make public when +the paper is accepted. Note that EBI/ENA takes up to 15 days to change the status +from private to public, so consider this when submitting data and your manuscript. + +.. note:: + For convenience Qiita allows you to upload a QIIME mapping file to process your data. However, + the QIIME mapping file, in general, does not have all the EBI/ENA fields. Thus, you will need to + update your information files (sample or preparation) via the update option. To simplify this process, + you can download the system generated files and add/modify these fields for each file. + + +EBI-ENA NULL values vocabulary +------------------------------ + +We support only the following values: *not applicable*, *not collected*, *not provided*, *restricted access*. + +For the latest definitions and explanation visit the `EBI/ENA Missing value reporting `__. + +.. warning:: + Column names in your information files cannot be named as a Postgres reserved word. For example, a column cannot be named `CONDITION`, but could instead be named `DISEASE_CONDITION`. For a full list of these reserved words, see this `link `__. + +Checklist +--------- + For each preparation that needs to be uploaded to EBI-ENA we will check: 1. Data processing @@ -27,6 +63,40 @@ For each preparation that needs to be uploaded to EBI-ENA we will check: 7. *elevation*, *latitude*, *longitude* 8. *empo_1*, *empo_2*, *empo_3* + .. table:: + :widths: auto + + =============== ================= ======================= ================================================================================ + empo_1 empo_2 empo_3 Examples + Free-living Non-saline Water (non-saline) fresh water from lake, pond, or river (<5 psu) + Free-living Non-saline Sediment (non-saline) sediment from lake, pond, or river (<5 psu) + Free-living Non-saline Soil (non-saline) soil from forest, grassland, tundra, desert, etc. + Free-living Non-saline Surface (non-saline) biofilm from wet (<5 psu) or dry surface, wood, dust, or microbial mat + Free-living Non-saline Subsurface (non-saline) deep or subsurface environment + Free-living Non-saline Aerosol (non-saline) aerosolized dust or liquid + Free-living Saline Water (saline) salt water from ocean, sea, estuary, mangrove, or coral reef (>5 psu) + Free-living Saline Sediment (saline) sediment from ocean, sea, estuary, mangrove, or beach (>5 psu) + Free-living Saline Hypersaline (saline) water from hypersaline sample or brine (>50 psu) + Free-living Saline Surface (saline) biofilm from wet or underwater surface or microbial mat (>5 psu) + Free-living Saline Aerosol (saline) seaspray or other aerosolized saline material (>5 psu) + Host-associated Animal-associated Animal distal gut feces, stool + Host-associated Animal-associated Animal proximal gut digesta + Host-associated Animal-associated Animal secretion gut intestine, gizzard, crop, lumen, or mucosa + Host-associated Animal-associated Animal surface skin, sebum, mucus, slime + Host-associated Animal-associated Animal corpus tissue of sponge, coral, gill, siphon, carcass, etc. or whole small animal + Host-associated Fungus-associated Fungus corpus tissue of mushroom or other fungi + Host-associated Fungus-associated Fungus surface biofilm of mushroom + Host-associated Plant-associated Plant secretion pollen or sap + Host-associated Plant-associated Plant surface leaf or kelp surface biofilm + Host-associated Plant-associated Plant rhizosphere plant root system, may include some soil + Host-associated Plant-associated Plant corpus tissue of leaf, stem, fruit, or algae + Control Negative Sterile water blank sterile water blank used as negative control for extraction, PCR, and sequencing + Control Positive Mock community known mixed community used as positive control + Control Positive Single strain known single strain control culture + Unknown Contradictory Unknown (contradictory) unknown sample type because other metadata is contradictory + Unknown Missing Unknown (missing) unknown sample type because metadata is unavailable + =============== ================= ======================= ================================================================================ + c. Extra minimal information for host associated studies: 1. *host_body_habitat*, *host_body_site*, *host_body_product* diff --git a/qiita_pet/support_files/doc/source/europeanbioinformaticsinstitute.rst b/qiita_pet/support_files/doc/source/europeanbioinformaticsinstitute.rst deleted file mode 100755 index 4ab6a2a8b..000000000 --- a/qiita_pet/support_files/doc/source/europeanbioinformaticsinstitute.rst +++ /dev/null @@ -1,121 +0,0 @@ -.. role:: red - -EBI Submission via Qiita -======================== - - 1. Upload sample information and preparation information with their required fields for amplicon sequencing - 2. Link appropriate sequence files - 3. Run split libraries, trimming, then deblur or closed reference commands - -* Note: when using closed reference make sure you're using Greengenes for 16S :ref:`[7]`, Silva for 18S :ref:`[8]`, and UNITE for ITS :ref:`[9]`. - -Qiita allows users to deposit their study, sample, experiment and sequence data to the -`European Nucleotide Archive (ENA) `__, which is the permanent data -repository of the `European Bioinformatics Institute (EBI) `__. Submitting to -this repository will provide you with a unique identifier for your study, which is generally a -requirement for publications. Your study will be housed with all other Qiita submissions -and so we require adherence to the `MiXs standard `__. - -EBI/ENA requires a given set of column fields to describe your samples and experiments described below. - -Creating a sample information template on our website using `Qiimp `__ will ensure your -data is EBI/ENA compliant. Alternatively, you can refer to the example template which can be found on -the `Knight Lab website `__ under "MetaData Template" and "Prep Template". -Without these, **Qiita Admins** will not be able to submit your data to EBI. If you want to submit your data or need -help send an email to `qiita.help@gmail.com `__ and please include your study ID. Help will include -advice on additional fields to add to ensure MiXs compliance. - -Note that submissions are time consuming and need full collaboration from the user. -:red:`Do not wait until the last minute to request help.` In general, the best -time to request a submission is when you are writing your paper. Remember that the -data can be submitted to EBI and can be kept private and simply make public when -the paper is accepted. Note that EBI/ENA takes up to 15 days to change the status -from private to public, so consider this when submitting data and your manuscript. - -.. note:: - For convenience Qiita allows you to upload a QIIME mapping file to process your data. However, - the QIIME mapping file, in general, does not have all the EBI/ENA fields. Thus, you will need to - update your information files (sample or preparation) via the update option. To simplify this process, - you can download the system generated files and add/modify these fields for each file. - - -EBI-ENA NULL values vocabulary ------------------------------- - -We support only the following values: *not applicable*, *not collected*, *not provided*, *restricted access*. - -For the latest definitions and explanation visit the `EBI/ENA Missing value reporting `__. - -.. warning:: - Column names in your information files cannot be named as a Postgres reserved word. For example, a column cannot be named `CONDITION`, but could instead be named `DISEASE_CONDITION`. For a full list of these reserved words, see this `link `__. - - -Required Sample Information Fields for EBI submission -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -These are the columns required for successfully submit any data to EBI via Qiita: -sample_name, collection_device, collection_method, collection_timestamp, description, dna_extracted, elevation, elevation_units, empo_1, -empo_2, empo_3, env_biome, env_feature, env_material, env_package, geo_loc_name, host_subject_id, latitude, latitude_units, longitude, -longitude_units, physical_specimen_location, physical_specimen_remaining, sample_type, scientific_name, taxon_id, taxon_id_units, title, tube_id. - -If your samples are related to animals you will also need: -host_age, host_age_units, host_body_habitat, host_body_product, host_body_site, host_common_name, host_height, host_height_units, -host_scientific_name, host_taxid, host_taxid_units, host_weight, host_weight_units, life_stage, sex, time_point, time_point_units. - -If your samples are related to humans you will also need: -host_body_mass_index, host_body_mass_index_units, irb_institute, irb_protocol_id. - -Please note that personally identifiable health information and protected health information (PHI) should NOT be supplied. For -information regarding the rules for defining PHI you can reference the -`CMI User Information Sheet: The De-Identification of Protected Health Information `__. - -We recommend creation of a metadata template for your study using `Qiimp `__ as this tool -enables you to automatically ensure compliance with EBI and MIMARKS standards and enable your data to be consistent with other -studies used in Qiita to maximize your ability to perform meta-analyses. - -Alternatively, you can refer to the example sample information spread sheet under "MetaData Template" -at the `Knight Lab website `__. - -Without these columns you will not be able to submit to EBI. - -Required Prep Information Fields for EBI submission -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To successfully submit your data to EBI, you will also need to include a minimal number of columns in your -preparation information depending on your data type: - -Amplicon Sequencing Data (16S, 18S, ITS, etc.) -sample_name, barcode, linkerprimer (use primer for older 454), experiment_design_description, library, library_construction_protocol, -linker, platform, run_center, run_date, run_prefix, pcr_primers, sequencing_meth, target_gene, target_subfragment, center_name, center_project_name, -instrument_model, runid. - -Metagenomic Sequencing Data (WGS, WGMS, "shotgun", etc.) -sample_name, experiment_design_description, library_construction_protocol, platform, run_center, run_date, run_prefix, sequencing_meth, center_name, -center_project_name, instrument_model, run_id, forward_read, reverse_read, sample_plate, sample_well, i7_index_id (Illumina only), index, -i5_index_id (Illumina only), index2, sample_project, well_description. - -Metabolomics Data: -sample_name, experiment_design_description, run_center, run_date, run_prefix, extraction_solvent, center_name, center_project_name, sample_plate, -sample_well, well_description. - -For descriptions of these fields, you can view the required columns listed on the preparation information spread sheet under "Prep Template" -on the `Knight Lab website `__. - -Without these columns you will not be able to submit to EBI. - -For sequencing data, all valid values for instrument_model per platform, view the values in the table below: - -+-----------------+----------------------------------------------------------------------------------------------------------+ -| Platform | Valid instrument_model options | -+=================+==========================================================================================================+ -| ``LS454`` | ``454 GS``, ``454 GS 20``, ``454 GS FLX``, ``454 GS FLX+``, ``454 GS FLX Titanium``, ``454 GS Junior``, | -| | ``454 GS Junior`` or ``unspecified`` | -+-----------------+----------------------------------------------------------------------------------------------------------+ -| ``Illumina`` | ``HiSeq X Five``, ``HiSeq X Ten``, ``Illumina Genome Analyzer``, ``Illumina Genome Analyzer II``, | -| | ``Illumina Genome Analyzer IIx``, ``Illumina HiScanSQ``, ``Illumina HiSeq 1000``, | -| | ``Illumina HiSeq 1500``,, ``Illumina HiSeq 2000``, ``Illumina HiSeq 2500``, ``Illumina HiSeq 3000``, | -| | ``Illumina HiSeq 4000``, ``Illumina MiSeq``, ``Illumina MiniSeq``, ``Illumina NovaSeq 6000``, | -| | ``NextSeq 500``, ``NextSeq 550``, or ``unspecified`` | -+-----------------+----------------------------------------------------------------------------------------------------------+ -| ``Ion Torrent`` | ``Ion Torrent PGM``, ``Ion Torrent Proton``, ``Ion Torrent S5``, ``Ion Torrent S5 XL`` | -+-----------------+----------------------------------------------------------------------------------------------------------+ diff --git a/qiita_pet/support_files/doc/source/index.rst b/qiita_pet/support_files/doc/source/index.rst index 22316e563..18efc7f97 100755 --- a/qiita_pet/support_files/doc/source/index.rst +++ b/qiita_pet/support_files/doc/source/index.rst @@ -38,7 +38,6 @@ Looking for information about submitting your files to EBI? Please see the docum .. toctree:: - europeanbioinformaticsinstitute.rst checklist-for-ebi-ena-submission Looking for comparable studies? Please see the document here: diff --git a/qiita_pet/support_files/doc/source/redbiom.rst b/qiita_pet/support_files/doc/source/redbiom.rst index 68d2ef60d..e58ee97e6 100755 --- a/qiita_pet/support_files/doc/source/redbiom.rst +++ b/qiita_pet/support_files/doc/source/redbiom.rst @@ -15,22 +15,29 @@ Search Options -------------- * **Metadata**: - * The search will be on the full metadata. - * The metadata search engine uses natural language processing to search for word stems within a samples metadata. A word stem disregards modifiers and plurals, so for instance, a search for "antibiotics" will actually perform a search for "antibiot". Similarly, a search for "crying" will actually search for "cry". The words specified can be combined with set-based operations, so for instance, a search for "antibiotics & crying" will obtain the set of samples in which each sample has "antibiot" in its metadata as well as "cry". N.B., the specific category in which a stem is found is not assured to be the same, "antibiot" could be in one category and "cry" in another. A set intersection can be performed with "&", a union with "|" and a difference with "-". - * In addition to the stem-based search, value based searches can also be a applied. These use a Python-like grammar and allow for a rich set of comparisons to be performed based on a metadata category of interest. For example, "where qiita_study_id == 10317" will find all samples which have the qiita_study_id metadata category, and in which the value for that sample is "10317." - * Examples: + * The search will be on the **full metadata**. + * **Natural language processing:** The metadata search engine uses natural language processing to search for word stems within a sample metadata. A word stem disregards modifiers and plurals, so for instance, a search for *antibiotics* will actually perform a search for *antibiot*. Similarly, a search for *crying* will actually search for *cry*. The words specified can be combined with set-based operations, so for instance, a search for *antibiotics & crying* will obtain the set of samples in which each sample has *antibiot* in its metadata as well as *cry*. - * Find all samples in which the word infant exists, as well as antibiotics, where the infants are under a year old: + N.B., the specific category in which a stem is found is not assured to be the same, *antibiot* could be in one category and *cry* in another. A set intersection can be performed with "&", a union with "|" and a difference with "-". + * **Value search:** In addition to the stem-based search, value based searches can also be applied. These use a Python-like grammar and allow for a rich set of comparisons to be performed based on a metadata category of interest. For example, *where qiita_study_id == 10317* will find all samples which have the *qiita_study_id* metadata category, and in which the value for that sample is *10317*. + * **Examples:** + + * Find all samples in which both the word 'infant', as well as 'antibiotics' exist, and where the infants are under a year old: * *infant & antibiotics where age_years <= 1* - * Find all samples only belonging to the EMP in which the ph is under 7 for a variety of sample types: + * Find all samples only belonging to the EMP in which the pH is under 7, for a variety of sample types: + + * soil: + *soil where ph < 7 and emp_release1 == 'True'* + + * ocean water: + *water & ocean where ph > 7 and emp_release1 == 'True'* - * soil: *soil where ph < 7 and emp_release1 == 'True'* - * ocean water: *water & ocean where ph > 7 and emp_release1 == 'True'* - * non-ocean water: *water - ocean where ph > 7 and emp_release1 == 'True'* + * non-ocean water: + *water - ocean where ph > 7 and emp_release1 == 'True'* - * Or instead of ph you could search for a different metadata category: + * Or instead of pH you could search for a different metadata category: * *water & ocean where salinity > 20* @@ -43,17 +50,18 @@ Search Options * **Feature**: - * The search will be on all the features, in specific: OTU ids for close reference and exact sequences for deblur. + * The search will be on all the features, in specific: **OTU ids for closed reference** or **exact sequences for deblur**. - * Examples: + * **Examples:** - * Find all samples in which the Greengenes feature 4479944 is found: "4479944" + * Find all samples in which the Greengenes feature 4479944 is found: *4479944* + * Find all samples in which the sequence exists: *TACGAAGGGTGCAAGCATTACTCGGAATTACTGGGCGTAAAGCGTGCGTAGGTGGTTCGTTAAGTCTGATGTGAAAGCCCTGGGCTCAACCTGGGAACTG* * **Taxon**: - * The search will be only on closed reference and based on the taxonomies available. Only exact matches are returned. Note that currently only the Greengenes taxonomy is searchable, and that it requires nomenclature of a rank prefix, two underscores, and then the name. + * The search will be **only on closed reference data** and based on the taxonomies available. Only exact matches are returned. Note that currently **only the Greengenes taxonomy** is searchable, and that it requires nomenclature of a rank prefix, two underscores, and then the name. - * Examples: + * **Examples:** - * Find all samples in which the genera Escherichia is found: "g__Escherichia" - * Find all samples in which the order Clostridiales is found: "o__Clostridiales" + * Find all samples in which the genera Escherichia is found: *g__Escherichia* + * Find all samples in which the phylum Tenericutes is found: *p__Tenericutes* diff --git a/qiita_pet/templates/redbiom.html b/qiita_pet/templates/redbiom.html index 62aa7f472..749dfe6ae 100644 --- a/qiita_pet/templates/redbiom.html +++ b/qiita_pet/templates/redbiom.html @@ -174,104 +174,9 @@ Redbiom only searches on public data. Last update: December 18th, 2018. Note that you will only be able to expand and add artifacts to analyses if you are signed into Qiita.

- + Help and examples?
-
-
- We have 3 search options: -
    -
  • - Metadata
    - The search will be on the full metadata. -

    - The metadata search engine uses natural language processing to search for - word stems within a samples metadata. A word stem disregards modifiers and - plurals, so for instance, a search for "antibiotics" will actually perform - a search for "antibiot". Similarly, a search for "crying" will actually - search for "cry". The words specified can be combined with set-based - operations, so for instance, a search for "antibiotics & crying" will - obtain the set of samples in which each sample has "antibiot" in its - metadata as well as "cry". N.B., the specific category in which a stem is - found is not assured to be the same, "antibiot" could be in one category - and "cry" in another. A set intersection can be performed with "&", a - union with "|" and a difference with "-". -

    - In addition to the stem-based search, value based searches can also be a - applied. These use a Python-like grammar and allow for a rich set of - comparisons to be performed based on a metadata category of interest. For - example, "where qiita_study_id == 10317" will find all samples which have - the qiita_study_id metadata category, and in which the value for that - sample is "10317." -

    - Examples: -
    -
      -
    • - Find all samples in which the word infant exists, as well as antibiotics, - where the infants are under a year old: -
        -
      • - infant & antibiotics where age_years <= 1 -
      • -
      -
    • -
    • - Find all samples only belonging to the EMP in which the ph is under 7 for a variety of sample types: -
        -
      • soil: soil where ph < 7 and emp_release1 == 'True'
      • -
      • ocean water: water & ocean where ph > 7 and emp_release1 == 'True'
      • -
      • non-ocean water: water - ocean where ph > 7 and emp_release1 == 'True'
      • -
      -
    • Or instead of ph you could search for a different metadata category:
    • -
        -
      • water & ocean where salinity > 20
      • -
      - -
    • Some other interesting examples: -
        -
      • feces & canine
      • -
      • (beer | cider | wine | alcohol)
      • -
      • where sample_type == 'stool'
      • -
      • usa where sample_type == 'stool' and host_taxid == 9606
      • -
      -
    • - - -
    -
  • -
  • - Feature:
    - The search will be on all the features, in specific: OTU ids for close reference and exact sequences for deblur. -

    - Examples: -
    -
      -
    • - Find all samples in which the Greengenes feature 4479944 is found: "4479944" -
    • -
    -
  • -
  • Taxon:
    - The search will be only on closed reference and based on the taxonomies available. - Only exact matches are returned. Note that currently only the Greengenes taxonomy is - searchable, and that it requires nomenclature of a rank prefix, two underscores, and then the - name. -

    - Examples: -
    -
      -
    • - Find all samples in which the genera Escherichia is found: "g__Escherichia" -
    • -
    • - Find all samples in which the order Clostridiales is found: "o__Clostridiales" -
    • -
    -
  • -
-
-

diff --git a/qiita_pet/test/test_qiita_redbiom.py b/qiita_pet/test/test_qiita_redbiom.py index c79bd6b20..11a8fe0ae 100644 --- a/qiita_pet/test/test_qiita_redbiom.py +++ b/qiita_pet/test/test_qiita_redbiom.py @@ -91,11 +91,17 @@ def test_post_taxon(self): } data = deepcopy(DATA) data[0]['artifact_biom_ids'] = { - '5': ['1.SKM3.640197'], '4': ['1.SKM3.640197']} + '5': sorted(['1.SKD2.640178', '1.SKM3.640197']), + '4': sorted(['1.SKM3.640197', '1.SKD2.640178'])} response = self.post('/redbiom/', post_args) exp = {'status': 'success', 'message': '', 'data': data} + # making sure they are in the same order + obs = loads(response.body) + obs['data'][0]['artifact_biom_ids'] = { + '4': sorted(obs['data'][0]['artifact_biom_ids']['4']), + '5': sorted(obs['data'][0]['artifact_biom_ids']['5'])} self.assertEqual(response.code, 200) - self.assertEqual(loads(response.body), exp) + self.assertEqual(obs, exp) post_args = { 'search': 'o_0319-7L14', diff --git a/qiita_ware/commands.py b/qiita_ware/commands.py index 534b46570..aae4c83b1 100644 --- a/qiita_ware/commands.py +++ b/qiita_ware/commands.py @@ -212,13 +212,7 @@ def submit_EBI(artifact_id, action, send, test=False, test_size=False): LogEntry.create( 'Runtime', 'The submission: %d is larger than allowed (%d), will ' 'try to fix: %d' % (artifact_id, max_size, total_size)) - # let's confirm that we are only dealing with the latest samples and - # then convert them to a DataFrame for easier cleanup - new_samples = { - sample for sample, accession in viewitems( - ebi_submission.prep_template.ebi_experiment_accessions) - if accession is None} - new_samples = new_samples.intersection(ebi_submission.samples) + # transform current metadata to dataframe for easier curation rows = {k: dict(v) for k, v in viewitems(ebi_submission.samples)} df = pd.DataFrame.from_dict(rows, orient='index') # remove unique columns and same value in all columns @@ -226,21 +220,18 @@ def submit_EBI(artifact_id, action, send, test=False, test_size=False): nsamples = len(df.index) cols_to_drop = set( nunique[(nunique == 1) | (nunique == nsamples)].index) + # maximize deletion by removing also columns that are almost all the + # same or almost all unique cols_to_drop = set( nunique[(nunique <= int(nsamples * .01)) | (nunique >= int(nsamples * .5))].index) cols_to_drop = cols_to_drop - {'taxon_id', 'scientific_name', 'description'} - df.drop(columns=cols_to_drop, inplace=True) - # let's overwrite samples - ebi_submission.samples = {k: r.to_dict() for k, r in df.iterrows()} + all_samples = ebi_submission.sample_template.ebi_sample_accessions + samples = {k: all_samples[k] for k in ebi_submission.samples} ebi_submission.write_xml_file( - ebi_submission.generate_sample_xml(new_samples), + ebi_submission.generate_sample_xml(samples, cols_to_drop), ebi_submission.sample_xml_fp) - # let's do the same with the prep - ebi_submission.write_xml_file( - ebi_submission.generate_experiment_xml(new_samples), - ebi_submission.experiment_xml_fp) # now let's recalculate the size to make sure it's fine new_total_size = sum([stat(tr).st_size diff --git a/qiita_ware/ebi.py b/qiita_ware/ebi.py index 912c32632..41d1995b9 100644 --- a/qiita_ware/ebi.py +++ b/qiita_ware/ebi.py @@ -380,7 +380,7 @@ def generate_study_xml(self): return study_set - def generate_sample_xml(self, samples=None): + def generate_sample_xml(self, samples=None, ignore_columns=None): """Generates the sample XML file Parameters @@ -388,6 +388,9 @@ def generate_sample_xml(self, samples=None): samples : list of str, optional The list of samples to be included in the sample xml. If not provided or an empty list is provided, all the samples are used + ignore_columns : list of str, optional + The list of columns to ignore during submission; helful for when + the submissions are too large Returns ------- @@ -433,6 +436,9 @@ def generate_sample_xml(self, samples=None): description.text = escape(clean_whitespace(text)) if sample_info: + if ignore_columns is not None: + for key in ignore_columns: + del sample_info[key] sample_attributes = ET.SubElement(sample, 'SAMPLE_ATTRIBUTES') self._add_dict_as_tags_and_values(sample_attributes, 'SAMPLE_ATTRIBUTE',