varfish-org · holtgrewe · Feb 24, 2023 · Feb 21, 2023 · Feb 24, 2023 · Feb 24, 2023
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,5 @@
 src/static_data/**/*.json* filter=lfs diff=lfs merge=lfs -text
-tests/data/**.gz
-tests/data/data/*.gz filter=lfs diff=lfs merge=lfs -text
+tests/data/*.gz filter=lfs diff=lfs merge=lfs -text
+tests/data/*/*.gz filter=lfs diff=lfs merge=lfs -text
+tests/data/*.fasta filter=lfs diff=lfs merge=lfs -text
+tests/data/*/*.fasta filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -31,6 +31,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2
+        with:
+          lfs: true
 
       - name: Install stable toolchain
         uses: actions-rs/toolchain@v1
@@ -69,12 +71,14 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v2
         with:
-          lfs: true
+          lfs: 'true'
 
       - name: Import test database.
         run: |
+          set -euo pipefail
           zcat tests/data/data/uta_20210129-subset.pgd.gz \
           | psql -v ON_ERROR_STOP=1 -U uta_admin -h 0.0.0.0 -d uta
+        shell: bash
         env:
           PGPASSWORD: uta_admin
 
@@ -94,6 +98,8 @@ jobs:
         env:
           TEST_UTA_DATABASE_URL: postgres://uta_admin:uta_admin@0.0.0.0/uta
           TEST_UTA_DATABASE_SCHEMA: uta_20210129
+          TEST_SEQREPO_CACHE_MODE: read
+          TEST_SEQREPO_CACHE_PATH: tests/data/seqrepo_cache.fasta
 
       - name: Codecov
         uses: codecov/codecov-action@v3

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 
 *~
 .*.sw?
+/.vscode
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,14 +5,21 @@ edition = "2021"
 
 [dependencies]
 anyhow = "1.0.69"
+bio = "1.1.0"
 chrono = "0.4.23"
 enum-map = "2.4.2"
 flate2 = "1.0.25"
 lazy_static = "1.4.0"
 linked-hash-map = "0.5.6"
+log = "0.4.17"
 nom = "7.1.3"
 postgres = { version = "0.19.4", features = ["with-chrono-0_4"] }
 pretty_assertions = "1.3.0"
 regex = "1.7.1"
+seqrepo = { git = "https://github.com/bihealth/seqrepo-rs.git", branch = "27-whole-fasta-sequences-are-read" }
 serde = { version = "1.0.152", features = ["derive"] }
 serde_json = "1.0.93"
+
+[dev-dependencies]
+env_logger = "0.10.0"
+test-log = "0.2.11"
diff --git a/README.md b/README.md
@@ -15,5 +15,47 @@ To use the public database:
 ```
 export TEST_UTA_DATABASE_URL=postgres://anonymous:anonymous@uta.biocommons.org:/uta
 export TEST_UTA_DATABASE_SCHEMA=uta_20210129
-$ cargo test
 ```
+
+Note that [seqrepo-rs](https://github.com/bihealth/seqrepo-rs) is used for access to the genome contig sequence.
+It is inconvenient to provide sub sets of sequences in SeqRepo format.
+Instead, we use a build-cache/read-cache approach that is also used by `biocommons/hgvs`.
+
+To build the cache, you will first need a download of the seqrepo [as described in biocommons/biocommons.seqrepo Quickstart](https://github.com/biocommons/biocommons.seqrepo#quick-start).
+Then, you configure the running of tests for `hgvs-rs` as follows:
+
+```
+export TEST_SEQREPO_CACHE_MODE=write
+export TEST_SEQREPO_PATH=path/to/seqrepo/instance
+export TEST_SEQREPO_CACHE_PATH=tests/data/seqrepo_cache.fasta
+```
+
+When running the tests with `cargo test`, the cache file will be (re-)written.
+If you don't want to regenerate the cache then you can use the following settings.
+With these settings, the cache will only be read.
+
+```
+export TEST_SEQREPO_CACHE_MODE=read
+export TEST_SEQREPO_CACHE_PATH=tests/data/seqrepo_cache.fasta
+```
+
+After either this, you can run the tests.
+
+```
+cargo test
+```
+
+## Creating Recuded UTA Databases
+
+The script `tests/data/data/bootstrap.sh` allows to easily build a reduced set of the UTA database given a list of genes.
+The process is as follows:
+
+1. You edit `bootstrap.sh` to include the HGNC gene symbols of the transcripts that you want to use.
+2. You run the bootstrap script.
+   This will download the given UTA dump and reduce it to the information related to these transcripts.
+
+```
+$ bootstrap.sh http://dl.biocommons.org/uta uta_20210129
+```
+
+The `*.pgd.gz` file is added to the Git repository via `git-lfs` and in CI, this minimal database will be used.
diff --git a/src/data/interface.rs b/src/data/interface.rs
@@ -214,7 +214,9 @@ pub trait Provider {
     /// # Arguments
     ///
     /// * `ac` -- accession
-    fn get_seq(&self, ac: &str) -> Result<String, anyhow::Error>;
+    fn get_seq(&self, ac: &str) -> Result<String, anyhow::Error> {
+        self.get_seq_part(ac, None, None)
+    }
 
     /// Return sequence part for the given accession.
     ///

diff --git a/src/data/mod.rs b/src/data/mod.rs
@@ -1,3 +1,4 @@
 ///! Datatypes, interfaces, and data acess.
 pub mod interface;
 pub mod uta;
+pub mod uta_sr;
diff --git a/src/data/uta.rs b/src/data/uta.rs
@@ -21,8 +21,7 @@ pub struct Config {
     /// URL with the connection string, e.g.
     /// `"postgresql://anonymous:anonymous@uta.biocommons.org/uta'"`.
     pub db_url: String,
-    /// The databaser schema to use, corresponds to the data version, e.g.,
-    /// `uta_20210129`.
+    /// The databaser schema to use, corresponds to the data version, e.g., `uta_20210129`.
     pub db_schema: String,
 }
 
@@ -155,6 +154,10 @@ impl TryFrom<Row> for TxMappingOptionsRecord {
     }
 }
 
+/// This provider provides information from a UTA Postgres database only.
+///
+/// The sequences are also read from the database which implies that no genome contig information
+/// is available.  Use `uta_sr::Provider` for a variant that is enabled to use a SeqRepo.
 pub struct Provider {
     /// Configuration for the access.
     config: Config,
@@ -238,10 +241,6 @@ impl ProviderInterface for Provider {
         Ok(None)
     }
 
-    fn get_seq(&self, ac: &str) -> Result<String, anyhow::Error> {
-        self.get_seq_part(ac, None, None)
-    }
-
     fn get_seq_part(
         &self,
         ac: &str,