Add test - need to test with spliced UTRs

GMOD · Dec 10, 2024 · 6cf24fd · 6cf24fd
1 parent 3d18e6c
commit 6cf24fd
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 23 deletions.
diff --git a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts
@@ -201,7 +201,10 @@ describe('CDS without exons', () => {
   it('Convert mRNA with CDS but without exon', () => {
     const [gffFeature] = readFeatureFile('test_data/cds_without_exon.gff')
     const actual = gff3ToAnnotationFeature(gffFeature)
-    assert.deepEqual(JSON.stringify(actual), '')
+    const expected = readAnnotationFeatureSnapshot(
+      'test_data/cds_without_exon.json',
+    )
+    compareFeatures(actual, expected)
   })
 })
 

diff --git a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts
@@ -183,21 +183,23 @@ function convertChildren(
       convertedChildren[child._id] = child
     }
   }
-  const processedCDS =
-    cdsFeatures.length > 0 ? processCDS(cdsFeatures, refSeq, featureIds) : []
 
-  for (const cds of processedCDS) {
-    convertedChildren[cds._id] = cds
-  }
+  if (cdsFeatures.length > 0) {
+    const processedCDS = processCDS(cdsFeatures, refSeq, featureIds)
 
-  const missingExons = inferMissingExons(
-    cdsFeatures,
-    exonFeatures,
-    utrFeatures,
-    refSeq,
-  )
-  for (const exon of missingExons) {
-    convertedChildren[exon._id] = exon
+    for (const cds of processedCDS) {
+      convertedChildren[cds._id] = cds
+    }
+
+    const missingExons = inferMissingExons(
+      cdsFeatures,
+      exonFeatures,
+      utrFeatures,
+      processedCDS[0].refSeq,
+    )
+    for (const exon of missingExons) {
+      convertedChildren[exon._id] = exon
+    }
   }
 
   if (Object.keys(convertedChildren).length > 0) {
@@ -210,15 +212,12 @@ function inferMissingExons(
   cdsFeatures: GFF3Feature[],
   existingExons: GFF3Feature[],
   utrFeatures: GFF3Feature[],
-  refSeq?: string,
+  refSeq: string,
 ): AnnotationFeatureSnapshot[] {
-  if (!refSeq) {
-    return []
-    // throw new Error('refSeq is missing')
-  }
   const missingExons: AnnotationFeatureSnapshot[] = []
   for (const protein of cdsFeatures) {
     for (const cds of protein) {
+      // For CDS check if there is an exon containing it. If not, create an exon with same coords as the CDS.
       let exonFound = false
       for (const x of existingExons) {
         if (x.length != 1) {
@@ -241,19 +240,25 @@ function inferMissingExons(
         if (!cds.start || !cds.end) {
           throw new Error('Invalid CDS feature')
         }
+        let strand: 1 | -1 | undefined = undefined
+        if (cds.strand === '+') {
+          strand = 1
+        } else if (cds.strand === '-') {
+          strand = -1
+        }
         const newExon: AnnotationFeatureSnapshot = {
           _id: new ObjectID().toHexString(),
           refSeq,
           type: 'exon',
           min: cds.start - 1,
           max: cds.end,
-          strand: cds.strand === '+' ? 1 : cds.strand === '-' ? -1 : undefined,
+          strand,
         }
         for (const utr of utrFeatures) {
+          // If the new exon is adjacent to a UTR, merge the UTR
           if (utr.length != 1 || !utr[0].start || !utr[0].end) {
-            throw new Error('Too many  UTRs')
+            throw new Error('Too many UTRs or invalid UTR')
           }
-          // If the new exon is adjacent to a UTR, merge the UTR
           if (utr[0].end === newExon.min) {
             newExon.min = utr[0].start - 1
             break

diff --git a/packages/apollo-shared/test_data/cds_without_exon.gff b/packages/apollo-shared/test_data/cds_without_exon.gff
@@ -3,7 +3,6 @@
 ctgA	example	gene	1050	9000	.	+	.	ID=eden
 ctgA	example	mRNA	1050	9000	.	+	.	ID=eden.1;Parent=eden
 ctgA	example	five_prime_UTR	1050	1210	.	+	0	ID=five1;Parent=eden.1
-ctgA	example	exon	1211	1510	.	+	0	ID=exon2;Parent=eden.1
 ctgA	example	CDS	1211	1510	.	+	0	ID=cds2;Parent=eden.1
 ctgA	example	CDS	1611	1710	.	+	0	ID=cds2;Parent=eden.1
 ctgA	example	three_prime_UTR	1711	1800	.	+	0	ID=three1;Parent=eden.1

diff --git a/packages/apollo-shared/test_data/cds_without_exon.json b/packages/apollo-shared/test_data/cds_without_exon.json
@@ -0,0 +1,74 @@
+{
+  "_id": "67581b7d5890a8eb1bedab6e",
+  "refSeq": "ctgA",
+  "type": "gene",
+  "min": 1049,
+  "max": 9000,
+  "strand": 1,
+  "children": {
+    "67581b7d5890a8eb1bedab6c": {
+      "_id": "67581b7d5890a8eb1bedab6c",
+      "refSeq": "ctgA",
+      "type": "mRNA",
+      "min": 1049,
+      "max": 9000,
+      "strand": 1,
+      "children": {
+        "67581b7d5890a8eb1bedab66": {
+          "_id": "67581b7d5890a8eb1bedab66",
+          "refSeq": "ctgA",
+          "type": "exon",
+          "min": 1200,
+          "max": 1500,
+          "strand": 1,
+          "attributes": { "gff_source": ["example"], "gff_id": ["exon1"] }
+        },
+        "67581b7d5890a8eb1bedab67": {
+          "_id": "67581b7d5890a8eb1bedab67",
+          "refSeq": "ctgA",
+          "type": "CDS",
+          "min": 1210,
+          "max": 1710,
+          "strand": 1,
+          "attributes": { "gff_source": ["example"], "gff_id": ["cds2"] }
+        },
+        "67581b7d5890a8eb1bedab68": {
+          "_id": "67581b7d5890a8eb1bedab68",
+          "refSeq": "ctgA",
+          "type": "CDS",
+          "min": 1200,
+          "max": 1700,
+          "strand": 1,
+          "attributes": { "gff_source": ["example"], "gff_id": ["cds1"] }
+        },
+        "67581b7d5890a8eb1bedab69": {
+          "_id": "67581b7d5890a8eb1bedab69",
+          "refSeq": "ctgA",
+          "type": "exon",
+          "min": 1049,
+          "max": 1510,
+          "strand": 1
+        },
+        "67581b7d5890a8eb1bedab6b": {
+          "_id": "67581b7d5890a8eb1bedab6b",
+          "refSeq": "ctgA",
+          "type": "exon",
+          "min": 1600,
+          "max": 1800,
+          "strand": 1
+        }
+      },
+      "attributes": { "gff_source": ["example"], "gff_id": ["eden.1"] }
+    },
+    "67581b7d5890a8eb1bedab6d": {
+      "_id": "67581b7d5890a8eb1bedab6d",
+      "refSeq": "ctgA",
+      "type": "TF_binding_site",
+      "min": 1049,
+      "max": 1100,
+      "strand": 1,
+      "attributes": { "gff_source": ["example"] }
+    }
+  },
+  "attributes": { "gff_source": ["example"], "gff_id": ["eden"] }
+}