Skip to content

Commit

Permalink
Add feature import conversion tests (GMOD#441)
Browse files Browse the repository at this point in the history
* Temp commit

* Use chai to deeply compare annotation features

* Some working tests

* Fix import

* Tests for feature conversion (GMOD#429)

Test `Convert example 3` fails because there should be 4 CDSs in the
converted feature object but current code detects 3

Test `Convert braker gff` fails because the converted feature includes
intron, start_codon, and stop_codon

* Fix linter

* Fix CDS overlap calculation

* Ignore more calculated properties on conversion

---------

Co-authored-by: Garrett Stevens <stevens.garrett.j@gmail.com>
  • Loading branch information
dariober and garrettjstevens authored Sep 21, 2024
1 parent 41cf781 commit 1b94250
Show file tree
Hide file tree
Showing 19 changed files with 1,230 additions and 76 deletions.
2 changes: 1 addition & 1 deletion packages/apollo-cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
"@istanbuljs/esm-loader-hook": "^0.2.0",
"@istanbuljs/nyc-config-typescript": "^1.0.2",
"@oclif/test": "^3.1.3",
"@types/chai": "^4",
"@types/chai": "^4.3.19",
"@types/cli-progress": "^3",
"@types/inquirer": "^9.0.7",
"@types/mocha": "^10",
Expand Down
3 changes: 3 additions & 0 deletions packages/apollo-shared/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@
"devDependencies": {
"@nestjs/common": "^10.1.0",
"@nestjs/core": "^10.1.0",
"@types/chai": "^4.3.19",
"@types/node": "^18.14.2",
"@types/rimraf": "^3",
"chai": "^5.1.1",
"chai-exclude": "^3.0.0",
"glob": "^11.0.0",
"mobx": "^6.6.1",
"mobx-state-tree": "^5.1.7",
Expand Down
170 changes: 165 additions & 5 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
/* eslint-disable @typescript-eslint/no-floating-promises */
import { strict as assert } from 'node:assert'
import { describe, it } from 'node:test'
import gff from '@gmod/gff'
import { readFileSync } from 'node:fs'

import gff, { GFF3Feature } from '@gmod/gff'
import { assert, use } from 'chai'
import chaiExclude from 'chai-exclude'

import { gff3ToAnnotationFeature } from './gff3ToAnnotationFeature'
import { AnnotationFeatureSnapshot } from '@apollo-annotation/mst'

use(chaiExclude)

const testCases: [string, string, AnnotationFeatureSnapshot][] = [
[
'a feature with no children',
Expand All @@ -23,18 +28,173 @@ const testCases: [string, string, AnnotationFeatureSnapshot][] = [
},
},
],
[
'a feature with two children',
`ctgA est EST_match 1050 3202 . + . ID=Match1;Name=agt830.5;Target=agt830.5 1 654
ctgA est match_part 1050 1500 . + . Parent=Match1;Name=agt830.5;Target=agt830.5 1 451
ctgA est match_part 3000 3202 . + . Parent=Match1;Name=agt830.5;Target=agt830.5 452 654
`,
{
_id: '66cf9fbb4e947fa2c27d3d6a',
refSeq: 'ctgA',
type: 'EST_match',
min: 1049,
max: 3202,
strand: 1,
children: {
'66cf9fbb4e947fa2c27d3d68': {
_id: '66cf9fbb4e947fa2c27d3d68',
refSeq: 'ctgA',
type: 'match_part',
min: 1049,
max: 1500,
strand: 1,
attributes: {
gff_source: ['est'],
gff_name: ['agt830.5'],
gff_target: ['agt830.5 1 451'],
},
},
'66cf9fbb4e947fa2c27d3d69': {
_id: '66cf9fbb4e947fa2c27d3d69',
refSeq: 'ctgA',
type: 'match_part',
min: 2999,
max: 3202,
strand: 1,
attributes: {
gff_source: ['est'],
gff_name: ['agt830.5'],
gff_target: ['agt830.5 452 654'],
},
},
},
attributes: {
gff_source: ['est'],
gff_id: ['Match1'],
gff_name: ['agt830.5'],
gff_target: ['agt830.5 1 654'],
},
},
],
]

interface AnnotationFeatureSnapshotWithChildrenArray
extends Omit<AnnotationFeatureSnapshot, 'children'> {
children?: AnnotationFeatureSnapshotWithChildrenArray[]
}

function childrenToArray(
feature: AnnotationFeatureSnapshot,
): AnnotationFeatureSnapshotWithChildrenArray {
const { children } = feature
if (!children) {
return feature as AnnotationFeatureSnapshotWithChildrenArray
}
const childrenArray = Object.values(children).map((child) =>
childrenToArray(child),
)
return { ...feature, children: childrenArray }
}

function compareFeatures(
feature1: AnnotationFeatureSnapshot,
feature2: AnnotationFeatureSnapshot,
) {
assert.deepEqual(
{ ...feature1, _id: undefined },
{ ...feature2, _id: undefined },
assert.deepEqualExcludingEvery(
childrenToArray(feature1),
childrenToArray(feature2),
'_id',
)
}

function readFeatureFile(fn: string): GFF3Feature[] {
const lines = readFileSync(fn).toString().split('\n')
const feature: string[] = []
for (const line of lines) {
if (!line.startsWith('#')) {
feature.push(line)
}
}
const inGff = gff.parseStringSync(feature.join('\n')) as GFF3Feature[]
return inGff
}

function readAnnotationFeatureSnapshot(fn: string): AnnotationFeatureSnapshot {
const lines = readFileSync(fn).toString()
return JSON.parse(lines) as AnnotationFeatureSnapshot
}

const [ex1, ex2, ex3, ex4] = readFeatureFile(
'test_data/gene_representations.gff3',
)

describe('gff3ToAnnotationFeature examples', () => {
it('Convert one CDS', () => {
const actual = gff3ToAnnotationFeature(
readFeatureFile('test_data/one_cds.gff3')[0],
)
const expected = readAnnotationFeatureSnapshot('test_data/one_cds.json')
compareFeatures(actual, expected)
})
it('Convert two CDSs', () => {
const actual = gff3ToAnnotationFeature(
readFeatureFile('test_data/two_cds.gff3')[0],
)
const expected = readAnnotationFeatureSnapshot('test_data/two_cds.json')
compareFeatures(actual, expected)
})
it('Convert example 1', () => {
const actual = gff3ToAnnotationFeature(ex1)
const txt = JSON.stringify(actual, null, 2)

assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)
assert.equal(txt.match(/"type": "TF_binding_site"/g)?.length, 1)

const expected = readAnnotationFeatureSnapshot('test_data/example01.json')
compareFeatures(actual, expected)
})
it('Convert example 2', () => {
const actual = gff3ToAnnotationFeature(ex2)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)
const expected = readAnnotationFeatureSnapshot('test_data/example02.json')
compareFeatures(actual, expected)
})
it('Convert example 3', () => {
// NB: In example 3 (and in the other examples) mRNA10003 produces two proteins.
// In the other examples the two proteins are identified by sharing the same cds id.
// In example 3 instead each cds has a unique id so the two proteins are identified by the order they
// appear in the gff.
const actual = gff3ToAnnotationFeature(ex3)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)

// const expected = readAnnotationFeatureSnapshot('test_data/example03.json')
// compareFeatures(actual, expected)
})
it('Convert example 4', () => {
const ft = JSON.stringify(ex4, null, 2)
assert.equal(ft.match(/"type": "five_prime_UTR"/g)?.length, 6)
assert.equal(ft.match(/"type": "three_prime_UTR"/g)?.length, 3)

const actual = gff3ToAnnotationFeature(ex4)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)
assert.equal(txt.match(/prime_UTR/g), null)

const expected = readAnnotationFeatureSnapshot('test_data/example04.json')
compareFeatures(actual, expected)
})
it('Convert braker gff', () => {
const [gffFeature] = readFeatureFile('test_data/braker.gff')
const actual = gff3ToAnnotationFeature(gffFeature)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/intron/g), null)
assert.equal(txt.match(/_codon/g), null)
})
})

describe('gff3ToAnnotationFeature', () => {
for (const testCase of testCases) {
const [description, featureLine, convertedFeature] = testCase
Expand Down
20 changes: 10 additions & 10 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,10 @@ function convertChildren(
const [firstChildFeatureLocation] = childFeature
if (
firstChildFeatureLocation.type === 'three_prime_UTR' ||
firstChildFeatureLocation.type === 'five_prime_UTR'
firstChildFeatureLocation.type === 'five_prime_UTR' ||
firstChildFeatureLocation.type === 'intron' ||
firstChildFeatureLocation.type === 'start_codon' ||
firstChildFeatureLocation.type === 'stop_codon'
) {
continue
}
Expand Down Expand Up @@ -232,20 +235,17 @@ function processCDS(
groupedLocations.push([location])
continue
}
const lastGroupLastLocation = lastGroup.at(-1)
if (!lastGroupLastLocation) {
throw new Error('Got group with no locations')
}
if (
const overlaps = lastGroup.some((lastGroupLoc) =>
doesIntersect2(
/* eslint-disable @typescript-eslint/no-non-null-assertion */
lastGroupLastLocation.start!,
lastGroupLastLocation.end!,
lastGroupLoc.start!,
lastGroupLoc.end!,
location.start!,
location.end!,
/* eslint-enable @typescript-eslint/no-non-null-assertion */
)
) {
),
)
if (overlaps) {
groupedLocations.push([location])
} else {
lastGroup.push(location)
Expand Down
13 changes: 13 additions & 0 deletions packages/apollo-shared/test_data/braker.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
##gff-version 3
CM033580.1 AUGUSTUS gene 15529 16566 0.92 - . ID=g1;
CM033580.1 AUGUSTUS mRNA 15529 16566 0.92 - . ID=g1.t1;Parent=g1;
CM033580.1 AUGUSTUS stop_codon 15529 15531 . - 0 ID=g1.t1.stop1;Parent=g1.t1;
CM033580.1 AUGUSTUS CDS 15529 15659 0.92 - 2 ID=g1.t1.CDS1;Parent=g1.t1;
CM033580.1 AUGUSTUS exon 15529 15659 . - . ID=g1.t1.exon1;Parent=g1.t1;
CM033580.1 AUGUSTUS intron 15660 16112 0.96 - . ID=g1.t1.intron1;Parent=g1.t1;
CM033580.1 AUGUSTUS CDS 16113 16314 0.96 - 0 ID=g1.t1.CDS2;Parent=g1.t1;
CM033580.1 AUGUSTUS exon 16113 16314 . - . ID=g1.t1.exon2;Parent=g1.t1;
CM033580.1 AUGUSTUS intron 16315 16536 0.96 - . ID=g1.t1.intron2;Parent=g1.t1;
CM033580.1 AUGUSTUS CDS 16537 16566 0.99 - 0 ID=g1.t1.CDS3;Parent=g1.t1;
CM033580.1 AUGUSTUS exon 16537 16566 . - . ID=g1.t1.exon3;Parent=g1.t1;
CM033580.1 AUGUSTUS start_codon 16564 16566 . - 0 ID=g1.t1.start1;Parent=g1.t1;
26 changes: 26 additions & 0 deletions packages/apollo-shared/test_data/example01.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
##gff-version 3
##sequence-region chr1 1000 9000
#example01
chr1 . gene 1000 9000 . + . ID=gene10001;Name=EDEN
chr1 . TF_binding_site 1000 1012 . + . ID=tfbs10001;Parent=gene10001
chr1 . mRNA 1050 9000 . + . ID=mRNA10001;Parent=gene10001;Name=EDEN.1
chr1 . mRNA 1050 9000 . + . ID=mRNA10002;Parent=gene10001;Name=EDEN.2
chr1 . mRNA 1300 9000 . + . ID=mRNA10003;Parent=gene10001;Name=EDEN.3
chr1 . exon 1050 1500 . + . ID=exon10001;Parent=mRNA10001,mRNA10002
chr1 . exon 1300 1500 . + . ID=exon10002;Parent=mRNA10003
chr1 . exon 3000 3902 . + . ID=exon10003;Parent=mRNA10001,mRNA10003
chr1 . exon 5000 5500 . + . ID=exon10004;Parent=mRNA10001,mRNA10002,mRNA10003
chr1 . exon 7000 9000 . + . ID=exon10005;Parent=mRNA10001,mRNA10002,mRNA10003
chr1 . CDS 1201 1500 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 3000 3902 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 5000 5500 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 7000 7600 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 1201 1500 . + 0 ID=cds10002;Parent=mRNA10002;Name=edenprotein.2
chr1 . CDS 5000 5500 . + 0 ID=cds10002;Parent=mRNA10002;Name=edenprotein.2
chr1 . CDS 7000 7600 . + 0 ID=cds10002;Parent=mRNA10002;Name=edenprotein.2
chr1 . CDS 3301 3902 . + 0 ID=cds10003;Parent=mRNA10003;Name=edenprotein.3
chr1 . CDS 5000 5500 . + 1 ID=cds10003;Parent=mRNA10003;Name=edenprotein.3
chr1 . CDS 7000 7600 . + 1 ID=cds10003;Parent=mRNA10003;Name=edenprotein.3
chr1 . CDS 3391 3902 . + 0 ID=cds10004;Parent=mRNA10003;Name=edenprotein.4
chr1 . CDS 5000 5500 . + 1 ID=cds10004;Parent=mRNA10003;Name=edenprotein.4
chr1 . CDS 7000 7600 . + 1 ID=cds10004;Parent=mRNA10003;Name=edenprotein.4
Loading

0 comments on commit 1b94250

Please sign in to comment.