Skip to content

Commit

Permalink
Handling of gzip input and bgzip'd & indexed fasta
Browse files Browse the repository at this point in the history
* CLI can upload and add assembly from gzip files (#405 and #407)

* CLI can add assembly in non-editable mode with `--no-db` flag, i.e. without loading
  sequence to mongodb (#406). Only bgzip'd & indexed fasta file are
  supported, not plain fa/fai files.

See tests in test.py: `testFeatureChecksIndexed`, `testFileUploadGzip`,
`testAddAssemblyWithoutLoadingInMongo`
  • Loading branch information
dariober authored and garrettjstevens committed Aug 30, 2024
1 parent 706c9d0 commit 1f59e38
Show file tree
Hide file tree
Showing 21 changed files with 529 additions and 14 deletions.
51 changes: 49 additions & 2 deletions packages/apollo-cli/src/commands/assembly/add-fasta.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
/* eslint-disable @typescript-eslint/no-unnecessary-condition */
import * as fs from 'node:fs'
import * as path from 'node:path'

import { Flags } from '@oclif/core'
import { ObjectId } from 'bson'

Expand Down Expand Up @@ -38,12 +37,18 @@ export default class AddFasta extends FileCommand {
index: Flags.string({
char: 'x',
description:
'URL of the index. Required if input is an external source and ignored if input is a local file',
'URL of the index. Required if input is an external source',
}),
force: Flags.boolean({
char: 'f',
description: 'Delete existing assembly, if it exists',
}),
'no-db': Flags.boolean({
char: 'n',
description: wrapLines("Do not load the fasta sequence into the Apollo database. \
This option assumes the fasta file is bgzip'd with `bgzip` and indexed with `samtools faidx`.\
Indexes should be named <my.fasta.gz>.gzi and <my.fasta.gz>.fai"),
}),
}

public async run(): Promise<void> {
Expand Down Expand Up @@ -76,6 +81,48 @@ export default class AddFasta extends FileCommand {
body,
flags.force,
)
} else if (flags['no-db']) {
const gzi = `${flags['input-file']}.gzi`
const fai = `${flags['input-file']}.fai`
if (!fs.existsSync(gzi) || !fs.existsSync(fai)) {
this.error("Only bgzip'd and indexed fasta files are supported at the moment")
}
// Upload fasta file
const faId = await this.uploadFile(
access.address,
access.accessToken,
flags['input-file'],
'text/x-fasta',
)
// Upload fai index
const faiId = await this.uploadFile(
access.address,
access.accessToken,
fai,
'text/x-fasta',
)
// Upload gzi index
const gziId = await this.uploadFile(
access.address,
access.accessToken,
gzi,
'text/x-fasta',
)
const body = {
assemblyName,
typeName: 'AddAssemblyFromFileIdChange',
fileIds: {
fa: faId,
fai: faiId,
gzi: gziId,
},
}
rec = await submitAssembly(
access.address,
access.accessToken,
body,
flags.force,
)
} else {
if (!isExternal && !fs.existsSync(flags['input-file'])) {
this.error(`File ${flags['input-file']} does not exist`)
Expand Down
2 changes: 1 addition & 1 deletion packages/apollo-cli/src/commands/assembly/add-gff.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export default class AddGff extends FileCommand {
static flags = {
'input-file': Flags.string({
char: 'i',
description: 'Input gff or gtf file',
description: 'Input gff file',
required: true,
}),
assembly: Flags.string({
Expand Down
2 changes: 1 addition & 1 deletion packages/apollo-cli/src/commands/feature/import.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ export default class Import extends FileCommand {
static flags = {
'input-file': Flags.string({
char: 'i',
description: 'Input gff or gtf file',
description: 'Input gff file',
required: true,
}),
assembly: Flags.string({
Expand Down
8 changes: 5 additions & 3 deletions packages/apollo-cli/src/commands/file/upload.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ export default class Upload extends FileCommand {
type: Flags.string({
char: 't',
description:
'File type or "autodetect" for automatic detection.\nNB: There is no check for whether the file complies to this type',
'File type or "autodetect" for automatic detection.\n\
NB: There is no check for whether the file complies to this type',
options: ['text/x-fasta', 'text/x-gff3', 'autodetect'],
default: 'autodetect',
}),
Expand All @@ -42,9 +43,10 @@ export default class Upload extends FileCommand {

let { type } = flags
if (type === 'autodetect') {
if (/\.fasta$|\.fas$|\.fa$|\.fna$/.test(flags['input-file'])) {
const infile = flags['input-file'].replace(/\.gz$/, '')
if (/\.fasta$|\.fas$|\.fa$|\.fna$/.test(infile)) {
type = 'text/x-fasta'
} else if (/\.gff$|\.gff3/.test(flags['input-file'])) {
} else if (/\.gff$|\.gff3/.test(infile)) {
type = 'text/x-gff3'
} else {
this.error(
Expand Down
7 changes: 7 additions & 0 deletions packages/apollo-cli/src/fileCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ export abstract class FileCommand extends BaseCommand<typeof FileCommand> {
throw error
}
})

let contentEncoding = ''
if (file.endsWith('.gz')) {
contentEncoding = 'gzip'
}

const init: RequestInit = {
method: 'POST',
body,
Expand All @@ -70,6 +76,7 @@ export abstract class FileCommand extends BaseCommand<typeof FileCommand> {
Authorization: `Bearer ${accessToken}`,
'Content-Type': type,
'Content-Length': String(size),
'Content-Encoding': contentEncoding,
},
dispatcher: new Agent({ headersTimeout: 60 * 60 * 1000 }),
}
Expand Down
13 changes: 11 additions & 2 deletions packages/apollo-cli/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -384,11 +384,20 @@ interface bodyExternalFile {
fai: string
}
}
interface bodyFileId {
assemblyName: string
typeName: string
fileIds: {
fa: string
fai: string
gzi: string | undefined
}
}

export async function submitAssembly(
address: string,
accessToken: string,
body: bodyLocalFile | bodyExternalFile,
body: bodyLocalFile | bodyExternalFile | bodyFileId,
force: boolean,
): Promise<object> {
let assemblies = await queryApollo(address, accessToken, 'assemblies')
Expand All @@ -401,7 +410,7 @@ export async function submitAssembly(
}
}
}

const auth: RequestInit = {
method: 'POST',
body: JSON.stringify(body),
Expand Down
66 changes: 66 additions & 0 deletions packages/apollo-cli/test/test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
#!/usr/bin/env python3

"""USAGE: Change to Apollo3/packages/apollo-cli, make this script executable:
chmod a+x ./test/test.py
and run it:
./test/test.py
./test/test.py TestCLI.testAddAssemblyFromGff # Run only this test
"""

import hashlib
import json
import os
import sys
Expand Down Expand Up @@ -855,6 +866,40 @@ def testFeatureChecks(self):
p = shell(f"{apollo} feature check {P} -i {xid}")
self.assertTrue("InternalStopCodonCheck" in p.stdout)

def testFeatureChecksIndexed(self):
shell(
f"{apollo} assembly add-fasta {P} -a v1 -i test_data/tiny.fasta.gz --no-db -f"
)
shell(f"{apollo} feature import {P} -a v1 -i test_data/tiny.fasta.gff3 -d")
# shell(f"{apollo} assembly add-gff {P} -i test_data/tiny.fasta.gff3 -a v1 -f")
shell(f"{apollo} assembly check {P} -a v1 -c CDSCheck")
p = shell(f"{apollo} feature check {P} -a v1")
## If we don't edit a feature, checks are not activated (!?)
self.assertEqual(p.stdout.strip(), "[]")

p = shell(f"{apollo} feature get {P} -a v1")
ff = json.loads(p.stdout)
g1 = [x for x in ff if x["gffId"] == "MyGene"][0]
g2 = [x for x in ff if x["gffId"] == "AnotherGene"][0]

shell(f"{apollo} feature edit-coords {P} -i {g1['_id']} -e 201")
shell(f"{apollo} feature edit-coords {P} -i {g2['_id']} -e 251")
p = shell(f"{apollo} feature check {P} -a v1")
out = json.loads(p.stdout)
self.assertTrue(len(out) > 1)
self.assertTrue("InternalStopCodonCheck" in p.stdout)

## Ids with checks
ids = []
for x in out:
ids.extend(x["ids"])
self.assertTrue(len(set(ids)) > 1)

## Retrieve by feature id
xid = " ".join(ids)
p = shell(f"{apollo} feature check {P} -i {xid}")
self.assertTrue("InternalStopCodonCheck" in p.stdout)

def testUser(self):
p = shell(f"{apollo} user get {P}")
out = json.loads(p.stdout)
Expand Down Expand Up @@ -983,6 +1028,27 @@ def testFileUpload(self):
p = shell(f"{apollo} file upload {P} -i test_data/guest.yaml", strict=False)
self.assertTrue(p.returncode != 0)

def testFileUploadGzip(self):
# Uploading a gzip file must skip compression and just copy the file
with open("test_data/tiny.fasta.gz", "rb") as gz:
md5 = hashlib.md5(gz.read()).hexdigest()
p = shell(f"{apollo} file upload {P} -i test_data/tiny.fasta.gz")
out = json.loads(p.stdout)
self.assertEqual(md5, out["checksum"])
shell(f"{apollo} assembly add-file {P} -f -i {out['_id']}")

def testAddAssemblyWithoutLoadingInMongo(self):
# It would be good to check that really there was no sequence loading
shell(f"{apollo} assembly add-fasta {P} -f --no-db -i test_data/tiny.fasta.gz")
p = shell(f"{apollo} assembly sequence {P} -a tiny.fasta.gz")
self.assertTrue(p.stdout.startswith(">"))

p = shell(
f"{apollo} assembly add-fasta {P} -f --no-db -i test_data/tiny.fasta",
strict=False,
)
self.assertTrue(p.returncode != 0)

def testAddAssemblyFromFileId(self):
p = shell(f"{apollo} file upload {P} -i test_data/tiny.fasta")
fid = json.loads(p.stdout)["_id"]
Expand Down
3 changes: 3 additions & 0 deletions packages/apollo-cli/test_data/tiny.fasta.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ctgA 420 6 60 61
ctgB 800 439 100 101
ctgC 420 1253 60 61
Binary file added packages/apollo-cli/test_data/tiny.fasta.gz
Binary file not shown.
3 changes: 3 additions & 0 deletions packages/apollo-cli/test_data/tiny.fasta.gz.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ctgA 420 6 60 61
ctgB 800 439 100 101
ctgC 420 1253 60 61
Binary file added packages/apollo-cli/test_data/tiny.fasta.gz.gzi
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ import { MessagesModule } from '../messages/messages.module'
import { RefSeqsModule } from '../refSeqs/refSeqs.module'
import { ChecksController } from './checks.controller'
import { ChecksService } from './checks.service'
import { FilesModule } from '../files/files.module'

@Module({
providers: [ChecksService],
imports: [
FilesModule,
MessagesModule,
RefSeqsModule,
MongooseModule.forFeatureAsync([
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import {
CheckResult,
CheckResultDocument,
FeatureDocument,
File,
FileDocument,
RefSeq,
RefSeqChunk,
RefSeqChunkDocument,
Expand All @@ -20,15 +22,19 @@ import {
import { BgzipIndexedFasta, IndexedFasta } from '@gmod/indexedfasta'
import { Injectable, Logger } from '@nestjs/common'
import { InjectModel } from '@nestjs/mongoose'
import { RemoteFile } from 'generic-filehandle'
import { LocalFile, RemoteFile } from 'generic-filehandle'
import { Model } from 'mongoose'

import { FeatureRangeSearchDto } from '../entity/gff3Object.dto'
import { RefSeqsService } from '../refSeqs/refSeqs.service'
import { LocalFileGzip } from '@apollo-annotation/shared'
import path from 'node:path'

@Injectable()
export class ChecksService {
constructor(
@InjectModel(File.name)
private readonly fileModel: Model<FileDocument>,
@InjectModel(CheckResult.name)
private readonly checkResultModel: Model<CheckResultDocument>,
private readonly refSeqsService: RefSeqsService,
Expand Down Expand Up @@ -143,6 +149,54 @@ export class ChecksService {
}
return sequence
}

if (assemblyDoc?.fileIds) {
const { fa, fai, gzi } = assemblyDoc.fileIds
this.logger.debug(
`Local fasta file = ${fa}, Local fasta index file = ${fai}`,
)
const { FILE_UPLOAD_FOLDER } = process.env
if (!FILE_UPLOAD_FOLDER) {
throw new Error('No FILE_UPLOAD_FOLDER found in .env file')
}
const faDoc = (await this.fileModel.findById(fa))
const faChecksum = faDoc?.checksum
if (!faChecksum) {
throw new Error(`No checksum for file document ${faDoc}`)
}

const faiDoc = (await this.fileModel.findById(fai))
const faiChecksum = faiDoc?.checksum
if (!faiChecksum) {
throw new Error(`No checksum for file document ${faiDoc}`)
}

const gziDoc = (await this.fileModel.findById(gzi))
const gziChecksum = gziDoc?.checksum
if (!gziChecksum) {
throw new Error(`No checksum for file document ${gziDoc}`)
}

const sequenceAdapter = gzi
? new BgzipIndexedFasta({
fasta: new LocalFile(path.join(FILE_UPLOAD_FOLDER, faChecksum)),
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-call
fai: new LocalFileGzip(path.join(FILE_UPLOAD_FOLDER, faiChecksum)),
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-call
gzi: new LocalFileGzip(path.join(FILE_UPLOAD_FOLDER, gziChecksum)),
})
: new IndexedFasta({
fasta: new LocalFile(fa),
fai: new LocalFile(fai),
})
const sequence = await sequenceAdapter.getSequence(name, start, end)
if (sequence === undefined) {
throw new Error('Sequence not found')
}
return sequence
}


const startChunk = Math.floor(start / chunkSize)
const endChunk = Math.floor(end / chunkSize)
const seq: string[] = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export class FileStorageEngine implements StorageEngine {
)
return
}

const checksum = await writeFileAndCalculateHash(
file,
FILE_UPLOAD_FOLDER,
Expand Down
9 changes: 6 additions & 3 deletions packages/apollo-collaboration-server/src/files/filesUtil.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,12 @@ export async function writeFileAndCalculateHash(
})

const fileWriteStream = createWriteStream(tmpFileName)
const gz = createGzip()
await pipeline(stream, gz, fileWriteStream)

if (originalname.endsWith('.gz')) {
await pipeline(stream, fileWriteStream)
} else {
const gz = createGzip()
await pipeline(stream, gz, fileWriteStream)
}
const fileChecksum = hash.digest('hex')
logger.debug(`Uploaded file checksum: "${fileChecksum}"`)

Expand Down
Loading

0 comments on commit 1f59e38

Please sign in to comment.