Skip to content

Commit

Permalink
No binary parser
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin committed Jul 31, 2024
1 parent ee4e6b7 commit 08e4090
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 132 deletions.
152 changes: 55 additions & 97 deletions src/cramFile/sectionParsers.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { TupleOf } from '../typescript'
import { ParsedItem, parseItf8, parseLtf8 } from './util'
import { parseItf8, parseLtf8 } from './util'
import { DataSeriesEncodingMap } from './codecs/dataSeriesTypes'
import { CramEncoding } from './encoding'

Expand Down Expand Up @@ -77,23 +77,16 @@ export function getCramBlockHeader(buffer: Buffer, startOffset = 0) {
}
}

export function getCramBlockCrc32(
buffer: Buffer,
dataView: DataView,
offset: number,
) {
export function getCramBlockCrc32(b: Buffer, offset: number) {
const dataView = new DataView(b.buffer, b.byteOffset, b.length)
const crc32 = dataView.getUint32(offset)
offset += 4
return { crc32, offset }
}

export type CramTagDictionary = string[][]

export function getCramTagDictionary(
buffer: Buffer,
dataView: DataView,
offset: number,
) {
export function getCramTagDictionary(buffer: Buffer, offset: number) {
const [size, newOffset1] = parseItf8(buffer, offset)
offset += newOffset1
const subbuf = buffer.subarray(offset, offset + size).toString('utf8')
Expand Down Expand Up @@ -179,11 +172,7 @@ export function getCramPreservationMap(
})
offset += 5
} else if (key === 'TD') {
const { offset: offsetRet, value } = getCramTagDictionary(
buffer,
dataView,
offset,
)
const { offset: offsetRet, value } = getCramTagDictionary(buffer, offset)
ents.push({ key, value })
offset = offsetRet
}
Expand Down Expand Up @@ -225,7 +214,7 @@ export interface MappedSliceHeader {
numContentIds: number
contentIds: number[]
refBaseBlockId: number
md5: TupleOf<number, 16>
md5?: TupleOf<number, 16>
}

export interface UnmappedSliceHeader {
Expand All @@ -234,11 +223,11 @@ export interface UnmappedSliceHeader {
numBlocks: number
numContentIds: number
contentIds: number[]
md5: TupleOf<number, 16>
md5?: TupleOf<number, 16>
}

export function isMappedSliceHeader(
header: MappedSliceHeader | UnmappedSliceHeader,
header: unknown,
): header is MappedSliceHeader {
return typeof (header as any).refSeqId === 'number'
}
Expand All @@ -263,7 +252,6 @@ interface Value {
function getCramUnmappedSliceHeader(
majorVersion: number,
buffer: Buffer,
dataView: DataView,
offset: number,
) {
let maxLength = 0
Expand Down Expand Up @@ -301,7 +289,7 @@ function getCramUnmappedSliceHeader(
// the md5 sum is missing in cram v1
let md5
if (majorVersion >= 2) {
md5 = buffer.subarray(offset, offset + 16)
md5 = buffer.subarray(offset, offset + 16) as unknown as TupleOf<number, 16>
offset += 16
maxLength += 16
}
Expand All @@ -325,7 +313,6 @@ function getCramUnmappedSliceHeader(
function getCramMappedSliceHeader(
majorVersion: number,
buffer: Buffer,
dataView: DataView,
offset: number,
) {
const [refSeqId, newOffset1] = parseItf8(buffer, offset)
Expand All @@ -338,7 +325,7 @@ function getCramMappedSliceHeader(
offset += newOffset4
let maxLength = 5 * 4

let recordCounter
let recordCounter = 0
if (majorVersion >= 3) {
const [rc, newOffset5] = parseLtf8(buffer, offset)
offset += newOffset5
Expand All @@ -349,6 +336,8 @@ function getCramMappedSliceHeader(
offset += newOffset5
recordCounter = rc
maxLength += 5
} else {
console.warn('majorVersion is <2, recordCounter set to 0')
}

const [numBlocks, newOffset6] = parseItf8(buffer, offset)
Expand All @@ -368,7 +357,7 @@ function getCramMappedSliceHeader(
// the md5 sum is missing in cram v1
let md5
if (majorVersion >= 2) {
md5 = buffer.subarray(offset, offset + 16)
md5 = buffer.subarray(offset, offset + 16) as unknown as TupleOf<number, 16>
offset += 16
maxLength += 16
}
Expand All @@ -380,6 +369,7 @@ function getCramMappedSliceHeader(
md5,
numBlocks,
numRecords,
numContentIds,
refSeqSpan,
refSeqId,
refSeqStart,
Expand All @@ -388,6 +378,7 @@ function getCramMappedSliceHeader(
contentIds,
},
maxLength: maxLen,
offset,
}
}
function getCramEncoding(
Expand Down Expand Up @@ -721,18 +712,6 @@ function getCramContainerHeader2(
}
}

// each of these is a function of the major and minor version
const versionedParsers = {
getCramUnmappedSliceHeader,
getCramMappedSliceHeader,
getCramEncoding,
getCramDataSeriesEncodingMap,
getCramTagEncodingMap,
getCramCompressionHeader,
getCramContainerHeader1,
getCramContainerHeader2,
}

export type CompressionMethod =
| 'raw'
| 'gzip'
Expand All @@ -758,69 +737,48 @@ export interface BlockHeader {
uncompressedSize: number
}

export type CramCompressionHeader = ParsedItem<{
export interface CramCompressionHeader {
preservation: CramPreservationMap
dataSeriesEncoding: DataSeriesEncodingMap
tagEncoding: Record<string, CramEncoding>
}>

function getSectionParsers(majorVersion: number): {
cramFileDefinition: {
parser: Parser<{
magic: string
majorVersion: number
minorVersion: number
fileId: string
}>
maxLength: number
}
cramContainerHeader1: {
parser: Parser<{
length: number
refSeqId: number
refSeqStart: number
alignmentSpan: number
numRecords: number
recordCounter: number
numBases: number
numBlocks: number
numLandmarks: number
}>
maxLength: number
}
cramContainerHeader2: {
parser: Parser<{
numLandmarks: number
landmarks: number[]
crc32: number
}>
maxLength: (x: number) => number
}
cramBlockHeader: {
parser: Parser<BlockHeader>
maxLength: number
}
cramBlockCrc32: {
parser: Parser<{ crc32: number }>
maxLength: number
}
cramCompressionHeader: {
parser: Parser<CramCompressionHeader>
}
cramMappedSliceHeader: {
parser: Parser<MappedSliceHeader>
maxLength: (numContentIds: number) => number
}
cramUnmappedSliceHeader: {
parser: Parser<UnmappedSliceHeader>
maxLength: (numContentIds: number) => number
}
} {
const parsers: any = Object.assign({}, unversionedParsers)
Object.keys(versionedParsers).forEach(parserName => {
parsers[parserName] = (versionedParsers as any)[parserName](majorVersion)
})
return parsers
}

export { cramFileDefinition, getSectionParsers }
export function getSectionParsers(majorVersion: number) {
return {
...unversionedParsers,
getCramUnmappedSliceHeader: (
buffer: Buffer,
dataView: DataView,
offset: number,
) => getCramUnmappedSliceHeader(majorVersion, buffer, offset),
getCramMappedSliceHeader: (buffer: Buffer, offset: number) =>
getCramMappedSliceHeader(majorVersion, buffer, offset),
getCramDataSeriesEncodingMap: (
buffer: Buffer,
dataView: DataView,
offset: number,
) => getCramDataSeriesEncodingMap(majorVersion, buffer, dataView, offset),
getCramTagEncodingMap: (
buffer: Buffer,
dataView: DataView,
offset: number,
) => getCramTagEncodingMap(majorVersion, buffer, dataView, offset),
getCramCompressionHeader: (
buffer: Buffer,
dataView: DataView,
offset: number,
) => getCramCompressionHeader(majorVersion, buffer, dataView, offset),
getCramContainerHeader1: (
buffer: Buffer,
dataView: DataView,
offset: number,
) => getCramContainerHeader1(majorVersion, buffer, dataView, offset),
getCramContainerHeader2: (
buffer: Buffer,
dataView: DataView,
offset: number,
) => getCramContainerHeader2(majorVersion, buffer, dataView, offset),
getCramEncoding: (buffer: Buffer, dataView: DataView, offset: number) =>
getCramEncoding(majorVersion, buffer, dataView, offset),
}
}
38 changes: 25 additions & 13 deletions src/cramFile/slice/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { CramArgumentError, CramMalformedError } from '../../errors'
import { parseItem, sequenceMD5, tinyMemoize } from '../util'
import { sequenceMD5, tinyMemoize } from '../util'

import Constants from '../constants'
import decodeRecord, { DataSeriesDecoder } from './decodeRecord'
Expand Down Expand Up @@ -191,7 +191,7 @@ export default class CramSlice {
}

// memoize
async getHeader(): Promise<SliceHeader> {
async getHeader() {
// fetch and parse the slice header
const sectionParsers = await this.file.getSectionParsers()
const containerHeader = await this.container.getHeader()
Expand All @@ -202,24 +202,36 @@ export default class CramSlice {
throw new Error('block header undefined')
}
if (header.contentType === 'MAPPED_SLICE_HEADER') {
const content = parseItem(
const { offset, value } = sectionParsers.getCramMappedSliceHeader(
header.content,
sectionParsers.cramMappedSliceHeader.parser,
0,
containerHeader._endPosition,
)
return { ...header, parsedContent: content }

const _endPosition = containerHeader._endPosition
const _size = offset
return {
...header,
parsedContent: value,
_size,
_endPosition,
}
} else if (header.contentType === 'UNMAPPED_SLICE_HEADER') {
const content = parseItem(
const { offset, value } = sectionParsers.getCramMappedSliceHeader(
header.content,
sectionParsers.cramUnmappedSliceHeader.parser,
0,
containerHeader._endPosition,
)
return { ...header, parsedContent: content }

const _endPosition = containerHeader._endPosition
const _size = offset
return {
...header,
parsedContent: value,
_size,
_endPosition,
}
} else {
throw new CramMalformedError(
`error reading slice header block, invalid content type ${header.contentType}`,
`error reading slice header, invalid content type ${header.contentType}`,
)
}
}
Expand Down Expand Up @@ -363,14 +375,14 @@ export default class CramSlice {
this.file.options.checkSequenceMD5 &&
isMappedSliceHeader(sliceHeader.parsedContent) &&
sliceHeader.parsedContent.refSeqId >= 0 &&
sliceHeader.parsedContent.md5.join('') !== '0000000000000000'
sliceHeader.parsedContent.md5?.join('') !== '0000000000000000'
) {
const refRegion = await this.getReferenceRegion()
if (refRegion) {
const { seq, start, end } = refRegion
const seqMd5 = sequenceMD5(seq)
const storedMd5 = sliceHeader.parsedContent.md5
.map(byte => (byte < 16 ? '0' : '') + byte.toString(16))
?.map(byte => (byte < 16 ? '0' : '') + byte.toString(16))
.join('')
if (seqMd5 !== storedMd5) {
throw new CramMalformedError(
Expand Down
Loading

0 comments on commit 08e4090

Please sign in to comment.