Skip to content

Commit

Permalink
Merge pull request #498 from scodec/topic/support-comments-in-hex-and…
Browse files Browse the repository at this point in the history
…-bin

Support comments in hex and bin strings
  • Loading branch information
mpilquist authored May 13, 2024
2 parents e967c3e + 49cffb4 commit 81152b5
Show file tree
Hide file tree
Showing 5 changed files with 200 additions and 50 deletions.
6 changes: 5 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ ThisBuild / mimaBinaryIssueFilters ++= Seq(
ProblemFilters.exclude[IncompatibleMethTypeProblem]("scodec.bits.HexDumpFormat.print"),
ProblemFilters.exclude[DirectMissingMethodProblem]("scodec.bits.HexDumpFormat.this"),
ProblemFilters.exclude[IncompatibleResultTypeProblem]("scodec.bits.ByteVector.fromHexInternal"),
ProblemFilters.exclude[DirectMissingMethodProblem]("scodec.bits.ByteVector#AtEmpty.apply")
ProblemFilters.exclude[DirectMissingMethodProblem]("scodec.bits.ByteVector#AtEmpty.apply"),
ProblemFilters.exclude[IncompatibleResultTypeProblem]("scodec.bits.ByteVector.fromBinInternal"),
ProblemFilters.exclude[DirectMissingMethodProblem](
"scodec.bits.ByteVector.fromBinInternal$default$2"
)
)

lazy val root = tlCrossRootProject.aggregate(core, benchmark)
Expand Down
44 changes: 37 additions & 7 deletions core/shared/src/main/scala/scodec/bits/Bases.scala
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ package scodec.bits
/** Provides types related to base conversion -- e.g., binary, hexadecimal, and base 64. */
object Bases {

/** Result of `Alphabet#toIndex` that indicates the character should be ignored. */
final val IgnoreChar: Int = -1

/** Result of `Alphabet#toIndex` that indicates the character and the rest of the line should be ignored. */
final val IgnoreRestOfLine: Int = -2

/** Partial mapping between characters and indices used in base conversions.
*/
trait Alphabet {
Expand Down Expand Up @@ -76,14 +82,26 @@ object Bases {
/** Predefined alphabets for use in base conversions. */
object Alphabets {

private[bits] object HexBinCommentChar {
def unapply(c: Char): Option[Char] =
c match {
case '#' => Some('#')
case ';' => Some(';')
case '|' => Some('|')
case _ => None
}
}

/** Binary alphabet that uses `{0, 1}` and allows whitespace and underscores for separation. */
object Binary extends BinaryAlphabet {
def toChar(i: Int) = if (i == 0) '0' else '1'
def toIndex(c: Char) =
c match {
case '0' => 0
case '1' => 1
case _ => throw new IllegalArgumentException
case '0' => 0
case '1' => 1
case c if ignore(c) => IgnoreChar
case HexBinCommentChar(_) => IgnoreRestOfLine
case _ => throw new IllegalArgumentException
}
def ignore(c: Char) = c.isWhitespace || c == '_'
}
Expand All @@ -93,9 +111,11 @@ object Bases {
def toChar(i: Int) = if (i == 0) 't' else 'f'
def toIndex(c: Char) =
c match {
case 't' | 'T' => 0
case 'f' | 'F' => 1
case _ => throw new IllegalArgumentException
case 't' | 'T' => 0
case 'f' | 'F' => 1
case c if ignore(c) => IgnoreChar
case HexBinCommentChar(_) => IgnoreRestOfLine
case _ => throw new IllegalArgumentException
}
def ignore(c: Char) = c.isWhitespace || c == '_'
}
Expand All @@ -105,7 +125,12 @@ object Bases {
private[bits] abstract class LenientHex extends HexAlphabet {
def toIndex(c: Char) = {
val i = Character.digit(c, 16)
if (i < 0) if (ignore(c)) -1 else throw new IllegalArgumentException else i
if (i >= 0) i
else
c match {
case c if ignore(c) => IgnoreChar
case HexBinCommentChar(_) => IgnoreRestOfLine
}
}
def ignore(c: Char) = c.isWhitespace || c == '_'
}
Expand Down Expand Up @@ -144,6 +169,7 @@ object Bases {
val lookupIndex = c - indicesMin
if (lookupIndex >= 0 && lookupIndex < indices.length && indices(lookupIndex) >= 0)
indices(lookupIndex)
else if (ignore(c)) IgnoreChar
else throw new IllegalArgumentException
}
def ignore(c: Char) = c.isWhitespace
Expand Down Expand Up @@ -172,6 +198,7 @@ object Bases {
val lookupIndex = c - indicesMin
if (lookupIndex >= 0 && lookupIndex < indices.length && indices(lookupIndex) >= 0)
indices(lookupIndex)
else if (ignore(c)) IgnoreChar
else throw new IllegalArgumentException
}
def ignore(c: Char) = c == '-' || c.isWhitespace
Expand All @@ -194,6 +221,7 @@ object Bases {
case c if c >= 'P' && c <= 'Z' => c - 'P' + 9 + 8 + 5
case c if c >= 'a' && c <= 'k' => c - 'a' + 9 + 8 + 5 + 11
case c if c >= 'm' && c <= 'z' => c - 'm' + 9 + 8 + 5 + 11 + 11
case c if ignore(c) => IgnoreChar
case _ => throw new IllegalArgumentException
}

Expand All @@ -214,6 +242,7 @@ object Bases {
case c if c >= '0' && c <= '9' => c - '0' + 26 + 26
case '+' => 62
case '/' => 63
case c if ignore(c) => IgnoreChar
case _ => throw new IllegalArgumentException
}
override def ignore(c: Char) = c.isWhitespace
Expand Down Expand Up @@ -248,6 +277,7 @@ object Bases {
case c if c >= '0' && c <= '9' => c - '0' + 26 + 26
case '-' => 62
case '_' => 63
case c if ignore(c) => IgnoreChar
case _ => throw new IllegalArgumentException
}
override def ignore(c: Char) = c.isWhitespace
Expand Down
9 changes: 6 additions & 3 deletions core/shared/src/main/scala/scodec/bits/BitVector.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1500,13 +1500,16 @@ object BitVector extends BitVectorCompanionCrossPlatform {
str: String,
alphabet: Bases.BinaryAlphabet = Bases.Alphabets.Binary
): Either[String, BitVector] =
ByteVector.fromBinInternal(str, alphabet).map { case (bytes, size) =>
val toDrop = size match {
try {
val (bytes, count) = ByteVector.fromBinInternal(str, alphabet)
val toDrop = count match {
case 0 => 0
case n if n % 8 == 0 => 0
case n => 8 - (n % 8)
}
bytes.toBitVector.drop(toDrop.toLong)
Right(bytes.toBitVector.drop(toDrop.toLong))
} catch {
case t: IllegalArgumentException => Left(t.getMessage)
}

/** Constructs a `BitVector` from a binary string or returns `None` if the string is not valid
Expand Down
110 changes: 71 additions & 39 deletions core/shared/src/main/scala/scodec/bits/ByteVector.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1753,6 +1753,10 @@ object ByteVector extends ByteVectorCompanionCrossPlatform {
* is not valid hexadecimal.
*
* The string may start with a `0x` and it may contain whitespace or underscore characters.
*
* Single-line comments are supported - by default, any text after a #, ;, or | charater is ignored
* until the start of the next line, though comment charaters are specified by the specified alphabet.
*
* @group base
*/
def fromHexDescriptive(
Expand Down Expand Up @@ -1785,11 +1789,11 @@ object ByteVector extends ByteVectorCompanionCrossPlatform {
val c = withoutPrefix.charAt(idx)
val nibble =
if (defaults) {
Character.digit(c, 16) match {
case i if i >= 0 => i
case i if Character.isWhitespace(c) || c == '_' => -1
case _ => throw new IllegalArgumentException
}
val i = Character.digit(c, 16)
if (i >= 0) i
else if (Character.isWhitespace(c) || c == '_') Bases.IgnoreChar
else if (Bases.Alphabets.HexBinCommentChar.unapply(c).isDefined) Bases.IgnoreRestOfLine
else throw new IllegalArgumentException
} else alphabet.toIndex(c)
if (nibble >= 0) {
if (midByte) {
Expand All @@ -1801,6 +1805,9 @@ object ByteVector extends ByteVectorCompanionCrossPlatform {
midByte = true
}
count += 1
} else if (nibble == Bases.IgnoreRestOfLine) {
// Ignore rest of line
while (idx < length && withoutPrefix.charAt(idx) != '\n') idx += 1
}
idx += 1
}
Expand Down Expand Up @@ -1855,55 +1862,80 @@ object ByteVector extends ByteVectorCompanionCrossPlatform {
* not valid binary.
*
* The string may start with a `0b` and it may contain whitespace or underscore characters.
*
* Single-line comments are supported - by default, any text after a #, ;, or | charater is ignored
* until the start of the next line, though comment charaters are specified by the specified alphabet.
*
* @group base
*/
def fromBinDescriptive(
str: String,
alphabet: Bases.BinaryAlphabet = Bases.Alphabets.Binary
): Either[String, ByteVector] = fromBinInternal(str, alphabet).map { case (res, _) => res }
): Either[String, ByteVector] =
try Right(fromBinInternal(str, alphabet)._1)
catch {
case t: IllegalArgumentException => Left(t.getMessage)
}

private[bits] def fromBinInternal(
str: String,
alphabet: Bases.BinaryAlphabet = Bases.Alphabets.Binary
): Either[String, (ByteVector, Int)] = {
val prefixed = (str.startsWith("0b")) || (str.startsWith("0B"))
alphabet: Bases.BinaryAlphabet
): (ByteVector, Int) = {
val prefixed = str.length >= 2 && str.charAt(0) == '0' && {
val second = str.charAt(1)
second == 'b' || second == 'B'
}
val withoutPrefix = if (prefixed) str.substring(2) else str
var idx, byte, bits, count = 0
var err: String = null
val bldr = ByteBuffer.allocate((str.size + 7) / 8)
while (idx < withoutPrefix.length && (err eq null)) {
val c = withoutPrefix(idx)
if (!alphabet.ignore(c))
try {
byte = (byte << 1) | (1 & alphabet.toIndex(c))
val length = withoutPrefix.length
val out = new Array[Byte]((length + 7) / 8)
var j = 0
val defaults = alphabet eq Bases.Alphabets.Binary
try
while (idx < length) {
val c = withoutPrefix.charAt(idx)
val bit =
if (defaults) {
c match {
case '0' => 0
case '1' => 1
case _ if Character.isWhitespace(c) || c == '_' => Bases.IgnoreChar
case Bases.Alphabets.HexBinCommentChar(_) => Bases.IgnoreRestOfLine
case _ => throw new IllegalArgumentException
}
} else alphabet.toIndex(c)
if (bit >= 0) {
byte = (byte << 1) | bit
bits += 1
count += 1
} catch {
case _: IllegalArgumentException =>
err = s"Invalid binary character '$c' at index ${idx + (if (prefixed) 2 else 0)}"
if (bits == 8) {
out(j) = byte.toByte
j += 1
bits = 0
byte = 0
}
} else if (bit == Bases.IgnoreRestOfLine) {
// Ignore rest of line
while (idx < length && withoutPrefix.charAt(idx) != '\n') idx += 1
}
if (bits == 8) {
bldr.put(byte.toByte)
byte = 0
bits = 0
idx += 1
}
idx += 1
}
if (err eq null)
Right(
(
if (bits > 0) {
bldr.put((byte << (8 - bits)).toByte)
bldr.flip()
ByteVector(bldr).shiftRight((8 - bits).toLong, false)
} else {
bldr.flip()
ByteVector(bldr)
},
count
catch {
case _: IllegalArgumentException =>
val c = withoutPrefix.charAt(idx)
throw new IllegalArgumentException(
s"Invalid binary character '$c' at index ${idx + (if (prefixed) 2 else 0)}"
)
)
else Left(err)
}

val result = if (bits > 0) {
out(j) = (byte << (8 - bits)).toByte
j += 1
ByteVector.view(out).shiftRight((8 - bits).toLong, false)
} else {
ByteVector.view(out).take(j)
}
(result, count)
}

/** Constructs a `ByteVector` from a binary string or returns `None` if the string is not valid
Expand Down
81 changes: 81 additions & 0 deletions core/shared/src/test/scala/scodec/bits/ByteVectorTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,50 @@ class ByteVectorTest extends BitsSuite {
)
}

test("fromHexDescriptive with comments") {
assertEquals(
ByteVector.fromHexDescriptive("""
deadbeef ; first line
01020304 # second line
05060708
"""),
Right(hex"deadbeef0102030405060708")
)

object CustomAlphabet extends Bases.Alphabets.LenientHex {
private val Chars =
Array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f')
def toChar(i: Int) = Chars(i)
override def toIndex(c: Char): Int = c match {
case '$' => Bases.IgnoreRestOfLine
case _ => super.toIndex(c)
}
}

assertEquals(
ByteVector.fromHexDescriptive(
"""
deadbeef $ first line
01020304 $ second line
05060708
""",
CustomAlphabet
),
Right(hex"deadbeef0102030405060708")
)
}

property("hex with comments example") {
val packet = hex"""
; Start of first packet from https://wiki.wireshark.org/uploads/__moin_import__/attachments/SampleCaptures/mpeg2_mp2t_with_cc_drop01.pcap
01 00 5e 7b ad 47 00 0c db 78 7d 00 08 00 ; Ethernet header
45 00 05 40 b6 9f 40 00 0c 11 de 95 51 a3 96 3c e9 70 03 28 ; IPv4 header
c3 50 15 7c 05 2c 00 00 ; UDP header
47 02 00 1e ; MP2T header
"""
assertEquals(packet.size, 46L)
}

property("toHex fromHex roundtrip") {
forAll((b: ByteVector) => ByteVector.fromHex(b.toHex).get == b)
}
Expand All @@ -214,6 +258,43 @@ class ByteVectorTest extends BitsSuite {
)
}

test("fromBinDescriptive with comments") {
assertEquals(
ByteVector.fromBinDescriptive("""
00110011 ; first line
11001100 # second line
11110000
"""),
Right(bin"001100111100110011110000".bytes)
)

object CustomAlphabet extends Bases.BinaryAlphabet {
def toChar(i: Int) = i match {
case 0 => '0'
case 1 => '1'
}
def toIndex(c: Char): Int = c match {
case '0' => 0
case '1' => 1
case '$' => Bases.IgnoreRestOfLine
case _ => Bases.IgnoreChar
}
def ignore(c: Char): Boolean = c.isWhitespace
}

assertEquals(
ByteVector.fromBinDescriptive(
"""
00110011 $ first line
11001100 $ second line
11110000
""",
CustomAlphabet
),
Right(bin"001100111100110011110000".bytes)
)
}

test("fromValidBin") {
assert(ByteVector.fromValidBin(deadbeef.toBin) == deadbeef)
intercept[IllegalArgumentException](ByteVector.fromValidBin("1101a000"))
Expand Down

0 comments on commit 81152b5

Please sign in to comment.