R1CS parser continuation (#388)

* r1cs: initial commit, circom binary format * r1cs: references for IR design * r1cs: move IR research to research folder [skip ci] * r1cs: dump circom parser progress [skip ci] * [parsing] return regular `bool` I don't think ParseBool is defined anywhere at the moment * [io] explicitly forbid `seq` in `readInto` for `var T` Otherwise it conflicts with the var openArray[T] version (indeed picked over that, which is wrong) * [io] add convenience `open` taking `string` and returning `File` * [r1cs] finish parser of R1CS files (w/o custom gate list / apps) * add `kInvalid` field to section type enum * export all R1CS related fields, remove `when isMainModule` * [tests] add test case for R1CS binary parser Based on the test provided in https://github.com/iden3/r1csfile/tree/master/test/testutils and https://github.com/iden3/r1csfile/blob/master/test/r1csfile.js * [tests] add R1CS parser test to testDesc, update path to binary file * use `seq[byte]` for `Factor.value` field instead of `seq[BaseType]` To be platform independent. We'll have to verify the inputs at a later time, a which we can also do the unmarshalling. * be explicit about `int` -> `int64` in parser --------- Co-authored-by: Mamy Ratsimbazafy <mamy_github@numforge.co>
mratsim · Jun 13, 2024 · 05dae3b · 05dae3b
1 parent b33479d
commit 05dae3b
Show file tree

Hide file tree

Showing 15 changed files with 1,429 additions and 11 deletions.
diff --git a/constantine.nimble b/constantine.nimble
@@ -555,6 +555,10 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
   ("tests/t_ethereum_eip4844_deneb_kzg_parallel.nim", false),
   ("tests/t_ethereum_verkle_primitives.nim", false),
   ("tests/t_ethereum_verkle_ipa_primitives.nim", false),
+
+  # Proof systems
+  # ----------------------------------------------------------
+  ("tests/proof_systems/t_r1cs_parser.nim", false),
 ]
 
 const testDescNvidia: seq[string] = @[

diff --git a/constantine/platforms/fileio.nim b/constantine/platforms/fileio.nim
@@ -74,7 +74,7 @@ else:
   proc c_fileno(f: File): cint {.importc: "fileno", header: "<fcntl.h>", sideeffect.}
   proc c_fstat(a1: cint, a2: var Stat): cint {.importc: "fstat", header: "<sys/stat.h>", sideeffect.}
 
-proc close*(f: File) =
+proc close*(f: File) {.inline.} =
   if not f.isNil:
     discard f.c_fclose()
 
@@ -92,6 +92,9 @@ proc open*(f: var File, filepath: cstring, mode = kRead): bool =
 
   return true
 
+proc open*(filepath: string, mode = kRead): File =
+  doAssert open(result, filepath.cstring, mode), "Failed to open file " & $filepath & " for " & $mode
+
 
 # Navigating files
 # ------------------------------------------------------------
@@ -108,17 +111,23 @@ else:
 
 proc c_fread(buffer: pointer, len, count: csize_t, f: File): csize_t {.importc: "fread", header: "<stdio.h>", sideeffect, tags:[ReadIOEffect].}
 
-proc readInto*(f: File, buffer: pointer, len: int): int =
+proc readInto*(f: File, buffer: pointer, len: int): int {.inline.} =
   ## Read data into buffer, return the number of bytes read
   cast[int](c_fread(buffer, 1, cast[csize_t](len), f))
 
-proc readInto*[T](f: File, buf: var T): bool =
+proc readInto*[T: not seq](f: File, buf: var T): bool {.inline.} =
   ## Read data into buffer,
   ## return true if the number of bytes read
   ## matches the output type size
   return f.readInto(buf.addr, sizeof(buf)) == sizeof(T)
 
-proc read*(f: File, T: typedesc): T =
+proc readInto*[T](f: File, buf: var openArray[T]): bool {.inline.} =
+  ## Read data into buffer,
+  ## return true if the number of bytes read
+  ## matches the output type size
+  return f.readInto(buf[0].addr, sizeof(T) * buf.len) == sizeof(T) * buf.len
+
+proc read*(f: File, T: typedesc): T {.inline.} =
   ## Interpret next bytes as type `T`
   ## Panics if the number of bytes read does not match
   ## the size of `T`

diff --git a/constantine/proof_systems/README.md b/constantine/proof_systems/README.md
@@ -0,0 +1,9 @@
+# Proof systems
+
+## Implementation "details"
+
+For proof systems, we relax some requirements used in the rest of Constantine:
+- Nim heap-allocated types like sequences and strings may be used.
+
+Proof systems are not expected to run on resource-restricted devices
+or trusted enclaves.
diff --git a/constantine/proof_systems/constraint_systems/r1cs_circom_bin_format.md b/constantine/proof_systems/constraint_systems/r1cs_circom_bin_format.md
diff --git a/constantine/proof_systems/constraint_systems/r1cs_circom_parser.nim b/constantine/proof_systems/constraint_systems/r1cs_circom_parser.nim
@@ -0,0 +1,258 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# This file implements serialization
+# for Circom's Rank-1 (Quadratic) Constraint System
+# described in r1cs_bin_format.md
+#
+# We do not harcode the prime or the bitsize at compile-time
+# This is not in the critical path (bench?) and we don't want monomorphization code-bloat.
+# furthermore converting to/from this format
+# - should not need recompilation to allow CLI-tools
+# - should allow parallelization, down to field -> raw bytes, before writing to file or inversely
+
+import
+  ../../serialization/[io_limbs, parsing],
+  ../../platforms/[fileio, abstractions]
+
+# We use `sortedByIt` to sort the different sections in the file by their
+# `R1csSectionKind`
+from std / algorithm import sortedByIt
+
+type
+  R1csSectionKind* = enum # `kInvalid` used to indicate unset & to make the enum work as field discriminator
+    kInvalid                = 0
+    kHeader                 = 1
+    kConstraints            = 2
+    kWire2LabelId           = 3
+    kCustomGatesList        = 4
+    kCustomGatesApplication = 5
+
+  # Note: We parse the `value` of the `Factor` into a `seq[byte]` to be platform
+  # independent (32 vs 64bit), because `BaseType` depends on the platform
+  Factor = tuple[index: int32, value: seq[byte]]
+  LinComb* = seq[Factor]
+    # A struct-of-arrays (SoA) is more efficient than seq[Constraint] array-of-structs (AoS)
+    # but at the moment we use a seq[byte] indirection for field elements
+    # so data access will occur a cache-miss anyway.
+    # And we optimize for simplicity, so we use the same format as .r1cs files.
+    # For heavy processing, constraints/linear combinations should use an AoS data structure.
+    #
+    # Data-structure wise, this is a sparse vector
+  Constraint* = tuple[A, B, C: LinComb]
+    # A .* B = C with .* pointwise/elementwise mul (Hadamard Product)
+
+  #[
+  Header section example
+     ┏━━━━┳━━━━━━━━━━━━━━━━━┓
+     ┃ 4  │   20 00 00 00   ┃               Field Size in bytes (fs)
+     ┗━━━━┻━━━━━━━━━━━━━━━━━┛
+     ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+     ┃ fs │   010000f0 93f5e143 9170b979 48e83328 5d588181 b64550b8 29a031e1 724e6430 ┃  Prime size
+     ┗━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛
+     ┏━━━━┳━━━━━━━━━━━━━━━━━┓
+     ┃ 32 │   01 00 00 00   ┃               nWires
+     ┗━━━━┻━━━━━━━━━━━━━━━━━┛
+     ┏━━━━┳━━━━━━━━━━━━━━━━━┓
+     ┃ 32 │   01 00 00 00   ┃               nPubOut
+     ┗━━━━┻━━━━━━━━━━━━━━━━━┛
+     ┏━━━━┳━━━━━━━━━━━━━━━━━┓
+     ┃ 32 │   01 00 00 00   ┃               nPubIn
+     ┗━━━━┻━━━━━━━━━━━━━━━━━┛
+     ┏━━━━┳━━━━━━━━━━━━━━━━━┓
+     ┃ 32 │   01 00 00 00   ┃               nPrvIn
+     ┗━━━━┻━━━━━━━━━━━━━━━━━┛
+     ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+     ┃ 64 │   01 00 00 00 00 00 00 00   ┃   nLabels
+     ┗━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛
+     ┏━━━━┳━━━━━━━━━━━━━━━━━┓
+     ┃ 32 │   01 00 00 00   ┃               mConstraints
+     ┗━━━━┻━━━━━━━━━━━━━━━━━┛
+  ]#
+  Header* = object
+    fieldSize*: uint32 # field size in bytes (fs)
+    prime*: seq[byte] # XXX: What type to use with RT size info?
+    nWires*: uint32
+    nPubOut*: uint32
+    nPubIn*: uint32
+    nPrvIn*: uint32
+    nLabels*: uint64
+    nConstraints*: uint32
+
+  #[
+  Wire 2 Label section example
+  ┏━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━┳━━━━━━━━━━━━━━━━━━━┓     ┏━━━━┳━━━━━━━━━━━━━━━━━━━┓
+  ┃ 64  │ labelId of Wire_0      ┃ 64  │ labelId of Wire_1      ┃ ... ┃ 64 │      labelId of Wire_n ┃
+  ┗━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━┻━━━━━━━━━━━━━━━━━━━┛     ┗━━━━┻━━━━━━━━━━━━━━━━━━━┛
+  ]#
+  Wire2Label* = object
+    wireIds*: seq[uint64]
+
+  Section* = object
+    size*: uint64 # NOTE: in the real file the section type is *FIRST* and then the size
+                 # But we cannot map this with a variant type in Nim (without using different
+                 # names for each variant branch)
+    case sectionType*: R1csSectionKind
+    of kInvalid: discard
+    of kHeader: header*: Header
+    of kConstraints: constraints*: seq[Constraint]
+    of kWire2LabelId: w2l*: Wire2Label
+    of kCustomGatesList: cGatesList*: R1csCustomGatesList
+    of kCustomGatesApplication: cGatesApp*: R1csCustomGatesApp
+
+  ## `R1csBin` is binary compatible with an R1CS binary file. Meaning it follows the structure
+  ## of the file (almost) exactly. The only difference is in the section header. The size comes
+  ## *after* the kind, which we don't reproduce in `Section` above
+  R1csBin* = object
+    magic*: array[4, char]
+    version*: uint32
+    numberSections*: uint32
+    sections*: seq[Section] # Note: Because of the unordered nature of the sections
+                           # the `sections` seq won't follow the binary order of
+                           # the data in the file. Instead, we first record (kind, file position)
+                           # of each different section in the file and then parse them in increasing
+                           # order of the section types
+
+  R1csCustomGatesList* = object
+  R1csCustomGatesApp* = object
+
+proc initSection(kind: R1csSectionKind, size: uint64): Section =
+  result = Section(sectionType: kind, size: size)
+
+proc parseLinComb(f: File, lincomb: var LinComb, fieldSize: int32): bool =
+  ## Parse a linear combination, returns true on success
+  ## This does not validate that the elements parsed are part of the field
+  var nnz: int32
+  ?f.parseInt(nnz, littleEndian) # Sparse vector, non-zero values
+  lincomb.setLen(int(nnz))
+
+  var last = low(int32)
+  # fieldSize is in bytes!
+  var buf = newSeq[byte](fieldSize)
+
+  for i in 0 ..< nnz:
+    ?f.parseInt(lincomb[i].index, littleEndian) # wireID of the factor
+    ?(last < lincomb[i].index)                  # The factors MUST be sorted in ascending order.
+    last = lincomb[i].index
+    ?f.readInto(buf)                            # value of the factor
+    lincomb[i].value = buf                      # copy!
+
+  return true
+
+proc parseConstraint(f: File, constraint: var Constraint, fieldSize: int32): bool =
+  ?f.parseLinComb(constraint.A, fieldSize)
+  ?f.parseLinComb(constraint.B, fieldSize)
+  ?f.parseLinComb(constraint.C, fieldSize)
+  return true
+
+template r1csSection(sectionSize, body: untyped): untyped =
+  let startOffset = f.getFilePosition()
+
+  body
+
+  return sectionSize.int == f.getFilePosition() - startOffset
+
+proc parseConstraints(f: File, constraints: var seq[Constraint], sectionSize: uint64, numConstraints, fieldSize: int32): bool =
+  r1csSection(sectionSize):
+    constraints.setLen(numConstraints)
+    for constraint in constraints.mitems():
+      ?f.parseConstraint(constraint, fieldSize)
+
+proc parseMagicHeader(f: File, mh: var array[4, char]): bool =
+  let num = f.readChars(mh, 0, 4) # from start of file!
+  result = num == 4
+
+proc parseSectionKind(f: File, v: var R1csSectionKind): bool =
+  var val: uint32
+  result = f.parseInt(val, littleEndian)
+  v = R1csSectionKind(val.int)
+
+proc parseHeader(f: File, h: var Header): bool =
+  ?f.parseInt(h.fieldSize, littleEndian) # byte size of the prime number
+  # allocate for the prime
+  h.prime = newSeq[byte](h.fieldSize)
+  ?f.readInto(h.prime)
+  ?f.parseInt(h.nWires, littleEndian)
+  ?f.parseInt(h.nPubOut, littleEndian)
+  ?f.parseInt(h.nPubIn, littleEndian)
+  ?f.parseInt(h.nPrvIn, littleEndian)
+  ?f.parseInt(h.nLabels, littleEndian)
+  ?f.parseInt(h.nConstraints, littleEndian)
+  result = true # would have returned before due to `?` otherwise
+
+proc parseWire2Label(f: File, v: var Wire2Label, sectionSize: uint64): bool =
+  r1csSection(sectionSize):
+    let numWires = sectionSize div 8
+    v.wireIds.setLen(numWires)
+    for labelId in v.wireIds.mitems():
+      ?f.parseInt(labelId, littleEndian)
+
+## TODO: to be written :)
+proc parseCustomGatesList(f: File, v: var R1csCustomGatesList): bool = discard
+proc parseCustomGatesApplication(f: File, v: var R1csCustomGatesApp): bool = discard
+
+proc getNumConstraints(r1cs: R1csBin): int32 =
+  ## Returns the number of constraints in the file, based on the header
+  ## This means the header must have been parsed and is located in `sections[0]`.
+  doAssert r1cs.sections[0].sectionType == kHeader
+  result = r1cs.sections[0].header.nConstraints.int32
+
+proc getFieldSize(r1cs: R1csBin): int32 =
+  ## Returns the field size (size of the prime) in the file, based on the header
+  ## This means the header must have been parsed and is located in `sections[0]`.
+  doAssert r1cs.sections[0].sectionType == kHeader
+  result = r1cs.sections[0].header.fieldSize.int32
+
+proc parseSection(f: File, s: var Section, kind: R1csSectionKind, size: uint64, r1cs: R1csBin): bool =
+  # NOTE: The `r1cs` object is there to provide the header information to
+  # the constraints section
+  s = initSection(kind, size)
+  case kind
+  of kHeader:                 ?f.parseHeader(s.header)
+  of kConstraints:            ?f.parseConstraints(s.constraints, size, r1cs.getNumConstraints(), r1cs.getFieldSize())
+  of kWire2LabelId:           ?f.parseWire2Label(s.w2l, size)
+  of kCustomGatesList:        ?f.parseCustomGatesList(s.cGatesList)
+  of kCustomGatesApplication: ?f.parseCustomGatesApplication(s.cGatesApp)
+  of kInvalid: return false
+
+  result = true # would have returned otherwise due to `?`
+
+proc parseR1csFile*(path: string): R1csBin =
+  var f = fileio.open(path, kRead)
+
+  doAssert f.parseMagicHeader(result.magic), "Failed to read magic header"
+  doAssert f.parseInt(result.version, littleEndian), "Failed to read version"
+  doAssert f.parseInt(result.numberSections, littleEndian), "Failed to read number of sections"
+
+  result.sections = newSeq[Section](result.numberSections)
+  # 1. determine the section kind, size & file position for each section in the file
+  var pos = newSeq[(R1csSectionKind, uint64, int64)](result.numberSections)
+  let fp = f.getFilePosition()
+  for i in 0 ..< result.numberSections:
+    var kind: R1csSectionKind
+    var size: uint64 # section size
+    doAssert f.parseSectionKind(kind), "Failed to read section type in section " & $i
+    doAssert f.parseInt(size, littleEndian), "Failed to read section size in section " & $i
+    # compute position of next section
+    pos[i] = (kind, size, f.getFilePosition())
+    let np = f.getFilePosition() + size.int64 # compute beginning of next section header
+    # set file pos to after
+    doAssert f.setFilePosition(np) == 0, "Failed to set file position to " & $np
+
+  # Sort the positions by the section type
+  pos = pos.sortedByIt(it[0]) # sort by the section kind
+  doAssert pos[0][0] == kHeader, "No header present in the file"
+
+  # 2. iterate the different kinds / positions & parse them
+  for i, (kind, size, p) in pos: # set position to start of section & parse
+    doAssert f.setFilePosition(p) == 0, "Failed to set file position to " & $p
+    doAssert f.parseSection(result.sections[i], kind, size, result), "Failed to parse section " & $i
+    if i == 0: # assert we read the header!
+      doAssert kind == kHeader
+
+  fileio.close(f)
diff --git a/constantine/serialization/parsing.nim b/constantine/serialization/parsing.nim
@@ -0,0 +1,40 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../platforms/[primitives, fileio],
+  ./endians
+
+template `?`*(parseCall: bool): untyped =
+  ## Return early if parsing failed
+  ## Syntax `?parseCall()`
+  if not parseCall:
+    return false
+
+proc parseInt*(f: File, v: var SomeInteger, endianness: static Endianness): bool =
+  when endianness == cpuEndian:
+    return f.readInto(v)
+  else:
+    var raw {.noInit.}: array[sizeof(v), byte]
+    if not f.readInto(raw):
+      return false
+
+    # endianness / raw bytes are fundamentally unsigned
+    type T = typeof(v)
+    when sizeof(v) == 8:
+      v = cast[T](uint64.fromBytes(raw, endianness))
+    elif sizeof(v) == 4:
+      v = cast[T](uint32.fromBytes(raw, endianness))
+    elif sizeof(v) == 2:
+      v = cast[T](uint16.fromBytes(raw, endianness))
+    elif sizeof(v) == 1:
+      v = cast[T](uint8.fromBytes(raw, endianness))
+    else:
+      unreachable()
+
+    return true
diff --git a/research/README.md b/research/README.md
@@ -2,15 +2,15 @@
 
 This folder stashes experimentations before they are productionized into the library.
 
-- `GLV`: Scalar multiplication with endomorphism acceleration\
+- `codegen`: LLVM IR codegen to produce optimal assembly `.asm`
+- `constraint_systems`: Research into how constraint systems like R1CS, UltraPlonk or CCS can reuse research from Deep Learning Compiler, Polyhedral compilers, integer programming and constraint programming.
+- `endomorphisms`: Scalar multiplication with endomorphism acceleration\
   - Faster Point Multiplication on Elliptic Curves with Efficient Endomorphisms\
     Robert P. Gallant, Robert J. Lambert, and Scott A. Vanstone
     https://www.iacr.org/archive/crypto2001/21390189.pdf
   - Efficient and Secure Algorithms for GLV-Based Scalar Multiplication and their Implementation on GLV-GLS Curves (Extended Version)\
     Armando Faz-Hernández, Patrick Longa, Ana H. Sánchez, 2013\
     https://eprint.iacr.org/2013/158.pdf
-
-
 - `kzg`: KZG Polynomial Commitments\
   - Constant-Size Commitments to Polynomials and Their Applications\
     Aniket Kate, Gregory M. Zaverucha, Ian Goldberg, 2010\

diff --git a/research/constraint_systems/bastoul2004_polyhedral_inequality.png b/research/constraint_systems/bastoul2004_polyhedral_inequality.png
diff --git a/research/constraint_systems/bastoul2012_polynomial_multiply.png b/research/constraint_systems/bastoul2012_polynomial_multiply.png
diff --git a/research/constraint_systems/bastoul2012_polynomial_multiply_constraint_matrix.png b/research/constraint_systems/bastoul2012_polynomial_multiply_constraint_matrix.png