IntersectMBO · kwxm · Jul 24, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/plutus-benchmark/bls12-381-costs/bench/Bench.hs b/plutus-benchmark/bls12-381-costs/bench/Bench.hs
@@ -17,27 +17,27 @@ import Data.ByteString qualified as BS (empty)
 
 benchHashAndAddG1 :: EvaluationContext -> Integer -> Benchmark
 benchHashAndAddG1 ctx n =
-    let prog = mkHashAndAddG1Script (listOfSizedByteStrings n 4)
+    let prog = mkHashAndAddG1Script (listOfByteStringsOfLength n 4)
     in bench (show n) $ benchProgramCek ctx prog
 
 benchHashAndAddG2 :: EvaluationContext -> Integer -> Benchmark
 benchHashAndAddG2 ctx n =
-    let prog = mkHashAndAddG2Script (listOfSizedByteStrings n 4)
+    let prog = mkHashAndAddG2Script (listOfByteStringsOfLength n 4)
     in bench (show n) $ benchProgramCek ctx prog
 
 benchUncompressAndAddG1 :: EvaluationContext -> Integer -> Benchmark
 benchUncompressAndAddG1 ctx n =
-    let prog = mkUncompressAndAddG1Script (listOfSizedByteStrings n 4)
+    let prog = mkUncompressAndAddG1Script (listOfByteStringsOfLength n 4)
     in bench (show n) $ benchProgramCek ctx prog
 
 benchUncompressAndAddG2 :: EvaluationContext -> Integer -> Benchmark
 benchUncompressAndAddG2 ctx n =
-    let prog = mkUncompressAndAddG2Script (listOfSizedByteStrings n 4)
+    let prog = mkUncompressAndAddG2Script (listOfByteStringsOfLength n 4)
     in bench (show n) $ benchProgramCek ctx prog
 
 benchPairing :: EvaluationContext -> Benchmark
 benchPairing ctx =
-    case listOfSizedByteStrings 4 4 of
+    case listOfByteStringsOfLength 4 4 of
       [b1, b2, b3, b4] ->
           let emptyDst = Tx.toBuiltin BS.empty
               p1 = Tx.bls12_381_G1_hashToGroup (Tx.toBuiltin b1) emptyDst
@@ -46,7 +46,7 @@ benchPairing ctx =
               q2 = Tx.bls12_381_G2_hashToGroup (Tx.toBuiltin b4) emptyDst
               prog = mkPairingScript p1 p2 q1 q2
           in bench "pairing" $ benchProgramCek ctx prog
-      _ -> error "Unexpected list returned by listOfSizedByteStrings"
+      _ -> error "Unexpected list returned by listOfByteStringsOfLength"
 
 benchGroth16Verify :: EvaluationContext -> Benchmark
 benchGroth16Verify ctx = bench "groth16Verify" $ benchProgramCek ctx mkGroth16VerifyScript

diff --git a/plutus-benchmark/bls12-381-costs/src/PlutusBenchmark/BLS12_381/RunTests.hs b/plutus-benchmark/bls12-381-costs/src/PlutusBenchmark/BLS12_381/RunTests.hs
@@ -21,22 +21,22 @@ import Prelude (IO, mapM_)
 
 printCosts_HashAndAddG1 :: Handle -> Integer -> IO ()
 printCosts_HashAndAddG1 h n =
-    let script = mkHashAndAddG1Script (listOfSizedByteStrings n 4)
+    let script = mkHashAndAddG1Script (listOfByteStringsOfLength n 4)
     in printSizeStatistics h (TestSize n) script
 
 printCosts_HashAndAddG2 :: Handle -> Integer -> IO ()
 printCosts_HashAndAddG2 h n =
-    let script = mkHashAndAddG2Script (listOfSizedByteStrings n 4)
+    let script = mkHashAndAddG2Script (listOfByteStringsOfLength n 4)
     in printSizeStatistics h (TestSize n) script
 
 printCosts_UncompressAndAddG1 :: Handle -> Integer -> IO ()
 printCosts_UncompressAndAddG1 h n =
-    let script = mkUncompressAndAddG1Script (listOfSizedByteStrings n 4)
+    let script = mkUncompressAndAddG1Script (listOfByteStringsOfLength n 4)
     in printSizeStatistics h (TestSize n) script
 
 printCosts_UncompressAndAddG2 :: Handle -> Integer -> IO ()
 printCosts_UncompressAndAddG2 h n =
-    let script = mkUncompressAndAddG2Script (listOfSizedByteStrings n 4)
+    let script = mkUncompressAndAddG2Script (listOfByteStringsOfLength n 4)
     in printSizeStatistics h (TestSize n) script
 
 printCosts_Pairing :: Handle -> IO ()

diff --git a/plutus-benchmark/bls12-381-costs/src/PlutusBenchmark/BLS12_381/Scripts.hs b/plutus-benchmark/bls12-381-costs/src/PlutusBenchmark/BLS12_381/Scripts.hs
@@ -16,7 +16,7 @@
  -}
 module PlutusBenchmark.BLS12_381.Scripts
     ( checkGroth16Verify_Haskell
-    , listOfSizedByteStrings
+    , listOfByteStringsOfLength
     , mkGroth16VerifyScript
     , mkHashAndAddG1Script
     , mkHashAndAddG2Script
@@ -61,9 +61,9 @@ import System.IO.Unsafe (unsafePerformIO)
 import Prelude (fromIntegral)
 
 -- Create a list containing n bytestrings of length l.  This could be better.
-{-# NOINLINE listOfSizedByteStrings #-}
-listOfSizedByteStrings :: Integer -> Integer -> [ByteString]
-listOfSizedByteStrings n l = unsafePerformIO . G.sample $
+{-# NOINLINE listOfByteStringsOfLength #-}
+listOfByteStringsOfLength :: Integer -> Integer -> [ByteString]
+listOfByteStringsOfLength n l = unsafePerformIO . G.sample $
                              G.list (R.singleton $ fromIntegral n)
                                   (G.bytes (R.singleton $ fromIntegral l))
 

diff --git a/plutus-benchmark/ed25519-costs/src/PlutusBenchmark/Ed25519/Common.hs b/plutus-benchmark/ed25519-costs/src/PlutusBenchmark/Ed25519/Common.hs
@@ -71,9 +71,9 @@ builtinHash :: BuiltinHashFun
 builtinHash = Tx.sha2_256
 
 -- Create a list containing n bytestrings of length l.  This could be better.
-{-# NOINLINE listOfSizedByteStrings #-}
-listOfSizedByteStrings :: Integer -> Integer -> [ByteString]
-listOfSizedByteStrings n l = unsafePerformIO . G.sample $
+{-# NOINLINE listOfByteStringsOfLength #-}
+listOfByteStringsOfLength :: Integer -> Integer -> [ByteString]
+listOfByteStringsOfLength n l = unsafePerformIO . G.sample $
                              G.list (R.singleton $ fromIntegral n)
                                   (G.bytes (R.singleton $ fromIntegral l))
 
@@ -94,7 +94,7 @@ mkInputs :: forall v msg .
 mkInputs n toMsg hash =
     Inputs $ map mkOneInput (zip seeds1 seeds2)
     where seedSize = 128
-          (seeds1, seeds2) = splitAt n $ listOfSizedByteStrings (2*n) seedSize
+          (seeds1, seeds2) = splitAt n $ listOfByteStringsOfLength (2*n) seedSize
           -- ^ Seeds for key generation. For some algorithms the seed has to be
           -- a certain minimal size and there's a SeedBytesExhausted error if
           -- it's not big enough; 128 is big enough for everything here though.

diff --git a/plutus-core/changelog.d/20240711_030856_kenneth.mackenzie_bitwise_4.md b/plutus-core/changelog.d/20240711_030856_kenneth.mackenzie_bitwise_4.md
@@ -0,0 +1,6 @@
+<!--
+### Added
+
+- Added costing for the new bitwise builtins (see CIP-0058), which will probably become available at the Chang+1 HF.
+
+-->
diff --git a/plutus-core/cost-model/budgeting-bench/Benchmarks/Bitwise.hs b/plutus-core/cost-model/budgeting-bench/Benchmarks/Bitwise.hs
@@ -1,45 +1,60 @@
 -- editorconfig-checker-disable-file
-
-{-# LANGUAGE TypeOperators #-}
-
 module Benchmarks.Bitwise (makeBenchmarks) where
 
 import Common
 import Generators
 
 import PlutusCore
-import PlutusCore.Evaluation.Machine.ExMemoryUsage
+import PlutusCore.Evaluation.Machine.CostStream (sumCostStream)
+import PlutusCore.Evaluation.Machine.ExMemoryUsage (ExMemoryUsage, IntegerCostedLiterally (..),
+                                                    ListCostedByLength (..),
+                                                    NumBytesCostedAsNumWords (..), flattenCostRose,
+                                                    memoryUsage)
 
 import Criterion.Main
 import Data.ByteString qualified as BS
+import Data.SatInt (fromSatInt)
 import Hedgehog qualified as H
 
+{- | Costing benchmarks for bitwise bytestring builtins and integer/bytestring conversions. -}
+
+{- Most of the initial exploratory benchmarks were run with a set of small input
+  bytestrings (up to size 160 / 1280 bytes) and then again with a set of large
+  inputs (up to size 1600 / 12800 bytes).  In the final budgeting benchmarks we
+  mostly go up to size 150 (= 1200 bytes).
+-}
 
----------------- ByteString builtins ----------------
+numSamples :: Int
+numSamples = 150
+
+sampleSizes :: [Int]
+sampleSizes = [1..numSamples]
 
 -- Smallish bytestring inputs: 150 entries.  Note that the length of a
 -- bytestring is eight times the size.
-smallerByteStrings150 :: H.Seed -> [BS.ByteString]
-smallerByteStrings150 seed = makeSizedByteStrings seed [1..150]
+makeSample :: H.Seed -> [BS.ByteString]
+makeSample seed = makeSizedByteStrings seed sampleSizes
 
 -- Make an integer of size n which encodes to 0xFF...FF
-allFF :: Int -> Integer
-allFF n = 256^(8*n) - 1
-
-------------------------- ByteStringToInteger -------------------------
-
-{- Experiments show that the times for big-endian and little-endian conversions
-   are very similar, with big-endian conversion perhaps taking a fraction
-   longer.  We just generate a costing function for big-endian conversion and
-   use that for the little-endian conversion as well.  A quadratic function
-   fitted to inputs of size up to 150 gives a good fit and extrapolates well to
-   larger inputs. -}
+repunitOfSize :: Int -> Integer
+repunitOfSize n = 256^(8*n) - 1
+
+-- Calculate the index of the top (ie, righmost) bit in a bytestring.
+topBitIndex :: BS.ByteString -> Integer
+topBitIndex s = fromIntegral $ 8*(BS.length s)-1
+
+memoryUsageAsNumBytes :: ExMemoryUsage a => a -> Integer
+memoryUsageAsNumBytes = (8*) . fromSatInt . sumCostStream . flattenCostRose . memoryUsage
+
+{- Experiments show that the times for big-endian and little-endian
+   `byteStringToInteger` conversions are very similar, with big-endian
+   conversion perhaps taking a fraction longer.  We just generate a costing
+   function for big-endian conversion and use that for the little-endian
+   conversion as well.  A quadratic function fitted to inputs of size up to 150
+   gives a good fit and extrapolates well to larger inputs. -}
 benchByteStringToInteger :: Benchmark
-benchByteStringToInteger =  createTwoTermBuiltinBenchElementwise ByteStringToInteger []
-                            (repeat True) (smallerByteStrings150 seedA)
-
-
-------------------------- IntegerToByteString -------------------------
+benchByteStringToInteger =
+  createTwoTermBuiltinBenchElementwise ByteStringToInteger [] $ fmap (\x -> (True,x)) (makeSample seedA)
 
 {- We have four possibilities for integer to bytestring conversions: they can be
  big- or little-endian, and they can also be of bounded or unbounded width.
@@ -53,31 +68,171 @@ benchByteStringToInteger =  createTwoTermBuiltinBenchElementwise ByteStringToInt
  a single function call to generate the padding and experiments show that the
  time required for this is negligible in comparison to the conversion time.
  It's important to make sure that the memory cost does take account of the width
- though. -}
-
--- Make sure that the input integer really does require the full width so that
--- the conversion does the maximum amount of work.
+ though.  The sample we use gives us bytestrings up to 8*150 = 1200 bytes long.
+ This is well within the 8192-byte limit. -}
 benchIntegerToByteString :: Benchmark
 benchIntegerToByteString =
     let b = IntegerToByteString
-        widths = [1..150]
-        inputs = fmap allFF widths
-        -- This is like createThreeTermBuiltinBenchElementwise, but we want to
-        -- make sure that the width appears literally in the benchmark name.
-        createBench l =
-            let mkOneBM (e, width, n) =
-                      -- Widths are in words: we need to convert those to widths in bytes for the implementation
-                      let width' = 8 * fromIntegral width
-                      in bgroup (showMemoryUsage e) [
-                              bgroup (showMemoryUsage (LiteralByteSize width')) [mkBM e width' n]
-                             ]
-                          where mkBM x y z = benchDefault (showMemoryUsage z) $ mkApp3 b [] x y z
-            in bgroup (show b) $ fmap mkOneBM l
-
-    in createBench $ zip3 (repeat True) widths inputs
+        inputs = fmap repunitOfSize sampleSizes
+        -- The minimum width of bytestring needed to fit the inputs into.
+        widthsInBytes = fmap memoryUsageAsNumBytes inputs
+    in createThreeTermBuiltinBenchElementwiseWithWrappers
+       (id, NumBytesCostedAsNumWords, id) b [] $
+       zip3 (repeat True) widthsInBytes inputs
+
+{- For `andByteString` with different-sized inputs, calling it with extension
+semantics (ie, first argument=True) takes up to about 5% longer than with
+truncation semantics for small arguments and up to about 15% for larger inputs.
+Fitting t~min(x,y) gives a reasonable prediction for small values of x and y but
+this doesn't extend too well to larger values.  There are two factors in play:
+with extension semantics there's less copying work to do but more alloction work
+(which is a lot cheaper).  If we fit a model of the form t~pmin(x,y) then this
+accounts for the copying but not the allocation.  if we add a factor for copying
+as well (t ~ pmin(x,y) + abs(x-y)) then we get a model that extends well to
+larger data.  Equivalently we can fit t~x+y to the data for y<=x, but then we'd
+have to swap the inputs for y>x.
+
+A model for t~x+y does a good job though: we get within +/-5% for the small data
+and -20% to +5% for big data. We could also try fitting t=a+bx along x=y for the
+small data and then extrapolate that to a/2+ b/2(x+y) elsewhere.
+
+We assume that the costs of `orByteString` and `xorByteString` are the same as
+those of `andByteString` and re-use the `andByteString` costing function for
+those.
+-}
+benchAndByteString :: Benchmark
+benchAndByteString =
+  let inputSizes = fmap (20*) [1..25]  -- 20..400: 625 cases, which should take an hour or so.
+      xs = makeSizedByteStrings seedA inputSizes
+      ys = makeSizedByteStrings seedB inputSizes
+  in createTwoTermBuiltinBenchWithFlag AndByteString [] True xs ys
+  -- This requires a special case in the costing codet because we don't include
+  -- the first argument (the flag).
+
+{- For `complementByteString`, the time taken is linear in the length.  A model
+ based on small input sizes extrapolates well to results for large inputs -}
+benchComplementByteString :: Benchmark
+benchComplementByteString =
+  let xs = makeSample seedA
+  in createOneTermBuiltinBench ComplementByteString [] xs
+
+{- `readBit` is pretty much constant time regardless of input size and the position of
+the bit to be read. -}
+benchReadBit :: Benchmark
+benchReadBit =
+  let xs = makeSample seedA
+  in createTwoTermBuiltinBenchElementwise ReadBit [] $ pairWith topBitIndex xs
+
+{- Benchmarks show that the time taken by `writeBits` depends mostly on the size
+   of the list of updates, although it may take a little longer to write bits
+   with larger indices.  We run benchmarks involving increasing numbers of
+   updates to 1024-byte bytestrings, always writing the highest-indexed bit to
+   take account of this.  We use a fresh bytestring for each set of updates.
+-}
+benchWriteBits :: Benchmark
+benchWriteBits =
+  let fun = WriteBits
+      size = 128  -- This is equal to length 1024.
+      xs = makeSizedByteStrings seedA $ take numSamples $ repeat size
+      l = zip xs [1..numSamples]
+      -- Given a bytestring s and an integer k, return a pair (s,u) where u is a
+      -- list of updates which write the highest bit in s 10*k times.  Here k
+      -- will range from 1 to numSamples, which is 150.
+      mkUpdatesFor (s,k) =
+        let topIndex = topBitIndex s
+            updates = take (10*k) $ cycle [(topIndex, False), (topIndex, True)]
+        in (s, updates)
+      inputs = fmap mkUpdatesFor l
+      mkBM x y = benchDefault (showMemoryUsage (ListCostedByLength y)) $ mkApp2 fun [] x y
+  in bgroup (show fun) $ fmap(\(s,u) -> bgroup (showMemoryUsage s) $ [mkBM s u]) inputs
+  {- This is like createTwoTermBuiltinBenchElementwise except that the benchmark name contains
+   the length of the list of updates, not the memory usage.  The denotation of WriteBits
+   in Default.Builtins must wrap its second argument in ListCostedByLength to make sure
+   that the correct ExMemoryUsage instance is called for costing. -}
+
+{- For small inputs `replicateByte` looks constant-time.  For larger inputs it's
+   linear.  We're limiting the output to 8192 bytes (size 1024), so we may as
+   well test the whole legal range.  NB: if we change the value of
+   integerToByteStringMaximumOutputLength then we probably need to change the
+   limits here too.
+-}
+benchReplicateByte :: Benchmark
+benchReplicateByte =
+  let numCases = 128 :: Int
+      xs = fmap (fromIntegral . (64*)) [1..numCases] :: [Integer]
+      -- ^ This gives us replication counts up to 64*128 = 8192, the maximum allowed.
+      inputs = pairWith (const (0xFF::Integer)) xs
+  in createTwoTermBuiltinBenchElementwiseWithWrappers
+     (NumBytesCostedAsNumWords, id) ReplicateByte [] inputs
+
+{- Benchmarks with varying sizes of bytestrings and varying amounts of shifting
+   show that the execution time of `shiftByteString` depends linearly on the
+   length of the bytestring and (to a much smaller degree) the size of the
+   shift, except that shifts which involve shifting bits between bytes are
+   significantly more expensive than shfts by a whole number of bytes.  For
+   bytestrings of size 50 the ratio between the former and the latter is about
+   1.5 and for size 400 it's about 4.  We could add a special case for costing
+   whole-byte shifts, but for the time being we run benchmarks for a single-bit
+   shift and fit a linear model to the time taken versus the length of the
+   bytestring.  This gives a mmodel which is very accurate for small shifts and
+   overstimates times for large shifts by maybe 4% or so, A model fitted to
+   smaller data extrapolates very well to larger data.
+-}
+benchShiftByteString :: Benchmark
+benchShiftByteString =
+  let xs = makeSample seedA
+      inputs = pairWith (const 1) xs
+      in createTwoTermBuiltinBenchElementwiseWithWrappers
+         (id, IntegerCostedLiterally) ShiftByteString [] inputs
+
+{- The behaviour of `rotateByteString` is very similar to that of
+   `shiftByteString` except that the time taken depends pretty much linearly on
+   the length of the bytestring and the effect of the size of the rotation is
+   negligible.  We could add a special case for costing whole-byte rotations,
+   but for the time being we run benchmarks for a single-bit shift and fit a
+   straight line to the time taken.  A model fitted to smaller data extrapolates
+   well to larger data.
+-}
+benchRotateBytestring :: Benchmark
+benchRotateBytestring =
+  let xs = makeSample seedA
+      inputs = pairWith (const 1) xs
+  in createTwoTermBuiltinBenchElementwiseWithWrappers
+     (id, IntegerCostedLiterally) RotateByteString [] inputs
+
+{- For `countSetBits`, the time taken is linear in the length.  A model based on
+   small input sizes (up to 1280 bytes) extrapolates well to results for large
+   inputs (up to 12800 bytes).  Counting the bits in an all-0xFF bytestring may
+   take 1% or so longer than for an all-0x00 bytestring. -}
+benchCountSetBits :: Benchmark
+benchCountSetBits =
+  let xs = fmap (\n -> BS.replicate (8*n) 0xFF) sampleSizes  -- length 8, 16, ..., 1200
+  in createOneTermBuiltinBench CountSetBits [] xs
+
+{- For `findFirstSetBits` the time taken is pretty much linear in the length, with
+   occasional bumps.  Unsurprisingly the function takes longest for an all-0x00
+   bytestring because it has to examine every byte in that case.  For costing we
+   use 0x8000...00 just to avoid the all-zeros case in case someone attempts to
+   optimise for that case at some time in the future.  For small data the worst
+   case takes up to 8% longer than the best case (0x00..01) and for large data
+   it can take up to 40% longer. A model based on small input sizes extrapolates
+   well to results for large inputs. -}
+benchFindFirstSetBit :: Benchmark
+benchFindFirstSetBit =
+  let xs = fmap (\n -> BS.cons 0x80 (BS.replicate (8*n-1) 0x00)) sampleSizes
+  in createOneTermBuiltinBench FindFirstSetBit [] xs
 
 makeBenchmarks :: [Benchmark]
 makeBenchmarks =
-    [ benchByteStringToInteger
-    , benchIntegerToByteString
+    [ benchIntegerToByteString
+    , benchByteStringToInteger
+    , benchAndByteString
+    , benchComplementByteString
+    , benchReadBit
+    , benchWriteBits
+    , benchReplicateByte
+    , benchShiftByteString
+    , benchRotateBytestring
+    , benchCountSetBits
+    , benchFindFirstSetBit
     ]