Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kwxm/costing/bitwise 4 #6301

Merged
merged 24 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions plutus-benchmark/bls12-381-costs/bench/Bench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,27 @@ import Data.ByteString qualified as BS (empty)

benchHashAndAddG1 :: EvaluationContext -> Integer -> Benchmark
benchHashAndAddG1 ctx n =
let prog = mkHashAndAddG1Script (listOfSizedByteStrings n 4)
let prog = mkHashAndAddG1Script (listOfByteStringsOfLength n 4)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great rename by the way.

in bench (show n) $ benchProgramCek ctx prog

benchHashAndAddG2 :: EvaluationContext -> Integer -> Benchmark
benchHashAndAddG2 ctx n =
let prog = mkHashAndAddG2Script (listOfSizedByteStrings n 4)
let prog = mkHashAndAddG2Script (listOfByteStringsOfLength n 4)
in bench (show n) $ benchProgramCek ctx prog

benchUncompressAndAddG1 :: EvaluationContext -> Integer -> Benchmark
benchUncompressAndAddG1 ctx n =
let prog = mkUncompressAndAddG1Script (listOfSizedByteStrings n 4)
let prog = mkUncompressAndAddG1Script (listOfByteStringsOfLength n 4)
in bench (show n) $ benchProgramCek ctx prog

benchUncompressAndAddG2 :: EvaluationContext -> Integer -> Benchmark
benchUncompressAndAddG2 ctx n =
let prog = mkUncompressAndAddG2Script (listOfSizedByteStrings n 4)
let prog = mkUncompressAndAddG2Script (listOfByteStringsOfLength n 4)
in bench (show n) $ benchProgramCek ctx prog

benchPairing :: EvaluationContext -> Benchmark
benchPairing ctx =
case listOfSizedByteStrings 4 4 of
case listOfByteStringsOfLength 4 4 of
[b1, b2, b3, b4] ->
let emptyDst = Tx.toBuiltin BS.empty
p1 = Tx.bls12_381_G1_hashToGroup (Tx.toBuiltin b1) emptyDst
Expand All @@ -46,7 +46,7 @@ benchPairing ctx =
q2 = Tx.bls12_381_G2_hashToGroup (Tx.toBuiltin b4) emptyDst
prog = mkPairingScript p1 p2 q1 q2
in bench "pairing" $ benchProgramCek ctx prog
_ -> error "Unexpected list returned by listOfSizedByteStrings"
_ -> error "Unexpected list returned by listOfByteStringsOfLength"

benchGroth16Verify :: EvaluationContext -> Benchmark
benchGroth16Verify ctx = bench "groth16Verify" $ benchProgramCek ctx mkGroth16VerifyScript
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,22 @@ import Prelude (IO, mapM_)

printCosts_HashAndAddG1 :: Handle -> Integer -> IO ()
printCosts_HashAndAddG1 h n =
let script = mkHashAndAddG1Script (listOfSizedByteStrings n 4)
let script = mkHashAndAddG1Script (listOfByteStringsOfLength n 4)
in printSizeStatistics h (TestSize n) script

printCosts_HashAndAddG2 :: Handle -> Integer -> IO ()
printCosts_HashAndAddG2 h n =
let script = mkHashAndAddG2Script (listOfSizedByteStrings n 4)
let script = mkHashAndAddG2Script (listOfByteStringsOfLength n 4)
in printSizeStatistics h (TestSize n) script

printCosts_UncompressAndAddG1 :: Handle -> Integer -> IO ()
printCosts_UncompressAndAddG1 h n =
let script = mkUncompressAndAddG1Script (listOfSizedByteStrings n 4)
let script = mkUncompressAndAddG1Script (listOfByteStringsOfLength n 4)
in printSizeStatistics h (TestSize n) script

printCosts_UncompressAndAddG2 :: Handle -> Integer -> IO ()
printCosts_UncompressAndAddG2 h n =
let script = mkUncompressAndAddG2Script (listOfSizedByteStrings n 4)
let script = mkUncompressAndAddG2Script (listOfByteStringsOfLength n 4)
in printSizeStatistics h (TestSize n) script

printCosts_Pairing :: Handle -> IO ()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
-}
module PlutusBenchmark.BLS12_381.Scripts
( checkGroth16Verify_Haskell
, listOfSizedByteStrings
, listOfByteStringsOfLength
, mkGroth16VerifyScript
, mkHashAndAddG1Script
, mkHashAndAddG2Script
Expand Down Expand Up @@ -61,9 +61,9 @@ import System.IO.Unsafe (unsafePerformIO)
import Prelude (fromIntegral)

-- Create a list containing n bytestrings of length l. This could be better.
{-# NOINLINE listOfSizedByteStrings #-}
listOfSizedByteStrings :: Integer -> Integer -> [ByteString]
listOfSizedByteStrings n l = unsafePerformIO . G.sample $
{-# NOINLINE listOfByteStringsOfLength #-}
listOfByteStringsOfLength :: Integer -> Integer -> [ByteString]
listOfByteStringsOfLength n l = unsafePerformIO . G.sample $
G.list (R.singleton $ fromIntegral n)
(G.bytes (R.singleton $ fromIntegral l))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ builtinHash :: BuiltinHashFun
builtinHash = Tx.sha2_256

-- Create a list containing n bytestrings of length l. This could be better.
{-# NOINLINE listOfSizedByteStrings #-}
listOfSizedByteStrings :: Integer -> Integer -> [ByteString]
listOfSizedByteStrings n l = unsafePerformIO . G.sample $
{-# NOINLINE listOfByteStringsOfLength #-}
listOfByteStringsOfLength :: Integer -> Integer -> [ByteString]
listOfByteStringsOfLength n l = unsafePerformIO . G.sample $
G.list (R.singleton $ fromIntegral n)
(G.bytes (R.singleton $ fromIntegral l))

Expand All @@ -94,7 +94,7 @@ mkInputs :: forall v msg .
mkInputs n toMsg hash =
Inputs $ map mkOneInput (zip seeds1 seeds2)
where seedSize = 128
(seeds1, seeds2) = splitAt n $ listOfSizedByteStrings (2*n) seedSize
(seeds1, seeds2) = splitAt n $ listOfByteStringsOfLength (2*n) seedSize
-- ^ Seeds for key generation. For some algorithms the seed has to be
-- a certain minimal size and there's a SeedBytesExhausted error if
-- it's not big enough; 128 is big enough for everything here though.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<!--
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably want to remove the <!-- and --> parts.

### Added

- Added costing for the new bitwise builtins (see CIP-0058), which will probably become available at the Chang+1 HF.

-->
243 changes: 199 additions & 44 deletions plutus-core/cost-model/budgeting-bench/Benchmarks/Bitwise.hs
Original file line number Diff line number Diff line change
@@ -1,45 +1,60 @@
-- editorconfig-checker-disable-file

{-# LANGUAGE TypeOperators #-}

module Benchmarks.Bitwise (makeBenchmarks) where

import Common
import Generators

import PlutusCore
import PlutusCore.Evaluation.Machine.ExMemoryUsage
import PlutusCore.Evaluation.Machine.CostStream (sumCostStream)
import PlutusCore.Evaluation.Machine.ExMemoryUsage (ExMemoryUsage, IntegerCostedLiterally (..),
ListCostedByLength (..),
NumBytesCostedAsNumWords (..), flattenCostRose,
memoryUsage)

import Criterion.Main
import Data.ByteString qualified as BS
import Data.SatInt (fromSatInt)
import Hedgehog qualified as H

{- | Costing benchmarks for bitwise bytestring builtins and integer/bytestring conversions. -}

{- Most of the initial exploratory benchmarks were run with a set of small input
bytestrings (up to size 160 / 1280 bytes) and then again with a set of large
inputs (up to size 1600 / 12800 bytes). In the final budgeting benchmarks we
mostly go up to size 150 (= 1200 bytes).
-}

---------------- ByteString builtins ----------------
numSamples :: Int
numSamples = 150

sampleSizes :: [Int]
sampleSizes = [1..numSamples]

-- Smallish bytestring inputs: 150 entries. Note that the length of a
-- bytestring is eight times the size.
smallerByteStrings150 :: H.Seed -> [BS.ByteString]
smallerByteStrings150 seed = makeSizedByteStrings seed [1..150]
makeSample :: H.Seed -> [BS.ByteString]
makeSample seed = makeSizedByteStrings seed sampleSizes

-- Make an integer of size n which encodes to 0xFF...FF
allFF :: Int -> Integer
allFF n = 256^(8*n) - 1

------------------------- ByteStringToInteger -------------------------

{- Experiments show that the times for big-endian and little-endian conversions
are very similar, with big-endian conversion perhaps taking a fraction
longer. We just generate a costing function for big-endian conversion and
use that for the little-endian conversion as well. A quadratic function
fitted to inputs of size up to 150 gives a good fit and extrapolates well to
larger inputs. -}
repunitOfSize :: Int -> Integer
repunitOfSize n = 256^(8*n) - 1

-- Calculate the index of the top (ie, righmost) bit in a bytestring.
topBitIndex :: BS.ByteString -> Integer
topBitIndex s = fromIntegral $ 8*(BS.length s)-1

memoryUsageAsNumBytes :: ExMemoryUsage a => a -> Integer
memoryUsageAsNumBytes = (8*) . fromSatInt . sumCostStream . flattenCostRose . memoryUsage

{- Experiments show that the times for big-endian and little-endian
`byteStringToInteger` conversions are very similar, with big-endian
conversion perhaps taking a fraction longer. We just generate a costing
function for big-endian conversion and use that for the little-endian
conversion as well. A quadratic function fitted to inputs of size up to 150
gives a good fit and extrapolates well to larger inputs. -}
benchByteStringToInteger :: Benchmark
benchByteStringToInteger = createTwoTermBuiltinBenchElementwise ByteStringToInteger []
(repeat True) (smallerByteStrings150 seedA)


------------------------- IntegerToByteString -------------------------
benchByteStringToInteger =
createTwoTermBuiltinBenchElementwise ByteStringToInteger [] $ fmap (\x -> (True,x)) (makeSample seedA)

{- We have four possibilities for integer to bytestring conversions: they can be
big- or little-endian, and they can also be of bounded or unbounded width.
Expand All @@ -53,31 +68,171 @@ benchByteStringToInteger = createTwoTermBuiltinBenchElementwise ByteStringToInt
a single function call to generate the padding and experiments show that the
time required for this is negligible in comparison to the conversion time.
It's important to make sure that the memory cost does take account of the width
though. -}

-- Make sure that the input integer really does require the full width so that
-- the conversion does the maximum amount of work.
though. The sample we use gives us bytestrings up to 8*150 = 1200 bytes long.
This is well within the 8192-byte limit. -}
benchIntegerToByteString :: Benchmark
benchIntegerToByteString =
let b = IntegerToByteString
widths = [1..150]
inputs = fmap allFF widths
-- This is like createThreeTermBuiltinBenchElementwise, but we want to
-- make sure that the width appears literally in the benchmark name.
createBench l =
let mkOneBM (e, width, n) =
-- Widths are in words: we need to convert those to widths in bytes for the implementation
let width' = 8 * fromIntegral width
in bgroup (showMemoryUsage e) [
bgroup (showMemoryUsage (LiteralByteSize width')) [mkBM e width' n]
]
where mkBM x y z = benchDefault (showMemoryUsage z) $ mkApp3 b [] x y z
in bgroup (show b) $ fmap mkOneBM l

in createBench $ zip3 (repeat True) widths inputs
inputs = fmap repunitOfSize sampleSizes
-- The minimum width of bytestring needed to fit the inputs into.
widthsInBytes = fmap memoryUsageAsNumBytes inputs
in createThreeTermBuiltinBenchElementwiseWithWrappers
(id, NumBytesCostedAsNumWords, id) b [] $
zip3 (repeat True) widthsInBytes inputs

{- For `andByteString` with different-sized inputs, calling it with extension
semantics (ie, first argument=True) takes up to about 5% longer than with
truncation semantics for small arguments and up to about 15% for larger inputs.
Fitting t~min(x,y) gives a reasonable prediction for small values of x and y but
this doesn't extend too well to larger values. There are two factors in play:
with extension semantics there's less copying work to do but more alloction work
(which is a lot cheaper). If we fit a model of the form t~pmin(x,y) then this
accounts for the copying but not the allocation. if we add a factor for copying
as well (t ~ pmin(x,y) + abs(x-y)) then we get a model that extends well to
larger data. Equivalently we can fit t~x+y to the data for y<=x, but then we'd
have to swap the inputs for y>x.

A model for t~x+y does a good job though: we get within +/-5% for the small data
and -20% to +5% for big data. We could also try fitting t=a+bx along x=y for the
small data and then extrapolate that to a/2+ b/2(x+y) elsewhere.

We assume that the costs of `orByteString` and `xorByteString` are the same as
those of `andByteString` and re-use the `andByteString` costing function for
those.
-}
benchAndByteString :: Benchmark
benchAndByteString =
let inputSizes = fmap (20*) [1..25] -- 20..400: 625 cases, which should take an hour or so.
xs = makeSizedByteStrings seedA inputSizes
ys = makeSizedByteStrings seedB inputSizes
in createTwoTermBuiltinBenchWithFlag AndByteString [] True xs ys
-- This requires a special case in the costing codet because we don't include
-- the first argument (the flag).

{- For `complementByteString`, the time taken is linear in the length. A model
based on small input sizes extrapolates well to results for large inputs -}
benchComplementByteString :: Benchmark
benchComplementByteString =
let xs = makeSample seedA
in createOneTermBuiltinBench ComplementByteString [] xs

{- `readBit` is pretty much constant time regardless of input size and the position of
the bit to be read. -}
benchReadBit :: Benchmark
benchReadBit =
let xs = makeSample seedA
in createTwoTermBuiltinBenchElementwise ReadBit [] $ pairWith topBitIndex xs

{- Benchmarks show that the time taken by `writeBits` depends mostly on the size
of the list of updates, although it may take a little longer to write bits
with larger indices. We run benchmarks involving increasing numbers of
updates to 1024-byte bytestrings, always writing the highest-indexed bit to
take account of this. We use a fresh bytestring for each set of updates.
-}
benchWriteBits :: Benchmark
benchWriteBits =
let fun = WriteBits
size = 128 -- This is equal to length 1024.
xs = makeSizedByteStrings seedA $ take numSamples $ repeat size
l = zip xs [1..numSamples]
-- Given a bytestring s and an integer k, return a pair (s,u) where u is a
-- list of updates which write the highest bit in s 10*k times. Here k
-- will range from 1 to numSamples, which is 150.
mkUpdatesFor (s,k) =
let topIndex = topBitIndex s
updates = take (10*k) $ cycle [(topIndex, False), (topIndex, True)]
in (s, updates)
inputs = fmap mkUpdatesFor l
mkBM x y = benchDefault (showMemoryUsage (ListCostedByLength y)) $ mkApp2 fun [] x y
in bgroup (show fun) $ fmap(\(s,u) -> bgroup (showMemoryUsage s) $ [mkBM s u]) inputs
{- This is like createTwoTermBuiltinBenchElementwise except that the benchmark name contains
the length of the list of updates, not the memory usage. The denotation of WriteBits
in Default.Builtins must wrap its second argument in ListCostedByLength to make sure
that the correct ExMemoryUsage instance is called for costing. -}

{- For small inputs `replicateByte` looks constant-time. For larger inputs it's
linear. We're limiting the output to 8192 bytes (size 1024), so we may as
well test the whole legal range. NB: if we change the value of
integerToByteStringMaximumOutputLength then we probably need to change the
limits here too.
-}
benchReplicateByte :: Benchmark
benchReplicateByte =
let numCases = 128 :: Int
xs = fmap (fromIntegral . (64*)) [1..numCases] :: [Integer]
-- ^ This gives us replication counts up to 64*128 = 8192, the maximum allowed.
inputs = pairWith (const (0xFF::Integer)) xs
in createTwoTermBuiltinBenchElementwiseWithWrappers
(NumBytesCostedAsNumWords, id) ReplicateByte [] inputs

{- Benchmarks with varying sizes of bytestrings and varying amounts of shifting
show that the execution time of `shiftByteString` depends linearly on the
length of the bytestring and (to a much smaller degree) the size of the
shift, except that shifts which involve shifting bits between bytes are
significantly more expensive than shfts by a whole number of bytes. For
bytestrings of size 50 the ratio between the former and the latter is about
1.5 and for size 400 it's about 4. We could add a special case for costing
whole-byte shifts, but for the time being we run benchmarks for a single-bit
shift and fit a linear model to the time taken versus the length of the
bytestring. This gives a mmodel which is very accurate for small shifts and
overstimates times for large shifts by maybe 4% or so, A model fitted to
smaller data extrapolates very well to larger data.
-}
benchShiftByteString :: Benchmark
benchShiftByteString =
let xs = makeSample seedA
inputs = pairWith (const 1) xs
in createTwoTermBuiltinBenchElementwiseWithWrappers
(id, IntegerCostedLiterally) ShiftByteString [] inputs

{- The behaviour of `rotateByteString` is very similar to that of
`shiftByteString` except that the time taken depends pretty much linearly on
the length of the bytestring and the effect of the size of the rotation is
negligible. We could add a special case for costing whole-byte rotations,
but for the time being we run benchmarks for a single-bit shift and fit a
straight line to the time taken. A model fitted to smaller data extrapolates
well to larger data.
-}
benchRotateBytestring :: Benchmark
benchRotateBytestring =
let xs = makeSample seedA
inputs = pairWith (const 1) xs
in createTwoTermBuiltinBenchElementwiseWithWrappers
(id, IntegerCostedLiterally) RotateByteString [] inputs

{- For `countSetBits`, the time taken is linear in the length. A model based on
small input sizes (up to 1280 bytes) extrapolates well to results for large
inputs (up to 12800 bytes). Counting the bits in an all-0xFF bytestring may
take 1% or so longer than for an all-0x00 bytestring. -}
benchCountSetBits :: Benchmark
benchCountSetBits =
let xs = fmap (\n -> BS.replicate (8*n) 0xFF) sampleSizes -- length 8, 16, ..., 1200
in createOneTermBuiltinBench CountSetBits [] xs

{- For `findFirstSetBits` the time taken is pretty much linear in the length, with
occasional bumps. Unsurprisingly the function takes longest for an all-0x00
bytestring because it has to examine every byte in that case. For costing we
use 0x8000...00 just to avoid the all-zeros case in case someone attempts to
optimise for that case at some time in the future. For small data the worst
case takes up to 8% longer than the best case (0x00..01) and for large data
it can take up to 40% longer. A model based on small input sizes extrapolates
well to results for large inputs. -}
benchFindFirstSetBit :: Benchmark
benchFindFirstSetBit =
let xs = fmap (\n -> BS.cons 0x80 (BS.replicate (8*n-1) 0x00)) sampleSizes
in createOneTermBuiltinBench FindFirstSetBit [] xs

makeBenchmarks :: [Benchmark]
makeBenchmarks =
[ benchByteStringToInteger
, benchIntegerToByteString
[ benchIntegerToByteString
, benchByteStringToInteger
, benchAndByteString
, benchComplementByteString
, benchReadBit
, benchWriteBits
, benchReplicateByte
, benchShiftByteString
, benchRotateBytestring
, benchCountSetBits
, benchFindFirstSetBit
]
Loading