diff --git a/Cargo.lock b/Cargo.lock index dad95ed252..e3fd286e47 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3002,6 +3002,18 @@ dependencies = [ "tracer", ] +[[package]] +name = "jolt-inlines-poseidon2-goldilocks" +version = "0.1.0" +dependencies = [ + "jolt-inlines-sdk", + "p3-field", + "p3-goldilocks", + "p3-poseidon2", + "p3-symmetric", + "tracer", +] + [[package]] name = "jolt-inlines-sdk" version = "0.1.0" @@ -3909,6 +3921,175 @@ dependencies = [ "jolt-sdk", ] +[[package]] +name = "p3-challenger" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8972ccd1d5dc90e46cdb1f2ab4ee2bae49b3917e5e98aa533f0c2b779c010445" +dependencies = [ + "p3-field", + "p3-maybe-rayon", + "p3-monty-31", + "p3-symmetric", + "p3-util", + "tracing", +] + +[[package]] +name = "p3-dft" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17771aca44632f9cc11f2718d7ea7ec06794946c4190ef3a985bfc893f14c18a" +dependencies = [ + "itertools 0.14.0", + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-util", + "spin 0.10.0", + "tracing", +] + +[[package]] +name = "p3-field" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f3eb24d0591fd4d282d89cbe4e4efba5571c699375006f80b2cbf53ce83461c" +dependencies = [ + "itertools 0.14.0", + "num-bigint", + "p3-maybe-rayon", + "p3-util", + "paste", + "rand 0.10.1", + "serde", + "tracing", +] + +[[package]] +name = "p3-goldilocks" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5751c6591a0d2397d726620c2c29a7436ec6c5e19d2ed74ca5d078d4fbb18eb5" +dependencies = [ + "num-bigint", + "p3-challenger", + "p3-dft", + "p3-field", + "p3-mds", + "p3-poseidon1", + "p3-poseidon2", + "p3-symmetric", + "p3-util", + "paste", + "rand 0.10.1", + "serde", +] + +[[package]] +name = "p3-matrix" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea9c94c0714944e7b8a9a62e6340b1e3e1d3f8ecfd3e35c08798360200e73eff" +dependencies = [ + "itertools 0.14.0", + "p3-field", + "p3-maybe-rayon", + "p3-util", + "rand 0.10.1", + "serde", + "tracing", +] + +[[package]] +name = "p3-maybe-rayon" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eebc233a34b1ab0273f35b4052fa2eeb3114b22ba4575bd7da00716e878ffb77" + +[[package]] +name = "p3-mds" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b5441fa8116246ec9e6c835f15273cb27777ca572960ec87476b67fef13e01e" +dependencies = [ + "p3-dft", + "p3-field", + "p3-symmetric", + "p3-util", + "rand 0.10.1", +] + +[[package]] +name = "p3-monty-31" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8724f330ea6d19dd4f2436aa0f88b5fcbf88f0f55ca7fccd3fea8b736dbcddad" +dependencies = [ + "itertools 0.14.0", + "num-bigint", + "p3-dft", + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-mds", + "p3-poseidon1", + "p3-poseidon2", + "p3-symmetric", + "p3-util", + "paste", + "rand 0.10.1", + "serde", + "spin 0.10.0", + "tracing", +] + +[[package]] +name = "p3-poseidon1" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e2a562fea210baae390a32f9ecf0dd8724ae3f4352d1c8e413077b6f00a162" +dependencies = [ + "p3-field", + "p3-symmetric", + "rand 0.10.1", +] + +[[package]] +name = "p3-poseidon2" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06394851c161d17e4aa4ad2aad5557d32f14cadd1dc838f965d8e1821a63b8c5" +dependencies = [ + "p3-field", + "p3-mds", + "p3-symmetric", + "p3-util", + "rand 0.10.1", +] + +[[package]] +name = "p3-symmetric" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac1a276d421f8ef3361bb7d8c39a02c93c6b3f10eeaa559cc4c50222f9a5b82" +dependencies = [ + "itertools 0.14.0", + "p3-field", + "p3-util", + "serde", +] + +[[package]] +name = "p3-util" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d08a58162a4c264269ef454f0b28dcda89939490eecacb2b2cf5b00f719b80f6" +dependencies = [ + "serde", + "transpose", +] + [[package]] name = "page_size" version = "0.6.0" @@ -4295,7 +4476,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", - "heck 0.4.1", + "heck 0.5.0", "itertools 0.12.1", "log", "multimap", @@ -4329,7 +4510,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.117", @@ -4427,6 +4608,15 @@ dependencies = [ "serde", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -4466,6 +4656,12 @@ dependencies = [ "serde", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -5567,6 +5763,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "strsim" version = "0.11.1" @@ -5982,6 +6184,16 @@ dependencies = [ "zklean-extractor", ] +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + [[package]] name = "tree-sitter" version = "0.20.9" diff --git a/Cargo.toml b/Cargo.toml index d643be1a5a..9ece2d7559 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,7 @@ members = [ "jolt-inlines/secp256k1", "jolt-inlines/grumpkin", "jolt-inlines/p256", + "jolt-inlines/poseidon2-goldilocks", "examples/btreemap/host", "examples/btreemap/guest", "examples/collatz", @@ -254,6 +255,10 @@ sha3 = "0.11" blake2 = "0.11.0-rc.6" blake3 = { version = "1.8.5" } light-poseidon = "0.4" +p3-field = "0.5.3" +p3-goldilocks = "0.5.3" +p3-poseidon2 = "0.5.3" +p3-symmetric = "0.5.3" digest = "0.11" jolt-optimizations = { git = "https://github.com/a16z/arkworks-algebra", branch = "dev/twist-shout" } dory = { package = "dory-pcs", version = "0.3.0", features = [ @@ -400,3 +405,4 @@ jolt-inlines-bigint = { path = "./jolt-inlines/bigint", default-features = false jolt-inlines-secp256k1 = { path = "./jolt-inlines/secp256k1", default-features = false } jolt-inlines-grumpkin = { path = "./jolt-inlines/grumpkin", default-features = false } jolt-inlines-p256 = { path = "./jolt-inlines/p256", default-features = false } +jolt-inlines-poseidon2-goldilocks = { path = "./jolt-inlines/poseidon2-goldilocks", default-features = false } diff --git a/crates/jolt-riscv/src/profile.rs b/crates/jolt-riscv/src/profile.rs index 26839dc187..6f8ef12bfb 100644 --- a/crates/jolt-riscv/src/profile.rs +++ b/crates/jolt-riscv/src/profile.rs @@ -36,6 +36,7 @@ pub enum InlineExtension { Secp256k1, Grumpkin, P256, + Poseidon2Goldilocks, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -84,6 +85,7 @@ pub const RV64IMAC_JOLT_ALL_INLINES: JoltInstructionProfile = JoltInstructionPro InlineExtension::Secp256k1, InlineExtension::Grumpkin, InlineExtension::P256, + InlineExtension::Poseidon2Goldilocks, ], }; @@ -254,6 +256,7 @@ const fn inline_extension_code(extension: InlineExtension) -> u8 { InlineExtension::Secp256k1 => 5, InlineExtension::Grumpkin => 6, InlineExtension::P256 => 7, + InlineExtension::Poseidon2Goldilocks => 8, } } diff --git a/jolt-inlines/poseidon2-goldilocks/Cargo.toml b/jolt-inlines/poseidon2-goldilocks/Cargo.toml new file mode 100644 index 0000000000..1065debcb3 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "jolt-inlines-poseidon2-goldilocks" +version = "0.1.0" +edition = "2021" +description = "Poseidon2-Goldilocks inline implementation for Jolt VM" +license = "MIT" +homepage = "https://github.com/a16z/jolt/README.md" +repository = "https://github.com/a16z/jolt" + +[features] +default = [] +host = ["jolt-inlines-sdk/host"] + +[dependencies] +jolt-inlines-sdk = { workspace = true, optional = true } + +[dev-dependencies] +p3-field.workspace = true +p3-goldilocks.workspace = true +p3-poseidon2.workspace = true +p3-symmetric.workspace = true +tracer = { workspace = true, features = ["std", "test-utils"] } diff --git a/jolt-inlines/poseidon2-goldilocks/src/exec.rs b/jolt-inlines/poseidon2-goldilocks/src/exec.rs new file mode 100644 index 0000000000..61d58ba384 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/exec.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: MIT + +//! Host-side reference implementation of the Goldilocks Poseidon2 +//! permutation. +//! +//! This is the ground-truth comparator the sequence builder must +//! match byte-for-byte. The tests compare it against Plonky3's +//! canonical `Poseidon2Goldilocks<8>` implementation. + +use crate::{Poseidon2GoldilocksState, GOLDILOCKS_MODULUS, STATE_WIDTH}; + +// Re-exported from `crate` root so existing callers can keep their +// `crate::exec::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8` and +// `crate::exec::add_mod` imports. The canonical definitions live in +// `lib.rs` because the SDK guest path needs them in `no_std` builds. +pub use crate::{add_mod, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8}; + +/// Diagonal matrix for the internal-round diffusion step. +#[rustfmt::skip] +pub const POSEIDON2_INTERNAL_DIAG: [u64; STATE_WIDTH] = [ + 0xfffffffeffffffff, + 1, + 2, + 0x7fffffff80000001, + 3, + 0x7fffffff80000000, + 0xfffffffefffffffe, + 0xfffffffefffffffd, +]; + +#[inline] +pub fn mul_mod(a: u64, b: u64) -> u64 { + let res = (a as u128) * (b as u128); + let lo = res as u64; + let hi = (res >> 64) as u64; + let hi_hi = hi >> 32; + let hi_lo = hi as u32 as u64; + + // `add_term` is `lo + (hi_lo << 32)`. This sum can exceed 2^64. + // The naive wrapping_add loses 2^64 worth of magnitude in that + // case — and since 2^64 ≡ (2^32 - 1) mod P, the result is short + // by (2^32 - 1) mod P when the overflow happens. Detect and + // compensate. + let (add_term, add_overflow) = lo.overflowing_add(hi_lo << 32); + let sub_term = hi_lo + hi_hi; + + let mut r = add_term.wrapping_sub(sub_term); + if add_term < sub_term { + r = r.wrapping_add(GOLDILOCKS_MODULUS); + } + + if add_overflow { + // Add (2^32 - 1) to recover the lost magnitude. If THIS add + // overflows u64, the wrap is equivalent to subtracting another + // 2^64 ≡ (2^32 - 1) mod P from the result — so we add + // (2^32 - 1) one more time. + let (r1, wrapped) = r.overflowing_add(0xFFFFFFFF); + r = r1; + if wrapped { + r = r.wrapping_add(0xFFFFFFFF); + } + } + + while r >= GOLDILOCKS_MODULUS { + r -= GOLDILOCKS_MODULUS; + } + r +} + +/// S-box: `x^7` over Goldilocks. Computed as `x^4 * x^2 * x` (3 mults). +#[inline] +pub fn sbox(x: u64) -> u64 { + let x2 = mul_mod(x, x); + let x4 = mul_mod(x2, x2); + let x3 = mul_mod(x2, x); + mul_mod(x4, x3) +} + +/// External MDS layer: 8-wide matrix multiply via two m4 sub-blocks +/// plus the cross-mixing step. +pub fn external_mds(state: &mut [u64; STATE_WIDTH]) { + fn m4(s: &mut [u64]) { + let (a, b, c, d) = (s[0], s[1], s[2], s[3]); + let sum = add_mod(add_mod(a, b), add_mod(c, d)); + s[0] = add_mod(sum, add_mod(a, add_mod(b, b))); + s[1] = add_mod(sum, add_mod(b, add_mod(c, c))); + s[2] = add_mod(sum, add_mod(c, add_mod(d, d))); + s[3] = add_mod(sum, add_mod(d, add_mod(a, a))); + } + let mut left = [state[0], state[1], state[2], state[3]]; + let mut right = [state[4], state[5], state[6], state[7]]; + m4(&mut left); + m4(&mut right); + for i in 0..4 { + state[i] = add_mod(left[i], right[i]); + state[i + 4] = add_mod(left[i], right[i]); + } + for i in 0..4 { + state[i] = add_mod(state[i], left[i]); + state[i + 4] = add_mod(state[i + 4], right[i]); + } +} + +/// Internal-round diffusion: multiply by diagonal, then add row-sum +/// to every coordinate. +pub fn internal_diffusion(state: &mut [u64; STATE_WIDTH]) { + let mut sum = 0; + for &s in state.iter() { + sum = add_mod(sum, s); + } + for i in 0..STATE_WIDTH { + state[i] = add_mod(mul_mod(POSEIDON2_INTERNAL_DIAG[i], state[i]), sum); + } +} + +/// The Poseidon2 permutation in full. +pub fn execute_poseidon2_permutation(state: &mut Poseidon2GoldilocksState) { + let mut rc_idx = 0; + + external_mds(state); + + // 4 external initial rounds + for _ in 0..4 { + for s in state.iter_mut() { + *s = add_mod(*s, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[rc_idx]); + rc_idx += 1; + } + for s in state.iter_mut() { + *s = sbox(*s); + } + external_mds(state); + } + + // 22 internal rounds + for _ in 0..22 { + state[0] = add_mod(state[0], POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[rc_idx]); + rc_idx += 1; + state[0] = sbox(state[0]); + internal_diffusion(state); + } + + // 4 external final rounds + for _ in 0..4 { + for s in state.iter_mut() { + *s = add_mod(*s, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[rc_idx]); + rc_idx += 1; + } + for s in state.iter_mut() { + *s = sbox(*s); + } + external_mds(state); + } +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/host.rs b/jolt-inlines/poseidon2-goldilocks/src/host.rs new file mode 100644 index 0000000000..3e5d4300e0 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/host.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: MIT + +//! Host-side registration of the Poseidon2-Goldilocks inline with +//! the Jolt prover/tracer. +//! +//! The `register_inlines!` macro generates the dispatcher that maps +//! our `(INLINE_OPCODE, FUNCT3, FUNCT7)` triple to +//! `Poseidon2GoldilocksPermutation::build_sequence`. + +use crate::sequence_builder::Poseidon2GoldilocksPermutation; + +jolt_inlines_sdk::register_inlines! { + trace_file: "poseidon2_goldilocks_trace.joltinline", + extension: jolt_inlines_sdk::host::InlineExtension::Poseidon2Goldilocks, + ops: [Poseidon2GoldilocksPermutation], +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/lib.rs b/jolt-inlines/poseidon2-goldilocks/src/lib.rs new file mode 100644 index 0000000000..8b0899818a --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/lib.rs @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: MIT + +//! Goldilocks Poseidon2 inline for the Jolt zkVM. +//! +//! This crate implements the canonical Plonky3-compatible 8-wide +//! Poseidon2 permutation over the Goldilocks field. The guest API emits +//! one custom inline instruction that permutes an eight-limb state +//! in-place; the host registration expands that instruction into a +//! deterministic virtual-instruction sequence for Jolt tracing. +//! +//! ## Inline opcode encoding +//! +//! Custom RISC-V instruction: +//! +//! ```text +//! .insn r INLINE_OPCODE, POSEIDON2_GOLDILOCKS_FUNCT3, POSEIDON2_GOLDILOCKS_FUNCT7, x0, rs1, x0 +//! ``` +//! +//! - `rs1` points to a 64-byte (8 × u64), 8-byte-aligned state buffer +//! that is permuted in place. +//! - Round constants are embedded in the inline expansion as virtual +//! immediates; `rs2` is unused. +//! +//! `INLINE_OPCODE` (0x0B) is shared with the upstream +//! `jolt-inlines-*` crates; this crate owns the `funct7 = 0x08` +//! namespace, with `funct3` enumerating its operations (`0x00` = the +//! 8-wide permutation). + +#![cfg_attr(not(feature = "host"), no_std)] + +/// Shared custom inline opcode space. Same value used by all +/// `jolt-inlines-*` crates upstream. +pub const INLINE_OPCODE: u32 = 0x0B; + +/// `funct3` for the Goldilocks Poseidon2 permutation opcode. +/// +pub const POSEIDON2_GOLDILOCKS_FUNCT3: u32 = 0x00; + +/// `funct7` for the Goldilocks Poseidon2 permutation opcode. +pub const POSEIDON2_GOLDILOCKS_FUNCT7: u32 = 0x08; + +/// Human-readable inline name. Used in trace-file headers and +/// upstream registration. +pub const POSEIDON2_GOLDILOCKS_NAME: &str = "POSEIDON2_GOLDILOCKS_INLINE"; + +/// State width for our Poseidon2 instance. Hard-coded to 8; v0 is +/// not generic over width. +pub const STATE_WIDTH: usize = 8; + +/// Convenience: an 8-element Goldilocks state. +pub type Poseidon2GoldilocksState = [u64; STATE_WIDTH]; + +/// Goldilocks field modulus `p = 2^64 - 2^32 + 1`. +pub const GOLDILOCKS_MODULUS: u64 = 0xFFFF_FFFF_0000_0001; + +/// Goldilocks field modular addition. +/// +/// Lives in `lib.rs` (not `exec.rs`) because the SDK's +/// `poseidon2_hash_pair` absorbs inputs via `add_mod` in BOTH host and +/// no_std/guest builds. The guest path can't see `exec` (host-only). +#[inline] +pub fn add_mod(a: u64, b: u64) -> u64 { + let (mut sum, overflow) = a.overflowing_add(b); + if overflow { + sum = sum.wrapping_sub(GOLDILOCKS_MODULUS); + } + if sum >= GOLDILOCKS_MODULUS { + sum -= GOLDILOCKS_MODULUS; + } + sum +} + +/// 86 Poseidon2 round constants for the Goldilocks 8-wide instance. +/// +/// Layout: 32 external initial (4 rounds × 8 elements) + 22 internal +/// (state[0] only) + 32 external final (4 rounds × 8 elements). +/// +/// Kept in the crate root so both the host reference and sequence +/// builder share the same table. +#[rustfmt::skip] +pub static POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8: [u64; 86] = [ + // External initial: 4 rounds × 8 elements + 0xdd5743e7f2a5a5d9, 0xcb3a864e58ada44b, 0xffa2449ed32f8cdc, 0x42025f65d6bd13ee, + 0x7889175e25506323, 0x34b98bb03d24b737, 0xbdcc535ecc4faa2a, 0x5b20ad869fc0d033, + 0xf1dda5b9259dfcb4, 0x27515210be112d59, 0x4227d1718c766c3f, 0x26d333161a5bd794, + 0x49b938957bf4b026, 0x4a56b5938b213669, 0x1120426b48c8353d, 0x6b323c3f10a56cad, + 0xce57d6245ddca6b2, 0xb1fc8d402bba1eb1, 0xb5c5096ca959bd04, 0x6db55cd306d31f7f, + 0xc49d293a81cb9641, 0x1ce55a4fe979719f, 0xa92e60a9d178a4d1, 0x002cc64973bcfd8c, + 0xcea721cce82fb11b, 0xe5b55eb8098ece81, 0x4e30525c6f1ddd66, 0x43c6702827070987, + 0xaca68430a7b5762a, 0x3674238634df9c93, 0x88cee1c825e33433, 0xde99ae8d74b57176, + // Internal: 22 scalars (state[0] only) + 0x488897d85ff51f56, 0x1140737ccb162218, 0xa7eeb9215866ed35, 0x9bd2976fee49fcc9, + 0xc0c8f0de580a3fcc, 0x4fb2dae6ee8fc793, 0x343a89f35f37395b, 0x223b525a77ca72c8, + 0x56ccb62574aaa918, 0xc4d507d8027af9ed, 0xa080673cf0b7e95c, 0xf0184884eb70dcf8, + 0x044f10b0cb3d5c69, 0xe9e3f7993938f186, 0x1b761c80e772f459, 0x606cec607a1b5fac, + 0x14a0c2e1d45f03cd, 0x4eace8855398574f, 0xf905ca7103eff3e6, 0xf8c8f8d20862c059, + 0xb524fe8bdd678e5a, 0xfbb7865901a1ec41, + // External final: 4 rounds × 8 elements + 0x014ef1197d341346, 0x9725e20825d07394, 0xfdb25aef2c5bae3b, 0xbe5402dc598c971e, + 0x93a5711f04cdca3d, 0xc45a9a5b2f8fb97b, 0xfe8946a924933545, 0x2af997a27369091c, + 0xaa62c88e0b294011, 0x058eb9d810ce9f74, 0xb3cb23eced349ae4, 0xa3648177a77b4a84, + 0x43153d905992d95d, 0xf4e2a97cda44aa4b, 0x5baa2702b908682f, 0x082923bdf4f750d1, + 0x98ae09a325893803, 0xf8a6475077968838, 0xceb0735bf00b2c5f, 0x0a1a5d953888e072, + 0x2fcb190489f94475, 0xb5be06270dec69fc, 0x739cb934b09acf8b, 0x537750b75ec7f25b, + 0xe9dd318bae1f3961, 0xf7462137299efe1a, 0xb1f6b8eee9adb940, 0xbdebcc8a809dfe6b, + 0x40fc1f791b178113, 0x3ac1c3362d014864, 0x9a016184bdb8aeba, 0x95f2394459fbc25e, +]; + +pub mod sdk; +pub use sdk::*; + +#[cfg(feature = "host")] +pub mod exec; + +#[cfg(feature = "host")] +pub mod sequence_builder; + +#[cfg(feature = "host")] +pub mod host; + +#[cfg(all(test, feature = "host"))] +mod test_constants; + +#[cfg(all(test, feature = "host"))] +mod tests; diff --git a/jolt-inlines/poseidon2-goldilocks/src/sdk.rs b/jolt-inlines/poseidon2-goldilocks/src/sdk.rs new file mode 100644 index 0000000000..fc45ee6e32 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/sdk.rs @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: MIT + +//! Public Poseidon2-Goldilocks API for guests and hosts. +//! +//! In a RISC-V guest build (no_std), `poseidon2_permute` emits the +//! custom inline opcode and the Jolt prover dispatches it to the +//! `sequence_builder`. +//! +//! In a host build (feature = "host"), `poseidon2_permute` calls the +//! reference implementation in `exec.rs`. +//! +//! On non-RISC-V non-host targets (rare — basically tooling builds +//! that don't enable the `host` feature) the function panics with a +//! clear message. + +use crate::Poseidon2GoldilocksState; + +/// Permute an 8-element Goldilocks state in place. +/// +/// # Safety +/// +/// `state` must point to exactly `STATE_WIDTH` (= 8) contiguous u64 +/// values that are writable for the duration of the call. The pointer +/// must be 8-byte aligned. +#[inline(always)] +pub fn poseidon2_permute(state: &mut Poseidon2GoldilocksState) { + unsafe { + poseidon2_permute_inner(state.as_mut_ptr()); + } +} + +// ──────────────────────────────────────────────────────────────────────── +// Custom inline opcode dispatch +// ──────────────────────────────────────────────────────────────────────── + +/// RISC-V guest path: emit the custom inline opcode. +/// +/// Memory contract enforced by the sequence builder: +/// - `rs1` → pointer to the 8-element state (read+written in place). +/// - Round constants are embedded in the inline expansion as virtual +/// immediates; `rs2` is unused. +/// +/// # Safety +/// +/// `state` must be a valid, 8-byte-aligned pointer to 64 bytes of +/// readable+writable memory. +#[cfg(all( + not(feature = "host"), + any(target_arch = "riscv32", target_arch = "riscv64") +))] +#[inline(always)] +unsafe fn poseidon2_permute_inner(state: *mut u64) { + use crate::{INLINE_OPCODE, POSEIDON2_GOLDILOCKS_FUNCT3, POSEIDON2_GOLDILOCKS_FUNCT7}; + core::arch::asm!( + ".insn r {opcode}, {funct3}, {funct7}, x0, {rs1}, x0", + opcode = const INLINE_OPCODE, + funct3 = const POSEIDON2_GOLDILOCKS_FUNCT3, + funct7 = const POSEIDON2_GOLDILOCKS_FUNCT7, + rs1 = in(reg) state, + options(nostack) + ); +} + +/// Host path: dispatch to the reference implementation in `exec.rs`. +#[cfg(feature = "host")] +#[inline(always)] +unsafe fn poseidon2_permute_inner(state: *mut u64) { + let slice = core::slice::from_raw_parts_mut(state, crate::STATE_WIDTH); + let arr: &mut [u64; 8] = slice + .try_into() + .expect("Poseidon2 state must be exactly 8 u64 elements"); + crate::exec::execute_poseidon2_permutation(arr); +} + +/// Non-RISC-V, non-host targets: fail loudly. +#[cfg(all( + not(feature = "host"), + not(any(target_arch = "riscv32", target_arch = "riscv64")) +))] +#[inline(always)] +unsafe fn poseidon2_permute_inner(_state: *mut u64) { + panic!( + "poseidon2_permute requires either the `host` feature or a \ + RISC-V target. Add `features = [\"host\"]` for tooling builds." + ); +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs b/jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs new file mode 100644 index 0000000000..cf1cfcc358 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs @@ -0,0 +1,749 @@ +// SPDX-License-Identifier: MIT + +//! Sequence builder for the Goldilocks Poseidon2 inline. +//! +//! Emits a flat sequence of virtual RISC-V instructions that permutes +//! an 8-element Goldilocks state in place. Operates over ~35 virtual +//! registers so the entire permutation runs without spilling state to +//! memory between rounds. +//! +//! Memory layout: +//! - `rs1`: pointer to the 8-element state (64 bytes), permuted in place +//! +use core::array; + +use jolt_inlines_sdk::host::{ + instruction::{ + add::ADD, addi::ADDI, and::AND, ld::LD, mul::MUL, mulhu::MULHU, sd::SD, slli::SLLI, + sltu::SLTU, srli::SRLI, sub::SUB, + }, + FormatInline, InlineOp, InstrAssembler, Instruction, VirtualRegisterGuard, +}; + +use crate::exec::POSEIDON2_INTERNAL_DIAG; +use crate::STATE_WIDTH; + +/// Virtual-register count. +/// +/// Layout: +/// - `vr[0..8]` — state `S[0..7]` (live across all rounds) +/// - `vr[8..16]` — temp state `T[0..7]` for MDS reorganization +/// - `vr[16]` — P (Goldilocks modulus) loaded once +/// - `vr[17..24]` — mul_mod scratch: lo, hi, hi_lo, hi_hi, +/// shifted, add_term, sub_term +/// - `vr[24..28]` — add_mod / final-reduction scratch (4 regs) +/// - `vr[28..32]` — generic scratch (round constants, diff sums, etc.) +/// - `vr[32]` — internal-diffusion row-sum accumulator +/// - `vr[33..35]` — extra scratch for shifts and intermediates +/// - `vr[35]` — mask_low_32 (constant 2^32 - 1, loaded once) +/// - `vr[36]` — mm_add_ovf (mul_mod add_term overflow flag) +pub const NEEDED_REGISTERS: u8 = 37; + +const P_REG: usize = 16; + +const STATE_LEN: usize = STATE_WIDTH; + +/// Inline operation tag, registered with the Jolt prover via +/// [`crate::host`]. +pub struct Poseidon2GoldilocksPermutation; + +impl InlineOp for Poseidon2GoldilocksPermutation { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = crate::POSEIDON2_GOLDILOCKS_FUNCT3; + const FUNCT7: u32 = crate::POSEIDON2_GOLDILOCKS_FUNCT7; + const NAME: &'static str = crate::POSEIDON2_GOLDILOCKS_NAME; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + Poseidon2GoldilocksSequenceBuilder::new(asm, operands).build() + } +} + +pub(crate) struct Poseidon2GoldilocksSequenceBuilder { + asm: InstrAssembler, + vr: [VirtualRegisterGuard; NEEDED_REGISTERS as usize], + operands: FormatInline, +} + +impl Poseidon2GoldilocksSequenceBuilder { + fn new(asm: InstrAssembler, operands: FormatInline) -> Self { + let vr = array::from_fn(|_| asm.allocator.allocate_for_inline()); + Poseidon2GoldilocksSequenceBuilder { asm, vr, operands } + } + + // ── Register accessors ──────────────────────────────────────────── + + fn s(&self, i: usize) -> u8 { + *self.vr[i] + } + fn t(&self, i: usize) -> u8 { + *self.vr[STATE_LEN + i] + } + fn p_reg(&self) -> u8 { + *self.vr[P_REG] + } + // Named scratch registers for mul_mod + fn mm_lo(&self) -> u8 { + *self.vr[17] + } + fn mm_hi(&self) -> u8 { + *self.vr[18] + } + fn mm_hi_lo(&self) -> u8 { + *self.vr[19] + } + fn mm_hi_hi(&self) -> u8 { + *self.vr[20] + } + fn mm_shifted(&self) -> u8 { + *self.vr[21] + } + fn mm_add_term(&self) -> u8 { + *self.vr[22] + } + fn mm_sub_term(&self) -> u8 { + *self.vr[23] + } + // add_mod / final-reduction scratch + fn am_ovf(&self) -> u8 { + *self.vr[24] + } + fn am_corr(&self) -> u8 { + *self.vr[25] + } + fn am_less(&self) -> u8 { + *self.vr[26] + } + fn am_tmp(&self) -> u8 { + *self.vr[27] + } + // Generic scratch + fn sc_rc(&self) -> u8 { + *self.vr[28] + } + fn sc_diag(&self) -> u8 { + *self.vr[29] + } + fn sc_a(&self) -> u8 { + *self.vr[30] + } + fn sc_b(&self) -> u8 { + *self.vr[31] + } + fn sum_reg(&self) -> u8 { + *self.vr[32] + } + fn sc_c(&self) -> u8 { + *self.vr[33] + } + fn sc_d(&self) -> u8 { + *self.vr[34] + } + fn mask_low32(&self) -> u8 { + *self.vr[35] + } + fn mm_add_ovf(&self) -> u8 { + *self.vr[36] + } + + // ── Top-level build ─────────────────────────────────────────────── + + fn build(mut self) -> Vec { + // 1. Load Goldilocks modulus P into p_reg (3 instructions). + self.load_p(); + + // 2. Load state from memory into vr[0..8]. + self.load_state(); + + // 3. Initial external MDS. + self.external_mds(); + + // 4. 4 external initial rounds. + let mut rc_idx: usize = 0; + for _ in 0..4 { + self.add_round_constants_full(rc_idx); + rc_idx += STATE_LEN; + self.sbox_full(); + self.external_mds(); + } + + // 5. 22 internal rounds. + for _ in 0..22 { + self.add_round_constant_partial(rc_idx); + rc_idx += 1; + self.sbox_state_0(); + self.internal_diffusion(); + } + + // 6. 4 external final rounds. + for _ in 0..4 { + self.add_round_constants_full(rc_idx); + rc_idx += STATE_LEN; + self.sbox_full(); + self.external_mds(); + } + debug_assert_eq!(rc_idx, 86, "round-constant index off"); + + // 7. Store state back to memory. + self.store_state(); + + drop(self.vr); + self.asm.finalize_inline() + } + + // ── Constant loading ────────────────────────────────────────────── + + /// Load P = 2^64 - 2^32 + 1 into p_reg, and (2^32 - 1) into + /// mask_low32. Both are constants used throughout the permutation. + /// + /// The Jolt inline assembler's `emit_i::` accepts a full + /// u64 immediate (it stores it raw and the emulator does + /// `x[rs1].wrapping_add(imm as i64)`). So a single ADDI with rs1=x0 + /// loads any u64 value. + fn load_p(&mut self) { + let p = self.p_reg(); + let mask = self.mask_low32(); + self.asm.emit_i::(p, 0, crate::GOLDILOCKS_MODULUS); + self.asm.emit_i::(mask, 0, 0xFFFF_FFFF); + } + + // ── State load / store ──────────────────────────────────────────── + + fn load_state(&mut self) { + for i in 0..STATE_LEN { + self.asm + .emit_ld::(self.s(i), self.operands.rs1, (i * 8) as i64); + } + } + + fn store_state(&mut self) { + for i in 0..STATE_LEN { + self.asm + .emit_s::(self.operands.rs1, self.s(i), (i * 8) as i64); + } + } + + // ── Field arithmetic primitives ─────────────────────────────────── + + /// dst = (a + b) mod P. + /// + /// 11 instructions. Note: `dst` may alias `a` or `b` — we snapshot + /// `a` first to avoid clobbering it before the overflow check. + fn add_mod_into(&mut self, dst: u8, a: u8, b: u8) { + let p = self.p_reg(); + let ovf = self.am_ovf(); + let corr = self.am_corr(); + let less = self.am_less(); + let tmp = self.am_tmp(); + + // 0. Snapshot `a` into tmp to survive the dst-write aliasing. + // Common call shape is `add_mod_into(s, s, rc)` where dst == a. + self.asm.emit_r::(tmp, a, 0); + // 1. dst = a + b (wrapping) + self.asm.emit_r::(dst, a, b); + // 2. ovf = (dst < tmp) ? 1 : 0 -- overflow detection using snapshot + self.asm.emit_r::(ovf, dst, tmp); + // 3-4. corr = ovf * (2^32 - 1) = (ovf << 32) - ovf + self.asm.emit_i::(corr, ovf, 32); + self.asm.emit_r::(corr, corr, ovf); + // 5. dst = dst + corr -- if no overflow, corr = 0 + self.asm.emit_r::(dst, dst, corr); + // 6-10. Final reduction: if dst >= P, dst -= P. + self.asm.emit_r::(less, dst, p); + self.asm.emit_r::(tmp, dst, p); + self.asm.emit_r::(corr, 0, less); // corr = -less = 0 or all-ones + self.asm.emit_r::(corr, corr, p); // corr = P if less else 0 + self.asm.emit_r::(dst, tmp, corr); + } + + /// dst = (a + b) mod P, for call sites where `dst != a`. + /// + /// Same arithmetic as `add_mod_into`, but skips the `a` snapshot + /// because overflow can be checked directly against the still-live + /// left operand. `dst` may alias `b`. + fn add_mod_into_dst_not_a(&mut self, dst: u8, a: u8, b: u8) { + debug_assert_ne!(dst, a, "dst must not alias a"); + + let p = self.p_reg(); + let ovf = self.am_ovf(); + let corr = self.am_corr(); + let less = self.am_less(); + let tmp = self.am_tmp(); + + // 1. dst = a + b (wrapping) + self.asm.emit_r::(dst, a, b); + // 2. ovf = (dst < a) ? 1 : 0 + self.asm.emit_r::(ovf, dst, a); + // 3-4. corr = ovf * (2^32 - 1) = (ovf << 32) - ovf + self.asm.emit_i::(corr, ovf, 32); + self.asm.emit_r::(corr, corr, ovf); + // 5. dst = dst + corr -- if no overflow, corr = 0 + self.asm.emit_r::(dst, dst, corr); + // 6-10. Final reduction: if dst >= P, dst -= P. + self.asm.emit_r::(less, dst, p); + self.asm.emit_r::(tmp, dst, p); + self.asm.emit_r::(corr, 0, less); + self.asm.emit_r::(corr, corr, p); + self.asm.emit_r::(dst, tmp, corr); + } + + /// dst = (a * b) mod P using the Goldilocks reduction trick. + /// + /// Mirrors the corrected `exec::mul_mod`. ~25 instructions. + /// + /// Critical detail: when `lo + (hi_lo << 32)` overflows 2^64 + /// during the `add_term` step, naive wrapping loses 2^64 worth of + /// magnitude. Since 2^64 ≡ (2^32 - 1) mod P, the result is short + /// by (2^32 - 1) mod P. We detect this overflow and add the + /// correction, handling the double-wrap case (where the + /// correction itself overflows). + fn mul_mod_into(&mut self, dst: u8, a: u8, b: u8) { + let p = self.p_reg(); + let mask = self.mask_low32(); + let lo = self.mm_lo(); + let hi = self.mm_hi(); + let hi_lo = self.mm_hi_lo(); + let hi_hi = self.mm_hi_hi(); + let shifted = self.mm_shifted(); + let add_term = self.mm_add_term(); + let sub_term = self.mm_sub_term(); + let add_ovf = self.mm_add_ovf(); + let sub_ovf = self.am_ovf(); + let corr = self.am_corr(); + let less = self.am_less(); + let tmp = self.am_tmp(); + + // 1. lo = low 64 bits of a*b + self.asm.emit_r::(lo, a, b); + // 2. hi = high 64 bits of a*b + self.asm.emit_r::(hi, a, b); + // 3. hi_hi = hi >> 32 + self.asm.emit_i::(hi_hi, hi, 32); + // 4. hi_lo = hi & (2^32 - 1) + self.asm.emit_r::(hi_lo, hi, mask); + // 5. shifted = hi_lo << 32. Shifting `hi` directly is + // equivalent because the upper half shifts out of the word. + self.asm.emit_i::(shifted, hi, 32); + // 7. add_term = lo + shifted (wrapping) + self.asm.emit_r::(add_term, lo, shifted); + // 8. add_ovf = (add_term < lo) ? 1 : 0 -- detect 2^64 overflow + self.asm.emit_r::(add_ovf, add_term, lo); + // 9. sub_term = hi_lo + hi_hi + self.asm.emit_r::(sub_term, hi_lo, hi_hi); + // 10. r = add_term - sub_term (wrapping) + self.asm.emit_r::(dst, add_term, sub_term); + // 11. sub_ovf = (add_term < sub_term) ? 1 : 0 + self.asm.emit_r::(sub_ovf, add_term, sub_term); + // 12-14. If underflow, add P. Compute corr = (0 - sub_ovf) AND P; r += corr. + self.asm.emit_r::(corr, 0, sub_ovf); + self.asm.emit_r::(corr, corr, p); + self.asm.emit_r::(dst, dst, corr); + // 15-21. add_term-overflow correction: if add_ovf, add + // (2^32 - 1). Snapshot dst, do the conditional add, detect + // wrap, conditional second add of (2^32 - 1). + self.asm.emit_r::(corr, 0, add_ovf); + self.asm.emit_r::(corr, corr, mask); // corr = (2^32-1) if add_ovf else 0 + self.asm.emit_r::(tmp, dst, 0); // snapshot dst for wrap detection + self.asm.emit_r::(dst, dst, corr); // dst += corr + self.asm.emit_r::(less, dst, tmp); // less = 1 if wrap (dst < snapshot) + self.asm.emit_r::(corr, 0, less); + self.asm.emit_r::(corr, corr, mask); + self.asm.emit_r::(dst, dst, corr); // if wrap, add another (2^32-1) + // 22-26. Final reduction: if dst >= P, dst -= P. + // After all corrections, dst < 2^64 < 2P, so one sub + // suffices. + self.asm.emit_r::(less, dst, p); + self.asm.emit_r::(tmp, dst, p); + self.asm.emit_r::(corr, 0, less); + self.asm.emit_r::(corr, corr, p); + self.asm.emit_r::(dst, tmp, corr); + } + + /// dst = x^7 over Goldilocks. 4 mul_mod calls. + /// + /// Now that `mul_mod_into` correctly handles `add_term` overflow, + /// the natural `x^7 = x^6 * x` decomposition works correctly. + /// (Previously this triggered a bug in `mul_mod_into`; the + /// workaround was the `x^7 = x^4 * x^3` decomposition. Fixed in + /// the v1 mul_mod correction.) + fn sbox_into(&mut self, dst: u8, x: u8) { + let x2 = self.sc_a(); + let x4 = self.sc_b(); + let x6 = self.sc_c(); + + // x2 = x * x + self.mul_mod_into(x2, x, x); + // x4 = x2 * x2 + self.mul_mod_into(x4, x2, x2); + // x6 = x4 * x2 + self.mul_mod_into(x6, x4, x2); + // dst = x6 * x = x^7 + self.mul_mod_into(dst, x6, x); + } + + // ── Round-constant loading ──────────────────────────────────────── + + /// Load round constant at index `idx` into sc_rc. + /// 1 instruction. + fn load_rc(&mut self, idx: usize) { + let dst = self.sc_rc(); + self.load_u64_immediate(dst, crate::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[idx]); + } + + /// Load the i'th diagonal constant into sc_diag. + /// Uses inline u64 immediate construction since DIAG only has 8 entries. + fn load_diag(&mut self, i: usize) { + let value = POSEIDON2_INTERNAL_DIAG[i]; + let dst = self.sc_diag(); + self.load_u64_immediate(dst, value); + } + + /// Load a 64-bit constant into `dst`. Single instruction: the Jolt + /// inline `emit_i::` accepts a full u64 immediate. + fn load_u64_immediate(&mut self, dst: u8, value: u64) { + self.asm.emit_i::(dst, 0, value); + } + + /// For each i in 0..8: S[i] = (S[i] + RC[rc_base + i]) mod P. + fn add_round_constants_full(&mut self, rc_base: usize) { + for i in 0..STATE_LEN { + self.load_rc(rc_base + i); + let s_i = self.s(i); + let rc = self.sc_rc(); + self.add_mod_into(s_i, s_i, rc); + } + } + + /// S[0] = (S[0] + RC[idx]) mod P. Used in internal rounds. + fn add_round_constant_partial(&mut self, idx: usize) { + self.load_rc(idx); + let s0 = self.s(0); + let rc = self.sc_rc(); + self.add_mod_into(s0, s0, rc); + } + + /// For each i in 0..8: S[i] = sbox(S[i]). + fn sbox_full(&mut self) { + for i in 0..STATE_LEN { + let s_i = self.s(i); + self.sbox_into(s_i, s_i); + } + } + + fn sbox_state_0(&mut self) { + let s0 = self.s(0); + self.sbox_into(s0, s0); + } + + // ── External MDS ────────────────────────────────────────────────── + + /// Apply the m4 sub-block to four registers in place. + /// + /// Reference (from exec.rs::external_mds): + /// ```text + /// let (a, b, c, d) = (s[0], s[1], s[2], s[3]); + /// let sum = add_mod(add_mod(a, b), add_mod(c, d)); + /// s[0] = add_mod(sum, add_mod(a, add_mod(b, b))); + /// s[1] = add_mod(sum, add_mod(b, add_mod(c, c))); + /// s[2] = add_mod(sum, add_mod(c, add_mod(d, d))); + /// s[3] = add_mod(sum, add_mod(d, add_mod(a, a))); + /// ``` + /// + /// We snapshot a,b,c,d into named scratch (sc_a/b/c/d) so we can + /// freely overwrite s[]. + fn m4_apply(&mut self, s: [u8; 4]) { + // Snapshot inputs. + let a = self.sc_a(); + let b = self.sc_b(); + let c = self.sc_c(); + let d = self.sc_d(); + self.asm.emit_r::(a, s[0], 0); + self.asm.emit_r::(b, s[1], 0); + self.asm.emit_r::(c, s[2], 0); + self.asm.emit_r::(d, s[3], 0); + + // sum = (a+b) + (c+d), using a temp register (mm_lo is free here). + let ab = self.mm_lo(); + let cd = self.mm_hi(); + let sum = self.sum_reg(); + self.add_mod_into_dst_not_a(ab, a, b); + self.add_mod_into_dst_not_a(cd, c, d); + self.add_mod_into_dst_not_a(sum, ab, cd); + + // s[0] = sum + a + 2b + let bb = self.mm_hi_lo(); + let a_plus_bb = self.mm_hi_hi(); + self.add_mod_into_dst_not_a(bb, b, b); + self.add_mod_into_dst_not_a(a_plus_bb, a, bb); + self.add_mod_into_dst_not_a(s[0], sum, a_plus_bb); + + // s[1] = sum + b + 2c + let cc = self.mm_hi_lo(); + let b_plus_cc = self.mm_hi_hi(); + self.add_mod_into_dst_not_a(cc, c, c); + self.add_mod_into_dst_not_a(b_plus_cc, b, cc); + self.add_mod_into_dst_not_a(s[1], sum, b_plus_cc); + + // s[2] = sum + c + 2d + let dd = self.mm_hi_lo(); + let c_plus_dd = self.mm_hi_hi(); + self.add_mod_into_dst_not_a(dd, d, d); + self.add_mod_into_dst_not_a(c_plus_dd, c, dd); + self.add_mod_into_dst_not_a(s[2], sum, c_plus_dd); + + // s[3] = sum + d + 2a + let aa = self.mm_hi_lo(); + let d_plus_aa = self.mm_hi_hi(); + self.add_mod_into_dst_not_a(aa, a, a); + self.add_mod_into_dst_not_a(d_plus_aa, d, aa); + self.add_mod_into_dst_not_a(s[3], sum, d_plus_aa); + } + + /// External MDS: two m4 sub-blocks (on left and right halves of the + /// state) then the cross-mix described in exec.rs::external_mds. + fn external_mds(&mut self) { + let left: [u8; 4] = [self.s(0), self.s(1), self.s(2), self.s(3)]; + let right: [u8; 4] = [self.s(4), self.s(5), self.s(6), self.s(7)]; + + self.m4_apply(left); + self.m4_apply(right); + + // After m4, left = transformed(S[0..4]), right = transformed(S[4..8]). + // Reference: + // for i in 0..4 { state[i] = left[i] + right[i]; state[i+4] = left[i] + right[i]; } + // for i in 0..4 { state[i] = state[i] + left[i]; state[i+4] = state[i+4] + right[i]; } + // + // After these two passes: + // state[i] = (left[i] + right[i]) + left[i] = 2*left[i] + right[i] + // state[i+4] = (left[i] + right[i]) + right[i] = left[i] + 2*right[i] + // + // Implement directly: snapshot left[i] and right[i] before + // overwriting them. + for i in 0..4 { + // We can't freely snapshot here because left/right ARE the + // state registers. Use t[i] and t[i+4] as scratch. + let l = left[i]; // = S[i] + let r = right[i]; // = S[i+4] + let t_l = self.t(i); + let t_r = self.t(i + 4); + + // t_l = l, t_r = r + self.asm.emit_r::(t_l, l, 0); + self.asm.emit_r::(t_r, r, 0); + + // l = 2*t_l + t_r = (t_l + t_r) + t_l + let sum_lr = self.sc_a(); + self.add_mod_into_dst_not_a(sum_lr, t_l, t_r); + self.add_mod_into_dst_not_a(l, sum_lr, t_l); + + // r = t_l + 2*t_r = (t_l + t_r) + t_r + self.add_mod_into_dst_not_a(r, sum_lr, t_r); + } + } + + // ── Internal diffusion ──────────────────────────────────────────── + + /// Compute row-sum into sum_reg, then state[i] = diag[i]*state[i] + sum. + fn internal_diffusion(&mut self) { + // 1. sum = S[0] + S[1] + ... + S[7] + let sum = self.sum_reg(); + // Start with sum = S[0]+S[1]. + let s0 = self.s(0); + let s1 = self.s(1); + self.add_mod_into_dst_not_a(sum, s0, s1); + for i in 2..STATE_LEN { + let s_i = self.s(i); + self.add_mod_into(sum, sum, s_i); + } + + // 2. For each i in 0..8: S[i] = (diag[i] * S[i]) + sum. + for i in 0..STATE_LEN { + self.load_diag(i); + let diag = self.sc_diag(); + let s_i = self.s(i); + // S[i] = diag * S[i] + self.mul_mod_into(s_i, diag, s_i); + // S[i] = S[i] + sum + self.add_mod_into(s_i, s_i, sum); + } + } +} + +// Compile-time sanity: the constants we pull in MUST be the same shape +// the reference implementation uses. If these `_` bindings fail to +// type-check, the implementer has the wrong constants. +const _: [u64; STATE_WIDTH] = POSEIDON2_INTERNAL_DIAG; + +// ── Test-only helpers for sub-operation isolation ──────────────────── + +#[cfg(test)] +#[allow(dead_code)] +impl Poseidon2GoldilocksSequenceBuilder { + pub fn new_for_test(asm: InstrAssembler, operands: FormatInline) -> Self { + Self::new(asm, operands) + } + + pub fn test_load_p_and_state_and_add_rc_full(&mut self, rc_base: usize) { + self.load_p(); + self.load_state(); + self.add_round_constants_full(rc_base); + } + + pub fn test_load_p_state_addrc_sbox(&mut self, rc_base: usize) { + self.load_p(); + self.load_state(); + self.add_round_constants_full(rc_base); + self.sbox_full(); + } + + pub fn test_load_p_state_addrc_sbox_mds(&mut self, rc_base: usize) { + self.load_p(); + self.load_state(); + self.add_round_constants_full(rc_base); + self.sbox_full(); + self.external_mds(); + } + + pub fn test_load_p_state_mds_only(&mut self) { + self.load_p(); + self.load_state(); + self.external_mds(); + } + + pub fn test_load_p_state_intdiff_only(&mut self) { + self.load_p(); + self.load_state(); + self.internal_diffusion(); + } + + /// For each i in 0..8: S[i] = S[i] * S[i] mod P (single squaring). + pub fn test_load_p_state_square_only(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + self.mul_mod_into(s_i, s_i, s_i); + } + } + + /// For each i in 0..8: S[i] = sbox(S[i]) (without the surrounding RC add). + pub fn test_load_p_state_sbox_only(&mut self) { + self.load_p(); + self.load_state(); + self.sbox_full(); + } + + /// For each i: S[i] = x^4 mod P (just two squares). + pub fn test_load_p_state_x4_only(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let scratch = self.sc_a(); + self.mul_mod_into(scratch, s_i, s_i); // x^2 + self.mul_mod_into(s_i, scratch, scratch); // x^4 + } + } + + /// For each i: S[i] = x^3 = x^2 * x. Tests asymmetric mul. + pub fn test_load_p_state_x3(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let scratch = self.sc_a(); + self.mul_mod_into(scratch, s_i, s_i); // x^2 + self.mul_mod_into(s_i, scratch, s_i); // x^3 = x^2 * x + } + } + + /// For each i: S[i] = x^6 = x^4 * x^2. + pub fn test_load_p_state_x6(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let x2 = self.sc_a(); + let x4 = self.sc_b(); + self.mul_mod_into(x2, s_i, s_i); // x^2 + self.mul_mod_into(x4, x2, x2); // x^4 + self.mul_mod_into(s_i, x4, x2); // x^6 = x^4 * x^2 + } + } + + /// For each i: S[i] = x * x (square), but written as + /// scratch = x; mul_mod(s_i, scratch, s_i). Tests dst-aliases-b + /// when a is a non-aliasing reg. + /// For each i: S[i] = S[i] * S[i+8] mod P. Used to stress + /// mul_mod_into with arbitrary (a, b) pairs supplied by the test + /// harness. The state's first 8 elements are a's, next 8 are b's. + /// Output is written back to the first 8 (overwriting a's). + /// + /// Note: this requires a 16-element state buffer at rs1. The + /// emulator test allocates 128 bytes for this. + pub fn test_load_p_state_mul_pairs(&mut self) { + self.load_p(); + // Load 16 u64s from rs1. + for i in 0..8 { + let s_i = self.s(i); + self.asm + .emit_ld::(s_i, self.operands.rs1, (i * 8) as i64); + } + // Load the b's into T[0..8] (vr[8..16]). + for i in 0..8 { + let t_i = self.t(i); + self.asm + .emit_ld::(t_i, self.operands.rs1, ((i + 8) * 8) as i64); + } + // For each i: S[i] = S[i] * T[i] mod P + for i in 0..8 { + let s_i = self.s(i); + let t_i = self.t(i); + self.mul_mod_into(s_i, s_i, t_i); + } + // Store the 8 results back to the first 8 slots. + for i in 0..8 { + let s_i = self.s(i); + self.asm + .emit_s::(self.operands.rs1, s_i, (i * 8) as i64); + } + } + + pub fn test_load_p_state_mul_dst_aliases_b(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let scratch = self.sc_a(); + // scratch = s_i (just a copy via ADD scratch, s_i, 0) + self.asm.emit_r::(scratch, s_i, 0); + // s_i = scratch * s_i (dst aliases b) + self.mul_mod_into(s_i, scratch, s_i); + } + } + + /// For each i: S[i] = x^7 via INLINED sbox using the natural + /// `x^7 = x^6 * x` decomposition. Previously diagnosed as + /// triggering a `mul_mod_into` bug; now fixed by the add_term- + /// overflow correction. This test stays in place as a regression + /// guard. + pub fn test_load_p_state_sbox_inlined(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let a = self.sc_a(); + let b = self.sc_b(); + let c = self.sc_c(); + self.mul_mod_into(a, s_i, s_i); // a = x^2 + self.mul_mod_into(b, a, a); // b = x^4 + self.mul_mod_into(c, b, a); // c = x^6 = x^4 * x^2 + self.mul_mod_into(s_i, c, s_i); // s_i = x^7 = x^6 * x + } + } + + pub fn test_store_and_finalize(mut self) -> Vec { + self.store_state(); + drop(self.vr); + self.asm.finalize_inline() + } +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/test_constants.rs b/jolt-inlines/poseidon2-goldilocks/src/test_constants.rs new file mode 100644 index 0000000000..d315a0d5dd --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/test_constants.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: MIT + +use crate::Poseidon2GoldilocksState; + +pub(crate) struct Poseidon2GoldilocksKat { + pub(crate) input: Poseidon2GoldilocksState, + pub(crate) output: Poseidon2GoldilocksState, +} + +pub(crate) const POSEIDON2_GOLDILOCKS_KATS: &[Poseidon2GoldilocksKat] = &[ + Poseidon2GoldilocksKat { + input: [ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + ], + output: [ + 0x4411ec57c44145f5, + 0x5ff55b96baa2f47b, + 0xdee0e2ae35662802, + 0x023c96b32c07981d, + 0x777e4afeaaf2e6a1, + 0x606c248e5ef427da, + 0x862e82242b2c5001, + 0x61ea532cc4c908c7, + ], + }, + Poseidon2GoldilocksKat { + input: [ + 0x0000000000000001, + 0x0000000000000002, + 0x0000000000000003, + 0x0000000000000004, + 0x0000000000000005, + 0x0000000000000006, + 0x0000000000000007, + 0x0000000000000008, + ], + output: [ + 0xd7314da15817d57e, + 0x298d56d49f1937a1, + 0x197376572d00355f, + 0xd302ce06a83b7f6e, + 0xcbfaa68735b06b4a, + 0x01a9337c49e10228, + 0x4a81976fb5dfc0ee, + 0xa98941ad4ca9232e, + ], + }, + Poseidon2GoldilocksKat { + input: [ + 0xffffffff00000000, + 0xfffffffeffffffff, + 0xfffffffefffffffe, + 0xfffffffefffffffd, + 0xfffffffefffffffc, + 0xfffffffefffffffb, + 0xfffffffefffffffa, + 0xfffffffefffffff9, + ], + output: [ + 0xa785ac3b187380e2, + 0xbaf871af4702a41a, + 0xfdfa9a3998f6a535, + 0xd5ffa984d60dfc7c, + 0x847f180534bd6dc1, + 0xe07d1ab55263f732, + 0x0a84ee6f62e263ad, + 0x4068ba2f6dce11b2, + ], + }, + Poseidon2GoldilocksKat { + input: [ + 0x0123456789abcdef, + 0xfedcba9876543210, + 0x1111111122222222, + 0x3333333344444444, + 0x5555555566666666, + 0x7777777788888888, + 0x00099999999aaaaa, + 0xbbbbbbbbcccccccc, + ], + output: [ + 0x4b3aa0d92edfae34, + 0x33dea26f790576cb, + 0xbd45605508de7e43, + 0x38259456bd796aae, + 0xad1b11a37b4ab584, + 0x7490a81972b0fa70, + 0x0a7e73b3531a193f, + 0x12ba121bc764e5c6, + ], + }, +]; diff --git a/jolt-inlines/poseidon2-goldilocks/src/tests.rs b/jolt-inlines/poseidon2-goldilocks/src/tests.rs new file mode 100644 index 0000000000..698e555f61 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/tests.rs @@ -0,0 +1,531 @@ +// SPDX-License-Identifier: MIT + +//! Parity and emulator tests for the Poseidon2-Goldilocks inline. + +use crate::exec::{execute_poseidon2_permutation, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8}; + +const P: u64 = crate::GOLDILOCKS_MODULUS; + +fn u128_mul_mod(a: u64, b: u64) -> u64 { + ((a as u128) * (b as u128) % (P as u128)) as u64 +} + +#[test] +fn exec_mul_mod_matches_u128_for_known_cases() { + let cases: &[(u64, u64)] = &[ + (0, 0), + (1, 1), + (P - 1, P - 1), + (P - 1, 1), + (2, 3), + (0xC0000000_00000000, 0xC0000000_00000000), + (0x80000000_00000001, 0x80000000_00000001), + (0xFFFFFFFF_FFFFFFFF_u64 % P, 0xFFFFFFFF_FFFFFFFF_u64 % P), + (12345, P - 1), + ]; + for &(a, b) in cases { + assert_eq!(crate::exec::mul_mod(a, b), u128_mul_mod(a, b)); + } +} + +#[test] +fn exec_mul_mod_matches_u128_random_stress() { + let mut seed: u64 = 0xDEADBEEFCAFEBABE; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + for _ in 0..100_000 { + let a = next() % P; + let b = next() % P; + assert_eq!(crate::exec::mul_mod(a, b), u128_mul_mod(a, b)); + } +} + +#[test] +fn opcode_allocation_matches_inline_extension_namespace() { + assert_eq!(crate::POSEIDON2_GOLDILOCKS_FUNCT3, 0x00); + assert_eq!(crate::POSEIDON2_GOLDILOCKS_FUNCT7, 0x08); +} + +#[test] +fn known_answer_vectors_match_reference() { + for vector in crate::test_constants::POSEIDON2_GOLDILOCKS_KATS { + let mut state = vector.input; + execute_poseidon2_permutation(&mut state); + assert_eq!(state, vector.output); + } +} + +mod plonky3_parity { + use p3_goldilocks::{default_goldilocks_poseidon2_8, Goldilocks}; + use p3_symmetric::Permutation; + + use super::{execute_poseidon2_permutation, P}; + + fn plonky3_permute(state_u64: [u64; 8]) -> [u64; 8] { + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + + let perm = default_goldilocks_poseidon2_8(); + let mut state: [Goldilocks; 8] = state_u64.map(Goldilocks::from_u64); + perm.permute_mut(&mut state); + state.map(|f| f.as_canonical_u64()) + } + + fn plonky3_permute_generic(state_u64: [u64; 8]) -> [u64; 8] { + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + use p3_goldilocks::{ + Poseidon2ExternalLayerGoldilocks, Poseidon2InternalLayerGoldilocks, + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL, GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL, + GOLDILOCKS_POSEIDON2_RC_8_INTERNAL, + }; + use p3_poseidon2::{ExternalLayerConstants, Poseidon2}; + + let external = ExternalLayerConstants::::new( + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(), + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(), + ); + let internal = GOLDILOCKS_POSEIDON2_RC_8_INTERNAL.to_vec(); + let perm: Poseidon2< + Goldilocks, + Poseidon2ExternalLayerGoldilocks<8>, + Poseidon2InternalLayerGoldilocks, + 8, + 7, + > = Poseidon2::new(external, internal); + + let mut state: [Goldilocks; 8] = state_u64.map(Goldilocks::from_u64); + perm.permute_mut(&mut state); + state.map(|f| f.as_canonical_u64()) + } + + fn assert_matches_plonky3(initial: [u64; 8]) { + let mut ours = initial; + execute_poseidon2_permutation(&mut ours); + assert_eq!(ours, plonky3_permute(initial)); + assert_eq!(ours, plonky3_permute_generic(initial)); + } + + #[test] + fn permute_all_zero_matches_plonky3() { + assert_matches_plonky3([0u64; 8]); + } + + #[test] + fn permute_known_input_matches_plonky3() { + assert_matches_plonky3([1, 2, 3, 4, 5, 6, 7, 8]); + } + + #[test] + fn permute_large_values_match_plonky3() { + assert_matches_plonky3([P - 1, P - 2, P - 3, P - 4, P - 5, P - 6, P - 7, P - 8]); + } + + #[test] + fn permute_stress_matches_plonky3() { + let mut seed: u64 = 0x0BADC0DEF00DCAFE; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + + for _ in 0..200 { + assert_matches_plonky3([ + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + ]); + } + } + + #[test] + fn round_constants_match_plonky3_layout() { + use p3_field::PrimeField64; + use p3_goldilocks::{ + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL, GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL, + GOLDILOCKS_POSEIDON2_RC_8_INTERNAL, + }; + + let mut idx = 0; + for round_constants in &GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL { + for constant in round_constants { + assert_eq!( + super::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[idx], + constant.as_canonical_u64() + ); + idx += 1; + } + } + for constant in &GOLDILOCKS_POSEIDON2_RC_8_INTERNAL { + assert_eq!( + super::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[idx], + constant.as_canonical_u64() + ); + idx += 1; + } + for round_constants in &GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL { + for constant in round_constants { + assert_eq!( + super::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[idx], + constant.as_canonical_u64() + ); + idx += 1; + } + } + assert_eq!(idx, 86); + } + + #[test] + fn internal_diagonal_matches_plonky3() { + use crate::exec::POSEIDON2_INTERNAL_DIAG; + use p3_field::PrimeField64; + use p3_goldilocks::MATRIX_DIAG_8_GOLDILOCKS; + + for i in 0..8 { + assert_eq!( + POSEIDON2_INTERNAL_DIAG[i], + MATRIX_DIAG_8_GOLDILOCKS[i].as_canonical_u64() + ); + } + } +} + +#[test] +fn sequence_builder_emits_expected_instruction_count() { + use jolt_inlines_sdk::host::InlineOp; + use tracer::utils::inline_sequence_writer::SequenceInputs; + + const EXPECTED_INSTRUCTION_COUNT: usize = 22_315; + + let inputs = SequenceInputs::default(); + let instructions = crate::sequence_builder::Poseidon2GoldilocksPermutation::build_sequence( + (&inputs).into(), + (&inputs).into(), + ); + assert_eq!(instructions.len(), EXPECTED_INSTRUCTION_COUNT); +} + +#[test] +fn sequence_builder_emission_is_deterministic() { + use jolt_inlines_sdk::host::InlineOp; + use tracer::utils::inline_sequence_writer::SequenceInputs; + + let inputs1 = SequenceInputs::default(); + let inputs2 = SequenceInputs::default(); + let seq1 = crate::sequence_builder::Poseidon2GoldilocksPermutation::build_sequence( + (&inputs1).into(), + (&inputs1).into(), + ); + let seq2 = crate::sequence_builder::Poseidon2GoldilocksPermutation::build_sequence( + (&inputs2).into(), + (&inputs2).into(), + ); + assert_eq!(seq1.len(), seq2.len()); + let dbg1: Vec = seq1.iter().map(|i| format!("{i:?}")).collect(); + let dbg2: Vec = seq2.iter().map(|i| format!("{i:?}")).collect(); + assert_eq!(dbg1, dbg2); +} + +#[cfg(test)] +mod emulator { + use core::array; + + use super::*; + use jolt_inlines_sdk::host::{ + instruction::{ld::LD, sd::SD}, + FormatInline, InlineOp, InlineOp as InlineOpTrait, InstrAssembler, Instruction, + VirtualRegisterGuard, + }; + use tracer::utils::inline_test_harness::{InlineMemoryLayout, InlineTestHarness}; + + fn create_harness(output_size: usize) -> InlineTestHarness { + let layout = InlineMemoryLayout::single_input(0, output_size); + InlineTestHarness::new(layout) + } + + fn execute_inline_permutation(initial_state: &[u64; 8]) -> [u64; 8] { + let mut harness = create_harness(64); + harness.setup_registers(); + harness.load_state64(initial_state); + let inline_instr = InlineTestHarness::create_default_instruction( + crate::INLINE_OPCODE, + crate::POSEIDON2_GOLDILOCKS_FUNCT3, + crate::POSEIDON2_GOLDILOCKS_FUNCT7, + ); + harness.execute_inline(inline_instr); + let result_vec = harness.read_output64(8); + let mut result = [0u64; 8]; + result.copy_from_slice(&result_vec); + result + } + + #[test] + fn emulator_permute_all_zero_matches_reference() { + let mut reference = [0u64; 8]; + execute_poseidon2_permutation(&mut reference); + assert_eq!(execute_inline_permutation(&[0u64; 8]), reference); + } + + #[test] + fn emulator_permute_known_input_matches_reference() { + let initial = [1u64, 2, 3, 4, 5, 6, 7, 8]; + let mut reference = initial; + execute_poseidon2_permutation(&mut reference); + assert_eq!(execute_inline_permutation(&initial), reference); + } + + #[test] + fn emulator_permute_stress_matches_reference() { + let mut seed: u64 = 0xFACEFEED0BADBEEF; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + for _ in 0..50 { + let initial = [ + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + ]; + let mut reference = initial; + execute_poseidon2_permutation(&mut reference); + assert_eq!(execute_inline_permutation(&initial), reference); + } + } + + struct IdentityPermutation; + + impl InlineOpTrait for IdentityPermutation { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x05; + const FUNCT7: u32 = 0x05; + const NAME: &'static str = "IDENTITY_TEST_INLINE"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let vr: [VirtualRegisterGuard; 8] = + array::from_fn(|_| asm.allocator.allocate_for_inline()); + let mut asm = asm; + for (i, reg) in vr.iter().enumerate() { + asm.emit_ld::(**reg, operands.rs1, (i * 8) as i64); + } + for (i, reg) in vr.iter().enumerate() { + asm.emit_s::(operands.rs1, **reg, (i * 8) as i64); + } + drop(vr); + asm.finalize_inline() + } + } + + struct AddRcOnlyTest; + + impl InlineOpTrait for AddRcOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x06; + const FUNCT7: u32 = 0x06; + const NAME: &'static str = "ADD_RC_ONLY_TEST_INLINE"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_and_state_and_add_rc_full(0); + builder.test_store_and_finalize() + } + } + + struct MdsOnlyTest; + + impl InlineOpTrait for MdsOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x09; + const NAME: &'static str = "MDS_ONLY_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_mds_only(); + builder.test_store_and_finalize() + } + } + + struct IntDiffOnlyTest; + + impl InlineOpTrait for IntDiffOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x0A; + const NAME: &'static str = "INT_DIFF_ONLY_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_intdiff_only(); + builder.test_store_and_finalize() + } + } + + struct SboxOnlyTest; + + impl InlineOpTrait for SboxOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x0C; + const NAME: &'static str = "SBOX_ONLY_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_sbox_only(); + builder.test_store_and_finalize() + } + } + + struct MulPairsTest; + + impl InlineOpTrait for MulPairsTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x12; + const NAME: &'static str = "MUL_PAIRS_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_mul_pairs(); + builder.test_store_and_finalize() + } + } + + jolt_inlines_sdk::register_inlines! { + trace_file: "poseidon2_test_inlines_trace.joltinline", + extension: jolt_inlines_sdk::host::InlineExtension::Poseidon2Goldilocks, + ops: [IdentityPermutation, AddRcOnlyTest, MdsOnlyTest, IntDiffOnlyTest, SboxOnlyTest, MulPairsTest], + } + + fn run_inline_with_state(funct3: u32, funct7: u32, initial: &[u64; 8]) -> [u64; 8] { + let mut harness = create_harness(64); + harness.setup_registers(); + harness.load_state64(initial); + let instr = + InlineTestHarness::create_default_instruction(crate::INLINE_OPCODE, funct3, funct7); + harness.execute_inline(instr); + let v = harness.read_output64(8); + let mut out = [0u64; 8]; + out.copy_from_slice(&v); + out + } + + #[test] + fn inline_mul_mod_stress_vs_u128() { + let mut seed: u64 = 0xBADC0FFEE0DDF00D; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + for _ in 0..100 { + let mut state = [0u64; 16]; + for i in 0..8 { + state[i] = next() % P; + state[i + 8] = next() % P; + } + let layout = InlineMemoryLayout::single_input(0, 128); + let mut harness = InlineTestHarness::new(layout); + harness.setup_registers(); + harness.load_state64(&state); + let instr = InlineTestHarness::create_default_instruction( + crate::INLINE_OPCODE, + MulPairsTest::FUNCT3, + MulPairsTest::FUNCT7, + ); + harness.execute_inline(instr); + let result = harness.read_output64(8); + for i in 0..8 { + assert_eq!(result[i], u128_mul_mod(state[i], state[i + 8])); + } + } + } + + #[test] + fn add_rc_only_inline_matches_reference() { + let result = + run_inline_with_state(AddRcOnlyTest::FUNCT3, AddRcOnlyTest::FUNCT7, &[0u64; 8]); + let mut expected = [0u64; 8]; + expected.copy_from_slice(&POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[..8]); + assert_eq!(result, expected); + } + + #[test] + fn mds_only_matches_reference() { + let initial = [1u64, 2, 3, 4, 5, 6, 7, 8]; + let got = run_inline_with_state(MdsOnlyTest::FUNCT3, MdsOnlyTest::FUNCT7, &initial); + let mut expected = initial; + crate::exec::external_mds(&mut expected); + assert_eq!(got, expected); + } + + #[test] + fn int_diff_only_matches_reference() { + let initial = [11u64, 22, 33, 44, 55, 66, 77, 88]; + let got = run_inline_with_state(IntDiffOnlyTest::FUNCT3, IntDiffOnlyTest::FUNCT7, &initial); + let mut expected = initial; + crate::exec::internal_diffusion(&mut expected); + assert_eq!(got, expected); + } + + #[test] + fn sbox_only_matches_reference() { + let initial: [u64; 8] = POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[..8] + .try_into() + .unwrap(); + let got = run_inline_with_state(SboxOnlyTest::FUNCT3, SboxOnlyTest::FUNCT7, &initial); + let mut expected = [0u64; 8]; + for i in 0..8 { + expected[i] = crate::exec::sbox(initial[i]); + } + assert_eq!(got, expected); + } + + #[test] + fn identity_inline_preserves_state() { + let layout = InlineMemoryLayout::single_input(0, 64); + let mut harness = InlineTestHarness::new(layout); + harness.setup_registers(); + let initial: [u64; 8] = [11, 22, 33, 44, 55, 66, 77, 88]; + harness.load_state64(&initial); + let instr = InlineTestHarness::create_default_instruction( + crate::INLINE_OPCODE, + IdentityPermutation::FUNCT3, + IdentityPermutation::FUNCT7, + ); + harness.execute_inline(instr); + assert_eq!(harness.read_output64(8), initial); + } +} diff --git a/specs/1570-poseidon2-goldilocks-inline.md b/specs/1570-poseidon2-goldilocks-inline.md new file mode 100644 index 0000000000..3369faf137 --- /dev/null +++ b/specs/1570-poseidon2-goldilocks-inline.md @@ -0,0 +1,238 @@ +# Spec: Poseidon2-Goldilocks Inline + +| Field | Value | +|-------------|--------------------------------| +| Author(s) | @jay-clarke | +| Created | 2026-05-24 | +| Revised | 2026-06-10 | +| Status | proposed | +| PR | #1570 | + +## Summary + +Add a Jolt inline for the canonical 8-wide Poseidon2 permutation over the Goldilocks field. +Poseidon2-Goldilocks is a common ZK-native permutation used for proof-friendly commitments and +Merkle-tree style constructions. Executing it as ordinary guest Rust expands into many traced RISC-V +instructions; an inline lets Jolt recognize this specific operation and replace it with a +deterministic, tested virtual-instruction expansion. + +## Intent + +### Goal + +Provide a `jolt-inlines-poseidon2-goldilocks` crate that exposes a guest-callable +`poseidon2_permute(&mut [u64; 8])`, registers a `Poseidon2Goldilocks` inline extension with the Jolt +tracer, and expands the custom instruction into a sequence that is byte-equivalent to Plonky3's +canonical `Poseidon2Goldilocks<8>` permutation. + +### Invariants + +1. The host reference implementation produces the same output as Plonky3's canonical + `Poseidon2Goldilocks<8>` for every tested state. +2. The sequence-builder output, when executed through Jolt's inline emulator harness, produces the + same output as the host reference implementation. +3. Goldilocks arithmetic stays in the field `p = 2^64 - 2^32 + 1`; multiplication reduction must + match `u128` modular arithmetic for edge cases and random stress inputs. +4. Round constants are materialized in the same order as the permutation executes them: external + initial constants, internal constants, then external final constants. +5. The internal diagonal matches Plonky3's `MATRIX_DIAG_8_GOLDILOCKS`. +6. The inline mutates only the 8-limb state buffer supplied by `rs1`; round constants are embedded + in the inline expansion as virtual immediates. +7. The inline is gated behind a distinct `InlineExtension::Poseidon2Goldilocks` entry so profiles can + opt into it explicitly. +8. The `(opcode, funct3, funct7)` encoding occupies a funct7 namespace not used by any existing + `jolt-inlines-*` crate (see "Opcode allocation"). + +No `jolt-eval` invariant is proposed in this initial patch because existing inline crates primarily +validate these properties through crate-local unit and emulator tests. A follow-up could add a shared +inline-permutation equivalence invariant if maintainers want a broader framework-level check. + +### Non-Goals + +1. Supporting Poseidon2 widths other than 8. +2. Supporting fields other than Goldilocks. +3. Providing a sponge/hash API beyond the raw 8-limb permutation. +4. Changing existing Poseidon transcript code over BN254. +5. Replacing or modifying any existing hash/curve inline. +6. Claiming a specific performance improvement before benchmark review. + +## Opcode allocation + +The existing `jolt-inlines-*` crates follow a consistent convention under the shared +`INLINE_OPCODE = 0x0B`: each crate owns one `funct7` namespace, and `funct3` enumerates operations +within that crate. Current allocations on main: + +| funct7 | Crate | funct3 values in use | +|--------|-------------|--------------------------------------| +| 0x00 | sha2 | 0x00 (SHA256), 0x01 (SHA256_INIT) | +| 0x01 | keccak256 | 0x00 | +| 0x02 | blake2 | 0x00 | +| 0x03 | blake3 | 0x00, 0x01 (KEYED64) | +| 0x04 | bigint | 0x00 | +| 0x05 | secp256k1 | 0x00-0x07 | +| 0x06 | grumpkin | 0x00, 0x01 | +| 0x07 | p256 | 0x00-0x07 | + +This crate therefore claims: + +```text +POSEIDON2_GOLDILOCKS_FUNCT7 = 0x08 +POSEIDON2_GOLDILOCKS_FUNCT3 = 0x00 +``` + +which also matches the crate's `inline_extension_code = 8`. Future operations in this crate, such as +a fused two-to-one compression, should take `funct3 = 0x01, 0x02, ...` under `funct7 = 0x08`. + +Pre-merge check: reconfirm the allocation against the authoritative dispatch table in +`jolt-inlines/sdk` on current main after rebase. + +## Evaluation + +### Acceptance Criteria + +- [ ] `cargo check -p jolt-inlines-poseidon2-goldilocks` passes. +- [ ] `cargo check -p jolt-inlines-poseidon2-goldilocks --features host` passes. +- [ ] `cargo test -p jolt-inlines-poseidon2-goldilocks --features host` passes. +- [ ] `cargo test -p jolt-riscv` passes after adding the new inline extension. +- [ ] `cargo fmt --check` and workspace clippy pass on the rebased branch. +- [ ] Host permutation tests match Plonky3's default `Poseidon2Goldilocks<8>` path. +- [ ] Host permutation tests match an explicitly constructed generic Plonky3 `Poseidon2` path. +- [ ] Host permutation tests match committed known-answer vectors. +- [ ] Inline emulator tests match the host reference for fixed and randomized states. +- [ ] Goldilocks multiplication tests match `u128` modular arithmetic for edge and random stress cases. +- [ ] The new inline is registered through the existing `register_inlines!` mechanism. +- [ ] The `(0x0B, 0x00, 0x08)` encoding is confirmed free on rebased main. + +### Testing Strategy + +Existing tests that must continue passing: + +- `cargo test -p jolt-riscv` + +New tests added under `jolt-inlines/poseidon2-goldilocks`: + +- Field multiplication reduction tests against `u128`. +- Plonky3 parity tests for all-zero, known, near-modulus, and randomized states. +- Known-answer tests: fixed `(input state -> output state)` vectors committed as constants. +- Round-constant layout tests against Plonky3 constants. +- Internal diagonal tests against Plonky3 constants. +- Sequence-builder determinism tests. +- Inline emulator tests for full permutation and isolated sub-operations. + +Feature coverage is `--features host`, matching the existing inline-crate testing pattern. No `zk` +feature coverage is required for this crate-local inline expansion patch. + +### Performance + +This PR does not claim an end-to-end speedup over ordinary guest Rust. The initial goal is a +reviewable, deterministic, Plonky3-compatible inline surface that gives Jolt a dedicated hook for +future Poseidon2-specific optimization. + +The current inline expansion emits **22,315 virtual instructions per permutation**, including the +virtual-register reset instructions appended by `finalize_inline`. The crate-local deterministic +emission test pins this count so future refactors cannot silently regress trace size. + +As a local sanity check, the same permutation compiled as ordinary no-std guest Rust and measured +with Jolt's `start_cycle_tracking` / `end_cycle_tracking` markers emitted: + +```text +"poseidon2_plain": 20504 RV64IMAC cycles + 8 virtual instructions = 20512 total cycles +``` + +That comparison is intentionally not a merge-blocking benchmark: it is a single traced-row count for +one marked guest region, not wall-clock time, prover time, padded proof trace length, or a complete +`jolt-eval` objective. It does show that this first inline is not yet smaller than optimized guest +Rust on raw traced-row count. The main value of this patch is correctness, API shape, opcode +allocation, and a deterministic expansion that can be improved behind the same guest call. + +This count is produced by: + +- embedding all 86 round constants as virtual immediates instead of loading them from guest memory, +- using the corrected Goldilocks `mul_mod` reduction path, +- using a no-left-alias `add_mod` variant at call sites where the destination cannot alias the left + operand, and +- leaving the more invasive lazy-reduction optimization out of this initial PR. + +The follow-up performance path is explicit: lazy reduction with a written bounds argument, +diagonal-constant specialization where profitable, a two-to-one compression operation for +Merkle-style use cases, and eventually a prover-side precompile if maintainers want Poseidon2 to +move beyond ordinary RV instruction semantics. If maintainers want a maintained comparison against +plain guest Rust, the natural follow-up is a `jolt-eval` objective comparing the two paths +end-to-end. + +## Design + +### Architecture + +The change follows the existing `jolt-inlines/*` crate pattern: + +- `jolt-inlines/poseidon2-goldilocks/src/sdk.rs` exposes the guest API. In RISC-V guest builds, it + emits the custom inline instruction. In host builds, it calls the host reference implementation. + On non-RISC-V, non-host targets it panics with a clear message. +- `jolt-inlines/poseidon2-goldilocks/src/exec.rs` contains the standalone host reference + implementation of the 8-wide Poseidon2-Goldilocks permutation. +- `jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs` expands the inline instruction into + virtual RISC-V instructions. +- `jolt-inlines/poseidon2-goldilocks/src/host.rs` registers the inline using the existing + `register_inlines!` macro. +- `crates/jolt-riscv/src/profile.rs` adds `InlineExtension::Poseidon2Goldilocks`. +- The root workspace includes the new crate and adds Plonky3 crates as dev/test dependencies for + parity checks. + +### Instruction contract + +```text +.insn r INLINE_OPCODE, POSEIDON2_GOLDILOCKS_FUNCT3, POSEIDON2_GOLDILOCKS_FUNCT7, x0, rs1, x0 +``` + +- `rs1` points to 8 writable, 8-byte-aligned `u64` limbs representing the state, permuted in place. +- `rs2` is unused and should be encoded as `x0` by SDK callers. +- The inline writes only the 8-limb state buffer and reads no guest memory other than that state. +- Round constants are embedded in the expansion as virtual immediates. + +### Alternatives Considered + +1. **Leave Poseidon2 as ordinary guest Rust.** Rejected because Poseidon2-Goldilocks is a + proof-native primitive likely to appear in Jolt guest programs; inline support gives Jolt a + dedicated, testable hook for this hot operation and future optimization work. +2. **Expose a hash/sponge API instead of the raw permutation.** Rejected for the initial version. + The raw permutation is the narrowest reusable primitive and avoids committing to one + absorption/domain-separation policy. +3. **Support multiple widths immediately.** Rejected to keep the review surface small. Width 8 is + the concrete Plonky3-compatible instance covered by the current implementation and tests. +4. **Use fixed test vectors only.** Considered as an alternative to Plonky3 dev dependencies. This + patch does both: Plonky3 parity tests for breadth, plus committed known-answer vectors that + survive if maintainers later choose to drop the Plonky3 dev-dependencies. +5. **Load round constants from a guest-memory table via `rs2`.** Rejected. The Jolt inline + assembler can materialize a full `u64` immediate in one virtual `ADDI`, so a memory table does not + reduce instruction count. Embedding constants also removes 86 guest-memory reads, avoids adding a + 688-byte constants table to guest data, and eliminates the possibility of callers passing the + wrong `rs2` pointer. + +## Documentation + +No Jolt book changes are required for the initial patch because this adds an internal inline crate +and does not change user-facing Jolt APIs or examples. If maintainers want to advertise the inline, +a follow-up can add a short entry to the inline documentation alongside the existing hash and curve +inlines. + +## Execution + +The implementation should: + +1. Add the new inline crate under `jolt-inlines/poseidon2-goldilocks`. +2. Add a `Poseidon2Goldilocks` inline extension entry. +3. Register the inline with `register_inlines!`. +4. Implement field addition, multiplication, S-box, external MDS, internal diffusion, and round + scheduling for the 8-wide Goldilocks instance. +5. Encode at `(INLINE_OPCODE = 0x0B, funct3 = 0x00, funct7 = 0x08)` per "Opcode allocation". +6. Align SPDX headers with the crate license. +7. Keep the module documentation in sync with the `rs1`-only instruction contract. +8. Add parity, KAT, and emulator tests described above. +9. Rebase onto current main; rerun `fmt`/`clippy`/workspace tests; reconfirm the opcode table. + +## References + +- [Poseidon2 paper](https://eprint.iacr.org/2023/323) +- [Plonky3 repository](https://github.com/Plonky3/Plonky3) +- Existing Jolt inline crates under `jolt-inlines/`