From 91ed4475b14a1feca8667b358c6304e03933dcc1 Mon Sep 17 00:00:00 2001 From: Guilleag01 Date: Thu, 25 Sep 2025 20:54:10 +0200 Subject: [PATCH] AVX512, my cpu doesn't support it, lmao --- src/main.rs | 20 ++++++++----- src/sha3.rs | 84 ++++++++++++++++++++++++++++------------------------- 2 files changed, 57 insertions(+), 47 deletions(-) diff --git a/src/main.rs b/src/main.rs index 724791b..659cadb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,18 +17,24 @@ fn main() { file.read_to_end(&mut file_data).unwrap(); - let mut sha = Sha3_256::default(); - // println!("{:?}", (0x01 as u64).to_ne_bytes()); // let text = "hola"; - let now = time::Instant::now(); + let mut time = 0_f32; - sha.absorb(&file_data); - let res: [u8; 32] = sha.squeeze(); + let mut res: [u8; 32] = [0_u8; 32]; - let elapsed = now.elapsed().as_micros() as f32; + for _ in 0..1000 { + let mut sha = Sha3_256::default(); + let now = time::Instant::now(); + + sha.absorb(&file_data); + res = sha.squeeze(); + + let elapsed = now.elapsed().as_micros() as f32; + time += elapsed; + } // let expected_res: [u8; 32] = [ // 0x8a, 0xf1, 0x3d, 0x92, 0x44, 0x61, 0x8e, 0xee, 0x87, 0x6d, 0x04, 0x31, 0xf3, 0x44, 0x9a, @@ -42,7 +48,7 @@ fn main() { } println!(); - println!("Time taken: {} ms", elapsed / 1000_f32); + println!("Avg Time taken: {} ms", (time / 1000_f32) / 1000_f32); // assert!(res == expected_res); } diff --git a/src/sha3.rs b/src/sha3.rs index c596786..958e14c 100644 --- a/src/sha3.rs +++ b/src/sha3.rs @@ -1,7 +1,7 @@ // Rate: 1088 // Capacity: 512 -// use std::arch::x86_64::_mm256_xor_epi64; +use std::arch::x86_64::*; use std::array; @@ -9,28 +9,24 @@ use crate::consts::LFSR_LUT; const RATE_256: usize = 136; const TOTAL_STATE_SIZE: usize = 200; -const TOTAL_STATE_SIZE_U64: usize = 25; const ROUNDS: usize = 24; const DELIMITER_SUFFIX: u8 = 0x06; // delimiter suffix for sha3 #[derive(Debug)] pub struct Sha3_256 { - state: [u64; TOTAL_STATE_SIZE_U64], + state: [u8; TOTAL_STATE_SIZE], } impl Default for Sha3_256 { fn default() -> Self { Self { - state: [0; TOTAL_STATE_SIZE_U64], + state: [0; TOTAL_STATE_SIZE], } } } impl Sha3_256 { pub fn absorb(&mut self, input: &[u8]) { - // let (inputs_u64, rem) = input.as_chunks::<8>(); - let state_u8 = [u8; 200]; - // Xor input with rate let mut remaining = input.len(); let mut off = 0; @@ -84,29 +80,51 @@ impl Sha3_256 { } fn keccak_permute(input: &mut [u8; TOTAL_STATE_SIZE]) { - // let (lanes, _) = input.as_chunks_mut::<8>(); + let (lanes, _) = input.as_chunks_mut::<8>(); let mut lfsr_state = 0x01_u8; for _ in 0..ROUNDS { // θ step let c: [u64; 5] = array::from_fn(|x| { - get_lane2(input, x, 0) - ^ get_lane2(input, x, 1) - ^ get_lane2(input, x, 2) - ^ get_lane2(input, x, 3) - ^ get_lane2(input, x, 4) + get_lane(lanes, x, 0) + ^ get_lane(lanes, x, 1) + ^ get_lane(lanes, x, 2) + ^ get_lane(lanes, x, 3) + ^ get_lane(lanes, x, 4) }); let mut d: u64; + for x in 0..5 { d = c[(x + 4) % 5] ^ rol64(c[(x + 1) % 5], 1); - for y in 0..5 { - xor_lane2(d, input, x, y); + let mut out = [0_u64; 8]; + unsafe { + let a: __m512i = + _mm512_set_epi64(d as i64, d as i64, d as i64, d as i64, d as i64, 0, 0, 0); + + let b: __m512i = _mm512_set_epi64( + get_lane(lanes, x, 0) as i64, + get_lane(lanes, x, 1) as i64, + get_lane(lanes, x, 2) as i64, + get_lane(lanes, x, 3) as i64, + get_lane(lanes, x, 4) as i64, + 0, + 0, + 0, + ); + let res = _mm512_xor_epi64(a, b); + _mm512_storeu_epi64(out.as_mut_ptr() as *mut i64, res); } + for i in 0..5 { + set_lane(out[i], x, i, lanes); + } + // for y in 0..5 { + // xor_lane(d, lanes, x, y); + // } } // ρ and π steps let (mut x, mut y) = (1, 0); - let mut current = get_lane2(input, x, y); + let mut current = get_lane(lanes, x, y); let mut temp: u64; for t in 0..24 { @@ -115,23 +133,24 @@ fn keccak_permute(input: &mut [u8; TOTAL_STATE_SIZE]) { x = y; y = y2; - temp = get_lane2(input, x, y); - set_lane2(rol64(current, r), x, y, input); + temp = get_lane(lanes, x, y); + set_lane(rol64(current, r), x, y, lanes); current = temp; } // χ step - let mut temp2 = [0_u64; 5]; for y in 0..5 { + // let mut temp2 = [0_u64; 5]; + // for x in 0..5 { + // temp2[x] = get_lane(lanes, x, y); + // } + let temp2: [u64; 5] = array::from_fn(|x| get_lane(lanes, x, y)); for x in 0..5 { - temp2[x] = get_lane2(input, x, y); - } - for x in 0..5 { - set_lane2( + set_lane( temp2[x] ^ ((!temp2[(x + 1) % 5]) & temp2[(x + 2) % 5]), x, y, - input, + lanes, ); } } @@ -147,7 +166,7 @@ fn keccak_permute(input: &mut [u8; TOTAL_STATE_SIZE]) { // } if lfsr_out { - xor_lane2((1 as u64) << bit_pos, input, 0, 0); + xor_lane((1 as u64) << bit_pos, lanes, 0, 0); } } } @@ -158,22 +177,11 @@ fn get_lane(lanes: &[[u8; 8]], x: usize, y: usize) -> u64 { u64::from_ne_bytes(lanes[x + 5 * y]) } -#[inline] -fn get_lane2(lanes: &[64; TOTAL_STATE_SIZE], x: usize, y: usize) -> u64 { - u64::from_ne_bytes(lanes[x + 5 * y..x + 5 * y + 8].try_into().unwrap()) -} - #[inline] fn set_lane(lane: u64, x: usize, y: usize, lanes: &mut [[u8; 8]]) { lanes[x + 5 * y] = lane.to_ne_bytes(); } -#[inline] -fn set_lane2(lane: u64, x: usize, y: usize, lanes: &mut [u8; TOTAL_STATE_SIZE]) { - // lanes[x + 5 * y] = lane.to_ne_bytes(); - lanes[x + 5 * y..x + 5 * y + 8].copy_from_slice(&lane.to_ne_bytes()); -} - #[inline] fn rol64(v: u64, off: usize) -> u64 { ((v) << off) ^ ((v) >> (64 - off)) @@ -183,10 +191,6 @@ fn rol64(v: u64, off: usize) -> u64 { fn xor_lane(lane: u64, lanes: &mut [[u8; 8]], x: usize, y: usize) { set_lane(get_lane(lanes, x, y) ^ lane, x, y, lanes); } -#[inline] -fn xor_lane2(lane: u64, lanes: &mut [u8; TOTAL_STATE_SIZE], x: usize, y: usize) { - set_lane2(get_lane2(lanes, x, y) ^ lane, x, y, lanes); -} // Function that computes the linear feedback shift register (LFSR) // I have absolutely no idea wtf is this shit. Copied from a github repo lol.