Modernizing SFMT-19937 for Zig

A development note on moving SFMT-19937 from a C-shaped port to Zig vector code, and why the useful benchmark only appeared in ReleaseFast.

June 6, 2026View on GitHub

The first version of SFMT-19937 in zig-prng was correct, but it still thought like C. It carried a 128-bit word as a struct of four u32 values, walked loops with manual indices, and used scalar helper functions to imitate operations that the original algorithm expected the CPU to do as SIMD. That is a reasonable starting point for a port. It is also a good way to hide the actual shape of the algorithm from Zig. SFMT is not a scalar generator with a SIMD paint job. The name is literal: SIMD-oriented Fast Mersenne Twister. If a modern implementation makes the vector path slower, something specific has gone wrong. astro-island,astro-slot,astro-static-slot{display:contents}(()=>{var a=(s,i,o)=>{let r=async()=>{await(await s())()},t=typeof i.value=="object"?i.value:void 0,c={rootMargin:t==null?void 0:t.rootMargin},n=new IntersectionObserver(e=>{for(let l of e)if(l.isIntersecting){n.disconnect(),r();break}},c);for(let e of o.children)n.observe(e)};(self.Astro||(self.Astro={})).visible=a;window.dispatchEvent(new Event("astro:visible"));})();(()=>{var g=Object.defineProperty;var w=(a,s,c)=>s in a?g(a,s,{enumerable:!0,configurable:!0,writable:!0,value:c}):a[s]=c;var l=(a,s,c)=>w(a,typeof s!="symbol"?s+"":s,c);var E=new Set(["__proto__","constructor","prototype"]);{let a={0:t=>y(t),1:t=>c(t),2:t=>new RegExp(t),3:t=>new Date(t),4:t=>new Map(c(t)),5:t=>new Set(c(t)),6:t=>BigInt(t),7:t=>new URL(t),8:t=>new Uint8Array(t),9:t=>new Uint16Array(t),10:t=>new Uint32Array(t),11:t=>Number.POSITIVE_INFINITY*t},s=t=>{let[p,e]=t;return p in a?a[p](e):void 0},c=t=>t.map(s),y=t=>typeof t!="object"||t===null?t:Object.fromEntries(Object.entries(t).map(([p,e])=>[p,s(e)]));class f extends HTMLElement{constructor(){super(...arguments);l(this,"Component");l(this,"hydrator");l(this,"hydrate",async()=>{var b;if(!this.hydrator||!this.isConnected)return;let e=(b=this.parentElement)==null?void 0:b.closest("astro-island[ssr]");if(e){e.addEventListener("astro:hydrate",this.hydrate,{once:!0});return}let r=this.querySelectorAll("astro-slot"),n={},d=this.querySelectorAll("template[data-astro-template]");for(let o of d){let i=o.closest(this.tagName);i!=null&&i.isSameNode(this)&&(n[o.getAttribute("data-astro-template")||"default"]=o.innerHTML,o.remove())}for(let o of r){let i=o.closest(this.tagName);i!=null&&i.isSameNode(this)&&(n[o.getAttribute("name")||"default"]=o.innerHTML)}let u;try{u=this.hasAttribute("props")?y(JSON.parse(this.getAttribute("props"))):{}}catch(o){let i=this.getAttribute("component-url")||"<unknown>",v=this.getAttribute("component-export");throw v&&(i+=` (export ${v})`),console.error(`[hydrate] Error parsing props for component ${i}`,this.getAttribute("props"),o),o}let h;await this.hydrator(this)(this.Component,u,n,{client:this.getAttribute("client")}),this.removeAttribute("ssr"),this.dispatchEvent(new CustomEvent("astro:hydrate"))});l(this,"unmount",()=>{this.isConnected||this.dispatchEvent(new CustomEvent("astro:unmount"))})}disconnectedCallback(){document.removeEventListener("astro:after-swap",this.unmount),document.addEventListener("astro:after-swap",this.unmount,{once:!0})}connectedCallback(){if(!this.hasAttribute("await-children")||document.readyState==="interactive"||document.readyState==="complete")this.childrenConnectedCallback();else{let e=()=>{document.removeEventListener("DOMContentLoaded",e),r.disconnect(),this.childrenConnectedCallback()},r=new MutationObserver(()=>{var n;((n=this.lastChild)==null?void 0:n.nodeType)===Node.COMMENT_NODE&&this.lastChild.nodeValue==="astro:end"&&(this.lastChild.remove(),e())});r.observe(this,{childList:!0}),document.addEventListener("DOMContentLoaded",e)}}async childrenConnectedCallback(){let e=this.getAttribute("before-hydration-url");e&&await import(e),this.start()}getRetryImportUrl(e){let r=new URL(e,document.baseURI);return r.searchParams.set("astro-retry",Date.now().toString()),r.toString()}async importWithRetry(e){try{return await import(e)}catch(r){return await new Promise(n=>setTimeout(n,1e3)),import(this.getRetryImportUrl(e))}}handleHydrationError(e){let r=this.getAttribute("component-url"),n=new CustomEvent("astro:hydration-error",{cancelable:!0,bubbles:!0,composed:!0,detail:{error:e,componentUrl:r}});this.dispatchEvent(n)&&console.error(`[astro-island] Error hydrating ${r}`,e)}async start(){let e=JSON.parse(this.getAttribute("opts")),r=this.getAttribute("client");if(Astro[r]===void 0){window.addEventListener(`astro:${r}`,()=>this.start(),{once:!0});return}try{await Astro[r](async()=>{let n=this.getAttribute("renderer-url");try{let[d,{default:u}]=await Promise.all([this.importWithRetry(this.getAttribute("component-url")),n?this.importWithRetry(n):Promise.resolve({default:()=>()=>{}})]),h=this.getAttribute("component-export")||"default";if(h.includes(".")){this.Component=d;for(let m of h.split(".")){if(E.has(m)||!this.Component||typeof this.Component!="object"&&typeof this.Component!="function"||!Object.hasOwn(this.Component,m))throw new Error(`Invalid component export path: ${h}`);this.Component=this.Component[m]}}else{if(E.has(h))throw new Error(`Invalid component export path: ${h}`);this.Component=d[h]}return this.hydrator=u,this.hydrate}catch(d){return this.handleHydrationError(d),()=>{}}},e,this)}catch(n){this.handleHydrationError(n)}}attributeChangedCallback(){this.hydrate()}}l(f,"observedAttributes",["props"]),customElements.get("astro-island")||customElements.define("astro-island",f)}})();ReleaseFast throughputThe takeoff only appears when the optimizer can see the SIMD shape.5.0xStruct baselineOriginal C-shaped state representation332 M/s@Vector rewriteByte-lane shifts expressed with @shuffle1,618 M/sDocumented finalCorrectness held against SFMT 1.5.1 vectors1,675 M/sDebug mode told the wrong story: the vector rewrite looked slower because the conversions and helper calls were still visible to the compiler.Baseline82 M/sNaive vector48 M/sPointer rollback57 M/s@shuffle70 M/sOne billion generated u32 values per run. The checksum stayed fixed at 0x7c7d388d. Establishing the invariant Before changing the implementation, I added a benchmark that generated one billion u32 values and reported throughput. The first measurement was boring in the useful way: 82 M/s in Debug mode, with the reference vectors still passing. The important number was not the throughput. It was the checksum, 0x7c7d388d. That value became the guardrail. Every refactor could move the code around, but it had to keep producing the same stream. The easy cleanup did not move speed The first pass replaced C-style while loops with Zig range for loops where the iteration count was fixed. That made the recurrence code read like Zig instead of a translation unit with different punctuation. It did not change performance. That was expected. Loop syntax was not the bottleneck, and the benchmark stayed at 82 M/s. I left the initByArray loops alone. They carry interdependent counters and modular arithmetic, so the explicit while form is clearer there. Idiomatic code is not code that removes every older-looking construct. It is code that uses the control flow that matches the job. The vector rewrite got slower first The next pass replaced the 128-bit word struct with @Vector(4, u32). On paper, that was the right move. The state now had a type that matched the algorithm. The benchmark disagreed. Debug throughput dropped to 48 M/s. My first guess was that passing 128-bit vectors by value had introduced extra copying. Changing the recursion helper back to pointer parameters improved the number to 57 M/s, but that was still slower than the baseline. The signature was not the real problem. The real problem was a shift. The byte-lane shift was the leak The SSE2 reference uses two different kinds of right shift. One shifts each u32 element. The other shifts the whole 128-bit register by bytes. Those are different operations, even though they sit near each other in the recurrence. The Zig code was simulating the byte-lane shift with scalar u64 composition. It packed halves of the register, shifted them, then unpacked the result. That preserved the bits, but it destroyed the compiler’s ability to see a single vector operation. The fix was to express the operation as a byte shuffle. Reinterpret the vector as sixteen bytes, use @shuffle with a zero vector as the second input, and let negative mask lanes pull zeros into the vacated bytes. That matches the _mm_srli_si128 semantics without routing through scalar arithmetic. Once that changed, the structure of the code matched the structure of the algorithm. The ReleaseFast benchmark finally showed it: 332 M/s for the old struct-based version, 1,618 M/s for the vector implementation, and 1,675 M/s after the final cleanup. Debug mode was useful, then misleading Debug mode helped catch regressions quickly, but it was the wrong place to judge the vector rewrite. The extra conversions and helper calls had not been inlined away, so the benchmark was measuring scaffolding that ReleaseFast would remove. That does not make the Debug measurements useless. They still told me something was off during the false starts. They just could not answer the final performance question. The lesson is narrow but durable: if a SIMD-designed algorithm gets slower after moving to vector types, inspect the places where the abstraction leaks. In this case, correctness hid the problem. The byte-lane shift was bit-exact, but the implementation no longer looked like SIMD to the compiler. The finished version keeps compatibility with the SFMT 1.5.1 reference vectors for both init_gen_rand and init_by_array. It also reads like the implementation it wanted to be: vector state, vector recurrence, byte-lane shifts written as byte-lane shifts.