diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4dcbf5438fe..8209933db42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## **[Unreleased]**
 
+- [#1303](https://github.com/wasmerio/wasmer/pull/1303) NaN canonicalization for singlepass backend.
 - [#1305](https://github.com/wasmerio/wasmer/pull/1305) Handle panics from DynamicFunc.
 - [#1301](https://github.com/wasmerio/wasmer/pull/1301) Update supported stable Rust version to 1.41.1.
 - [#1300](https://github.com/wasmerio/wasmer/pull/1300) Add support for multiple versions of WASI tests: wasitests now test all versions of WASI.
diff --git a/lib/clif-backend/src/code.rs b/lib/clif-backend/src/code.rs
index ada71d0a394..3b9bdf3940c 100644
--- a/lib/clif-backend/src/code.rs
+++ b/lib/clif-backend/src/code.rs
@@ -19,7 +19,7 @@ use std::mem;
 use std::sync::{Arc, RwLock};
 use wasmer_runtime_core::error::CompileError;
 use wasmer_runtime_core::{
-    backend::{CacheGen, Token},
+    backend::{CacheGen, CompilerConfig, Token},
     cache::{Artifact, Error as CacheError},
     codegen::*,
     memory::MemoryType,
@@ -36,7 +36,7 @@ use wasmparser::Type as WpType;
 static BACKEND_ID: &str = "cranelift";
 
 pub struct CraneliftModuleCodeGenerator {
-    isa: Box<dyn isa::TargetIsa>,
+    isa: Option<Box<dyn isa::TargetIsa>>,
     signatures: Option<Arc<Map<SigIndex, FuncSig>>>,
     pub clif_signatures: Map<SigIndex, ir::Signature>,
     function_signatures: Option<Arc<Map<FuncIndex, SigIndex>>>,
@@ -47,9 +47,8 @@ impl ModuleCodeGenerator<CraneliftFunctionCodeGenerator, Caller, CodegenError>
     for CraneliftModuleCodeGenerator
 {
     fn new() -> Self {
-        let isa = get_isa();
         CraneliftModuleCodeGenerator {
-            isa,
+            isa: None,
             clif_signatures: Map::new(),
             functions: vec![],
             function_signatures: None,
@@ -100,7 +99,7 @@ impl ModuleCodeGenerator<CraneliftFunctionCodeGenerator, Caller, CodegenError>
             position: Position::default(),
             func_env: FunctionEnvironment {
                 module_info: Arc::clone(&module_info),
-                target_config: self.isa.frontend_config().clone(),
+                target_config: self.isa.as_ref().unwrap().frontend_config().clone(),
                 clif_signatures: self.clif_signatures.clone(),
             },
             loc,
@@ -162,9 +161,9 @@ impl ModuleCodeGenerator<CraneliftFunctionCodeGenerator, Caller, CodegenError>
         }
 
         let (func_resolver_builder, debug_metadata, handler_data) =
-            FuncResolverBuilder::new(&*self.isa, func_bodies, module_info)?;
+            FuncResolverBuilder::new(&**self.isa.as_ref().unwrap(), func_bodies, module_info)?;
 
-        let trampolines = Arc::new(Trampolines::new(&*self.isa, module_info));
+        let trampolines = Arc::new(Trampolines::new(&**self.isa.as_ref().unwrap(), module_info));
 
         let signatures_empty = Map::new();
         let signatures = if self.signatures.is_some() {
@@ -191,9 +190,19 @@ impl ModuleCodeGenerator<CraneliftFunctionCodeGenerator, Caller, CodegenError>
         ))
     }
 
+    fn feed_compiler_config(&mut self, config: &CompilerConfig) -> Result<(), CodegenError> {
+        self.isa = Some(get_isa(Some(config)));
+        Ok(())
+    }
+
     fn feed_signatures(&mut self, signatures: Map<SigIndex, FuncSig>) -> Result<(), CodegenError> {
         self.signatures = Some(Arc::new(signatures));
-        let call_conv = self.isa.frontend_config().default_call_conv;
+        let call_conv = self
+            .isa
+            .as_ref()
+            .unwrap()
+            .frontend_config()
+            .default_call_conv;
         for (_sig_idx, func_sig) in self.signatures.as_ref().unwrap().iter() {
             self.clif_signatures
                 .push(convert_func_sig(func_sig, call_conv));
@@ -1302,7 +1311,10 @@ fn generate_signature(
 }
 
 fn pointer_type(mcg: &CraneliftModuleCodeGenerator) -> ir::Type {
-    ir::Type::int(u16::from(mcg.isa.frontend_config().pointer_bits())).unwrap()
+    ir::Type::int(u16::from(
+        mcg.isa.as_ref().unwrap().frontend_config().pointer_bits(),
+    ))
+    .unwrap()
 }
 
 /// Declare local variables for the signature parameters that correspond to WebAssembly locals.
diff --git a/lib/clif-backend/src/lib.rs b/lib/clif-backend/src/lib.rs
index 95fd334a4ad..c4fb65736c3 100644
--- a/lib/clif-backend/src/lib.rs
+++ b/lib/clif-backend/src/lib.rs
@@ -29,6 +29,7 @@ use cranelift_codegen::{
     settings::{self, Configurable},
 };
 use target_lexicon::Triple;
+use wasmer_runtime_core::{backend::CompilerConfig, codegen::SimpleStreamingCompilerGen};
 
 #[macro_use]
 extern crate serde_derive;
@@ -36,7 +37,7 @@ extern crate serde_derive;
 extern crate rayon;
 extern crate serde;
 
-fn get_isa() -> Box<dyn isa::TargetIsa> {
+fn get_isa(config: Option<&CompilerConfig>) -> Box<dyn isa::TargetIsa> {
     let flags = {
         let mut builder = settings::builder();
         builder.set("opt_level", "speed_and_size").unwrap();
@@ -48,6 +49,12 @@ fn get_isa() -> Box<dyn isa::TargetIsa> {
             builder.set("enable_verifier", "false").unwrap();
         }
 
+        if let Some(config) = config {
+            if config.nan_canonicalization {
+                builder.set("enable_nan_canonicalization", "true").unwrap();
+            }
+        }
+
         let flags = settings::Flags::new(builder);
         debug_assert_eq!(flags.opt_level(), settings::OptLevel::SpeedAndSize);
         flags
@@ -58,8 +65,6 @@ fn get_isa() -> Box<dyn isa::TargetIsa> {
 /// The current version of this crate
 pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 
-use wasmer_runtime_core::codegen::SimpleStreamingCompilerGen;
-
 /// Streaming compiler implementation for the Cranelift backed. Compiles web assembly binary into
 /// machine code.
 pub type CraneliftCompiler = SimpleStreamingCompilerGen<
diff --git a/lib/clif-backend/src/trampoline.rs b/lib/clif-backend/src/trampoline.rs
index 70854c1ea90..6b6c5d7ea08 100644
--- a/lib/clif-backend/src/trampoline.rs
+++ b/lib/clif-backend/src/trampoline.rs
@@ -212,8 +212,7 @@ fn wasm_ty_to_clif(ty: Type) -> ir::types::Type {
 }
 
 fn generate_trampoline_signature() -> ir::Signature {
-    let isa = super::get_isa();
-    let call_convention = isa.default_call_conv();
+    let call_convention = super::get_isa(None).default_call_conv();
     let mut sig = ir::Signature::new(call_convention);
 
     let ptr_param = ir::AbiParam {
@@ -229,8 +228,7 @@ fn generate_trampoline_signature() -> ir::Signature {
 }
 
 fn generate_export_signature(func_sig: &FuncSig) -> ir::Signature {
-    let isa = super::get_isa();
-    let call_convention = isa.default_call_conv();
+    let call_convention = super::get_isa(None).default_call_conv();
     let mut export_clif_sig = ir::Signature::new(call_convention);
 
     let func_sig_iter = func_sig.params().iter().map(|wasm_ty| ir::AbiParam {
diff --git a/lib/llvm-backend/src/code.rs b/lib/llvm-backend/src/code.rs
index cebde5daefa..822447076d5 100644
--- a/lib/llvm-backend/src/code.rs
+++ b/lib/llvm-backend/src/code.rs
@@ -3701,7 +3701,7 @@ impl<'ctx> FunctionCodeGenerator<CodegenError> for LLVMFunctionCodeGenerator<'ct
                     .try_as_basic_value()
                     .left()
                     .unwrap();
-                state.push1_extra(res, i);
+                state.push1_extra(res, i | ExtraInfo::pending_f32_nan());
             }
             Operator::F64Trunc => {
                 let (v, i) = state.pop1_extra()?;
@@ -3714,7 +3714,7 @@ impl<'ctx> FunctionCodeGenerator<CodegenError> for LLVMFunctionCodeGenerator<'ct
                     .try_as_basic_value()
                     .left()
                     .unwrap();
-                state.push1_extra(res, i);
+                state.push1_extra(res, i | ExtraInfo::pending_f64_nan());
             }
             Operator::F32Nearest => {
                 let (v, i) = state.pop1_extra()?;
@@ -3727,7 +3727,7 @@ impl<'ctx> FunctionCodeGenerator<CodegenError> for LLVMFunctionCodeGenerator<'ct
                     .try_as_basic_value()
                     .left()
                     .unwrap();
-                state.push1_extra(res, i);
+                state.push1_extra(res, i | ExtraInfo::pending_f32_nan());
             }
             Operator::F64Nearest => {
                 let (v, i) = state.pop1_extra()?;
@@ -3740,7 +3740,7 @@ impl<'ctx> FunctionCodeGenerator<CodegenError> for LLVMFunctionCodeGenerator<'ct
                     .try_as_basic_value()
                     .left()
                     .unwrap();
-                state.push1_extra(res, i);
+                state.push1_extra(res, i | ExtraInfo::pending_f64_nan());
             }
             Operator::F32Abs => {
                 let (v, i) = state.pop1_extra()?;
diff --git a/lib/runtime-core/src/backend.rs b/lib/runtime-core/src/backend.rs
index 4aca2d2a71c..eabe526d8c7 100644
--- a/lib/runtime-core/src/backend.rs
+++ b/lib/runtime-core/src/backend.rs
@@ -132,6 +132,10 @@ pub struct CompilerConfig {
     /// When enabled there can be a small amount of runtime performance overhead.
     pub full_preemption: bool,
 
+    /// Always choose a unique bit representation for NaN.
+    /// Enabling this makes execution deterministic but increases runtime overhead.
+    pub nan_canonicalization: bool,
+
     pub features: Features,
 
     // Target info. Presently only supported by LLVM.
diff --git a/lib/singlepass-backend/src/codegen_x64.rs b/lib/singlepass-backend/src/codegen_x64.rs
index 5903508fca1..38df4e67b6b 100644
--- a/lib/singlepass-backend/src/codegen_x64.rs
+++ b/lib/singlepass-backend/src/codegen_x64.rs
@@ -39,8 +39,8 @@ use wasmer_runtime_core::{
     structures::{Map, TypedIndex},
     typed_func::{Trampoline, Wasm},
     types::{
-        FuncIndex, FuncSig, GlobalIndex, LocalFuncIndex, LocalOrImport, MemoryIndex, SigIndex,
-        TableIndex, Type,
+        FuncIndex, FuncSig, GlobalIndex, ImportedGlobalIndex, LocalFuncIndex, LocalGlobalIndex,
+        LocalOrImport, MemoryIndex, SigIndex, TableIndex, Type,
     },
     vm::{self, LocalGlobal, LocalTable, INTERNALS_SIZE},
     wasmparser::{MemoryImmediate, Operator, Type as WpType, TypeOrFuncType as WpTypeOrFuncType},
@@ -220,8 +220,12 @@ pub struct X64FunctionCode {
     returns: SmallVec<[WpType; 1]>,
     locals: Vec<Location>,
     num_params: usize,
-    num_locals: usize,
+    local_types: Vec<WpType>,
     value_stack: Vec<Location>,
+
+    /// Metadata about floating point values on the stack.
+    fp_stack: Vec<FloatValue>,
+
     control_stack: Vec<ControlFrame>,
     machine: Machine,
     unreachable_depth: usize,
@@ -231,6 +235,127 @@ pub struct X64FunctionCode {
     exception_table: Option<ExceptionTable>,
 }
 
+/// Metadata about a floating-point value.
+#[derive(Copy, Clone, Debug)]
+struct FloatValue {
+    /// Do we need to canonicalize the value before its bit pattern is next observed? If so, how?
+    canonicalization: Option<CanonicalizeType>,
+
+    /// Corresponding depth in the main value stack.
+    depth: usize,
+}
+
+impl FloatValue {
+    fn new(depth: usize) -> Self {
+        FloatValue {
+            canonicalization: None,
+            depth,
+        }
+    }
+
+    fn cncl_f32(depth: usize) -> Self {
+        FloatValue {
+            canonicalization: Some(CanonicalizeType::F32),
+            depth,
+        }
+    }
+
+    fn cncl_f64(depth: usize) -> Self {
+        FloatValue {
+            canonicalization: Some(CanonicalizeType::F64),
+            depth,
+        }
+    }
+
+    fn promote(self, depth: usize) -> FloatValue {
+        FloatValue {
+            canonicalization: match self.canonicalization {
+                Some(CanonicalizeType::F32) => Some(CanonicalizeType::F64),
+                Some(CanonicalizeType::F64) => panic!("cannot promote F64"),
+                None => None,
+            },
+            depth,
+        }
+    }
+
+    fn demote(self, depth: usize) -> FloatValue {
+        FloatValue {
+            canonicalization: match self.canonicalization {
+                Some(CanonicalizeType::F64) => Some(CanonicalizeType::F32),
+                Some(CanonicalizeType::F32) => panic!("cannot demote F32"),
+                None => None,
+            },
+            depth,
+        }
+    }
+}
+
+/// Type of a pending canonicalization floating point value.
+/// Sometimes we don't have the type information elsewhere and therefore we need to track it here.
+#[derive(Copy, Clone, Debug)]
+enum CanonicalizeType {
+    F32,
+    F64,
+}
+
+impl CanonicalizeType {
+    fn to_size(&self) -> Size {
+        match self {
+            CanonicalizeType::F32 => Size::S32,
+            CanonicalizeType::F64 => Size::S64,
+        }
+    }
+}
+
+trait PopMany<T> {
+    fn peek1(&self) -> Result<&T, CodegenError>;
+    fn pop1(&mut self) -> Result<T, CodegenError>;
+    fn pop2(&mut self) -> Result<(T, T), CodegenError>;
+}
+
+impl<T> PopMany<T> for Vec<T> {
+    fn peek1(&self) -> Result<&T, CodegenError> {
+        match self.last() {
+            Some(x) => Ok(x),
+            None => Err(CodegenError {
+                message: "peek1() expects at least 1 element".into(),
+            }),
+        }
+    }
+    fn pop1(&mut self) -> Result<T, CodegenError> {
+        match self.pop() {
+            Some(x) => Ok(x),
+            None => Err(CodegenError {
+                message: "pop1() expects at least 1 element".into(),
+            }),
+        }
+    }
+    fn pop2(&mut self) -> Result<(T, T), CodegenError> {
+        if self.len() < 2 {
+            return Err(CodegenError {
+                message: "pop2() expects at least 2 elements".into(),
+            });
+        }
+
+        let right = self.pop().unwrap();
+        let left = self.pop().unwrap();
+        Ok((left, right))
+    }
+}
+
+trait WpTypeExt {
+    fn is_float(&self) -> bool;
+}
+
+impl WpTypeExt for WpType {
+    fn is_float(&self) -> bool {
+        match self {
+            WpType::F32 | WpType::F64 => true,
+            _ => false,
+        }
+    }
+}
+
 enum FuncPtrInner {}
 #[repr(transparent)]
 #[derive(Copy, Clone, Debug)]
@@ -282,6 +407,7 @@ pub struct ControlFrame {
     pub if_else: IfElseState,
     pub returns: SmallVec<[WpType; 1]>,
     pub value_stack_depth: usize,
+    pub fp_stack_depth: usize,
     pub state: MachineState,
     pub state_diff_id: usize,
 }
@@ -645,6 +771,7 @@ struct CodegenConfig {
     enforce_stack_check: bool,
     track_state: bool,
     full_preemption: bool,
+    nan_canonicalization: bool,
 }
 
 impl ModuleCodeGenerator<X64FunctionCode, X64ExecutionContext, CodegenError>
@@ -738,9 +865,10 @@ impl ModuleCodeGenerator<X64FunctionCode, X64ExecutionContext, CodegenError>
             breakpoints: Some(breakpoints),
             returns: smallvec![],
             locals: vec![],
+            local_types: vec![],
             num_params: 0,
-            num_locals: 0,
             value_stack: vec![],
+            fp_stack: vec![],
             control_stack: vec![],
             machine,
             unreachable_depth: 0,
@@ -1021,6 +1149,7 @@ impl ModuleCodeGenerator<X64FunctionCode, X64ExecutionContext, CodegenError>
             enforce_stack_check: config.enforce_stack_check,
             track_state: config.track_state,
             full_preemption: config.full_preemption,
+            nan_canonicalization: config.nan_canonicalization,
         }));
         Ok(())
     }
@@ -1103,6 +1232,53 @@ impl X64FunctionCode {
         ret
     }
 
+    /// Canonicalizes the floating point value at `input` into `output`.
+    fn canonicalize_nan(
+        a: &mut Assembler,
+        m: &mut Machine,
+        sz: Size,
+        input: Location,
+        output: Location,
+    ) {
+        let tmp1 = m.acquire_temp_xmm().unwrap();
+        let tmp2 = m.acquire_temp_xmm().unwrap();
+        let tmp3 = m.acquire_temp_xmm().unwrap();
+        let tmpg1 = m.acquire_temp_gpr().unwrap();
+
+        Self::emit_relaxed_binop(a, m, Assembler::emit_mov, sz, input, Location::XMM(tmp1));
+
+        match sz {
+            Size::S32 => {
+                a.emit_vcmpunordss(tmp1, XMMOrMemory::XMM(tmp1), tmp2);
+                a.emit_mov(
+                    Size::S32,
+                    Location::Imm32(0x7FC0_0000), // Canonical NaN
+                    Location::GPR(tmpg1),
+                );
+                a.emit_mov(Size::S64, Location::GPR(tmpg1), Location::XMM(tmp3));
+                a.emit_vblendvps(tmp2, XMMOrMemory::XMM(tmp3), tmp1, tmp1);
+            }
+            Size::S64 => {
+                a.emit_vcmpunordsd(tmp1, XMMOrMemory::XMM(tmp1), tmp2);
+                a.emit_mov(
+                    Size::S64,
+                    Location::Imm64(0x7FF8_0000_0000_0000), // Canonical NaN
+                    Location::GPR(tmpg1),
+                );
+                a.emit_mov(Size::S64, Location::GPR(tmpg1), Location::XMM(tmp3));
+                a.emit_vblendvpd(tmp2, XMMOrMemory::XMM(tmp3), tmp1, tmp1);
+            }
+            _ => unreachable!(),
+        }
+
+        Self::emit_relaxed_binop(a, m, Assembler::emit_mov, sz, Location::XMM(tmp1), output);
+
+        m.release_temp_gpr(tmpg1);
+        m.release_temp_xmm(tmp3);
+        m.release_temp_xmm(tmp2);
+        m.release_temp_xmm(tmp1);
+    }
+
     /// Moves `loc` to a valid location for `div`/`idiv`.
     fn emit_relaxed_xdiv(
         a: &mut Assembler,
@@ -2506,14 +2682,14 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
         Ok(())
     }
 
-    fn feed_param(&mut self, _ty: WpType) -> Result<(), CodegenError> {
+    fn feed_param(&mut self, ty: WpType) -> Result<(), CodegenError> {
         self.num_params += 1;
-        self.num_locals += 1;
+        self.local_types.push(ty);
         Ok(())
     }
 
-    fn feed_local(&mut self, _ty: WpType, n: usize, _loc: u32) -> Result<(), CodegenError> {
-        self.num_locals += n;
+    fn feed_local(&mut self, ty: WpType, n: usize, _loc: u32) -> Result<(), CodegenError> {
+        self.local_types.extend(iter::repeat(ty).take(n));
         Ok(())
     }
 
@@ -2550,7 +2726,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
 
         self.locals = self
             .machine
-            .init_locals(a, self.num_locals, self.num_params);
+            .init_locals(a, self.local_types.len(), self.num_params);
 
         self.machine.state.register_values
             [X64Register::GPR(Machine::get_vmctx_reg()).to_index().0] = MachineValue::Vmctx;
@@ -2578,6 +2754,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
             if_else: IfElseState::None,
             returns: self.returns.clone(),
             value_stack_depth: 0,
+            fp_stack_depth: 0,
             state: self.machine.state.clone(),
             state_diff_id,
         });
@@ -2631,6 +2808,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
         module_info: &ModuleInfo,
         _source_loc: u32,
     ) -> Result<(), CodegenError> {
+        assert!(self.fp_stack.len() <= self.value_stack.len());
+
         let a = self.assembler.as_mut().unwrap();
 
         match ev {
@@ -2797,12 +2976,14 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                             Location::Memory(tmp, (local_index.index() as i32) * 8),
                             Location::GPR(tmp),
                         );
+                        let ty = type_to_wp_type(module_info.globals[local_index].desc.ty);
+                        if ty.is_float() {
+                            self.fp_stack
+                                .push(FloatValue::new(self.value_stack.len() - 1));
+                        }
                         self.machine.acquire_locations(
                             a,
-                            &[(
-                                type_to_wp_type(module_info.globals[local_index].desc.ty),
-                                MachineValue::WasmStack(self.value_stack.len()),
-                            )],
+                            &[(ty, MachineValue::WasmStack(self.value_stack.len()))],
                             false,
                         )[0]
                     }
@@ -2820,12 +3001,14 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                             Location::Memory(tmp, (import_index.index() as i32) * 8),
                             Location::GPR(tmp),
                         );
+                        let ty = type_to_wp_type(module_info.imported_globals[import_index].1.ty);
+                        if ty.is_float() {
+                            self.fp_stack
+                                .push(FloatValue::new(self.value_stack.len() - 1));
+                        }
                         self.machine.acquire_locations(
                             a,
-                            &[(
-                                type_to_wp_type(module_info.imported_globals[import_index].1.ty),
-                                MachineValue::WasmStack(self.value_stack.len()),
-                            )],
+                            &[(ty, MachineValue::WasmStack(self.value_stack.len()))],
                             false,
                         )[0]
                     }
@@ -2850,7 +3033,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
 
                 let tmp = self.machine.acquire_temp_gpr().unwrap();
 
-                if global_index < module_info.imported_globals.len() {
+                let ty = if global_index < module_info.imported_globals.len() {
                     a.emit_mov(
                         Size::S64,
                         Location::Memory(
@@ -2859,6 +3042,11 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         ),
                         Location::GPR(tmp),
                     );
+                    type_to_wp_type(
+                        module_info.imported_globals[ImportedGlobalIndex::new(global_index)]
+                            .1
+                            .ty,
+                    )
                 } else {
                     global_index -= module_info.imported_globals.len();
                     if global_index >= module_info.globals.len() {
@@ -2874,21 +3062,54 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         ),
                         Location::GPR(tmp),
                     );
-                }
+                    type_to_wp_type(
+                        module_info.globals[LocalGlobalIndex::new(global_index)]
+                            .desc
+                            .ty,
+                    )
+                };
                 a.emit_mov(
                     Size::S64,
                     Location::Memory(tmp, (global_index as i32) * 8),
                     Location::GPR(tmp),
                 );
-                Self::emit_relaxed_binop(
-                    a,
-                    &mut self.machine,
-                    Assembler::emit_mov,
-                    Size::S64,
-                    loc,
-                    Location::Memory(tmp, LocalGlobal::offset_data() as i32),
-                );
-
+                if ty.is_float() {
+                    let fp = self.fp_stack.pop1()?;
+                    if a.arch_supports_canonicalize_nan()
+                        && self.config.nan_canonicalization
+                        && fp.canonicalization.is_some()
+                    {
+                        Self::canonicalize_nan(
+                            a,
+                            &mut self.machine,
+                            match ty {
+                                WpType::F32 => Size::S32,
+                                WpType::F64 => Size::S64,
+                                _ => unreachable!(),
+                            },
+                            loc,
+                            Location::Memory(tmp, LocalGlobal::offset_data() as i32),
+                        );
+                    } else {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S64,
+                            loc,
+                            Location::Memory(tmp, LocalGlobal::offset_data() as i32),
+                        );
+                    }
+                } else {
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::Memory(tmp, LocalGlobal::offset_data() as i32),
+                    );
+                }
                 self.machine.release_temp_gpr(tmp);
             }
             Operator::LocalGet { local_index } => {
@@ -2907,33 +3128,95 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     ret,
                 );
                 self.value_stack.push(ret);
+                if self.local_types[local_index].is_float() {
+                    self.fp_stack
+                        .push(FloatValue::new(self.value_stack.len() - 1));
+                }
             }
             Operator::LocalSet { local_index } => {
                 let local_index = local_index as usize;
                 let loc =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
 
-                Self::emit_relaxed_binop(
-                    a,
-                    &mut self.machine,
-                    Assembler::emit_mov,
-                    Size::S64,
-                    loc,
-                    self.locals[local_index],
-                );
+                if self.local_types[local_index].is_float() {
+                    let fp = self.fp_stack.pop1()?;
+                    if a.arch_supports_canonicalize_nan()
+                        && self.config.nan_canonicalization
+                        && fp.canonicalization.is_some()
+                    {
+                        Self::canonicalize_nan(
+                            a,
+                            &mut self.machine,
+                            match self.local_types[local_index] {
+                                WpType::F32 => Size::S32,
+                                WpType::F64 => Size::S64,
+                                _ => unreachable!(),
+                            },
+                            loc,
+                            self.locals[local_index],
+                        );
+                    } else {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S64,
+                            loc,
+                            self.locals[local_index],
+                        );
+                    }
+                } else {
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        self.locals[local_index],
+                    );
+                }
             }
             Operator::LocalTee { local_index } => {
                 let local_index = local_index as usize;
                 let loc = *self.value_stack.last().unwrap();
 
-                Self::emit_relaxed_binop(
-                    a,
-                    &mut self.machine,
-                    Assembler::emit_mov,
-                    Size::S64,
-                    loc,
-                    self.locals[local_index],
-                );
+                if self.local_types[local_index].is_float() {
+                    let fp = self.fp_stack.peek1()?;
+                    if a.arch_supports_canonicalize_nan()
+                        && self.config.nan_canonicalization
+                        && fp.canonicalization.is_some()
+                    {
+                        Self::canonicalize_nan(
+                            a,
+                            &mut self.machine,
+                            match self.local_types[local_index] {
+                                WpType::F32 => Size::S32,
+                                WpType::F64 => Size::S64,
+                                _ => unreachable!(),
+                            },
+                            loc,
+                            self.locals[local_index],
+                        );
+                    } else {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S64,
+                            loc,
+                            self.locals[local_index],
+                        );
+                    }
+                } else {
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        self.locals[local_index],
+                    );
+                }
             }
             Operator::I32Const { value } => {
                 self.value_stack.push(Location::Imm32(value as u32));
@@ -3905,36 +4188,61 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
 
             Operator::F32Const { value } => {
                 self.value_stack.push(Location::Imm32(value.bits()));
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
                 self.machine
                     .state
                     .wasm_stack
                     .push(WasmAbstractValue::Const(value.bits() as u64));
             }
-            Operator::F32Add => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vaddss,
-            )?,
-            Operator::F32Sub => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vsubss,
-            )?,
-            Operator::F32Mul => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vmulss,
-            )?,
-            Operator::F32Div => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vdivss,
-            )?,
+            Operator::F32Add => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vaddss,
+                )?;
+            }
+            Operator::F32Sub => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vsubss,
+                )?
+            }
+            Operator::F32Mul => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vmulss,
+                )?
+            }
+            Operator::F32Div => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vdivss,
+                )?
+            }
             Operator::F32Max => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 2));
                 if !a.arch_supports_canonicalize_nan() {
                     Self::emit_fp_binop_avx(
                         a,
@@ -4058,6 +4366,9 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                 }
             }
             Operator::F32Min => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 2));
                 if !a.arch_supports_canonicalize_nan() {
                     Self::emit_fp_binop_avx(
                         a,
@@ -4186,72 +4497,115 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     self.machine.release_temp_xmm(tmp1);
                 }
             }
-            Operator::F32Eq => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpeqss,
-            )?,
-            Operator::F32Ne => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpneqss,
-            )?,
-            Operator::F32Lt => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpltss,
-            )?,
-            Operator::F32Le => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpless,
-            )?,
-            Operator::F32Gt => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpgtss,
-            )?,
-            Operator::F32Ge => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpgess,
-            )?,
-            Operator::F32Nearest => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundss_nearest,
-            )?,
-            Operator::F32Floor => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundss_floor,
-            )?,
-            Operator::F32Ceil => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundss_ceil,
-            )?,
-            Operator::F32Trunc => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundss_trunc,
-            )?,
-            Operator::F32Sqrt => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vsqrtss,
-            )?,
+            Operator::F32Eq => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpeqss,
+                )?
+            }
+            Operator::F32Ne => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpneqss,
+                )?
+            }
+            Operator::F32Lt => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpltss,
+                )?
+            }
+            Operator::F32Le => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpless,
+                )?
+            }
+            Operator::F32Gt => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpgtss,
+                )?
+            }
+            Operator::F32Ge => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpgess,
+                )?
+            }
+            Operator::F32Nearest => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundss_nearest,
+                )?
+            }
+            Operator::F32Floor => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundss_floor,
+                )?
+            }
+            Operator::F32Ceil => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundss_ceil,
+                )?
+            }
+            Operator::F32Trunc => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundss_trunc,
+                )?
+            }
+            Operator::F32Sqrt => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f32(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vsqrtss,
+                )?
+            }
 
             Operator::F32Copysign => {
                 let loc_b =
@@ -4265,10 +4619,34 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                 )[0];
                 self.value_stack.push(ret);
 
+                let (fp_src1, fp_src2) = self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
+
                 let tmp1 = self.machine.acquire_temp_gpr().unwrap();
                 let tmp2 = self.machine.acquire_temp_gpr().unwrap();
-                a.emit_mov(Size::S32, loc_a, Location::GPR(tmp1));
-                a.emit_mov(Size::S32, loc_b, Location::GPR(tmp2));
+
+                if a.arch_supports_canonicalize_nan() && self.config.nan_canonicalization {
+                    for (fp, loc, tmp) in [(fp_src1, loc_a, tmp1), (fp_src2, loc_b, tmp2)].iter() {
+                        match fp.canonicalization {
+                            Some(_) => {
+                                Self::canonicalize_nan(
+                                    a,
+                                    &mut self.machine,
+                                    Size::S32,
+                                    *loc,
+                                    Location::GPR(*tmp),
+                                );
+                            }
+                            None => {
+                                a.emit_mov(Size::S32, *loc, Location::GPR(*tmp));
+                            }
+                        }
+                    }
+                } else {
+                    a.emit_mov(Size::S32, loc_a, Location::GPR(tmp1));
+                    a.emit_mov(Size::S32, loc_b, Location::GPR(tmp2));
+                }
                 a.emit_and(
                     Size::S32,
                     Location::Imm32(0x7fffffffu32),
@@ -4286,6 +4664,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
             }
 
             Operator::F32Abs => {
+                // Preserve canonicalization state.
+
                 let loc =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                 let ret = self.machine.acquire_locations(
@@ -4306,6 +4686,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
             }
 
             Operator::F32Neg => {
+                // Preserve canonicalization state.
+
                 let loc =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                 let ret = self.machine.acquire_locations(
@@ -4346,36 +4728,62 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
 
             Operator::F64Const { value } => {
                 self.value_stack.push(Location::Imm64(value.bits()));
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
                 self.machine
                     .state
                     .wasm_stack
                     .push(WasmAbstractValue::Const(value.bits()));
             }
-            Operator::F64Add => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vaddsd,
-            )?,
-            Operator::F64Sub => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vsubsd,
-            )?,
-            Operator::F64Mul => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vmulsd,
-            )?,
-            Operator::F64Div => Self::emit_fp_binop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vdivsd,
-            )?,
+            Operator::F64Add => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vaddsd,
+                )?
+            }
+            Operator::F64Sub => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vsubsd,
+                )?
+            }
+            Operator::F64Mul => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vmulsd,
+                )?
+            }
+            Operator::F64Div => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 2));
+                Self::emit_fp_binop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vdivsd,
+                )?
+            }
             Operator::F64Max => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 2));
+
                 if !a.arch_supports_canonicalize_nan() {
                     Self::emit_fp_binop_avx(
                         a,
@@ -4499,6 +4907,10 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                 }
             }
             Operator::F64Min => {
+                self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 2));
+
                 if !a.arch_supports_canonicalize_nan() {
                     Self::emit_fp_binop_avx(
                         a,
@@ -4627,72 +5039,115 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     self.machine.release_temp_xmm(tmp1);
                 }
             }
-            Operator::F64Eq => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpeqsd,
-            )?,
-            Operator::F64Ne => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpneqsd,
-            )?,
-            Operator::F64Lt => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpltsd,
-            )?,
-            Operator::F64Le => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmplesd,
-            )?,
-            Operator::F64Gt => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpgtsd,
-            )?,
-            Operator::F64Ge => Self::emit_fp_cmpop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcmpgesd,
-            )?,
-            Operator::F64Nearest => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundsd_nearest,
-            )?,
-            Operator::F64Floor => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundsd_floor,
-            )?,
-            Operator::F64Ceil => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundsd_ceil,
-            )?,
-            Operator::F64Trunc => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vroundsd_trunc,
-            )?,
-            Operator::F64Sqrt => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vsqrtsd,
-            )?,
+            Operator::F64Eq => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpeqsd,
+                )?
+            }
+            Operator::F64Ne => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpneqsd,
+                )?
+            }
+            Operator::F64Lt => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpltsd,
+                )?
+            }
+            Operator::F64Le => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmplesd,
+                )?
+            }
+            Operator::F64Gt => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpgtsd,
+                )?
+            }
+            Operator::F64Ge => {
+                self.fp_stack.pop2()?;
+                Self::emit_fp_cmpop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcmpgesd,
+                )?
+            }
+            Operator::F64Nearest => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundsd_nearest,
+                )?
+            }
+            Operator::F64Floor => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundsd_floor,
+                )?
+            }
+            Operator::F64Ceil => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundsd_ceil,
+                )?
+            }
+            Operator::F64Trunc => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vroundsd_trunc,
+                )?
+            }
+            Operator::F64Sqrt => {
+                self.fp_stack.pop1()?;
+                self.fp_stack
+                    .push(FloatValue::cncl_f64(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vsqrtsd,
+                )?
+            }
 
             Operator::F64Copysign => {
                 let loc_b =
@@ -4706,12 +5161,36 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                 )[0];
                 self.value_stack.push(ret);
 
+                let (fp_src1, fp_src2) = self.fp_stack.pop2()?;
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
+
                 let tmp1 = self.machine.acquire_temp_gpr().unwrap();
                 let tmp2 = self.machine.acquire_temp_gpr().unwrap();
-                let c = self.machine.acquire_temp_gpr().unwrap();
 
-                a.emit_mov(Size::S64, loc_a, Location::GPR(tmp1));
-                a.emit_mov(Size::S64, loc_b, Location::GPR(tmp2));
+                if a.arch_supports_canonicalize_nan() && self.config.nan_canonicalization {
+                    for (fp, loc, tmp) in [(fp_src1, loc_a, tmp1), (fp_src2, loc_b, tmp2)].iter() {
+                        match fp.canonicalization {
+                            Some(_) => {
+                                Self::canonicalize_nan(
+                                    a,
+                                    &mut self.machine,
+                                    Size::S64,
+                                    *loc,
+                                    Location::GPR(*tmp),
+                                );
+                            }
+                            None => {
+                                a.emit_mov(Size::S64, *loc, Location::GPR(*tmp));
+                            }
+                        }
+                    }
+                } else {
+                    a.emit_mov(Size::S64, loc_a, Location::GPR(tmp1));
+                    a.emit_mov(Size::S64, loc_b, Location::GPR(tmp2));
+                }
+
+                let c = self.machine.acquire_temp_gpr().unwrap();
 
                 a.emit_mov(
                     Size::S64,
@@ -4736,6 +5215,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
             }
 
             Operator::F64Abs => {
+                // Preserve canonicalization state.
+
                 let loc =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                 let ret = self.machine.acquire_locations(
@@ -4762,6 +5243,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
             }
 
             Operator::F64Neg => {
+                // Preserve canonicalization state.
+
                 let loc =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                 let ret = self.machine.acquire_locations(
@@ -4799,18 +5282,26 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                 }
             }
 
-            Operator::F64PromoteF32 => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcvtss2sd,
-            )?,
-            Operator::F32DemoteF64 => Self::emit_fp_unop_avx(
-                a,
-                &mut self.machine,
-                &mut self.value_stack,
-                Assembler::emit_vcvtsd2ss,
-            )?,
+            Operator::F64PromoteF32 => {
+                let fp = self.fp_stack.pop1()?;
+                self.fp_stack.push(fp.promote(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcvtss2sd,
+                )?
+            }
+            Operator::F32DemoteF64 => {
+                let fp = self.fp_stack.pop1()?;
+                self.fp_stack.push(fp.demote(self.value_stack.len() - 1));
+                Self::emit_fp_unop_avx(
+                    a,
+                    &mut self.machine,
+                    &mut self.value_stack,
+                    Assembler::emit_vcvtsd2ss,
+                )?
+            }
 
             Operator::I32ReinterpretF32 => {
                 let loc =
@@ -4821,16 +5312,24 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                let fp = self.fp_stack.pop1()?;
 
-                if loc != ret {
-                    Self::emit_relaxed_binop(
-                        a,
-                        &mut self.machine,
-                        Assembler::emit_mov,
-                        Size::S32,
-                        loc,
-                        ret,
-                    );
+                if !a.arch_supports_canonicalize_nan()
+                    || !self.config.nan_canonicalization
+                    || fp.canonicalization.is_none()
+                {
+                    if loc != ret {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S32,
+                            loc,
+                            ret,
+                        );
+                    }
+                } else {
+                    Self::canonicalize_nan(a, &mut self.machine, Size::S32, loc, ret);
                 }
             }
             Operator::F32ReinterpretI32 => {
@@ -4842,6 +5341,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
 
                 if loc != ret {
                     Self::emit_relaxed_binop(
@@ -4864,16 +5365,24 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                let fp = self.fp_stack.pop1()?;
 
-                if loc != ret {
-                    Self::emit_relaxed_binop(
-                        a,
-                        &mut self.machine,
-                        Assembler::emit_mov,
-                        Size::S64,
-                        loc,
-                        ret,
-                    );
+                if !a.arch_supports_canonicalize_nan()
+                    || !self.config.nan_canonicalization
+                    || fp.canonicalization.is_none()
+                {
+                    if loc != ret {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S64,
+                            loc,
+                            ret,
+                        );
+                    }
+                } else {
+                    Self::canonicalize_nan(a, &mut self.machine, Size::S64, loc, ret);
                 }
             }
             Operator::F64ReinterpretI64 => {
@@ -4885,6 +5394,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
 
                 if loc != ret {
                     Self::emit_relaxed_binop(
@@ -4907,6 +5418,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -4967,6 +5479,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5018,6 +5531,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -5078,6 +5592,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5136,6 +5651,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -5196,6 +5712,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5254,6 +5771,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -5338,6 +5856,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5415,6 +5934,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -5476,6 +5996,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5528,6 +6049,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -5594,6 +6116,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5657,6 +6180,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -5718,6 +6242,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5776,6 +6301,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 if a.arch_has_itruncf() {
                     let tmp_out = self.machine.acquire_temp_gpr().unwrap();
@@ -5861,6 +6387,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack.pop1()?;
 
                 let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                 let tmp_in = self.machine.acquire_temp_xmm().unwrap();
@@ -5938,6 +6465,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i32 to f32 never results in NaN.
 
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
@@ -5982,6 +6511,9 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i32 to f32 never results in NaN.
+
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                     let tmp_in = self.machine.acquire_temp_gpr().unwrap();
@@ -6025,6 +6557,9 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i64 to f32 never results in NaN.
+
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                     let tmp_in = self.machine.acquire_temp_gpr().unwrap();
@@ -6068,6 +6603,9 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i64 to f32 never results in NaN.
+
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                     let tmp_in = self.machine.acquire_temp_gpr().unwrap();
@@ -6128,6 +6666,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i32 to f64 never results in NaN.
 
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
@@ -6172,6 +6712,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i32 to f64 never results in NaN.
 
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
@@ -6216,6 +6758,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i64 to f64 never results in NaN.
 
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
@@ -6260,6 +6804,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1)); // Converting i64 to f64 never results in NaN.
 
                 if a.arch_has_fconverti() {
                     let tmp_out = self.machine.acquire_temp_xmm().unwrap();
@@ -6339,6 +6885,31 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
 
                 self.machine.release_locations_only_osr_state(params.len());
 
+                // Pop arguments off the FP stack and canonicalize them if needed.
+                //
+                // Canonicalization state will be lost across function calls, so early canonicalization
+                // is necessary here.
+                while let Some(fp) = self.fp_stack.last() {
+                    if fp.depth >= self.value_stack.len() {
+                        let index = fp.depth - self.value_stack.len();
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization
+                            && fp.canonicalization.is_some()
+                        {
+                            Self::canonicalize_nan(
+                                a,
+                                &mut self.machine,
+                                fp.canonicalization.unwrap().to_size(),
+                                params[index],
+                                params[index],
+                            );
+                        }
+                        self.fp_stack.pop().unwrap();
+                    } else {
+                        break;
+                    }
+                }
+
                 Self::emit_call_sysv_label(
                     a,
                     &mut self.machine,
@@ -6359,13 +6930,12 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         false,
                     )[0];
                     self.value_stack.push(ret);
-                    match return_types[0] {
-                        WpType::F32 | WpType::F64 => {
-                            a.emit_mov(Size::S64, Location::XMM(XMM::XMM0), ret);
-                        }
-                        _ => {
-                            a.emit_mov(Size::S64, Location::GPR(GPR::RAX), ret);
-                        }
+                    if return_types[0].is_float() {
+                        a.emit_mov(Size::S64, Location::XMM(XMM::XMM0), ret);
+                        self.fp_stack
+                            .push(FloatValue::new(self.value_stack.len() - 1));
+                    } else {
+                        a.emit_mov(Size::S64, Location::GPR(GPR::RAX), ret);
                     }
                 }
             }
@@ -6390,6 +6960,31 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     .collect();
                 self.machine.release_locations_only_regs(&params);
 
+                // Pop arguments off the FP stack and canonicalize them if needed.
+                //
+                // Canonicalization state will be lost across function calls, so early canonicalization
+                // is necessary here.
+                while let Some(fp) = self.fp_stack.last() {
+                    if fp.depth >= self.value_stack.len() {
+                        let index = fp.depth - self.value_stack.len();
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization
+                            && fp.canonicalization.is_some()
+                        {
+                            Self::canonicalize_nan(
+                                a,
+                                &mut self.machine,
+                                fp.canonicalization.unwrap().to_size(),
+                                params[index],
+                                params[index],
+                            );
+                        }
+                        self.fp_stack.pop().unwrap();
+                    } else {
+                        break;
+                    }
+                }
+
                 let table_base = self.machine.acquire_temp_gpr().unwrap();
                 let table_count = self.machine.acquire_temp_gpr().unwrap();
                 let sigidx = self.machine.acquire_temp_gpr().unwrap();
@@ -6505,13 +7100,12 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         false,
                     )[0];
                     self.value_stack.push(ret);
-                    match return_types[0] {
-                        WpType::F32 | WpType::F64 => {
-                            a.emit_mov(Size::S64, Location::XMM(XMM::XMM0), ret);
-                        }
-                        _ => {
-                            a.emit_mov(Size::S64, Location::GPR(GPR::RAX), ret);
-                        }
+                    if return_types[0].is_float() {
+                        a.emit_mov(Size::S64, Location::XMM(XMM::XMM0), ret);
+                        self.fp_stack
+                            .push(FloatValue::new(self.value_stack.len() - 1));
+                    } else {
+                        a.emit_mov(Size::S64, Location::GPR(GPR::RAX), ret);
                     }
                 }
             }
@@ -6536,6 +7130,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         }
                     },
                     value_stack_depth: self.value_stack.len(),
+                    fp_stack_depth: self.fp_stack.len(),
                     state: self.machine.state.clone(),
                     state_diff_id: Self::get_state_diff(
                         &self.machine,
@@ -6559,19 +7154,49 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
 
                 if !was_unreachable && frame.returns.len() > 0 {
                     let loc = *self.value_stack.last().unwrap();
-                    Self::emit_relaxed_binop(
-                        a,
-                        &mut self.machine,
-                        Assembler::emit_mov,
-                        Size::S64,
-                        loc,
-                        Location::GPR(GPR::RAX),
-                    );
+                    if frame.returns[0].is_float() {
+                        let fp = self.fp_stack.peek1()?;
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization
+                            && fp.canonicalization.is_some()
+                        {
+                            Self::canonicalize_nan(
+                                a,
+                                &mut self.machine,
+                                match frame.returns[0] {
+                                    WpType::F32 => Size::S32,
+                                    WpType::F64 => Size::S64,
+                                    _ => unreachable!(),
+                                },
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        } else {
+                            Self::emit_relaxed_binop(
+                                a,
+                                &mut self.machine,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        }
+                    } else {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S64,
+                            loc,
+                            Location::GPR(GPR::RAX),
+                        );
+                    }
                 }
 
                 let released: &[Location] = &self.value_stack[frame.value_stack_depth..];
                 self.machine.release_locations(a, released);
                 self.value_stack.truncate(frame.value_stack_depth);
+                self.fp_stack.truncate(frame.fp_stack_depth);
 
                 match frame.if_else {
                     IfElseState::If(label) => {
@@ -6593,6 +7218,18 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                 let v_a =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
+                let cncl: Option<(Option<CanonicalizeType>, Option<CanonicalizeType>)> =
+                    if self.fp_stack.len() >= 2
+                        && self.fp_stack[self.fp_stack.len() - 2].depth == self.value_stack.len()
+                        && self.fp_stack[self.fp_stack.len() - 1].depth
+                            == self.value_stack.len() + 1
+                    {
+                        let (left, right) = self.fp_stack.pop2()?;
+                        self.fp_stack.push(FloatValue::new(self.value_stack.len()));
+                        Some((left.canonicalization, right.canonicalization))
+                    } else {
+                        None
+                    };
                 let ret = self.machine.acquire_locations(
                     a,
                     &[(WpType::I64, MachineValue::WasmStack(self.value_stack.len()))],
@@ -6612,27 +7249,47 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     cond,
                 );
                 a.emit_jmp(Condition::Equal, zero_label);
-                if v_a != ret {
-                    Self::emit_relaxed_binop(
-                        a,
-                        &mut self.machine,
-                        Assembler::emit_mov,
-                        Size::S64,
-                        v_a,
-                        ret,
-                    );
+                match cncl {
+                    Some((Some(fp), _))
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization =>
+                    {
+                        Self::canonicalize_nan(a, &mut self.machine, fp.to_size(), v_a, ret);
+                    }
+                    _ => {
+                        if v_a != ret {
+                            Self::emit_relaxed_binop(
+                                a,
+                                &mut self.machine,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                v_a,
+                                ret,
+                            );
+                        }
+                    }
                 }
                 a.emit_jmp(Condition::None, end_label);
                 a.emit_label(zero_label);
-                if v_b != ret {
-                    Self::emit_relaxed_binop(
-                        a,
-                        &mut self.machine,
-                        Assembler::emit_mov,
-                        Size::S64,
-                        v_b,
-                        ret,
-                    );
+                match cncl {
+                    Some((_, Some(fp)))
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization =>
+                    {
+                        Self::canonicalize_nan(a, &mut self.machine, fp.to_size(), v_b, ret);
+                    }
+                    _ => {
+                        if v_b != ret {
+                            Self::emit_relaxed_binop(
+                                a,
+                                &mut self.machine,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                v_b,
+                                ret,
+                            );
+                        }
+                    }
                 }
                 a.emit_label(end_label);
             }
@@ -6651,6 +7308,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         }
                     },
                     value_stack_depth: self.value_stack.len(),
+                    fp_stack_depth: self.fp_stack.len(),
                     state: self.machine.state.clone(),
                     state_diff_id: Self::get_state_diff(
                         &self.machine,
@@ -6680,6 +7338,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         }
                     },
                     value_stack_depth: self.value_stack.len(),
+                    fp_stack_depth: self.fp_stack.len(),
                     state: self.machine.state.clone(),
                     state_diff_id,
                 });
@@ -6844,6 +7503,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
 
                 Self::emit_memory_op(
                     module_info,
@@ -7038,6 +7699,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                 let target_addr =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
+                let fp = self.fp_stack.pop1()?;
+                let config_nan_canonicalization = self.config.nan_canonicalization;
 
                 Self::emit_memory_op(
                     module_info,
@@ -7050,14 +7713,28 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                     4,
                     |a, m, addr| {
-                        Self::emit_relaxed_binop(
-                            a,
-                            m,
-                            Assembler::emit_mov,
-                            Size::S32,
-                            target_value,
-                            Location::Memory(addr, 0),
-                        );
+                        if !a.arch_supports_canonicalize_nan()
+                            || !config_nan_canonicalization
+                            || fp.canonicalization.is_none()
+                        {
+                            Self::emit_relaxed_binop(
+                                a,
+                                m,
+                                Assembler::emit_mov,
+                                Size::S32,
+                                target_value,
+                                Location::Memory(addr, 0),
+                            );
+                        } else {
+                            Self::canonicalize_nan(
+                                a,
+                                m,
+                                Size::S32,
+                                target_value,
+                                Location::Memory(addr, 0),
+                            );
+                        }
+
                         Ok(())
                     },
                 )?;
@@ -7162,6 +7839,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                 )[0];
                 self.value_stack.push(ret);
+                self.fp_stack
+                    .push(FloatValue::new(self.value_stack.len() - 1));
 
                 Self::emit_memory_op(
                     module_info,
@@ -7438,6 +8117,8 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                 let target_addr =
                     get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
+                let fp = self.fp_stack.pop1()?;
+                let config_nan_canonicalization = self.config.nan_canonicalization;
 
                 Self::emit_memory_op(
                     module_info,
@@ -7450,14 +8131,27 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     false,
                     8,
                     |a, m, addr| {
-                        Self::emit_relaxed_binop(
-                            a,
-                            m,
-                            Assembler::emit_mov,
-                            Size::S64,
-                            target_value,
-                            Location::Memory(addr, 0),
-                        );
+                        if !a.arch_supports_canonicalize_nan()
+                            || !config_nan_canonicalization
+                            || fp.canonicalization.is_none()
+                        {
+                            Self::emit_relaxed_binop(
+                                a,
+                                m,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                target_value,
+                                Location::Memory(addr, 0),
+                            );
+                        } else {
+                            Self::canonicalize_nan(
+                                a,
+                                m,
+                                Size::S64,
+                                target_value,
+                                Location::Memory(addr, 0),
+                            );
+                        }
                         Ok(())
                     },
                 )?;
@@ -7568,14 +8262,43 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         });
                     }
                     let loc = *self.value_stack.last().unwrap();
-                    Self::emit_relaxed_binop(
-                        a,
-                        &mut self.machine,
-                        Assembler::emit_mov,
-                        Size::S64,
-                        loc,
-                        Location::GPR(GPR::RAX),
-                    );
+                    if frame.returns[0].is_float() {
+                        let fp = self.fp_stack.peek1()?;
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization
+                            && fp.canonicalization.is_some()
+                        {
+                            Self::canonicalize_nan(
+                                a,
+                                &mut self.machine,
+                                match frame.returns[0] {
+                                    WpType::F32 => Size::S32,
+                                    WpType::F64 => Size::S64,
+                                    _ => unreachable!(),
+                                },
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        } else {
+                            Self::emit_relaxed_binop(
+                                a,
+                                &mut self.machine,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        }
+                    } else {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S64,
+                            loc,
+                            Location::GPR(GPR::RAX),
+                        );
+                    }
                 }
                 let released = &self.value_stack[frame.value_stack_depth..];
                 self.machine.release_locations_keep_state(a, released);
@@ -7592,8 +8315,39 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         });
                     }
                     let loc = *self.value_stack.last().unwrap();
-                    a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+
+                    if frame.returns[0].is_float() {
+                        let fp = self.fp_stack.peek1()?;
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization
+                            && fp.canonicalization.is_some()
+                        {
+                            Self::canonicalize_nan(
+                                a,
+                                &mut self.machine,
+                                match frame.returns[0] {
+                                    WpType::F32 => Size::S32,
+                                    WpType::F64 => Size::S64,
+                                    _ => unreachable!(),
+                                },
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        } else {
+                            Self::emit_relaxed_binop(
+                                a,
+                                &mut self.machine,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        }
+                    } else {
+                        a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+                    }
                 }
+
                 let released = &self.value_stack[frame.value_stack_depth..];
                 self.machine.release_locations_keep_state(a, released);
                 a.emit_jmp(Condition::None, frame.label);
@@ -7622,7 +8376,36 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         });
                     }
                     let loc = *self.value_stack.last().unwrap();
-                    a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+                    if frame.returns[0].is_float() {
+                        let fp = self.fp_stack.peek1()?;
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization
+                            && fp.canonicalization.is_some()
+                        {
+                            Self::canonicalize_nan(
+                                a,
+                                &mut self.machine,
+                                match frame.returns[0] {
+                                    WpType::F32 => Size::S32,
+                                    WpType::F64 => Size::S64,
+                                    _ => unreachable!(),
+                                },
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        } else {
+                            Self::emit_relaxed_binop(
+                                a,
+                                &mut self.machine,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        }
+                    } else {
+                        a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+                    }
                 }
                 let released = &self.value_stack[frame.value_stack_depth..];
                 self.machine.release_locations_keep_state(a, released);
@@ -7673,7 +8456,36 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                             });
                         }
                         let loc = *self.value_stack.last().unwrap();
-                        a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+                        if frame.returns[0].is_float() {
+                            let fp = self.fp_stack.peek1()?;
+                            if a.arch_supports_canonicalize_nan()
+                                && self.config.nan_canonicalization
+                                && fp.canonicalization.is_some()
+                            {
+                                Self::canonicalize_nan(
+                                    a,
+                                    &mut self.machine,
+                                    match frame.returns[0] {
+                                        WpType::F32 => Size::S32,
+                                        WpType::F64 => Size::S64,
+                                        _ => unreachable!(),
+                                    },
+                                    loc,
+                                    Location::GPR(GPR::RAX),
+                                );
+                            } else {
+                                Self::emit_relaxed_binop(
+                                    a,
+                                    &mut self.machine,
+                                    Assembler::emit_mov,
+                                    Size::S64,
+                                    loc,
+                                    Location::GPR(GPR::RAX),
+                                );
+                            }
+                        } else {
+                            a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+                        }
                     }
                     let released = &self.value_stack[frame.value_stack_depth..];
                     self.machine.release_locations_keep_state(a, released);
@@ -7691,7 +8503,36 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                             });
                         }
                         let loc = *self.value_stack.last().unwrap();
-                        a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+                        if frame.returns[0].is_float() {
+                            let fp = self.fp_stack.peek1()?;
+                            if a.arch_supports_canonicalize_nan()
+                                && self.config.nan_canonicalization
+                                && fp.canonicalization.is_some()
+                            {
+                                Self::canonicalize_nan(
+                                    a,
+                                    &mut self.machine,
+                                    match frame.returns[0] {
+                                        WpType::F32 => Size::S32,
+                                        WpType::F64 => Size::S64,
+                                        _ => unreachable!(),
+                                    },
+                                    loc,
+                                    Location::GPR(GPR::RAX),
+                                );
+                            } else {
+                                Self::emit_relaxed_binop(
+                                    a,
+                                    &mut self.machine,
+                                    Assembler::emit_mov,
+                                    Size::S64,
+                                    loc,
+                                    Location::GPR(GPR::RAX),
+                                );
+                            }
+                        } else {
+                            a.emit_mov(Size::S64, loc, Location::GPR(GPR::RAX));
+                        }
                     }
                     let released = &self.value_stack[frame.value_stack_depth..];
                     self.machine.release_locations_keep_state(a, released);
@@ -7706,20 +8547,54 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
             }
             Operator::Drop => {
                 get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
+                if let Some(x) = self.fp_stack.last() {
+                    if x.depth == self.value_stack.len() {
+                        self.fp_stack.pop1()?;
+                    }
+                }
             }
             Operator::End => {
                 let frame = self.control_stack.pop().unwrap();
 
                 if !was_unreachable && frame.returns.len() > 0 {
                     let loc = *self.value_stack.last().unwrap();
-                    Self::emit_relaxed_binop(
-                        a,
-                        &mut self.machine,
-                        Assembler::emit_mov,
-                        Size::S64,
-                        loc,
-                        Location::GPR(GPR::RAX),
-                    );
+                    if frame.returns[0].is_float() {
+                        let fp = self.fp_stack.peek1()?;
+                        if a.arch_supports_canonicalize_nan()
+                            && self.config.nan_canonicalization
+                            && fp.canonicalization.is_some()
+                        {
+                            Self::canonicalize_nan(
+                                a,
+                                &mut self.machine,
+                                match frame.returns[0] {
+                                    WpType::F32 => Size::S32,
+                                    WpType::F64 => Size::S64,
+                                    _ => unreachable!(),
+                                },
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        } else {
+                            Self::emit_relaxed_binop(
+                                a,
+                                &mut self.machine,
+                                Assembler::emit_mov,
+                                Size::S64,
+                                loc,
+                                Location::GPR(GPR::RAX),
+                            );
+                        }
+                    } else {
+                        Self::emit_relaxed_binop(
+                            a,
+                            &mut self.machine,
+                            Assembler::emit_mov,
+                            Size::S64,
+                            loc,
+                            Location::GPR(GPR::RAX),
+                        );
+                    }
                 }
 
                 if self.control_stack.len() == 0 {
@@ -7744,6 +8619,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                     let released = &self.value_stack[frame.value_stack_depth..];
                     self.machine.release_locations(a, released);
                     self.value_stack.truncate(frame.value_stack_depth);
+                    self.fp_stack.truncate(frame.fp_stack_depth);
 
                     if !frame.loop_like {
                         a.emit_label(frame.label);
@@ -7769,6 +8645,11 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                         )[0];
                         a.emit_mov(Size::S64, Location::GPR(GPR::RAX), loc);
                         self.value_stack.push(loc);
+                        if frame.returns[0].is_float() {
+                            self.fp_stack
+                                .push(FloatValue::new(self.value_stack.len() - 1));
+                            // we already canonicalized at the `Br*` instruction or here previously.
+                        }
                     }
                 }
             }
diff --git a/lib/spectests/spectests/wasmer.wast b/lib/spectests/spectests/wasmer.wast
index cf3841f9bfc..1a7647ee1e7 100644
--- a/lib/spectests/spectests/wasmer.wast
+++ b/lib/spectests/spectests/wasmer.wast
@@ -3,15 +3,21 @@
 (module
   ;; Auxiliary definitions
   (type $out-i32 (func (result i32)))
+  (type $f32-id (func (param f32) (result f32)))
+  (type $f64-id (func (param f64) (result f64)))
 
   (func $const-i32 (type $out-i32) (i32.const 0x132))
 
   (table funcref
     (elem
       $const-i32
+      $nan-canonicalization-f32-func-call-target
+      $nan-canonicalization-f64-func-call-target
     )
   )
 
+  (memory 1)
+
   ;; https://github.com/wasmerio/wasmer/pull/1191
   (func (export "call-indirect-from-spilled-stack") (result i32)
     (i64.add (i64.const 0) (i64.const 0))
@@ -28,6 +34,182 @@
     (call_indirect (type $out-i32))
     (return)
   )
+
+  ;; NaN canonicalization tests.
+  ;; Things that are covered by spectests canonicalization (`fabs`, `fneg`, `fcopysign`, `reinterpret`, `const`) won't be duplicated here.
+
+  (func (export "nan-canonicalization-f32-add") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.add (f32.reinterpret_i32 (get_local 0)) (f32.const 0)))
+  )
+  (func (export "nan-canonicalization-f32-sub") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.sub (f32.reinterpret_i32 (get_local 0)) (f32.const 0)))
+  )
+  (func (export "nan-canonicalization-f32-mul") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.mul (f32.reinterpret_i32 (get_local 0)) (f32.const 0)))
+  )
+  (func (export "nan-canonicalization-f32-div") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.div (f32.reinterpret_i32 (get_local 0)) (f32.const 1)))
+  )
+  (func (export "nan-canonicalization-f32-max") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.max (f32.reinterpret_i32 (get_local 0)) (f32.const 1)))
+  )
+  (func (export "nan-canonicalization-f32-min") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.min (f32.reinterpret_i32 (get_local 0)) (f32.const 1)))
+  )
+  (func (export "nan-canonicalization-f32-nearest") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.nearest (f32.reinterpret_i32 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f32-floor") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.floor (f32.reinterpret_i32 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f32-ceil") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.ceil (f32.reinterpret_i32 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f32-trunc") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.trunc (f32.reinterpret_i32 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f32-sqrt") (param i32) (result i32)
+    (i32.reinterpret_f32 (f32.sqrt (f32.reinterpret_i32 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f32-mem") (param i32) (result i32)
+    (f32.store (i32.const 0) (f32.reinterpret_i32 (get_local 0)))
+    (i32.reinterpret_f32 (f32.load (i32.const 0)))
+  )
+  (func (export "nan-canonicalization-f32-mem-cncl") (param i32) (result i32)
+    (f32.store (i32.const 0) (f32.add (f32.reinterpret_i32 (get_local 0)) (f32.const 0)))
+    (i32.reinterpret_f32 (f32.load (i32.const 0)))
+  )
+  (func (export "nan-canonicalization-f32-local") (param i32) (result i32)
+    (local f32)
+    (set_local 1 (f32.reinterpret_i32 (get_local 0)))
+    (i32.reinterpret_f32 (get_local 1))
+  )
+  (func (export "nan-canonicalization-f32-local-cncl") (param i32) (result i32)
+    (local f32)
+    (set_local 1 (f32.add (f32.reinterpret_i32 (get_local 0)) (f32.const 0)))
+    (i32.reinterpret_f32 (get_local 1))
+  )
+  (func $nan-canonicalization-f32-func-call-target (param f32) (result f32)
+    (get_local 0)
+  )
+  (func (export "nan-canonicalization-f32-func-call") (param i32) (result i32)
+    (i32.reinterpret_f32 (call $nan-canonicalization-f32-func-call-target (f32.reinterpret_i32 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f32-func-call-cncl") (param i32) (result i32)
+    (i32.reinterpret_f32 (call $nan-canonicalization-f32-func-call-target (f32.add (f32.reinterpret_i32 (get_local 0)) (f32.const 0))))
+  )
+  (func (export "nan-canonicalization-f32-func-call-indirect") (param i32) (result i32)
+    (i32.reinterpret_f32 (call_indirect (type $f32-id) (f32.reinterpret_i32 (get_local 0)) (i32.const 1)))
+  )
+  (func (export "nan-canonicalization-f32-func-call-indirect-cncl") (param i32) (result i32)
+    (i32.reinterpret_f32 (call_indirect (type $f32-id) (f32.add (f32.reinterpret_i32 (get_local 0)) (f32.const 0)) (i32.const 1)))
+  )
+
+  (func (export "nan-canonicalization-f64-add") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.add (f64.reinterpret_i64 (get_local 0)) (f64.const 0)))
+  )
+  (func (export "nan-canonicalization-f64-sub") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.sub (f64.reinterpret_i64 (get_local 0)) (f64.const 0)))
+  )
+  (func (export "nan-canonicalization-f64-mul") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.mul (f64.reinterpret_i64 (get_local 0)) (f64.const 0)))
+  )
+  (func (export "nan-canonicalization-f64-div") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.div (f64.reinterpret_i64 (get_local 0)) (f64.const 1)))
+  )
+  (func (export "nan-canonicalization-f64-max") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.max (f64.reinterpret_i64 (get_local 0)) (f64.const 1)))
+  )
+  (func (export "nan-canonicalization-f64-min") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.min (f64.reinterpret_i64 (get_local 0)) (f64.const 1)))
+  )
+  (func (export "nan-canonicalization-f64-nearest") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.nearest (f64.reinterpret_i64 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f64-floor") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.floor (f64.reinterpret_i64 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f64-ceil") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.ceil (f64.reinterpret_i64 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f64-trunc") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.trunc (f64.reinterpret_i64 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f64-sqrt") (param i64) (result i64)
+    (i64.reinterpret_f64 (f64.sqrt (f64.reinterpret_i64 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f64-mem") (param i64) (result i64)
+    (f64.store (i32.const 0) (f64.reinterpret_i64 (get_local 0)))
+    (i64.reinterpret_f64 (f64.load (i32.const 0)))
+  )
+  (func (export "nan-canonicalization-f64-mem-cncl") (param i64) (result i64)
+    (f64.store (i32.const 0) (f64.add (f64.reinterpret_i64 (get_local 0)) (f64.const 0)))
+    (i64.reinterpret_f64 (f64.load (i32.const 0)))
+  )
+  (func (export "nan-canonicalization-f64-local") (param i64) (result i64)
+    (local f64)
+    (set_local 1 (f64.reinterpret_i64 (get_local 0)))
+    (i64.reinterpret_f64 (get_local 1))
+  )
+  (func (export "nan-canonicalization-f64-local-cncl") (param i64) (result i64)
+    (local f64)
+    (set_local 1 (f64.add (f64.reinterpret_i64 (get_local 0)) (f64.const 0)))
+    (i64.reinterpret_f64 (get_local 1))
+  )
+  (func $nan-canonicalization-f64-func-call-target (param f64) (result f64)
+    (get_local 0)
+  )
+  (func (export "nan-canonicalization-f64-func-call") (param i64) (result i64)
+    (i64.reinterpret_f64 (call $nan-canonicalization-f64-func-call-target (f64.reinterpret_i64 (get_local 0))))
+  )
+  (func (export "nan-canonicalization-f64-func-call-cncl") (param i64) (result i64)
+    (i64.reinterpret_f64 (call $nan-canonicalization-f64-func-call-target (f64.add (f64.reinterpret_i64 (get_local 0)) (f64.const 0))))
+  )
+  (func (export "nan-canonicalization-f64-func-call-indirect") (param i64) (result i64)
+    (i64.reinterpret_f64 (call_indirect (type $f64-id) (f64.reinterpret_i64 (get_local 0)) (i32.const 2)))
+  )
+  (func (export "nan-canonicalization-f64-func-call-indirect-cncl") (param i64) (result i64)
+    (i64.reinterpret_f64 (call_indirect (type $f64-id) (f64.add (f64.reinterpret_i64 (get_local 0)) (f64.const 0)) (i32.const 2)))
+  )
 )
 
-(assert_return (invoke "call-indirect-from-spilled-stack") (i32.const 0x132))
\ No newline at end of file
+(assert_return (invoke "call-indirect-from-spilled-stack") (i32.const 0x132))
+(assert_return (invoke "nan-canonicalization-f32-add" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-sub" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-mul" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-div" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-max" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-min" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-nearest" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-floor" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-ceil" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-trunc" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-sqrt" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-mem" (i32.const 0x7fc00001)) (i32.const 0x7fc00001))
+(assert_return (invoke "nan-canonicalization-f32-mem-cncl" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-local" (i32.const 0x7fc00001)) (i32.const 0x7fc00001))
+(assert_return (invoke "nan-canonicalization-f32-local-cncl" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-func-call" (i32.const 0x7fc00001)) (i32.const 0x7fc00001))
+(assert_return (invoke "nan-canonicalization-f32-func-call-cncl" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+(assert_return (invoke "nan-canonicalization-f32-func-call-indirect" (i32.const 0x7fc00001)) (i32.const 0x7fc00001))
+(assert_return (invoke "nan-canonicalization-f32-func-call-indirect-cncl" (i32.const 0x7fc00001)) (i32.const 0x7fc00000))
+
+(assert_return (invoke "nan-canonicalization-f64-add" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-sub" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-mul" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-div" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-max" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-min" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-nearest" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-floor" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-ceil" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-trunc" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-sqrt" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-mem" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000001))
+(assert_return (invoke "nan-canonicalization-f64-mem-cncl" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-local" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000001))
+(assert_return (invoke "nan-canonicalization-f64-local-cncl" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-func-call" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000001))
+(assert_return (invoke "nan-canonicalization-f64-func-call-cncl" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
+(assert_return (invoke "nan-canonicalization-f64-func-call-indirect" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000001))
+(assert_return (invoke "nan-canonicalization-f64-func-call-indirect-cncl" (i64.const 0x7ff8000000000001)) (i64.const 0x7ff8000000000000))
diff --git a/lib/spectests/tests/excludes.txt b/lib/spectests/tests/excludes.txt
index a64d96b488a..54c886a89a7 100644
--- a/lib/spectests/tests/excludes.txt
+++ b/lib/spectests/tests/excludes.txt
@@ -244,10 +244,6 @@ clif:fail:exports.wast:167:windows # Module - caught panic Any
 clif:fail:exports.wast:168:windows # Module - caught panic Any
 clif:fail:exports.wast:169:windows # Module - caught panic Any
 clif:fail:exports.wast:170:windows # Module - caught panic Any
-clif:fail:f32.wast:2496:windows # "AssertReturnArithmeticNan" - value is not arithmetic nan F32(NaN)
-clif:fail:f32.wast:2498:windows # "AssertReturnArithmeticNan" - value is not arithmetic nan F32(NaN)
-clif:fail:f64.wast:2496:windows # "AssertReturnArithmeticNan" - value is not arithmetic nan F64(NaN)
-clif:fail:f64.wast:2498:windows # "AssertReturnArithmeticNan" - value is not arithmetic nan F64(NaN)
 clif:fail:func.wast:289:windows # Module - caught panic Any
 clif:fail:memory.wast:3:windows # Module - caught panic Any
 clif:fail:memory.wast:4:windows # Module - caught panic Any
@@ -523,4 +519,36 @@ singlepass:fail:traps.wast:53:*:aarch64 # AssertTrap - expected trap, got []
 singlepass:fail:traps.wast:54:*:aarch64 # AssertTrap - expected trap, got []
 singlepass:fail:traps.wast:55:*:aarch64 # AssertTrap - expected trap, got []
 singlepass:fail:traps.wast:56:*:aarch64 # AssertTrap - expected trap, got []
-singlepass:fail:traps.wast:57:*:aarch64 # AssertTrap - expected trap, got []
\ No newline at end of file
+singlepass:fail:traps.wast:57:*:aarch64 # AssertTrap - expected trap, got []
+
+# NaN canonicalization is not yet implemented for aarch64.
+singlepass:fail:wasmer.wast:177:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:178:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:179:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:180:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:181:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:182:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:183:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:184:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:185:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:186:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:187:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:189:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:191:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:193:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:195:*:aarch64 # AssertReturn - result I32(2143289345) ("0x7fc00001") does not match expected I32(2143289344) ("0x7fc00000")
+singlepass:fail:wasmer.wast:197:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:198:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:199:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:200:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:201:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:202:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:203:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:204:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:205:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:206:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:207:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:209:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:211:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:213:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
+singlepass:fail:wasmer.wast:215:*:aarch64 # AssertReturn - result I64(9221120237041090561) ("0x7ff8000000000001") does not match expected I64(9221120237041090560) ("0x7ff8000000000000")
\ No newline at end of file
diff --git a/lib/spectests/tests/spectest.rs b/lib/spectests/tests/spectest.rs
index 409a9d90829..c3d1ecce723 100644
--- a/lib/spectests/tests/spectest.rs
+++ b/lib/spectests/tests/spectest.rs
@@ -336,6 +336,7 @@ mod tests {
                                 simd: true,
                                 threads: true,
                             },
+                            nan_canonicalization: true,
                             ..Default::default()
                         };
                         let module = compile_with_config(&module.into_vec(), config)
@@ -774,6 +775,7 @@ mod tests {
                                 simd: true,
                                 threads: true,
                             },
+                            nan_canonicalization: true,
                             ..Default::default()
                         };
                         compile_with_config(&module.into_vec(), config)
@@ -826,6 +828,7 @@ mod tests {
                                 simd: true,
                                 threads: true,
                             },
+                            nan_canonicalization: true,
                             ..Default::default()
                         };
                         compile_with_config(&module.into_vec(), config)
@@ -877,6 +880,7 @@ mod tests {
                             simd: true,
                             threads: true,
                         },
+                        nan_canonicalization: true,
                         ..Default::default()
                     };
                     let module = compile_with_config(&module.into_vec(), config)
@@ -972,6 +976,7 @@ mod tests {
                                 simd: true,
                                 threads: true,
                             },
+                            nan_canonicalization: true,
                             ..Default::default()
                         };
                         let module = compile_with_config(&module.into_vec(), config)