diff --git a/common/goal_constants.h b/common/goal_constants.h
index b484be1e99..974e291620 100644
--- a/common/goal_constants.h
+++ b/common/goal_constants.h
@@ -15,7 +15,7 @@ constexpr int ARRAY_DATA_OFFSET = 12;  // not including type tag
 constexpr s32 GOAL_MAX_SYMBOLS = 0x2000;
 constexpr s32 SYM_INFO_OFFSET = 0xff34;
 
-enum class RegKind { GPR_64, FLOAT, INT_128, FLOAT_4X, INVALID };
+enum class RegClass { GPR_64, FLOAT, INT_128, VECTOR_FLOAT, INVALID };
 
 constexpr u32 GOAL_NEW_METHOD = 0;       // method ID of GOAL new
 constexpr u32 GOAL_DEL_METHOD = 1;       // method ID of GOAL delete
diff --git a/common/type_system/Type.cpp b/common/type_system/Type.cpp
index 541fc93da9..39da0cf744 100644
--- a/common/type_system/Type.cpp
+++ b/common/type_system/Type.cpp
@@ -9,18 +9,18 @@
 #include "Type.h"
 
 namespace {
-std::string reg_kind_to_string(RegKind kind) {
+std::string reg_kind_to_string(RegClass kind) {
   switch (kind) {
-    case RegKind::GPR_64:
+    case RegClass::GPR_64:
       return "gpr64";
-    case RegKind::INT_128:
+    case RegClass::INT_128:
       return "int128";
-    case RegKind::FLOAT:
+    case RegClass::FLOAT:
       return "float";
-    case RegKind::FLOAT_4X:
+    case RegClass::VECTOR_FLOAT:
       return "float-4x";
     default:
-      throw std::runtime_error("Unsupported RegKind");
+      throw std::runtime_error("Unsupported HWRegKind");
   }
 }
 
@@ -268,8 +268,8 @@ int NullType::get_size_in_memory() const {
   throw std::runtime_error("get_size_in_memory called on NullType");
 }
 
-RegKind NullType::get_preferred_reg_kind() const {
-  throw std::runtime_error("get_preferred_reg_kind called on NullType");
+RegClass NullType::get_preferred_reg_class() const {
+  throw std::runtime_error("get_preferred_reg_class called on NullType");
 }
 
 int NullType::get_offset() const {
@@ -306,7 +306,7 @@ ValueType::ValueType(std::string parent,
                      bool is_boxed,
                      int size,
                      bool sign_extend,
-                     RegKind reg)
+                     RegClass reg)
     : Type(std::move(parent), std::move(name), is_boxed),
       m_size(size),
       m_sign_extend(sign_extend),
@@ -339,7 +339,7 @@ int ValueType::get_size_in_memory() const {
 /*!
  * The type of register that this value likes to be loaded into.
  */
-RegKind ValueType::get_preferred_reg_kind() const {
+RegClass ValueType::get_preferred_reg_class() const {
   return m_reg_kind;
 }
 
@@ -447,8 +447,8 @@ int ReferenceType::get_load_size() const {
 /*!
  * Pointers go in GPRs
  */
-RegKind ReferenceType::get_preferred_reg_kind() const {
-  return RegKind::GPR_64;
+RegClass ReferenceType::get_preferred_reg_class() const {
+  return RegClass::GPR_64;
 }
 
 std::string ReferenceType::print() const {
@@ -592,7 +592,7 @@ bool BitField::operator==(const BitField& other) const {
 }
 
 BitFieldType::BitFieldType(std::string parent, std::string name, int size, bool sign_extend)
-    : ValueType(std::move(parent), std::move(name), false, size, sign_extend, RegKind::GPR_64) {}
+    : ValueType(std::move(parent), std::move(name), false, size, sign_extend, RegClass::GPR_64) {}
 
 bool BitFieldType::lookup_field(const std::string& name, BitField* out) const {
   for (auto& field : m_fields) {
diff --git a/common/type_system/Type.h b/common/type_system/Type.h
index 081af6fb5b..1653fe22f7 100644
--- a/common/type_system/Type.h
+++ b/common/type_system/Type.h
@@ -44,7 +44,7 @@ class Type {
   virtual int get_size_in_memory() const = 0;
 
   // if we have no other information, what kind of register should we load into?
-  virtual RegKind get_preferred_reg_kind() const = 0;
+  virtual RegClass get_preferred_reg_class() const = 0;
 
   // get the "offset" applied to boxed objects
   virtual int get_offset() const = 0;
@@ -106,7 +106,7 @@ class NullType : public Type {
   bool get_load_signed() const override;
   int get_size_in_memory() const override;
   int get_inline_array_alignment() const override;
-  RegKind get_preferred_reg_kind() const override;
+  RegClass get_preferred_reg_class() const override;
   int get_offset() const override;
   int get_in_memory_alignment() const override;
   std::string print() const override;
@@ -125,12 +125,12 @@ class ValueType : public Type {
             bool is_boxed,
             int size,
             bool sign_extend,
-            RegKind reg);
+            RegClass reg);
   bool is_reference() const override;
   int get_load_size() const override;
   bool get_load_signed() const override;
   int get_size_in_memory() const override;
-  RegKind get_preferred_reg_kind() const override;
+  RegClass get_preferred_reg_class() const override;
   int get_offset() const override;
   int get_in_memory_alignment() const override;
   int get_inline_array_alignment() const override;
@@ -146,7 +146,7 @@ class ValueType : public Type {
   int m_size = -1;
   int m_offset = 0;
   bool m_sign_extend = false;
-  RegKind m_reg_kind = RegKind::INVALID;
+  RegClass m_reg_kind = RegClass::INVALID;
 };
 
 /*!
@@ -159,7 +159,7 @@ class ReferenceType : public Type {
   bool is_reference() const override;
   int get_load_size() const override;
   bool get_load_signed() const override;
-  RegKind get_preferred_reg_kind() const override;
+  RegClass get_preferred_reg_class() const override;
   std::string print() const override;
   ~ReferenceType() = default;
 };
diff --git a/common/type_system/TypeSystem.cpp b/common/type_system/TypeSystem.cpp
index 28184c52e3..705f3fba4f 100644
--- a/common/type_system/TypeSystem.cpp
+++ b/common/type_system/TypeSystem.cpp
@@ -123,11 +123,11 @@ DerefInfo TypeSystem::get_deref_info(const TypeSpec& ts) const {
   }
 
   // default to GPR
-  info.reg = RegKind::GPR_64;
+  info.reg = RegClass::GPR_64;
   info.mem_deref = true;
 
   if (typecheck(TypeSpec("float"), ts, "", false, false)) {
-    info.reg = RegKind::FLOAT;
+    info.reg = RegClass::FLOAT;
   }
 
   if (ts.base_type() == "inline-array") {
@@ -165,7 +165,7 @@ DerefInfo TypeSystem::get_deref_info(const TypeSpec& ts) const {
       // an array of values, which should be loaded in the correct way to the correct register
       info.stride = result_type->get_size_in_memory();
       info.sign_extend = result_type->get_load_signed();
-      info.reg = result_type->get_preferred_reg_kind();
+      info.reg = result_type->get_preferred_reg_class();
       info.load_size = result_type->get_load_size();
       assert(result_type->get_size_in_memory() == result_type->get_load_size());
     }
@@ -640,7 +640,7 @@ void TypeSystem::add_builtin_types() {
 
   // OBJECT
   auto obj_type = add_type(
-      "object", std::make_unique<ValueType>("object", "object", false, 4, false, RegKind::GPR_64));
+      "object", std::make_unique<ValueType>("object", "object", false, 4, false, RegClass::GPR_64));
 
   auto structure_type = add_builtin_structure("object", "structure");
   auto basic_type = add_builtin_basic("structure", "basic");
@@ -660,7 +660,7 @@ void TypeSystem::add_builtin_types() {
   inline_array_type->set_runtime_type("pointer");
 
   add_builtin_value_type("object", "number", 8);  // sign extend?
-  add_builtin_value_type("number", "float", 4, false, false, RegKind::FLOAT);
+  add_builtin_value_type("number", "float", 4, false, false, RegClass::FLOAT);
   add_builtin_value_type("number", "integer", 8, false, false);   // sign extend?
   add_builtin_value_type("integer", "binteger", 8, true, false);  // sign extend?
   add_builtin_value_type("integer", "sinteger", 8, false, true);
@@ -668,13 +668,13 @@ void TypeSystem::add_builtin_types() {
   add_builtin_value_type("sinteger", "int16", 2, false, true);
   add_builtin_value_type("sinteger", "int32", 4, false, true);
   add_builtin_value_type("sinteger", "int64", 8, false, true);
-  add_builtin_value_type("sinteger", "int128", 16, false, true, RegKind::INT_128);
+  add_builtin_value_type("sinteger", "int128", 16, false, true, RegClass::INT_128);
   add_builtin_value_type("integer", "uinteger", 8);
   add_builtin_value_type("uinteger", "uint8", 1);
   add_builtin_value_type("uinteger", "uint16", 2);
   add_builtin_value_type("uinteger", "uint32", 4);
   add_builtin_value_type("uinteger", "uint64", 8);
-  add_builtin_value_type("uinteger", "uint128", 16, false, false, RegKind::INT_128);
+  add_builtin_value_type("uinteger", "uint128", 16, false, false, RegClass::INT_128);
 
   auto int_type = add_builtin_value_type("integer", "int", 8, false, true);
   int_type->disallow_in_runtime();
@@ -948,7 +948,7 @@ ValueType* TypeSystem::add_builtin_value_type(const std::string& parent,
                                               int size,
                                               bool boxed,
                                               bool sign_extend,
-                                              RegKind reg) {
+                                              RegClass reg) {
   add_type(type_name,
            std::make_unique<ValueType>(parent, type_name, boxed, size, sign_extend, reg));
   return get_type_of_type<ValueType>(type_name);
diff --git a/common/type_system/TypeSystem.h b/common/type_system/TypeSystem.h
index 7fc0b4ec15..82388d3de2 100644
--- a/common/type_system/TypeSystem.h
+++ b/common/type_system/TypeSystem.h
@@ -36,7 +36,7 @@ struct DerefInfo {
   bool can_deref = false;
   bool mem_deref = false;
   bool sign_extend = false;
-  RegKind reg = RegKind::INVALID;
+  RegClass reg = RegClass::INVALID;
   int stride = -1;
   int load_size = -1;
   TypeSpec result_type;
@@ -68,7 +68,7 @@ struct ReverseDerefInfo {
 struct ReverseDerefInputInfo {
   int offset = -1;
   bool mem_deref = false;
-  RegKind reg = RegKind::INVALID;
+  RegClass reg = RegClass::INVALID;
   int load_size = -1;
   bool sign_extend = false;
   TypeSpec input_type;
@@ -81,7 +81,7 @@ struct DerefKind {
   bool is_store = false;     // when true, the sign extension shouldn't matter
   int size = -1;             // how many bytes
   bool sign_extend = false;  // for loads only (4 bytes and under), do we sign extend?
-  RegKind reg_kind = RegKind::INVALID;
+  RegClass reg_kind = RegClass::INVALID;
 };
 
 struct FieldReverseLookupInput {
@@ -239,7 +239,7 @@ class TypeSystem {
                                     int size,
                                     bool boxed = false,
                                     bool sign_extend = false,
-                                    RegKind reg = RegKind::GPR_64);
+                                    RegClass reg = RegClass::GPR_64);
   void builtin_structure_inherit(StructureType* st);
 
   enum ForwardDeclareKind { TYPE, STRUCTURE, BASIC };
diff --git a/common/versions.h b/common/versions.h
index 47a654f168..d415879b4f 100644
--- a/common/versions.h
+++ b/common/versions.h
@@ -13,7 +13,7 @@
 namespace versions {
 // language version (OpenGOAL)
 constexpr s32 GOAL_VERSION_MAJOR = 0;
-constexpr s32 GOAL_VERSION_MINOR = 4;
+constexpr s32 GOAL_VERSION_MINOR = 5;
 
 // these versions are from the game
 constexpr u32 ART_FILE_VERSION = 6;
diff --git a/decompiler/IR/IR_TypeAnalysis.cpp b/decompiler/IR/IR_TypeAnalysis.cpp
index 4485653388..c8da6cd1f8 100644
--- a/decompiler/IR/IR_TypeAnalysis.cpp
+++ b/decompiler/IR/IR_TypeAnalysis.cpp
@@ -65,12 +65,12 @@ bool get_as_reg_offset(const IR* ir, RegOffset* out) {
   return false;
 }
 
-RegKind get_reg_kind(const Register& r) {
+RegClass get_reg_kind(const Register& r) {
   switch (r.get_kind()) {
     case Reg::GPR:
-      return RegKind::GPR_64;
+      return RegClass::GPR_64;
     case Reg::FPR:
-      return RegKind::FLOAT;
+      return RegClass::FLOAT;
     default:
       assert(false);
   }
diff --git a/decompiler/config/all-types.gc b/decompiler/config/all-types.gc
index a720141863..a177fb1efd 100644
--- a/decompiler/config/all-types.gc
+++ b/decompiler/config/all-types.gc
@@ -733,21 +733,13 @@
 ; ;;(define-extern rgba object) ;; unknown type
 ; (define-extern seekl function)
 
-; ;; vector-h
-(deftype vector (structure)
-  ((data float 4 :offset-assert 0)
-   (x float :offset 0)
-   (y float :offset 4)
-   (z float :offset 8)
-   (w float :offset 12)
-   (quad uint128 :offset 0)
-   )
-  :method-count-assert 9
-  :size-assert         #x10
-  :flag-assert         #x900000010
-  )
 
-;; vector-h
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; VECTOR-H         ;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~;
+
 (deftype bit-array (basic)
   ((length           int32  :offset-assert 4)
    (allocated-length int32  :offset-assert 8)
@@ -921,11 +913,30 @@
   :flag-assert         #x900000010
   )
 
-;; what's the s here?
+(deftype vector (structure)
+  ((data float 4 :offset-assert 0)
+   (x float :offset 0)
+   (y float :offset 4)
+   (z float :offset 8)
+   (w float :offset 12)
+   (quad uint128 :offset 0)
+   )
+  :method-count-assert 9
+  :size-assert         #x10
+  :flag-assert         #x900000010
+  )
+
+(define-extern *null-vector* vector)
+(define-extern *identity-vector* vector)
+(define-extern *x-vector* vector)
+(define-extern *y-vector* vector)
+(define-extern *z-vector* vector)
+(define-extern *up-vector* vector)
+
 (deftype vector4s-3 (structure)
   ((data   float 12 :offset-assert 0) ;; guess
    (quad   uint128 3  :offset 0)
-   (vector vector4w 3  :offset 0)     ;; guess
+   (vector vector 3 :inline :offset 0)     ;; guess
    )
   :method-count-assert 9
   :size-assert         #x30
@@ -970,6 +981,8 @@
   :flag-assert         #x900000010
   )
 
+;; todo isphere
+
 (deftype box8s (structure)
   ((data   float 8       :offset-assert 0)
    (quad   uint128 2       :offset 0)
@@ -1019,7 +1032,6 @@
     )
   )
 
-;; vector-h
 (deftype vertical-planes (structure)
   ((data uint128 4 :offset-assert 0) ;; probably wrong
    )
@@ -1064,6 +1076,18 @@
   :flag-assert         #x90000000c
   )
 
+(define-extern vector-dot (function vector vector float))
+(define-extern vector-dot-vu (function vector vector float))
+(define-extern vector4-dot (function vector vector float))
+(define-extern vector4-dot-vu (function vector vector float))
+(define-extern vector+! (function vector vector vector vector))
+(define-extern vector-! (function vector vector vector vector))
+(define-extern vector-zero! (function vector vector))
+(define-extern vector-reset! (function vector vector))
+(define-extern vector-copy! (function vector vector vector))
+(define-extern *zero-vector* vector)
+
+
 
 ;; bounding-box-h
 (deftype bounding-box (structure)
@@ -31307,52 +31331,7 @@
 ;;(define-extern time-frame object) ;; unknown type
 ;;(define-extern part-id object) ;; unknown type
 
-;;(define-extern vector2h object) ;; unknown type
-;;(define-extern vector4s-3 object) ;; unknown type
-;;(define-extern *identity-vector* object) ;; unknown type
-;;(define-extern vector4ub object) ;; unknown type
-;;(define-extern vector2w object) ;; unknown type
-;;(define-extern isphere object) ;; unknown type
-(define-extern vector-dot-vu (function vector vector float))
-;;(define-extern vertical-planes object) ;; unknown type
-;;(define-extern cylinder object) ;; unknown type
-;;(define-extern *x-vector* object) ;; unknown type
-;;(define-extern *y-vector* object) ;; unknown type
-;;(define-extern *z-vector* object) ;; unknown type
-(define-extern vector-zero! function)
-;;(define-extern bit-array object) ;; unknown type
-(define-extern vector-! function)
-;;(define-extern vector16b object) ;; unknown type
-;;(define-extern vector-array object) ;; unknown type
-;;(define-extern rgbaf object) ;; unknown type
-;;(define-extern cylinder-flat object) ;; unknown type
-;;(define-extern plane object) ;; unknown type
-;;(define-extern vector3s object) ;; unknown type
-(define-extern vector-dot (function vector vector float))
-;;(define-extern *zero-vector* object) ;; unknown type
-;;(define-extern vector4h object) ;; unknown type
-;;(define-extern vector4w object) ;; unknown type
-(define-extern box8s-array type)
-;;(define-extern *null-vector* object) ;; unknown type
-;;(define-extern qword object) ;; unknown type
-;;(define-extern *up-vector* object) ;; unknown type
-;;(define-extern vector4b object) ;; unknown type
-;;(define-extern vector4w-4 object) ;; unknown type
-;;(define-extern vertical-planes-array object) ;; unknown type
-(define-extern vector4-dot (function vector vector float))
-;;(define-extern vector2uh object) ;; unknown type
-;;(define-extern sphere object) ;; unknown type
-;;(define-extern vector3h object) ;; unknown type
-;;(define-extern vector4w-3 object) ;; unknown type
-(define-extern vector+! function)
-(define-extern vector4-dot-vu (function vector vector float))
-;;(define-extern box8s object) ;; unknown type
-;;(define-extern vector3w object) ;; unknown type
-(define-extern vector-reset! function)
-;;(define-extern vector4w-2 object) ;; unknown type
-;;(define-extern vector8h object) ;; unknown type
-(define-extern vector-copy! function)
-(define-extern vector type)
+
 ;;(define-extern bounding-box object) ;; unknown type
 ;;(define-extern bounding-box4w object) ;; unknown type
 ;;(define-extern bounding-box-both object) ;; unknown type
diff --git a/doc/changelog.md b/doc/changelog.md
index 919fa4fb83..1ad7dc616a 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -90,3 +90,12 @@
 - Made string/float constants go in the main segment when they are declared in the top-level segment, instead of the top-level segment. This is what GOAL seems to do (not 100% sure yet) and avoids issues where you set something to a string constant in the top-level. This avoids the possibility of memory bugs at the cost of more memory usage (likely very little additional memory).
 - Added support for boxed arrays. They can be created with `new` and indexed with `->`. The compound type `(array <elt-type>)` is used to describe an array with a given content type.
 - Added `reset-here` option for `rlet`.
+
+## V0.5
+- Breaking change: the register class `xmm` for a single float was renamed to `fpr` to distinguish it from other uses of `xmm` registers.
+- Breaking change: the message format for reset and shutdown messages sent between the listener and runtime has changed.
+- Improved code-generation quality where accessing a field or similar with an offset of zero from a base register.
+- The listener now uses message IDs to more robustly handle the situation where a response messages comes, but is extremely late, or if some sent messages are skipped.
+- Fixed bug where references to the debug segment using RIP-relative links were not set to zero by the linker when the debug segment isn't loaded.
+- The `rlet` form now supports 128-bit vector float registers with the `vf` register class.
+- Added support for "vector float" assembly operations, including `lvf`, `svf`, `xor`, `sub`, `add`, and `blend`.
\ No newline at end of file
diff --git a/doc/goal_doc.md b/doc/goal_doc.md
index 886538e75e..d98f03f66e 100644
--- a/doc/goal_doc.md
+++ b/doc/goal_doc.md
@@ -1169,7 +1169,7 @@ Not implemented well yet.
   body...
   )
 ```
-Create register variables. You can optionally specify a register with the `:reg` option and a register name like `rax` or `xmm3`. The initial value of the register is not set. If you don't specify a register, a GPR will be chosen for you by the coloring system and it will behave like a `let`.  If you don't specify a register, you can specify a register class (`gpr` or `xmm`) and the compiler will pick a GPR or XMM for you.
+Create register variables. You can optionally specify a register with the `:reg` option and a register name like `rax` or `xmm3`. The initial value of the register is not set. If you don't specify a register, a GPR will be chosen for you by the coloring system and it will behave like a `let`.  If you don't specify a register, you can specify a register class (`gpr`, a normal 64-bit integer register; `fpr`, a 32-bit single precision float; or  `vf`, and 128-bit floating point vector register) and the compiler will pick a GPR or XMM for you.
 
 If you pick a callee-saved register and use it within the coloring system, the compiler will back it up for you in the prologue and restore it in the epilogue.
 If you pick a special register like `rsp`, it won't be backed up.  
@@ -1188,17 +1188,17 @@ Here is an example of using an `rlet` to access registers:
 ```
 
 ## General assembly forms
-In general, assembly forms have a name that begins with a `.`. They all evaluate to `none` and copy the form of an x86-64 instruction. For example `(.sub dst src)`. A destination must be a settable register (ok if it's spilled). So you can't do something like `(.sub (-> obj field) x)`. Instead, do `(set! temp (-> obj field))`, `(.sub temp x)`, `(set! (-> obj field) temp)`.   The sources can be any expression.
+In general, assembly forms have a name that begins with a `.`. They all evaluate to `none` and copy the form of an x86-64 instruction. For example `(.sub dst src)`. A destination must be a settable register (ok if it's spilled). So you can't do something like `(.sub (-> obj field) x)`. Instead, do `(set! temp (-> obj field))`, `(.sub temp x)`, `(set! (-> obj field) temp)`.   The sources can be any expression, or a register. This allows you to mix high-level code with assembly easily, like `(.mov rax (-> obj field))` or `(.push (+ 1 (-> obj field)))`.
 
 By default, assembly forms work with the coloring system. This means that assembly and high level expression can be mixed together without clobbering each other. It also means use of callee-saved registers will cause them to be backed up/restored in the function prologue and epilogue.  Use of weird registers like `r15`, `r14`, and `rsp` works as you would expect with the coloring system. 
  
-But you can also request to skip this with `:color #f` option, like `(.push my-reg-var :color #f)`. Be very careful with this. The `:color #f` option will only work with register variables from `rlet` which have a manually specified register. It will entirely bypass the coloring system and use this register. Use of this with other GOAL code is extremely dangerous and should be done very carefully or avoided.
+But you can also request to skip this with `:color #f` option, like `(.push my-reg-var :color #f)`. Be very careful with this. The `:color #f` option will only work with register variables from `rlet` which have a manually specified register. It will entirely bypass the coloring system and use this register. Use of this near high level GOAL variables is extremely dangerous and should be done very carefully or avoided, as the GOAL compiler will not know that you could be modifying its registers.  In a form with `:color #f`, you cannot use higher level code or variables - all variables must be defined in `rlet`s. This is because higher level expressions and variables cannot be used without the coloring system.
 
 ## `.sub`
 ```lisp
 (.sub dest src [:color #t|#f])
 ```
-x86-64 subtraction. If coloring is on (the default), the `dest` must be a settable register (`rlet` var, `let` var, function argument, ...). It can't be a place like a symbol, field, stack variable, etc.  If coloring is off, both `src` and `dest` must be registers defined and constrained in an enclosing `rlet`.
+x86-64 subtraction (64-bit). If coloring is on (the default), the `dest` must be a settable register (`rlet` var, `let` var, function argument, ...). It can't be a place like a symbol, field, stack variable, etc.  If coloring is off, both `src` and `dest` must be registers defined and constrained in an enclosing `rlet`.
 
 Example:
 ```
@@ -1219,27 +1219,88 @@ Example:
   )
 ```
 
+## `.add`
+```lisp
+(.add dest src [:color #t|#f])
+```
+Addition (64-bit). Similar to subtraction.
+
+## `.jr`
+```lisp
+(.jr addres-reg [:color #t|#f])
+```
+Jump-register. Jumps to the address given. The address is treated as a 64-bit pointer, not a GOAL pointer.
+
+## `.load-sym`
+```lisp
+(.load-sym dest symbol-name [:sext #t|#f] [:color #t|#f])
+```
+Load the value of a symbol into a register.  By default, it will look at the type of the symbol to determine if it should be sign extended or not. You can override this with the `:sext` option if needed. The symbol must be known to the type system.
+
 ## `.push`
 ```lisp
 (.push src [:color #t|#f])
 ```
 
-The x86-64 push instruction. Does a 64-bit GPR.  The `src` can be any expression if color is on. Otherwise it must be a register defined and constrained in an enclosing `rlet`.
+The x86-64 push instruction. Does a 64-bit GPR.  The `src` can be any expression that can be put in a gpr if color is on. Otherwise it must be a register defined and constrained in an enclosing `rlet`.
 
 ## `.pop`
 ```lisp
 (.pop dst [:color #t|#f])
 ```
 
-The x86-64 pop instruction.  Does a 64-bit GPR. The `dst` can be any settable register if color is on. Otherwise it must be a register defined and constrained in an enclosing `rlet`.
+The x86-64 pop instruction.  Does a 64-bit GPR. The `dst` can be any expression which evaluates to a settable register if color is on. Otherwise it must be a register defined and constrained in an enclosing `rlet`.
 
 ## `.ret`
 ```lisp
 (.ret [:color #t|#f])
 ```
 
-The x86-64 ret instruction. The color option does nothing. This is not recognized as a control flow instruction by the coloring system.
+The x86-64 ret instruction. The color option does nothing. This is not recognized as a control flow instruction by the coloring system. It does not touch the return register `rax`.
 
+## `.mov`
+```lisp
+(.mov dst src [:color #t|#f])
+```
+Move between two registers. The `dst` should be a register (either `rlet` or `let` variable), and the `src` can be a register or any expression.  The following moves are supported:
+- `gpr` to `gpr`
+- `fpr` to `fpr` (only moves lower 32-bits of the xmms, uses `movss`)
+- `vf` to `vf` (moves all 128-bits of the xmms, uses `vmovaps`)
+- `gpr` to `fpr` (only moves 32-bits, uses `movd`)
+- `fpr` to `gpr` (only moves 32-bits, upper 32-bits are zero, uses `movd`)
+This code generation is identical to using a `(set! dst src)` form.
+  
+## `.lvf`
+```lisp
+(.lvf dst-reg src-loc [:color #t|#f])
+```
+Load a vector float register from `src-loc`. The `dst-reg` must be a vector float register. The `src-loc` can be a gpr containing a GOAL pointer or expression which gives a GOAL pointer. There is no type checking on the `src-loc` so be careful. The load uses `vmovaps`, so the source must be 16-byte aligned. 
+
+If the source is in the form `base-reg + constant-offset`, like from a `(&-> my-object my-inline-vector-field)`, the constant offset will be folded into the load instruction like `vmovaps xmm1, [r15 + rax + 12]`.
+
+If the source is an immediate `(new 'static ...)` form that results in a statically allocated variable, it will use `RIP` relative addressing (32-bit immediate) form. This means that the code:
+```lisp
+(.lvf vf1 (new 'static 'vector :x 1.2 :y 2.3 :z 3.4 :w 5.6))
+```
+will be just a single instruction to do a `vmovaps xmm1, [rip + XXX]`.
+
+##`.svf`
+```lisp
+(.svf dst-loc src-reg [:color #t|#f])
+```
+Store a vector float. Works similarly to the `lvf` form, but there is no optimized case for storing into a static because this isn't allowed in GOAL.
+
+## Three operand vector float operations.
+```lisp
+(.<op-name>.vf dst src0 src1 [:color #t|#f])
+```
+All the three operand forms work similarly. You can do something like `(.add.vf vf1 vf2 vf3)`. All operations use the similarly named `v<op-name>ps` instruction, xmm128 VEX encoding. We support `xor`, `sub`, and `add` so far.
+
+## `.blend.vf`
+```lisp
+(.blend.vf dst src0 src1 mask [:color #t|#f])
+```
+Wrapper around `vblendps` (VEX xmm128 version) instruction. The `mask` must evaluate to a constant integer at compile time. The integer must be in the range of 0-15. 
 
 # Compiler Forms - Unsorted
 
diff --git a/goal_src/engine/math/vector-h.gc b/goal_src/engine/math/vector-h.gc
index ef5a71fd60..17a857d29c 100644
--- a/goal_src/engine/math/vector-h.gc
+++ b/goal_src/engine/math/vector-h.gc
@@ -268,7 +268,7 @@
 (deftype vector4s-3 (structure)
   ((data   float 12 :offset-assert 0) ;; guess
    (quad   uint128 3  :offset 0)
-   (vector vector4w 3  :offset 0)     ;; guess
+   (vector vector 3 :inline :offset 0)     ;; guess
    )
   :method-count-assert 9
   :size-assert         #x30
@@ -315,6 +315,7 @@
   :flag-assert         #x900000010
   )
 
+; todo
 ; (deftype isphere (vec4s)
 ;   ()
 ;   :method-count-assert 9
@@ -417,8 +418,6 @@
   :flag-assert         #x90000000c
   )
 
-;; todo dot, dot-vu, 4-dot, 4-dot-vu, +!, -!, zero!, reset!, copy!
-
 (defun vector-dot ((a vector) (b vector))
   "Take the dot product of two vectors.
    Only does the x, y, z compoments.
@@ -461,4 +460,79 @@
    (vector4-dot a b)
   )
 
+(defun vector+! ((dst vector) (a vector) (b vector))
+  "Set dst = a + b. The w component of dst is set to 0."
+  (declare (inline))
+  (rlet ((vf0 :class vf :reset-here #t)
+         (vf1 :class vf :reset-here #t)
+         (vf2 :class vf :reset-here #t)
+         (vf3 :class vf :reset-here #t))
+        ; load vectors
+        (.lvf vf2 a)
+        (.lvf vf3 b)
+        ; set vf0 to zero
+        (.xor.vf vf0 vf0 vf0)
+        ; add
+        (.add.vf vf1 vf2 vf3)
+        ; set w = 0
+        (.blend.vf vf1 vf1 vf0 #b1000)
+        ; store
+        (.svf dst vf1)
+        )
+  dst
+  )
+
+(defun vector-! ((dst vector) (a vector) (b vector))
+  "Set dst = a - b. The w componenent of dst is set to 0."
+  (declare (inline))
+  (rlet ((vf0 :class vf :reset-here #t)
+         (vf1 :class vf :reset-here #t)
+         (vf2 :class vf :reset-here #t)
+         (vf3 :class vf :reset-here #t))
+        ; load vectors
+        (.lvf vf2 a)
+        (.lvf vf3 b)
+        ; set vf0 to zero
+        (.xor.vf vf0 vf0 vf0)
+        ; subtract
+        (.sub.vf vf1 vf2 vf3)
+        ; set w = 0
+        (.blend.vf vf1 vf1 vf0 #b1000)
+        ; store
+        (.svf dst vf1)
+        )
+  dst
+  )
+
+(defun vector-zero! ((dest vector))
+  "Set xyzw to 0."
+  (declare (inline))
+  (rlet ((vf1 :class vf :reset-here #t))
+        ; set vf1 = 0
+        (.xor.vf vf1 vf1 vf1)
+        ; store the 0
+        (.svf dest vf1)
+        )
+  dest
+  )
+
+(defun vector-reset! ((dst vector))
+  "Set vector to 0,0,0,1."
+  (declare (inline))
+  (vector-zero! dst)
+  (set! (-> dst w) 1.0)
+  dst
+  )
+
+(defun vector-copy! ((dst vector) (src vector))
+  "Copy vector src to dst. Copies the entire quadword (xyzw).
+   The vectors must be aligned."
+  (declare (inline))
+  (rlet ((vf1 :class vf :reset-here #t))
+        (.lvf vf1 src)
+        (.svf dst vf1)
+        )
+  dst
+  )
+
 (define *zero-vector* (new 'static 'vector :x 0. :y 0. :z 0. :w 0.))
\ No newline at end of file
diff --git a/goalc/compiler/Compiler.h b/goalc/compiler/Compiler.h
index 0cd8bf73e0..9cb6e28a98 100644
--- a/goalc/compiler/Compiler.h
+++ b/goalc/compiler/Compiler.h
@@ -65,6 +65,11 @@ class Compiler {
   Val* compile_get_symbol_value(const goos::Object& form, const std::string& name, Env* env);
   Val* compile_function_or_method_call(const goos::Object& form, Env* env);
 
+  Val* compile_asm_vf_math3(const goos::Object& form,
+                            const goos::Object& rest,
+                            IR_VFMath3Asm::Kind kind,
+                            Env* env);
+
   Val* get_field_of_structure(const StructureType* type,
                               Val* object,
                               const std::string& field_name,
@@ -112,7 +117,7 @@ class Compiler {
 
   TypeSpec parse_typespec(const goos::Object& src);
   bool is_local_symbol(const goos::Object& obj, Env* env);
-  emitter::RegKind get_preferred_reg_kind(const TypeSpec& ts);
+  emitter::HWRegKind get_preferred_reg_kind(const TypeSpec& ts);
   Val* compile_real_function_call(const goos::Object& form,
                                   RegVal* function,
                                   const std::vector<RegVal*>& args,
@@ -281,6 +286,13 @@ class Compiler {
   Val* compile_asm_load_sym(const goos::Object& form, const goos::Object& rest, Env* env);
   Val* compile_asm_jr(const goos::Object& form, const goos::Object& rest, Env* env);
   Val* compile_asm_mov(const goos::Object& form, const goos::Object& rest, Env* env);
+  Val* compile_asm_lvf(const goos::Object& form, const goos::Object& rest, Env* env);
+  Val* compile_asm_svf(const goos::Object& form, const goos::Object& rest, Env* env);
+  Val* compile_asm_xor_vf(const goos::Object& form, const goos::Object& rest, Env* env);
+  Val* compile_asm_sub_vf(const goos::Object& form, const goos::Object& rest, Env* env);
+  Val* compile_asm_add_vf(const goos::Object& form, const goos::Object& rest, Env* env);
+  Val* compile_asm_blend_vf(const goos::Object& form, const goos::Object& rest, Env* env);
+
   // Atoms
 
   // Block
diff --git a/goalc/compiler/Env.cpp b/goalc/compiler/Env.cpp
index b4beb388bd..71ba6da63e 100644
--- a/goalc/compiler/Env.cpp
+++ b/goalc/compiler/Env.cpp
@@ -18,8 +18,8 @@ void Env::emit(std::unique_ptr<IR> ir) {
 /*!
  * Allocate an IRegister with the given type.
  */
-RegVal* Env::make_ireg(TypeSpec ts, emitter::RegKind kind) {
-  return m_parent->make_ireg(std::move(ts), kind);
+RegVal* Env::make_ireg(TypeSpec ts, RegClass reg_class) {
+  return m_parent->make_ireg(std::move(ts), reg_class);
 }
 
 /*!
@@ -41,11 +41,11 @@ BlockEnv* Env::find_block(const std::string& name) {
 }
 
 RegVal* Env::make_gpr(const TypeSpec& ts) {
-  return make_ireg(coerce_to_reg_type(ts), emitter::RegKind::GPR);
+  return make_ireg(coerce_to_reg_type(ts), RegClass::GPR_64);
 }
 
-RegVal* Env::make_xmm(const TypeSpec& ts) {
-  return make_ireg(coerce_to_reg_type(ts), emitter::RegKind::XMM);
+RegVal* Env::make_fpr(const TypeSpec& ts) {
+  return make_ireg(coerce_to_reg_type(ts), RegClass::FLOAT);
 }
 
 std::unordered_map<std::string, Label>& Env::get_label_map() {
@@ -77,9 +77,9 @@ void GlobalEnv::emit(std::unique_ptr<IR> ir) {
 /*!
  * Allocate an IRegister with the given type.
  */
-RegVal* GlobalEnv::make_ireg(TypeSpec ts, emitter::RegKind kind) {
+RegVal* GlobalEnv::make_ireg(TypeSpec ts, RegClass reg_class) {
   (void)ts;
-  (void)kind;
+  (void)reg_class;
   throw std::runtime_error("cannot alloc reg in GlobalEnv");
 }
 
@@ -231,13 +231,13 @@ void FunctionEnv::resolve_gotos() {
   }
 }
 
-RegVal* FunctionEnv::make_ireg(TypeSpec ts, emitter::RegKind kind) {
+RegVal* FunctionEnv::make_ireg(TypeSpec ts, RegClass reg_class) {
   IRegister ireg;
-  ireg.kind = kind;
+  ireg.reg_class = reg_class;
   ireg.id = m_iregs.size();
   auto rv = std::make_unique<RegVal>(ireg, coerce_to_reg_type(ts));
   m_iregs.push_back(std::move(rv));
-  assert(kind != emitter::RegKind::INVALID);
+  assert(reg_class != RegClass::INVALID);
   return m_iregs.back().get();
 }
 
diff --git a/goalc/compiler/Env.h b/goalc/compiler/Env.h
index 19a5fb45df..f64bd7a0c9 100644
--- a/goalc/compiler/Env.h
+++ b/goalc/compiler/Env.h
@@ -31,13 +31,13 @@ class Env {
   explicit Env(Env* parent) : m_parent(parent) {}
   virtual std::string print() = 0;
   virtual void emit(std::unique_ptr<IR> ir);
-  virtual RegVal* make_ireg(TypeSpec ts, emitter::RegKind kind);
+  virtual RegVal* make_ireg(TypeSpec ts, RegClass reg_class);
   virtual void constrain_reg(IRegConstraint constraint);  // todo, remove!
   virtual RegVal* lexical_lookup(goos::Object sym);
   virtual BlockEnv* find_block(const std::string& name);
   virtual std::unordered_map<std::string, Label>& get_label_map();
   RegVal* make_gpr(const TypeSpec& ts);
-  RegVal* make_xmm(const TypeSpec& ts);
+  RegVal* make_fpr(const TypeSpec& ts);
   virtual ~Env() = default;
   Env* parent() { return m_parent; }
 
@@ -58,7 +58,7 @@ class GlobalEnv : public Env {
   GlobalEnv();
   std::string print() override;
   void emit(std::unique_ptr<IR> ir) override;
-  RegVal* make_ireg(TypeSpec ts, emitter::RegKind kind) override;
+  RegVal* make_ireg(TypeSpec ts, RegClass reg_class) override;
   void constrain_reg(IRegConstraint constraint) override;
   RegVal* lexical_lookup(goos::Object sym) override;
   BlockEnv* find_block(const std::string& name) override;
@@ -159,7 +159,7 @@ class FunctionEnv : public DeclareEnv {
   void set_segment(int seg) { segment = seg; }
   void emit(std::unique_ptr<IR> ir) override;
   void finish();
-  RegVal* make_ireg(TypeSpec ts, emitter::RegKind kind) override;
+  RegVal* make_ireg(TypeSpec ts, RegClass reg_class) override;
   const std::vector<std::unique_ptr<IR>>& code() const { return m_code; }
   int max_vars() const { return m_iregs.size(); }
   const std::vector<IRegConstraint>& constraints() { return m_constraints; }
diff --git a/goalc/compiler/IR.cpp b/goalc/compiler/IR.cpp
index 9e11ecdda9..39a127ee31 100644
--- a/goalc/compiler/IR.cpp
+++ b/goalc/compiler/IR.cpp
@@ -41,6 +41,12 @@ Register get_no_color_reg(const RegVal* rv) {
   return rv->rlet_constraint().value();
 }
 
+Register get_reg_asm(const RegVal* rv,
+                     const AllocationResult& allocs,
+                     emitter::IR_Record irec,
+                     bool use_coloring) {
+  return use_coloring ? get_reg(rv, allocs, irec) : get_no_color_reg(rv);
+}
 void load_constant(u64 value,
                    emitter::ObjectGenerator* gen,
                    emitter::IR_Record irec,
@@ -64,6 +70,49 @@ void load_constant(u64 value,
     }
   }
 }
+
+void regset_common(emitter::ObjectGenerator* gen,
+                   const AllocationResult& allocs,
+                   emitter::IR_Record irec,
+                   const RegVal* dst,
+                   const RegVal* src,
+                   bool use_coloring) {
+  auto src_reg = use_coloring ? get_reg(src, allocs, irec) : get_no_color_reg(src);
+  auto dst_reg = use_coloring ? get_reg(dst, allocs, irec) : get_no_color_reg(dst);
+  auto src_class = src->ireg().reg_class;
+  auto dst_class = dst->ireg().reg_class;
+
+  if (src_class == RegClass::GPR_64 && dst_class == RegClass::GPR_64) {
+    if (src_reg == dst_reg) {
+      // eliminate move
+      gen->add_instr(IGen::null(), irec);
+    } else {
+      gen->add_instr(IGen::mov_gpr64_gpr64(dst_reg, src_reg), irec);
+    }
+  } else if (src_class == RegClass::FLOAT && dst_class == RegClass::FLOAT) {
+    if (src_reg == dst_reg) {
+      // eliminate move
+      gen->add_instr(IGen::null(), irec);
+    } else {
+      gen->add_instr(IGen::mov_xmm32_xmm32(dst_reg, src_reg), irec);
+    }
+  } else if (src_class == RegClass::VECTOR_FLOAT && dst_class == RegClass::VECTOR_FLOAT) {
+    if (src_reg == dst_reg) {
+      // eliminate move
+      gen->add_instr(IGen::null(), irec);
+    } else {
+      gen->add_instr(IGen::mov_vf_vf(dst_reg, src_reg), irec);
+    }
+  } else if (src_class == RegClass::FLOAT && dst_class == RegClass::GPR_64) {
+    // xmm 1x -> gpr
+    gen->add_instr(IGen::movd_gpr32_xmm32(dst_reg, src_reg), irec);
+  } else if (src_class == RegClass::GPR_64 && dst_class == RegClass::FLOAT) {
+    // gpr -> xmm 1x
+    gen->add_instr(IGen::movd_xmm32_gpr32(dst_reg, src_reg), irec);
+  } else {
+    assert(false);  // unhandled move.
+  }
+}
 }  // namespace
 
 ///////////
@@ -79,7 +128,7 @@ RegAllocInstr IR_Return::to_rai() {
   RegAllocInstr rai;
   rai.write.push_back(m_return_reg->ireg());
   rai.read.push_back(m_value->ireg());
-  if (m_value->ireg().kind == m_return_reg->ireg().kind) {
+  if (m_value->ireg().reg_class == m_return_reg->ireg().reg_class) {
     rai.is_move = true;  // only true if we aren't moving from register kind to register kind
   }
   return rai;
@@ -233,7 +282,7 @@ RegAllocInstr IR_RegSet::to_rai() {
   RegAllocInstr rai;
   rai.write.push_back(m_dest->ireg());
   rai.read.push_back(m_src->ireg());
-  if (m_dest->ireg().kind == m_src->ireg().kind) {
+  if (m_dest->ireg().reg_class == m_src->ireg().reg_class) {
     rai.is_move = true;  // only true if we aren't moving from register kind to register kind
   }
   return rai;
@@ -242,22 +291,7 @@ RegAllocInstr IR_RegSet::to_rai() {
 void IR_RegSet::do_codegen(emitter::ObjectGenerator* gen,
                            const AllocationResult& allocs,
                            emitter::IR_Record irec) {
-  auto val_reg = get_reg(m_src, allocs, irec);
-  auto dest_reg = get_reg(m_dest, allocs, irec);
-
-  if (val_reg == dest_reg) {
-    gen->add_instr(IGen::null(), irec);
-  } else if (val_reg.is_gpr() && dest_reg.is_gpr()) {
-    gen->add_instr(IGen::mov_gpr64_gpr64(dest_reg, val_reg), irec);
-  } else if (val_reg.is_xmm() && dest_reg.is_gpr()) {
-    gen->add_instr(IGen::movd_gpr32_xmm32(dest_reg, val_reg), irec);
-  } else if (val_reg.is_gpr() && dest_reg.is_xmm()) {
-    gen->add_instr(IGen::movd_xmm32_gpr32(dest_reg, val_reg), irec);
-  } else if (val_reg.is_xmm() && dest_reg.is_xmm()) {
-    gen->add_instr(IGen::mov_xmm32_xmm32(dest_reg, val_reg), irec);
-  } else {
-    assert(false);
-  }
+  regset_common(gen, allocs, irec, m_dest, m_src, true);
 }
 
 std::string IR_RegSet::print() {
@@ -643,13 +677,18 @@ void IR_StaticVarLoad::do_codegen(emitter::ObjectGenerator* gen,
   auto load_info = m_src->get_load_info();
   assert(m_src->get_addr_offset() == 0);
 
-  if (m_dest->ireg().kind == emitter::RegKind::XMM) {
+  if (m_dest->ireg().reg_class == RegClass::FLOAT) {
     assert(load_info.load_signed == false);
     assert(load_info.load_size == 4);
     assert(load_info.requires_load == true);
 
     auto instr = gen->add_instr(IGen::static_load_xmm32(get_reg(m_dest, allocs, irec), 0), irec);
     gen->link_instruction_static(instr, m_src->rec, 0);
+  } else if (m_dest->ireg().reg_class == RegClass::VECTOR_FLOAT) {
+    // we don't check the load info intentionally because we want to allow loading an entire
+    // vector structure.
+    auto instr = gen->add_instr(IGen::loadvf_rip_plus_s32(get_reg(m_dest, allocs, irec), 0), irec);
+    gen->link_instruction_static(instr, m_src->rec, 0);
   } else {
     assert(false);
   }
@@ -766,8 +805,9 @@ void IR_ConditionalBranch::do_codegen(emitter::ObjectGenerator* gen,
 IR_LoadConstOffset::IR_LoadConstOffset(const RegVal* dest,
                                        int offset,
                                        const RegVal* base,
-                                       MemLoadInfo info)
-    : m_dest(dest), m_offset(offset), m_base(base), m_info(info) {}
+                                       MemLoadInfo info,
+                                       bool use_coloring)
+    : IR_Asm(use_coloring), m_dest(dest), m_offset(offset), m_base(base), m_info(info) {}
 
 std::string IR_LoadConstOffset::print() {
   return fmt::format("mov {}, [{} + {}]", m_dest->print(), m_base->print(), m_offset);
@@ -783,17 +823,22 @@ RegAllocInstr IR_LoadConstOffset::to_rai() {
 void IR_LoadConstOffset::do_codegen(emitter::ObjectGenerator* gen,
                                     const AllocationResult& allocs,
                                     emitter::IR_Record irec) {
-  if (m_dest->ireg().kind == emitter::RegKind::GPR) {
-    gen->add_instr(IGen::load_goal_gpr(get_reg(m_dest, allocs, irec), get_reg(m_base, allocs, irec),
-                                       emitter::gRegInfo.get_offset_reg(), m_offset, m_info.size,
-                                       m_info.sign_extend),
+  auto dest_reg = m_use_coloring ? get_reg(m_dest, allocs, irec) : get_no_color_reg(m_dest);
+  auto base_reg = m_use_coloring ? get_reg(m_base, allocs, irec) : get_no_color_reg(m_base);
+
+  if (m_dest->ireg().reg_class == RegClass::GPR_64) {
+    gen->add_instr(IGen::load_goal_gpr(dest_reg, base_reg, emitter::gRegInfo.get_offset_reg(),
+                                       m_offset, m_info.size, m_info.sign_extend),
                    irec);
-  } else if (m_dest->ireg().kind == emitter::RegKind::XMM && m_info.size == 4 &&
-             m_info.sign_extend == false && m_info.reg == ::RegKind::FLOAT) {
+  } else if (m_dest->ireg().reg_class == RegClass::FLOAT && m_info.size == 4 &&
+             m_info.sign_extend == false && m_info.reg == RegClass::FLOAT) {
     gen->add_instr(
-        IGen::load_goal_xmm32(get_reg(m_dest, allocs, irec), get_reg(m_base, allocs, irec),
-                              emitter::gRegInfo.get_offset_reg(), m_offset),
+        IGen::load_goal_xmm32(dest_reg, base_reg, emitter::gRegInfo.get_offset_reg(), m_offset),
         irec);
+  } else if (m_dest->ireg().reg_class == RegClass::VECTOR_FLOAT && m_info.size == 16 &&
+             m_info.sign_extend == false && m_info.reg == RegClass::VECTOR_FLOAT) {
+    gen->add_instr(
+        IGen::load_goal_vf(dest_reg, base_reg, emitter::gRegInfo.get_offset_reg(), m_offset), irec);
   } else {
     throw std::runtime_error("IR_LoadConstOffset::do_codegen not supported");
   }
@@ -805,8 +850,9 @@ void IR_LoadConstOffset::do_codegen(emitter::ObjectGenerator* gen,
 IR_StoreConstOffset::IR_StoreConstOffset(const RegVal* value,
                                          int offset,
                                          const RegVal* base,
-                                         int size)
-    : m_value(value), m_offset(offset), m_base(base), m_size(size) {}
+                                         int size,
+                                         bool use_coloring)
+    : IR_Asm(use_coloring), m_value(value), m_offset(offset), m_base(base), m_size(size) {}
 
 std::string IR_StoreConstOffset::print() {
   return fmt::format("move [{} + {}], {}", m_base->print(), m_offset, m_value->print());
@@ -822,15 +868,20 @@ RegAllocInstr IR_StoreConstOffset::to_rai() {
 void IR_StoreConstOffset::do_codegen(emitter::ObjectGenerator* gen,
                                      const AllocationResult& allocs,
                                      emitter::IR_Record irec) {
-  if (m_value->ireg().kind == emitter::RegKind::GPR) {
+  auto base_reg = m_use_coloring ? get_reg(m_base, allocs, irec) : get_no_color_reg(m_base);
+  auto value_reg = m_use_coloring ? get_reg(m_value, allocs, irec) : get_no_color_reg(m_value);
+
+  if (m_value->ireg().reg_class == RegClass::GPR_64) {
+    gen->add_instr(IGen::store_goal_gpr(base_reg, value_reg, emitter::gRegInfo.get_offset_reg(),
+                                        m_offset, m_size),
+                   irec);
+  } else if (m_value->ireg().reg_class == RegClass::FLOAT && m_size == 4) {
     gen->add_instr(
-        IGen::store_goal_gpr(get_reg(m_base, allocs, irec), get_reg(m_value, allocs, irec),
-                             emitter::gRegInfo.get_offset_reg(), m_offset, m_size),
+        IGen::store_goal_xmm32(base_reg, value_reg, emitter::gRegInfo.get_offset_reg(), m_offset),
         irec);
-  } else if (m_value->ireg().kind == emitter::RegKind::XMM && m_size == 4) {
+  } else if (m_value->ireg().reg_class == RegClass::VECTOR_FLOAT && m_size == 16) {
     gen->add_instr(
-        IGen::store_goal_xmm32(get_reg(m_base, allocs, irec), get_reg(m_value, allocs, irec),
-                               emitter::gRegInfo.get_offset_reg(), m_offset),
+        IGen::store_goal_vf(base_reg, value_reg, emitter::gRegInfo.get_offset_reg(), m_offset),
         irec);
   } else {
     throw std::runtime_error("IR_StoreConstOffset::do_codegen can't handle this");
@@ -1203,20 +1254,95 @@ RegAllocInstr IR_RegSetAsm::to_rai() {
 void IR_RegSetAsm::do_codegen(emitter::ObjectGenerator* gen,
                               const AllocationResult& allocs,
                               emitter::IR_Record irec) {
-  auto val_reg = m_use_coloring ? get_reg(m_src, allocs, irec) : get_no_color_reg(m_src);
-  auto dest_reg = m_use_coloring ? get_reg(m_dst, allocs, irec) : get_no_color_reg(m_dst);
+  regset_common(gen, allocs, irec, m_dst, m_src, m_use_coloring);
+}
 
-  if (val_reg == dest_reg) {
-    gen->add_instr(IGen::null(), irec);
-  } else if (val_reg.is_gpr() && dest_reg.is_gpr()) {
-    gen->add_instr(IGen::mov_gpr64_gpr64(dest_reg, val_reg), irec);
-  } else if (val_reg.is_xmm() && dest_reg.is_gpr()) {
-    gen->add_instr(IGen::movd_gpr32_xmm32(dest_reg, val_reg), irec);
-  } else if (val_reg.is_gpr() && dest_reg.is_xmm()) {
-    gen->add_instr(IGen::movd_xmm32_gpr32(dest_reg, val_reg), irec);
-  } else if (val_reg.is_xmm() && dest_reg.is_xmm()) {
-    gen->add_instr(IGen::mov_xmm32_xmm32(dest_reg, val_reg), irec);
-  } else {
-    assert(false);
+///////////////////////
+// AsmVF3
+///////////////////////
+
+IR_VFMath3Asm::IR_VFMath3Asm(bool use_color,
+                             const RegVal* dst,
+                             const RegVal* src1,
+                             const RegVal* src2,
+                             Kind kind)
+    : IR_Asm(use_color), m_dst(dst), m_src1(src1), m_src2(src2), m_kind(kind) {}
+
+std::string IR_VFMath3Asm::print() {
+  switch (m_kind) {
+    case Kind::XOR:
+      return fmt::format(".xor.vf{} {}, {}, {}", get_color_suffix_string(), m_dst->print(),
+                         m_src1->print(), m_src2->print());
+    case Kind::SUB:
+      return fmt::format(".sub.vf{} {}, {}, {}", get_color_suffix_string(), m_dst->print(),
+                         m_src1->print(), m_src2->print());
+    case Kind::ADD:
+      return fmt::format(".add.vf{} {}, {}, {}", get_color_suffix_string(), m_dst->print(),
+                         m_src1->print(), m_src2->print());
+    default:
+      assert(false);
   }
+}
+
+RegAllocInstr IR_VFMath3Asm::to_rai() {
+  RegAllocInstr rai;
+  if (m_use_coloring) {
+    rai.write.push_back(m_dst->ireg());
+    rai.read.push_back(m_src1->ireg());
+    rai.read.push_back(m_src2->ireg());
+  }
+  return rai;
+}
+
+void IR_VFMath3Asm::do_codegen(emitter::ObjectGenerator* gen,
+                               const AllocationResult& allocs,
+                               emitter::IR_Record irec) {
+  auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring);
+  auto src1 = get_reg_asm(m_src1, allocs, irec, m_use_coloring);
+  auto src2 = get_reg_asm(m_src2, allocs, irec, m_use_coloring);
+
+  switch (m_kind) {
+    case Kind::XOR:
+      gen->add_instr(IGen::xor_vf(dst, src1, src2), irec);
+      break;
+    case Kind::SUB:
+      gen->add_instr(IGen::sub_vf(dst, src1, src2), irec);
+      break;
+    case Kind::ADD:
+      gen->add_instr(IGen::add_vf(dst, src1, src2), irec);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+IR_BlendVF::IR_BlendVF(bool use_color,
+                       const RegVal* dst,
+                       const RegVal* src1,
+                       const RegVal* src2,
+                       u8 mask)
+    : IR_Asm(use_color), m_dst(dst), m_src1(src1), m_src2(src2), m_mask(mask) {}
+
+std::string IR_BlendVF::print() {
+  return fmt::format(".blend.vf{} {}, {}, {}, {}", get_color_suffix_string(), m_dst->print(),
+                     m_src1->print(), m_src2->print(), m_mask);
+}
+
+RegAllocInstr IR_BlendVF::to_rai() {
+  RegAllocInstr rai;
+  if (m_use_coloring) {
+    rai.write.push_back(m_dst->ireg());
+    rai.read.push_back(m_src1->ireg());
+    rai.read.push_back(m_src2->ireg());
+  }
+  return rai;
+}
+
+void IR_BlendVF::do_codegen(emitter::ObjectGenerator* gen,
+                            const AllocationResult& allocs,
+                            emitter::IR_Record irec) {
+  auto dst = get_reg_asm(m_dst, allocs, irec, m_use_coloring);
+  auto src1 = get_reg_asm(m_src1, allocs, irec, m_use_coloring);
+  auto src2 = get_reg_asm(m_src2, allocs, irec, m_use_coloring);
+  gen->add_instr(IGen::blend_vf(dst, src1, src2, m_mask), irec);
 }
\ No newline at end of file
diff --git a/goalc/compiler/IR.h b/goalc/compiler/IR.h
index 708cc199d7..1f369062dd 100644
--- a/goalc/compiler/IR.h
+++ b/goalc/compiler/IR.h
@@ -274,38 +274,6 @@ class IR_ConditionalBranch : public IR {
   bool m_resolved = false;
 };
 
-class IR_LoadConstOffset : public IR {
- public:
-  IR_LoadConstOffset(const RegVal* dest, int offset, const RegVal* base, MemLoadInfo info);
-  std::string print() override;
-  RegAllocInstr to_rai() override;
-  void do_codegen(emitter::ObjectGenerator* gen,
-                  const AllocationResult& allocs,
-                  emitter::IR_Record irec) override;
-
- private:
-  const RegVal* m_dest = nullptr;
-  int m_offset = 0;
-  const RegVal* m_base = nullptr;
-  MemLoadInfo m_info;
-};
-
-class IR_StoreConstOffset : public IR {
- public:
-  IR_StoreConstOffset(const RegVal* value, int offset, const RegVal* base, int size);
-  std::string print() override;
-  RegAllocInstr to_rai() override;
-  void do_codegen(emitter::ObjectGenerator* gen,
-                  const AllocationResult& allocs,
-                  emitter::IR_Record irec) override;
-
- private:
-  const RegVal* m_value = nullptr;
-  int m_offset = 0;
-  const RegVal* m_base = nullptr;
-  int m_size = 0;
-};
-
 class IR_Null : public IR {
  public:
   IR_Null() = default;
@@ -380,6 +348,46 @@ class IR_Asm : public IR {
   bool m_use_coloring;
 };
 
+class IR_LoadConstOffset : public IR_Asm {
+ public:
+  IR_LoadConstOffset(const RegVal* dest,
+                     int offset,
+                     const RegVal* base,
+                     MemLoadInfo info,
+                     bool use_coloring = true);
+  std::string print() override;
+  RegAllocInstr to_rai() override;
+  void do_codegen(emitter::ObjectGenerator* gen,
+                  const AllocationResult& allocs,
+                  emitter::IR_Record irec) override;
+
+ private:
+  const RegVal* m_dest = nullptr;
+  int m_offset = 0;
+  const RegVal* m_base = nullptr;
+  MemLoadInfo m_info;
+};
+
+class IR_StoreConstOffset : public IR_Asm {
+ public:
+  IR_StoreConstOffset(const RegVal* value,
+                      int offset,
+                      const RegVal* base,
+                      int size,
+                      bool use_coloring = true);
+  std::string print() override;
+  RegAllocInstr to_rai() override;
+  void do_codegen(emitter::ObjectGenerator* gen,
+                  const AllocationResult& allocs,
+                  emitter::IR_Record irec) override;
+
+ private:
+  const RegVal* m_value = nullptr;
+  int m_offset = 0;
+  const RegVal* m_base = nullptr;
+  int m_size = 0;
+};
+
 class IR_AsmRet : public IR_Asm {
  public:
   IR_AsmRet(bool use_coloring);
@@ -486,4 +494,40 @@ class IR_RegSetAsm : public IR_Asm {
   const RegVal* m_src = nullptr;
 };
 
+class IR_VFMath3Asm : public IR_Asm {
+ public:
+  enum class Kind { XOR, SUB, ADD };
+  IR_VFMath3Asm(bool use_color,
+                const RegVal* dst,
+                const RegVal* src1,
+                const RegVal* src2,
+                Kind kind);
+  std::string print() override;
+  RegAllocInstr to_rai() override;
+  void do_codegen(emitter::ObjectGenerator* gen,
+                  const AllocationResult& allocs,
+                  emitter::IR_Record irec) override;
+
+ protected:
+  const RegVal* m_dst = nullptr;
+  const RegVal* m_src1 = nullptr;
+  const RegVal* m_src2 = nullptr;
+  Kind m_kind;
+};
+
+class IR_BlendVF : public IR_Asm {
+ public:
+  IR_BlendVF(bool use_color, const RegVal* dst, const RegVal* src1, const RegVal* src2, u8 mask);
+  std::string print() override;
+  RegAllocInstr to_rai() override;
+  void do_codegen(emitter::ObjectGenerator* gen,
+                  const AllocationResult& allocs,
+                  emitter::IR_Record irec) override;
+
+ protected:
+  const RegVal* m_dst = nullptr;
+  const RegVal* m_src1 = nullptr;
+  const RegVal* m_src2 = nullptr;
+  u8 m_mask = 0xff;
+};
 #endif  // JAK_IR_H
diff --git a/goalc/compiler/Util.cpp b/goalc/compiler/Util.cpp
index 3b7b00ea8a..f396b060b4 100644
--- a/goalc/compiler/Util.cpp
+++ b/goalc/compiler/Util.cpp
@@ -136,12 +136,12 @@ bool Compiler::is_local_symbol(const goos::Object& obj, Env* env) {
   return false;
 }
 
-emitter::RegKind Compiler::get_preferred_reg_kind(const TypeSpec& ts) {
-  switch (m_ts.lookup_type(ts)->get_preferred_reg_kind()) {
-    case RegKind::GPR_64:
-      return emitter::RegKind::GPR;
-    case RegKind::FLOAT:
-      return emitter::RegKind::XMM;
+emitter::HWRegKind Compiler::get_preferred_reg_kind(const TypeSpec& ts) {
+  switch (m_ts.lookup_type(ts)->get_preferred_reg_class()) {
+    case RegClass::GPR_64:
+      return emitter::HWRegKind::GPR;
+    case RegClass::FLOAT:
+      return emitter::HWRegKind::XMM;
     default:
       throw std::runtime_error("Unknown preferred register kind");
   }
diff --git a/goalc/compiler/Val.cpp b/goalc/compiler/Val.cpp
index d35a44d60f..ccbc722b8b 100644
--- a/goalc/compiler/Val.cpp
+++ b/goalc/compiler/Val.cpp
@@ -9,7 +9,7 @@
 RegVal* Val::to_gpr(Env* fe) {
   // TODO - handle 128-bit stuff here!
   auto rv = to_reg(fe);
-  if (rv->ireg().kind == emitter::RegKind::GPR) {
+  if (rv->ireg().reg_class == RegClass::GPR_64) {
     return rv;
   } else {
     auto re = fe->make_gpr(coerce_to_reg_type(m_ts));
@@ -19,14 +19,14 @@ RegVal* Val::to_gpr(Env* fe) {
 }
 
 /*!
- * Fallback to_xmm if a more optimized one is not provided.
+ * Fallback to_fpr if a more optimized one is not provided.
  */
-RegVal* Val::to_xmm(Env* fe) {
+RegVal* Val::to_fpr(Env* fe) {
   auto rv = to_reg(fe);
-  if (rv->ireg().kind == emitter::RegKind::XMM) {
+  if (rv->ireg().reg_class == RegClass::FLOAT) {
     return rv;
   } else {
-    auto re = fe->make_xmm(coerce_to_reg_type(m_ts));
+    auto re = fe->make_fpr(coerce_to_reg_type(m_ts));
     fe->emit(std::make_unique<IR_RegSet>(re, rv));
     return re;
   }
@@ -39,7 +39,7 @@ RegVal* RegVal::to_reg(Env* fe) {
 
 RegVal* RegVal::to_gpr(Env* fe) {
   (void)fe;
-  if (m_ireg.kind == emitter::RegKind::GPR) {
+  if (m_ireg.reg_class == RegClass::GPR_64) {
     return this;
   } else {
     auto re = fe->make_gpr(coerce_to_reg_type(m_ts));
@@ -48,12 +48,12 @@ RegVal* RegVal::to_gpr(Env* fe) {
   }
 }
 
-RegVal* RegVal::to_xmm(Env* fe) {
+RegVal* RegVal::to_fpr(Env* fe) {
   (void)fe;
-  if (m_ireg.kind == emitter::RegKind::XMM) {
+  if (m_ireg.reg_class == RegClass::FLOAT) {
     return this;
   } else {
-    auto re = fe->make_xmm(coerce_to_reg_type(m_ts));
+    auto re = fe->make_fpr(coerce_to_reg_type(m_ts));
     fe->emit(std::make_unique<IR_RegSet>(re, this));
     return re;
   }
@@ -104,15 +104,20 @@ RegVal* InlinedLambdaVal::to_reg(Env* fe) {
 }
 
 RegVal* FloatConstantVal::to_reg(Env* fe) {
-  auto re = fe->make_xmm(coerce_to_reg_type(m_ts));
+  auto re = fe->make_fpr(coerce_to_reg_type(m_ts));
   fe->emit(std::make_unique<IR_StaticVarLoad>(re, m_value));
   return re;
 }
 
 RegVal* MemoryOffsetConstantVal::to_reg(Env* fe) {
   auto re = fe->make_gpr(coerce_to_reg_type(m_ts));
-  fe->emit(std::make_unique<IR_LoadConstant64>(re, int64_t(offset)));
-  fe->emit(std::make_unique<IR_IntegerMath>(IntegerMathKind::ADD_64, re, base->to_gpr(fe)));
+  if (offset == 0) {
+    fe->emit_ir<IR_RegSet>(re, base->to_gpr(fe));
+  } else {
+    fe->emit(std::make_unique<IR_LoadConstant64>(re, int64_t(offset)));
+    fe->emit(std::make_unique<IR_IntegerMath>(IntegerMathKind::ADD_64, re, base->to_gpr(fe)));
+  }
+
   return re;
 }
 
@@ -139,16 +144,16 @@ RegVal* MemoryDerefVal::to_reg(Env* fe) {
   }
 }
 
-RegVal* MemoryDerefVal::to_xmm(Env* fe) {
+RegVal* MemoryDerefVal::to_fpr(Env* fe) {
   // todo, support better loads/stores from the stack
   auto base_as_co = dynamic_cast<MemoryOffsetConstantVal*>(base);
   if (base_as_co) {
-    auto re = fe->make_xmm(coerce_to_reg_type(m_ts));
+    auto re = fe->make_fpr(coerce_to_reg_type(m_ts));
     fe->emit(std::make_unique<IR_LoadConstOffset>(re, base_as_co->offset,
                                                   base_as_co->base->to_gpr(fe), info));
     return re;
   } else {
-    auto re = fe->make_xmm(coerce_to_reg_type(m_ts));
+    auto re = fe->make_fpr(coerce_to_reg_type(m_ts));
     auto addr = base->to_gpr(fe);
     fe->emit(std::make_unique<IR_LoadConstOffset>(re, 0, addr, info));
     return re;
@@ -157,7 +162,7 @@ RegVal* MemoryDerefVal::to_xmm(Env* fe) {
 
 RegVal* AliasVal::to_reg(Env* fe) {
   auto as_old_type = base->to_reg(fe);
-  auto result = fe->make_ireg(m_ts, as_old_type->ireg().kind);
+  auto result = fe->make_ireg(m_ts, as_old_type->ireg().reg_class);
   fe->emit(std::make_unique<IR_RegSet>(result, as_old_type));
   return result;
 }
@@ -174,7 +179,7 @@ RegVal* PairEntryVal::to_reg(Env* fe) {
   int offset = is_car ? -2 : 2;
   auto re = fe->make_gpr(coerce_to_reg_type(m_ts));
   MemLoadInfo info;
-  info.reg = RegKind::GPR_64;
+  info.reg = RegClass::GPR_64;
   info.sign_extend = true;
   info.size = 4;
   fe->emit(std::make_unique<IR_LoadConstOffset>(re, offset, base->to_gpr(fe), info));
@@ -197,7 +202,7 @@ RegVal* BitFieldVal::to_reg(Env* env) {
   auto parent_reg = m_parent->to_gpr(env);
 
   auto fe = get_parent_env_of_type<FunctionEnv>(env);
-  auto result = fe->make_ireg(coerce_to_reg_type(m_ts), emitter::RegKind::GPR);
+  auto result = fe->make_ireg(coerce_to_reg_type(m_ts), RegClass::GPR_64);
   env->emit(std::make_unique<IR_RegSet>(result, parent_reg));
 
   int start_bit = m_offset;
diff --git a/goalc/compiler/Val.h b/goalc/compiler/Val.h
index 142e82ae6a..a968befcfd 100644
--- a/goalc/compiler/Val.h
+++ b/goalc/compiler/Val.h
@@ -40,7 +40,7 @@ class Val {
     throw std::runtime_error("to_reg called on invalid Val: " + print());
   }
   virtual RegVal* to_gpr(Env* fe);
-  virtual RegVal* to_xmm(Env* fe);
+  virtual RegVal* to_fpr(Env* fe);
 
   const TypeSpec& type() const { return m_ts; }
   void set_type(TypeSpec ts) { m_ts = std::move(ts); }
@@ -74,7 +74,7 @@ class RegVal : public Val {
   std::string print() const override { return m_ireg.to_string(); };
   RegVal* to_reg(Env* fe) override;
   RegVal* to_gpr(Env* fe) override;
-  RegVal* to_xmm(Env* fe) override;
+  RegVal* to_fpr(Env* fe) override;
   void set_rlet_constraint(emitter::Register reg);
   const std::optional<emitter::Register>& rlet_constraint() const;
 
@@ -157,7 +157,7 @@ struct MemLoadInfo {
     reg = di.reg;
   }
 
-  RegKind reg = RegKind::INVALID;
+  RegClass reg = RegClass::INVALID;
   bool sign_extend = false;
   int size = -1;
 };
@@ -207,7 +207,7 @@ class MemoryDerefVal : public Val {
       : Val(std::move(ts)), base(_base), info(_info) {}
   std::string print() const override { return "[" + base->print() + "]"; }
   RegVal* to_reg(Env* fe) override;
-  RegVal* to_xmm(Env* fe) override;
+  RegVal* to_fpr(Env* fe) override;
   Val* base = nullptr;
   MemLoadInfo info;
 };
diff --git a/goalc/compiler/compilation/Asm.cpp b/goalc/compiler/compilation/Asm.cpp
index 68113aa761..71796c60c2 100644
--- a/goalc/compiler/compilation/Asm.cpp
+++ b/goalc/compiler/compilation/Asm.cpp
@@ -58,20 +58,22 @@ Val* Compiler::compile_rlet(const goos::Object& form, const goos::Object& rest,
     }
 
     // figure out the class
-    emitter::RegKind register_kind = emitter::RegKind::GPR;
+    RegClass register_class = RegClass::GPR_64;
     if (def_args.has_named("class")) {
       auto& class_name = def_args.named.at("class").as_symbol()->name;
       if (class_name == "gpr") {
-        register_kind = emitter::RegKind::GPR;
-      } else if (class_name == "xmm") {
-        register_kind = emitter::RegKind::XMM;
+        register_class = RegClass::GPR_64;
+      } else if (class_name == "fpr") {
+        register_class = RegClass::FLOAT;
+      } else if (class_name == "vf") {
+        register_class = RegClass::VECTOR_FLOAT;
       } else {
         throw_compiler_error(o, "Register class {} is unknown.", class_name);
       }
     }
 
     // alloc a register:
-    auto new_place_reg = env->make_ireg(ts, register_kind);
+    auto new_place_reg = env->make_ireg(ts, register_class);
     new_place_reg->mark_as_settable();
 
     if (def_args.has_named("reg")) {
@@ -231,11 +233,174 @@ Val* Compiler::compile_asm_mov(const goos::Object& form, const goos::Object& res
   if (args.has_named("color")) {
     color = get_true_or_false(form, args.named.at("color"));
   }
-  auto dest = compile_error_guard(args.unnamed.at(0), env)->to_gpr(env);
+  auto dest = compile_error_guard(args.unnamed.at(0), env)->to_reg(env);
   if (!dest->settable()) {
     throw_compiler_error(form, "Cannot .mov this. Got a {}.", dest->print());
   }
-  auto src = compile_error_guard(args.unnamed.at(1), env)->to_gpr(env);
+  auto src = compile_error_guard(args.unnamed.at(1), env)->to_reg(env);
   env->emit_ir<IR_RegSetAsm>(color, dest, src);
+  return get_none();
+}
+
+/*!
+ * Load a vector float from memory. Does an aligned load.
+ */
+Val* Compiler::compile_asm_lvf(const goos::Object& form, const goos::Object& rest, Env* env) {
+  auto args = get_va(form, rest);
+  va_check(form, args, {{}, {}}, {{"color", {false, goos::ObjectType::SYMBOL}}});
+  bool color = true;
+  if (args.has_named("color")) {
+    color = get_true_or_false(form, args.named.at("color"));
+  }
+
+  auto dest = compile_error_guard(args.unnamed.at(0), env)->to_reg(env);
+  if (!dest->settable() || dest->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(form, "Cannot .lvf into this. Got a {}.", dest->print());
+  }
+  auto src = compile_error_guard(args.unnamed.at(1), env);
+  auto as_co = dynamic_cast<MemoryOffsetConstantVal*>(src);
+  auto as_sv = dynamic_cast<StaticVal*>(src);
+  MemLoadInfo info;
+  info.sign_extend = false;
+  info.size = 16;
+  info.reg = RegClass::VECTOR_FLOAT;
+  if (as_co) {
+    // can do a clever offset here
+    assert(false);
+    env->emit_ir<IR_LoadConstOffset>(dest, as_co->offset, as_co->base->to_gpr(env), info, color);
+  } else if (as_sv) {
+    if (!color) {
+      throw std::runtime_error("no color nyi for static loads");
+    }
+    env->emit_ir<IR_StaticVarLoad>(dest, as_sv->obj);
+  } else {
+    env->emit_ir<IR_LoadConstOffset>(dest, 0, src->to_gpr(env), info, color);
+  }
+  return get_none();
+}
+
+/*!
+ * Store a vector float into memory. Does an aligned load.
+ */
+Val* Compiler::compile_asm_svf(const goos::Object& form, const goos::Object& rest, Env* env) {
+  auto args = get_va(form, rest);
+  va_check(form, args, {{}, {}}, {{"color", {false, goos::ObjectType::SYMBOL}}});
+  bool color = true;
+  if (args.has_named("color")) {
+    color = get_true_or_false(form, args.named.at("color"));
+  }
+
+  auto dest = compile_error_guard(args.unnamed.at(0), env);
+  auto src = compile_error_guard(args.unnamed.at(1), env)->to_reg(env);
+
+  if (!src->settable() || src->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(form, "Cannot .svf from this. Got a {}.", dest->print());
+  }
+
+  auto as_co = dynamic_cast<MemoryOffsetConstantVal*>(dest);
+  MemLoadInfo info;
+  info.sign_extend = false;
+  info.size = 16;
+  info.reg = RegClass::VECTOR_FLOAT;
+  if (as_co) {
+    // can do a clever offset here
+    assert(false);
+    env->emit_ir<IR_StoreConstOffset>(src, as_co->offset, as_co->base->to_gpr(env), 16, color);
+  } else {
+    env->emit_ir<IR_StoreConstOffset>(src, 0, dest->to_gpr(env), 16, color);
+  }
+  return get_none();
+}
+
+Val* Compiler::compile_asm_xor_vf(const goos::Object& form, const goos::Object& rest, Env* env) {
+  return compile_asm_vf_math3(form, rest, IR_VFMath3Asm::Kind::XOR, env);
+}
+
+Val* Compiler::compile_asm_sub_vf(const goos::Object& form, const goos::Object& rest, Env* env) {
+  return compile_asm_vf_math3(form, rest, IR_VFMath3Asm::Kind::SUB, env);
+}
+
+Val* Compiler::compile_asm_add_vf(const goos::Object& form, const goos::Object& rest, Env* env) {
+  return compile_asm_vf_math3(form, rest, IR_VFMath3Asm::Kind::ADD, env);
+}
+
+Val* Compiler::compile_asm_blend_vf(const goos::Object& form, const goos::Object& rest, Env* env) {
+  auto args = get_va(form, rest);
+  va_check(form, args, {{}, {}, {}, {}}, {{"color", {false, goos::ObjectType::SYMBOL}}});
+  bool color = true;
+  if (args.has_named("color")) {
+    color = get_true_or_false(form, args.named.at("color"));
+  }
+
+  auto dest = compile_error_guard(args.unnamed.at(0), env)->to_reg(env);
+  if (!dest->settable() || dest->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(
+        form, "Invalid destination register for a vector float 3-arg math form. Got a {}.",
+        dest->print());
+  }
+
+  auto src1 = compile_error_guard(args.unnamed.at(1), env)->to_reg(env);
+  if (src1->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(
+        form, "Invalid first source register for a vector float 3-arg math form. Got a {}.",
+        src1->print());
+  }
+
+  auto src2 = compile_error_guard(args.unnamed.at(2), env)->to_reg(env);
+  if (src2->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(
+        form, "Invalid second source register for a vector float 3-arg math form. Got a {}.",
+        src2->print());
+  }
+
+  int64_t mask;
+  if (!try_getting_constant_integer(args.unnamed.at(3), &mask, env)) {
+    throw_compiler_error(form,
+                         "The value {} is invalid for a blend mask, it could not be evaluated as a "
+                         "constant integer.",
+                         args.unnamed.at(3).print());
+  }
+
+  if (mask < 0 || mask > 15) {
+    throw_compiler_error(form, "The value {} is out of range for a blend mask.", mask);
+  }
+  env->emit_ir<IR_BlendVF>(color, dest, src1, src2, mask);
+  return get_none();
+}
+
+Val* Compiler::compile_asm_vf_math3(const goos::Object& form,
+                                    const goos::Object& rest,
+                                    IR_VFMath3Asm::Kind kind,
+                                    Env* env) {
+  auto args = get_va(form, rest);
+  va_check(form, args, {{}, {}, {}}, {{"color", {false, goos::ObjectType::SYMBOL}}});
+  bool color = true;
+  if (args.has_named("color")) {
+    color = get_true_or_false(form, args.named.at("color"));
+  }
+
+  auto dest = compile_error_guard(args.unnamed.at(0), env)->to_reg(env);
+  if (!dest->settable() || dest->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(
+        form, "Invalid destination register for a vector float 3-arg math form. Got a {}.",
+        dest->print());
+  }
+
+  auto src1 = compile_error_guard(args.unnamed.at(1), env)->to_reg(env);
+  if (src1->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(
+        form, "Invalid first source register for a vector float 3-arg math form. Got a {}.",
+        src1->print());
+  }
+
+  auto src2 = compile_error_guard(args.unnamed.at(2), env)->to_reg(env);
+  if (src2->ireg().reg_class != RegClass::VECTOR_FLOAT) {
+    throw_compiler_error(
+        form, "Invalid second source register for a vector float 3-arg math form. Got a {}.",
+        src2->print());
+  }
+
+  env->emit_ir<IR_VFMath3Asm>(color, dest, src1, src2, kind);
+
   return get_none();
 }
\ No newline at end of file
diff --git a/goalc/compiler/compilation/Atoms.cpp b/goalc/compiler/compilation/Atoms.cpp
index 359242b021..88520ea0c7 100644
--- a/goalc/compiler/compilation/Atoms.cpp
+++ b/goalc/compiler/compilation/Atoms.cpp
@@ -23,6 +23,12 @@ static const std::unordered_map<
         {".add", &Compiler::compile_asm_add},
         {".load-sym", &Compiler::compile_asm_load_sym},
         {".mov", &Compiler::compile_asm_mov},
+        {".lvf", &Compiler::compile_asm_lvf},
+        {".svf", &Compiler::compile_asm_svf},
+        {".xor.vf", &Compiler::compile_asm_xor_vf},
+        {".sub.vf", &Compiler::compile_asm_sub_vf},
+        {".add.vf", &Compiler::compile_asm_add_vf},
+        {".blend.vf", &Compiler::compile_asm_blend_vf},
 
         // BLOCK FORMS
         {"top-level", &Compiler::compile_top_level},
diff --git a/goalc/compiler/compilation/ControlFlow.cpp b/goalc/compiler/compilation/ControlFlow.cpp
index e2d490e007..d01170df61 100644
--- a/goalc/compiler/compilation/ControlFlow.cpp
+++ b/goalc/compiler/compilation/ControlFlow.cpp
@@ -90,8 +90,8 @@ Condition Compiler::compile_condition(const goos::Object& condition, Env* env, b
 
         // pick between a floating point and an integer comparison.
         if (is_float(first_arg->type())) {
-          gc.a = first_arg->to_xmm(env);
-          gc.b = second_arg->to_xmm(env);
+          gc.a = first_arg->to_fpr(env);
+          gc.b = second_arg->to_fpr(env);
           gc.is_float = true;
         } else {
           gc.a = first_arg->to_gpr(env);
diff --git a/goalc/compiler/compilation/Function.cpp b/goalc/compiler/compilation/Function.cpp
index 3319407f51..2f421b9b61 100644
--- a/goalc/compiler/compilation/Function.cpp
+++ b/goalc/compiler/compilation/Function.cpp
@@ -153,7 +153,7 @@ Val* Compiler::compile_lambda(const goos::Object& form, const goos::Object& rest
     for (u32 i = 0; i < lambda.params.size(); i++) {
       IRegConstraint constr;
       constr.instr_idx = 0;  // constraint at function start
-      auto ireg = new_func_env->make_ireg(lambda.params.at(i).type, emitter::RegKind::GPR);
+      auto ireg = new_func_env->make_gpr(lambda.params.at(i).type);
       ireg->mark_as_settable();
       constr.ireg = ireg->ireg();
       constr.desired_register = emitter::gRegInfo.get_arg_reg(i);
@@ -165,7 +165,7 @@ Val* Compiler::compile_lambda(const goos::Object& form, const goos::Object& rest
     place->func = new_func_env.get();
 
     // nasty function block env setup
-    auto return_reg = new_func_env->make_ireg(get_none()->type(), emitter::RegKind::GPR);
+    auto return_reg = new_func_env->make_gpr(get_none()->type());
     auto func_block_env = new_func_env->alloc_env<BlockEnv>(new_func_env.get(), "#f");
     func_block_env->return_value = return_reg;
     func_block_env->end_label = Label(new_func_env.get());
@@ -357,7 +357,7 @@ Val* Compiler::compile_function_or_method_call(const goos::Object& form, Env* en
       // note, inlined functions will get a more specific type if possible
       // todo, is this right?
       auto type = eval_args.at(i)->type();
-      auto copy = env->make_ireg(type, get_preferred_reg_kind(type));
+      auto copy = env->make_ireg(type, m_ts.lookup_type(type)->get_preferred_reg_class());
       env->emit(std::make_unique<IR_RegSet>(copy, eval_args.at(i)));
       copy->mark_as_settable();
       lexical_env->vars[head_as_lambda->lambda.params.at(i).name] = copy;
@@ -368,8 +368,7 @@ Val* Compiler::compile_function_or_method_call(const goos::Object& form, Env* en
     RegVal* result_reg_if_return_from = nullptr;
     if (auto_inline || got_inlined_lambda) {
       inlined_block_env = fe->alloc_env<BlockEnv>(inlined_compile_env, "#f");
-      result_reg_if_return_from =
-          inlined_compile_env->make_ireg(get_none()->type(), emitter::RegKind::GPR);
+      result_reg_if_return_from = inlined_compile_env->make_gpr(get_none()->type());
       inlined_block_env->return_value = result_reg_if_return_from;
       inlined_block_env->end_label = Label(fe);
       inlined_compile_env = inlined_block_env;
@@ -474,7 +473,7 @@ Val* Compiler::compile_real_function_call(const goos::Object& form,
     return_ts = function->type().last_arg();
   }
 
-  auto return_reg = env->make_ireg(return_ts, emitter::RegKind::GPR);
+  auto return_reg = env->make_gpr(return_ts);
 
   // check arg count:
   if (function->type().arg_count() && !is_varargs_function(function->type())) {
@@ -501,7 +500,7 @@ Val* Compiler::compile_real_function_call(const goos::Object& form,
   // set args (introducing a move here makes coloring more likely to be possible)
   std::vector<RegVal*> arg_outs;
   for (auto& arg : args) {
-    arg_outs.push_back(env->make_ireg(arg->type(), emitter::RegKind::GPR));
+    arg_outs.push_back(env->make_gpr(arg->type()));
     arg_outs.back()->mark_as_settable();
     env->emit(std::make_unique<IR_RegSet>(arg_outs.back(), arg));
   }
diff --git a/goalc/compiler/compilation/Math.cpp b/goalc/compiler/compilation/Math.cpp
index 6cf729a318..2c7c3cac3f 100644
--- a/goalc/compiler/compilation/Math.cpp
+++ b/goalc/compiler/compilation/Math.cpp
@@ -46,7 +46,7 @@ Val* Compiler::number_to_integer(const goos::Object& form, Val* in, Env* env) {
   } else if (is_float(ts)) {
     auto fe = get_parent_env_of_type<FunctionEnv>(env);
     auto result = fe->make_gpr(m_ts.make_typespec("int"));
-    env->emit(std::make_unique<IR_FloatToInt>(result, in->to_xmm(env)));
+    env->emit(std::make_unique<IR_FloatToInt>(result, in->to_fpr(env)));
     return result;
   } else if (is_integer(ts)) {
     return in;
@@ -84,7 +84,7 @@ Val* Compiler::number_to_float(const goos::Object& form, Val* in, Env* env) {
     return in;
   } else if (is_integer(ts)) {
     auto fe = get_parent_env_of_type<FunctionEnv>(env);
-    auto result = fe->make_xmm(m_ts.make_typespec("float"));
+    auto result = fe->make_fpr(m_ts.make_typespec("float"));
     env->emit(std::make_unique<IR_IntToFloat>(result, in->to_gpr(env)));
     return result;
   }
@@ -132,14 +132,14 @@ Val* Compiler::compile_add(const goos::Object& form, const goos::Object& rest, E
     }
 
     case MATH_FLOAT: {
-      auto result = env->make_xmm(first_type);
-      env->emit(std::make_unique<IR_RegSet>(result, first_val->to_xmm(env)));
+      auto result = env->make_fpr(first_type);
+      env->emit(std::make_unique<IR_RegSet>(result, first_val->to_fpr(env)));
 
       for (size_t i = 1; i < args.unnamed.size(); i++) {
         env->emit(std::make_unique<IR_FloatMath>(
             FloatMathKind::ADD_SS, result,
             to_math_type(form, compile_error_guard(args.unnamed.at(i), env), math_type, env)
-                ->to_xmm(env)));
+                ->to_fpr(env)));
       }
       return result;
     }
@@ -178,14 +178,14 @@ Val* Compiler::compile_mul(const goos::Object& form, const goos::Object& rest, E
       return result;
     }
     case MATH_FLOAT: {
-      auto result = env->make_xmm(first_type);
-      env->emit(std::make_unique<IR_RegSet>(result, first_val->to_xmm(env)));
+      auto result = env->make_fpr(first_type);
+      env->emit(std::make_unique<IR_RegSet>(result, first_val->to_fpr(env)));
 
       for (size_t i = 1; i < args.unnamed.size(); i++) {
         env->emit(std::make_unique<IR_FloatMath>(
             FloatMathKind::MUL_SS, result,
             to_math_type(form, compile_error_guard(args.unnamed.at(i), env), math_type, env)
-                ->to_xmm(env)));
+                ->to_fpr(env)));
       }
       return result;
     }
@@ -210,14 +210,14 @@ Val* Compiler::compile_fmin(const goos::Object& form, const goos::Object& rest,
   if (get_math_mode(first_val->type()) != MATH_FLOAT) {
     throw_compiler_error(form, "Must use floats in fmin");
   }
-  auto result = env->make_xmm(first_val->type());
-  env->emit(std::make_unique<IR_RegSet>(result, first_val->to_xmm(env)));
+  auto result = env->make_fpr(first_val->type());
+  env->emit(std::make_unique<IR_RegSet>(result, first_val->to_fpr(env)));
   for (size_t i = 1; i < args.unnamed.size(); i++) {
     auto val = compile_error_guard(args.unnamed.at(i), env);
     if (get_math_mode(val->type()) != MATH_FLOAT) {
       throw_compiler_error(form, "Must use floats in fmin");
     }
-    env->emit(std::make_unique<IR_FloatMath>(FloatMathKind::MIN_SS, result, val->to_xmm(env)));
+    env->emit(std::make_unique<IR_FloatMath>(FloatMathKind::MIN_SS, result, val->to_fpr(env)));
   }
   return result;
 }
@@ -233,14 +233,14 @@ Val* Compiler::compile_fmax(const goos::Object& form, const goos::Object& rest,
   if (get_math_mode(first_val->type()) != MATH_FLOAT) {
     throw_compiler_error(form, "Must use floats in fmax");
   }
-  auto result = env->make_xmm(first_val->type());
-  env->emit(std::make_unique<IR_RegSet>(result, first_val->to_xmm(env)));
+  auto result = env->make_fpr(first_val->type());
+  env->emit(std::make_unique<IR_RegSet>(result, first_val->to_fpr(env)));
   for (size_t i = 1; i < args.unnamed.size(); i++) {
     auto val = compile_error_guard(args.unnamed.at(i), env);
     if (get_math_mode(val->type()) != MATH_FLOAT) {
       throw_compiler_error(form, "Must use floats in fmax");
     }
-    env->emit(std::make_unique<IR_FloatMath>(FloatMathKind::MAX_SS, result, val->to_xmm(env)));
+    env->emit(std::make_unique<IR_FloatMath>(FloatMathKind::MAX_SS, result, val->to_fpr(env)));
   }
   return result;
 }
@@ -316,23 +316,23 @@ Val* Compiler::compile_sub(const goos::Object& form, const goos::Object& rest, E
     case MATH_FLOAT:
       if (args.unnamed.size() == 1) {
         auto result =
-            compile_float(0, env, get_parent_env_of_type<FunctionEnv>(env)->segment)->to_xmm(env);
+            compile_float(0, env, get_parent_env_of_type<FunctionEnv>(env)->segment)->to_fpr(env);
         env->emit(std::make_unique<IR_FloatMath>(
             FloatMathKind::SUB_SS, result,
             to_math_type(form, compile_error_guard(args.unnamed.at(0), env), math_type, env)
-                ->to_xmm(env)));
+                ->to_fpr(env)));
         return result;
       } else {
-        auto result = env->make_xmm(first_type);
+        auto result = env->make_fpr(first_type);
         env->emit(std::make_unique<IR_RegSet>(
             result, to_math_type(form, compile_error_guard(args.unnamed.at(0), env), math_type, env)
-                        ->to_xmm(env)));
+                        ->to_fpr(env)));
 
         for (size_t i = 1; i < args.unnamed.size(); i++) {
           env->emit(std::make_unique<IR_FloatMath>(
               FloatMathKind::SUB_SS, result,
               to_math_type(form, compile_error_guard(args.unnamed.at(i), env), math_type, env)
-                  ->to_xmm(env)));
+                  ->to_fpr(env)));
         }
         return result;
       }
@@ -360,7 +360,7 @@ Val* Compiler::compile_div(const goos::Object& form, const goos::Object& rest, E
     case MATH_INT: {
       auto fe = get_parent_env_of_type<FunctionEnv>(env);
       auto first_thing = first_val->to_gpr(env);
-      auto result = env->make_ireg(first_type, emitter::RegKind::GPR);
+      auto result = env->make_gpr(first_type);
       env->emit(std::make_unique<IR_RegSet>(result, first_thing));
 
       IRegConstraint result_rax_constraint;
@@ -377,12 +377,12 @@ Val* Compiler::compile_div(const goos::Object& form, const goos::Object& rest, E
     }
 
     case MATH_FLOAT: {
-      auto result = env->make_xmm(first_type);
-      env->emit(std::make_unique<IR_RegSet>(result, first_val->to_xmm(env)));
+      auto result = env->make_fpr(first_type);
+      env->emit(std::make_unique<IR_RegSet>(result, first_val->to_fpr(env)));
       env->emit(std::make_unique<IR_FloatMath>(
           FloatMathKind::DIV_SS, result,
           to_math_type(form, compile_error_guard(args.unnamed.at(1), env), math_type, env)
-              ->to_xmm(env)));
+              ->to_fpr(env)));
       return result;
     }
 
diff --git a/goalc/compiler/compilation/Type.cpp b/goalc/compiler/compilation/Type.cpp
index 94547952ac..c875b6b5d0 100644
--- a/goalc/compiler/compilation/Type.cpp
+++ b/goalc/compiler/compilation/Type.cpp
@@ -65,7 +65,7 @@ RegVal* Compiler::compile_get_method_of_object(const goos::Object& form,
     MemLoadInfo info;
     info.size = 4;
     info.sign_extend = false;
-    info.reg = RegKind::GPR_64;
+    info.reg = RegClass::GPR_64;
     env->emit(std::make_unique<IR_LoadConstOffset>(runtime_type, -4, object, info));
   } else {
     // can't look up at runtime
@@ -174,8 +174,8 @@ Val* Compiler::generate_inspector_for_type(const goos::Object& form, Env* env, T
   method_env->set_segment(DEBUG_SEGMENT);
 
   // Create a register which will hold the input to the inspect method
-  auto input = method_env->make_ireg(structured_type->get_name(), emitter::RegKind::GPR);
-  // "Constraint" this register to be the register that the function argument is passed in
+  auto input = method_env->make_gpr(structured_type->get_name());
+  // "Constrain" this register to be the register that the function argument is passed in
   IRegConstraint constraint;
   constraint.instr_idx = 0;         // constraint at the start of the function
   constraint.ireg = input->ireg();  // constrain this register
@@ -326,7 +326,7 @@ Val* Compiler::compile_defmethod(const goos::Object& form, const goos::Object& _
   for (u32 i = 0; i < lambda.params.size(); i++) {
     IRegConstraint constr;
     constr.instr_idx = 0;  // constraint at function start
-    auto ireg = new_func_env->make_ireg(lambda.params.at(i).type, emitter::RegKind::GPR);
+    auto ireg = new_func_env->make_gpr(lambda.params.at(i).type);
     ireg->mark_as_settable();
     constr.ireg = ireg->ireg();
     constr.desired_register = emitter::gRegInfo.get_arg_reg(i);
@@ -338,7 +338,7 @@ Val* Compiler::compile_defmethod(const goos::Object& form, const goos::Object& _
   place->func = new_func_env.get();
 
   // nasty function block env setup
-  auto return_reg = new_func_env->make_ireg(get_none()->type(), emitter::RegKind::GPR);
+  auto return_reg = new_func_env->make_gpr(get_none()->type());
   auto func_block_env = new_func_env->alloc_env<BlockEnv>(new_func_env.get(), "#f");
   func_block_env->return_value = return_reg;
   func_block_env->end_label = Label(new_func_env.get());
diff --git a/goalc/emitter/IGen.h b/goalc/emitter/IGen.h
index 3a2ed06a75..d5f6d32f4a 100644
--- a/goalc/emitter/IGen.h
+++ b/goalc/emitter/IGen.h
@@ -121,7 +121,6 @@ class IGen {
 
   // todo - GPR64 -> XMM64 (zext)
   // todo - XMM -> GPR64
-  // todo - XMM128 - XMM128
 
   //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   //   GOAL Loads and Stores
@@ -706,6 +705,17 @@ class IGen {
     return instr;
   }
 
+  static Instruction store_goal_vf(Register addr, Register value, Register off, s64 offset) {
+    if (offset == 0) {
+      return storevf_gpr64_plus_gpr64(value, addr, off);
+    } else if (offset >= INT8_MIN && offset <= INT8_MAX) {
+      return storevf_gpr64_plus_gpr64_plus_s8(value, addr, off, offset);
+    } else if (offset >= INT32_MIN && offset <= INT32_MAX) {
+      return storevf_gpr64_plus_gpr64_plus_s32(value, addr, off, offset);
+    }
+    assert(false);
+  }
+
   static Instruction store_goal_gpr(Register addr,
                                     Register value,
                                     Register off,
@@ -757,6 +767,18 @@ class IGen {
     }
   }
 
+  static Instruction load_goal_vf(Register dst, Register addr, Register off, int offset) {
+    if (offset == 0) {
+      return loadvf_gpr64_plus_gpr64(dst, addr, off);
+    } else if (offset >= INT8_MIN && offset <= INT8_MAX) {
+      return loadvf_gpr64_plus_gpr64_plus_s8(dst, addr, off, offset);
+    } else if (offset >= INT32_MIN && offset <= INT32_MAX) {
+      return loadvf_gpr64_plus_gpr64_plus_s32(dst, addr, off, offset);
+    } else {
+      assert(false);
+    }
+  }
+
   /*!
    * Load memory at addr + offset, where addr is a GOAL pointer and off is the offset register.
    * This will pick the appropriate fancy addressing mode instruction.
@@ -1820,6 +1842,203 @@ class IGen {
     i.is_null = true;
     return i;
   }
+
+  /////////////////////////////
+  // AVX (VF - Vector Float) //
+  /////////////////////////////
+  static Instruction mov_vf_vf(Register dst, Register src) {
+    assert(dst.is_xmm());
+    assert(src.is_xmm());
+
+    if (src.hw_id() >= 8 && dst.hw_id() < 8) {
+      // in this case, we can use the 0x29 encoding, which swaps src and dst, in order to use the
+      // 2 byte VEX prefix, where the 0x28 encoding would require an extra byte.
+      // compilers/assemblers seem to prefer 0x28, unless 0x29 would save you a byte.
+      Instruction instr(0x29);
+      instr.set_vex_modrm_and_rex(src.hw_id(), dst.hw_id(), 3, VEX3::LeadingBytes::P_0F, false);
+      return instr;
+    } else {
+      Instruction instr(0x28);
+      instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, VEX3::LeadingBytes::P_0F, false);
+      return instr;
+    }
+  }
+
+  static Instruction loadvf_gpr64_plus_gpr64(Register dst, Register addr1, Register addr2) {
+    assert(dst.is_xmm());
+    assert(addr1.is_gpr());
+    assert(addr2.is_gpr());
+    assert(addr1 != addr2);
+    assert(addr1 != RSP);
+    assert(addr2 != RSP);
+    Instruction instr(0x28);
+    instr.set_vex_modrm_and_rex_for_reg_plus_reg_addr(dst.hw_id(), addr1.hw_id(), addr2.hw_id(),
+                                                      VEX3::LeadingBytes::P_0F, false);
+    return instr;
+  }
+
+  static Instruction loadvf_gpr64_plus_gpr64_plus_s8(Register dst,
+                                                     Register addr1,
+                                                     Register addr2,
+                                                     s64 offset) {
+    assert(dst.is_xmm());
+    assert(addr1.is_gpr());
+    assert(addr2.is_gpr());
+    assert(addr1 != addr2);
+    assert(addr1 != RSP);
+    assert(addr2 != RSP);
+    assert(offset >= INT8_MIN && offset <= INT8_MAX);
+    Instruction instr(0x28);
+    instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s8(dst.hw_id(), addr1.hw_id(), addr2.hw_id(),
+                                                         offset, VEX3::LeadingBytes::P_0F, false);
+    return instr;
+  }
+
+  static Instruction loadvf_gpr64_plus_gpr64_plus_s32(Register dst,
+                                                      Register addr1,
+                                                      Register addr2,
+                                                      s64 offset) {
+    assert(dst.is_xmm());
+    assert(addr1.is_gpr());
+    assert(addr2.is_gpr());
+    assert(addr1 != addr2);
+    assert(addr1 != RSP);
+    assert(addr2 != RSP);
+    assert(offset >= INT32_MIN && offset <= INT32_MAX);
+    Instruction instr(0x28);
+    instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s32(dst.hw_id(), addr1.hw_id(), addr2.hw_id(),
+                                                          offset, VEX3::LeadingBytes::P_0F, false);
+    return instr;
+  }
+
+  static Instruction storevf_gpr64_plus_gpr64(Register value, Register addr1, Register addr2) {
+    assert(value.is_xmm());
+    assert(addr1.is_gpr());
+    assert(addr2.is_gpr());
+    assert(addr1 != addr2);
+    assert(addr1 != RSP);
+    assert(addr2 != RSP);
+    Instruction instr(0x29);
+    instr.set_vex_modrm_and_rex_for_reg_plus_reg_addr(value.hw_id(), addr1.hw_id(), addr2.hw_id(),
+                                                      VEX3::LeadingBytes::P_0F, false);
+    return instr;
+  }
+
+  static Instruction storevf_gpr64_plus_gpr64_plus_s8(Register value,
+                                                      Register addr1,
+                                                      Register addr2,
+                                                      s64 offset) {
+    assert(value.is_xmm());
+    assert(addr1.is_gpr());
+    assert(addr2.is_gpr());
+    assert(addr1 != addr2);
+    assert(addr1 != RSP);
+    assert(addr2 != RSP);
+    assert(offset >= INT8_MIN && offset <= INT8_MAX);
+    Instruction instr(0x29);
+    instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s8(
+        value.hw_id(), addr1.hw_id(), addr2.hw_id(), offset, VEX3::LeadingBytes::P_0F, false);
+    return instr;
+  }
+
+  static Instruction storevf_gpr64_plus_gpr64_plus_s32(Register value,
+                                                       Register addr1,
+                                                       Register addr2,
+                                                       s64 offset) {
+    assert(value.is_xmm());
+    assert(addr1.is_gpr());
+    assert(addr2.is_gpr());
+    assert(addr1 != addr2);
+    assert(addr1 != RSP);
+    assert(addr2 != RSP);
+    assert(offset >= INT32_MIN && offset <= INT32_MAX);
+    Instruction instr(0x29);
+    instr.set_vex_modrm_and_rex_for_reg_plus_reg_plus_s32(
+        value.hw_id(), addr1.hw_id(), addr2.hw_id(), offset, VEX3::LeadingBytes::P_0F, false);
+    return instr;
+  }
+
+  static Instruction loadvf_rip_plus_s32(Register dest, s64 offset) {
+    assert(dest.is_xmm());
+    assert(offset >= INT32_MIN);
+    assert(offset <= INT32_MAX);
+    Instruction instr(0x28);
+    instr.set_vex_modrm_and_rex_for_rip_plus_s32(dest.hw_id(), offset);
+    return instr;
+  }
+
+  // todo, rip relative loads and stores.
+
+  static Instruction mul_vf(Register dst, Register src1, Register src2) {
+    assert(dst.is_xmm());
+    assert(src1.is_xmm());
+    assert(src2.is_xmm());
+    Instruction instr(0x59);
+    instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id());
+    return instr;
+  }
+
+  static Instruction shuffle_vf(Register dst, Register src, u8 dx, u8 dy, u8 dz, u8 dw) {
+    assert(dst.is_xmm());
+    assert(src.is_xmm());
+    assert(dx < 4);
+    assert(dy < 4);
+    assert(dz < 4);
+    assert(dw < 4);
+    u8 imm = dx + (dy << 2) + (dz << 4) + (dw << 6);
+    // we use the AVX "VEX" encoding here. This is a three-operand form, but we just set both source
+    // to the same register. It seems like this is one byte longer but is faster maybe?
+    Instruction instr(0xc6);
+    instr.set_vex_modrm_and_rex(dst.hw_id(), src.hw_id(), VEX3::LeadingBytes::P_0F, src.hw_id());
+    instr.set(Imm(1, imm));
+    return instr;
+
+    // SSE encoding version:
+    //    Instruction instr(0x0f);
+    //    instr.set_op2(0xc6);
+    //    instr.set_modrm_and_rex(dst.hw_id(), src.hw_id(), 3, false);
+    //    instr.set(Imm(1, imm));
+    //    return instr;
+  }
+
+  static Instruction xor_vf(Register dst, Register src1, Register src2) {
+    assert(dst.is_xmm());
+    assert(src1.is_xmm());
+    assert(src2.is_xmm());
+    Instruction instr(0x57);
+    instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id());
+    return instr;
+  }
+
+  static Instruction sub_vf(Register dst, Register src1, Register src2) {
+    assert(dst.is_xmm());
+    assert(src1.is_xmm());
+    assert(src2.is_xmm());
+    Instruction instr(0x5c);
+    instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id());
+    return instr;
+  }
+
+  static Instruction add_vf(Register dst, Register src1, Register src2) {
+    assert(dst.is_xmm());
+    assert(src1.is_xmm());
+    assert(src2.is_xmm());
+    Instruction instr(0x58);
+    instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F, src1.hw_id());
+    return instr;
+  }
+
+  static Instruction blend_vf(Register dst, Register src1, Register src2, u8 mask) {
+    assert(!(mask & 0b11110000));
+    assert(dst.is_xmm());
+    assert(src1.is_xmm());
+    assert(src2.is_xmm());
+    Instruction instr(0x0c);
+    instr.set_vex_modrm_and_rex(dst.hw_id(), src2.hw_id(), VEX3::LeadingBytes::P_0F_3A,
+                                src1.hw_id(), false, VexPrefix::P_66);
+    instr.set(Imm(1, mask));
+    return instr;
+  }
 };
 }  // namespace emitter
 
diff --git a/goalc/emitter/Instruction.h b/goalc/emitter/Instruction.h
index b4e19d6b35..8c004a0f0a 100644
--- a/goalc/emitter/Instruction.h
+++ b/goalc/emitter/Instruction.h
@@ -54,6 +54,83 @@ struct REX {
   uint8_t operator()() const { return (1 << 6) | (W << 3) | (R << 2) | (X << 1) | (B << 0); }
 };
 
+enum class VexPrefix : u8 { P_NONE = 0, P_66 = 1, P_F3 = 2, P_F2 = 3 };
+
+/*!
+ * The "VEX" 3-byte format for AVX instructions
+ */
+struct VEX3 {
+  bool W, R, X, B;
+  enum class LeadingBytes : u8 { P_INVALID = 0, P_0F = 1, P_0F_38 = 2, P_0F_3A = 3 } leading_bytes;
+  u8 reg_id;
+  VexPrefix prefix;
+  bool L;
+
+  u8 emit(u8 byte) const {
+    if (byte == 0) {
+      return 0b11000100;
+    } else if (byte == 1) {
+      u8 result = 0;
+      result |= ((!R) << 7);
+      result |= ((!X) << 6);
+      result |= ((!B) << 5);
+      result |= (0b11111 & u8(leading_bytes));
+      return result;
+    } else if (byte == 2) {
+      u8 result = 0;
+      result |= (W << 7);  // this may be inverted?
+      result |= ((~reg_id) & 0b1111) << 3;
+      result |= (L << 2);
+      result |= (u8(prefix) & 0b11);
+      return result;
+    } else {
+      assert(false);
+    }
+  }
+
+  VEX3(bool w,
+       bool r,
+       bool x,
+       bool b,
+       LeadingBytes _leading_bytes,
+       u8 _reg_id = 0,
+       VexPrefix _prefix = VexPrefix::P_NONE,
+       bool l = false)
+      : W(w),
+        R(r),
+        X(x),
+        B(b),
+        leading_bytes(_leading_bytes),
+        reg_id(_reg_id),
+        prefix(_prefix),
+        L(l) {}
+};
+
+struct VEX2 {
+  bool R;
+  u8 reg_id;
+  VexPrefix prefix;
+  bool L;
+
+  u8 emit(u8 byte) const {
+    if (byte == 0) {
+      return 0b11000101;
+    } else if (byte == 1) {
+      u8 result = 0;
+      result |= ((!R) << 7);
+      result |= ((~reg_id) & 0b1111) << 3;
+      result |= (L << 2);
+      result |= (u8(prefix) & 0b11);
+      return result;
+    } else {
+      assert(false);
+    }
+  }
+
+  VEX2(bool r, u8 _reg_id = 0, VexPrefix _prefix = VexPrefix::P_NONE, bool l = false)
+      : R(r), reg_id(_reg_id), prefix(_prefix), L(l) {}
+};
+
 /*!
  * A high-level description of an x86-64 opcode.  It can emit itself.
  */
@@ -73,6 +150,9 @@ struct Instruction {
   // flag to indicate it's the first instruction of a function and needs align and type tag
   bool is_function_start = false;
 
+  int n_vex = 0;
+  uint8_t vex[3] = {0, 0, 0};
+
   // the rex byte
   bool set_rex = false;
   uint8_t m_rex = 0;
@@ -93,10 +173,6 @@ struct Instruction {
   bool set_imm = false;
   Imm imm;
 
-  // which IR instruction does this go with?
-  // this is only set for the first instruction generated from an IR.
-  int ir_index = -1;
-
   /*!
    * Move opcode byte 0 to before the rex prefix.
    */
@@ -123,6 +199,20 @@ struct Instruction {
     set_sib = true;
   }
 
+  void set(VEX3 vex3) {
+    n_vex = 3;
+    for (int i = 0; i < n_vex; i++) {
+      vex[i] = vex3.emit(i);
+    }
+  }
+
+  void set(VEX2 vex2) {
+    n_vex = 2;
+    for (int i = 0; i < n_vex; i++) {
+      vex[i] = vex2.emit(i);
+    }
+  }
+
   void set_disp(Imm i) {
     disp = i;
     set_disp_imm = true;
@@ -187,6 +277,78 @@ struct Instruction {
     }
   }
 
+  void set_vex_modrm_and_rex(uint8_t reg,
+                             uint8_t rm,
+                             VEX3::LeadingBytes lb,
+                             uint8_t vex_reg = 0,
+                             bool rex_w = false,
+                             VexPrefix prefix = VexPrefix::P_NONE) {
+    bool rex_b = false, rex_r = false;
+
+    if (rm >= 8) {
+      rm -= 8;
+      rex_b = true;
+    }
+
+    if (reg >= 8) {
+      reg -= 8;
+      rex_r = true;
+    }
+
+    ModRM modrm;
+    modrm.mod = 3;
+    modrm.reg_op = reg;
+    modrm.rm = rm;
+
+    set(modrm);
+    if (rex_b || rex_w || lb != VEX3::LeadingBytes::P_0F) {
+      // need three byte version
+      set(VEX3(rex_w, rex_r, false, rex_b, lb, vex_reg, prefix));
+    } else {
+      assert(lb == VEX3::LeadingBytes::P_0F);  // vex2 implies 0x0f
+      assert(!rex_b);
+      assert(!rex_w);
+      set(VEX2(rex_r, vex_reg, prefix));
+    }
+  }
+
+  /*!
+   * Set VEX prefix for REX as needed for two registers.
+   */
+  void set_vex_modrm_and_rex(uint8_t reg,
+                             uint8_t rm,
+                             uint8_t mod,
+                             VEX3::LeadingBytes lb,
+                             bool rex_w = false) {
+    bool rex_b = false;
+    bool rex_r = false;
+    if (rm >= 8) {
+      rm -= 8;
+      rex_b = true;
+    }
+
+    if (reg >= 8) {
+      reg -= 8;
+      rex_r = true;
+    }
+
+    ModRM modrm;
+    modrm.mod = mod;
+    modrm.reg_op = reg;
+    modrm.rm = rm;
+    set(modrm);
+    if (rex_b || rex_w || lb != VEX3::LeadingBytes::P_0F) {
+      // need three byte version
+      set(VEX3(rex_w, rex_r, false, rex_b, lb));
+    } else {
+      // can get away with two byte version
+      assert(lb == VEX3::LeadingBytes::P_0F);  // vex2 implies 0x0f
+      assert(!rex_b);
+      assert(!rex_w);
+      set(VEX2(rex_r));
+    }
+  }
+
   void set_modrm_and_rex_for_reg_plus_reg_plus_s8(uint8_t reg,
                                                   uint8_t addr1,
                                                   uint8_t addr2,
@@ -245,6 +407,72 @@ struct Instruction {
     set_disp(imm2);
   }
 
+  void set_vex_modrm_and_rex_for_reg_plus_reg_plus_s8(uint8_t reg,
+                                                      uint8_t addr1,
+                                                      uint8_t addr2,
+                                                      s8 offset,
+                                                      VEX3::LeadingBytes lb,
+                                                      bool rex_w) {
+    bool rex_b = false, rex_r = false, rex_x = false;
+    bool addr1_ext = false;
+    bool addr2_ext = false;
+
+    if (addr1 >= 8) {
+      addr1 -= 8;
+      addr1_ext = true;
+    }
+
+    if (addr2 >= 8) {
+      addr2 -= 8;
+      addr2_ext = true;
+    }
+
+    if (reg >= 8) {
+      reg -= 8;
+      rex_r = true;
+    }
+
+    ModRM modrm;
+    modrm.mod = 1;  // no disp
+    modrm.rm = 4;   // sib!
+    modrm.reg_op = reg;
+
+    SIB sib;
+    sib.scale = 0;
+
+    Imm imm2(1, offset);
+
+    // default  addr1 in index
+    if (addr1 == 4) {
+      sib.index = addr2;
+      sib.base = addr1;
+      rex_x = addr2_ext;
+      rex_b = addr1_ext;
+    } else {
+      // addr1 in index
+      sib.index = addr1;
+      sib.base = addr2;
+      rex_x = addr1_ext;
+      rex_b = addr2_ext;
+    }
+    assert(sib.index != 4);
+
+    if (rex_b || rex_w || rex_x || lb != VEX3::LeadingBytes::P_0F) {
+      // need three byte version
+      set(VEX3(rex_w, rex_r, rex_x, rex_b, lb));
+    } else {
+      assert(lb == VEX3::LeadingBytes::P_0F);  // vex2 implies 0x0f
+      assert(!rex_b);
+      assert(!rex_w);
+      assert(!rex_x);
+      set(VEX2(rex_r));
+    }
+
+    set(modrm);
+    set(sib);
+    set_disp(imm2);
+  }
+
   void set_modrm_and_rex_for_reg_plus_reg_plus_s32(uint8_t reg,
                                                    uint8_t addr1,
                                                    uint8_t addr2,
@@ -303,6 +531,72 @@ struct Instruction {
     set_disp(imm2);
   }
 
+  void set_vex_modrm_and_rex_for_reg_plus_reg_plus_s32(uint8_t reg,
+                                                       uint8_t addr1,
+                                                       uint8_t addr2,
+                                                       s32 offset,
+                                                       VEX3::LeadingBytes lb,
+                                                       bool rex_w) {
+    bool rex_b = false, rex_r = false, rex_x = false;
+    bool addr1_ext = false;
+    bool addr2_ext = false;
+
+    if (addr1 >= 8) {
+      addr1 -= 8;
+      addr1_ext = true;
+    }
+
+    if (addr2 >= 8) {
+      addr2 -= 8;
+      addr2_ext = true;
+    }
+
+    if (reg >= 8) {
+      reg -= 8;
+      rex_r = true;
+    }
+
+    ModRM modrm;
+    modrm.mod = 2;  // no disp
+    modrm.rm = 4;   // sib!
+    modrm.reg_op = reg;
+
+    SIB sib;
+    sib.scale = 0;
+
+    Imm imm2(4, offset);
+
+    // default  addr1 in index
+    if (addr1 == 4) {
+      sib.index = addr2;
+      sib.base = addr1;
+      rex_x = addr2_ext;
+      rex_b = addr1_ext;
+    } else {
+      // addr1 in index
+      sib.index = addr1;
+      sib.base = addr2;
+      rex_x = addr1_ext;
+      rex_b = addr2_ext;
+    }
+    assert(sib.index != 4);
+
+    if (rex_b || rex_w || rex_x || lb != VEX3::LeadingBytes::P_0F) {
+      // need three byte version
+      set(VEX3(rex_w, rex_r, rex_x, rex_b, lb));
+    } else {
+      assert(lb == VEX3::LeadingBytes::P_0F);  // vex2 implies 0x0f
+      assert(!rex_b);
+      assert(!rex_w);
+      assert(!rex_x);
+      set(VEX2(rex_r));
+    }
+
+    set(modrm);
+    set(sib);
+    set_disp(imm2);
+  }
+
   void set_modrm_and_rex_for_reg_plus_reg_addr(uint8_t reg,
                                                uint8_t addr1,
                                                uint8_t addr2,
@@ -371,6 +665,81 @@ struct Instruction {
     set(sib);
   }
 
+  void set_vex_modrm_and_rex_for_reg_plus_reg_addr(uint8_t reg,
+                                                   uint8_t addr1,
+                                                   uint8_t addr2,
+                                                   VEX3::LeadingBytes lb,
+                                                   bool rex_w = false) {
+    bool rex_b = false, rex_r = false, rex_x = false;
+    bool addr1_ext = false;
+    bool addr2_ext = false;
+
+    if (addr1 >= 8) {
+      addr1 -= 8;
+      addr1_ext = true;
+    }
+
+    if (addr2 >= 8) {
+      addr2 -= 8;
+      addr2_ext = true;
+    }
+
+    if (reg >= 8) {
+      reg -= 8;
+      rex_r = true;
+    }
+
+    ModRM modrm;
+    modrm.mod = 0;  // no disp
+    modrm.rm = 4;   // sib!
+    modrm.reg_op = reg;
+
+    SIB sib;
+    sib.scale = 0;
+
+    if (addr1 == 5 && addr2 == 5) {
+      sib.index = addr1;
+      sib.base = addr2;
+      rex_x = addr1_ext;
+      rex_b = addr2_ext;
+      modrm.mod = 1;
+      set_disp(Imm(1, 0));
+
+    } else {
+      // default  addr1 in index
+      bool flipped = (addr1 == 4) || (addr2 == 5);
+
+      if (flipped) {
+        sib.index = addr2;
+        sib.base = addr1;
+        rex_x = addr2_ext;
+        rex_b = addr1_ext;
+      } else {
+        // addr1 in index
+        sib.index = addr1;
+        sib.base = addr2;
+        rex_x = addr1_ext;
+        rex_b = addr2_ext;
+      }
+      assert(sib.base != 5);
+      assert(sib.index != 4);
+    }
+
+    if (rex_b || rex_w || rex_x || lb != VEX3::LeadingBytes::P_0F) {
+      // need three byte version
+      set(VEX3(rex_w, rex_r, rex_x, rex_b, lb));
+    } else {
+      assert(lb == VEX3::LeadingBytes::P_0F);  // vex2 implies 0x0f
+      assert(!rex_b);
+      assert(!rex_w);
+      assert(!rex_x);
+      set(VEX2(rex_r));
+    }
+
+    set(modrm);
+    set(sib);
+  }
+
   /*!
    * Set modrm and rex as needed for two regs for an addressing mode.
    * Will set SIB if R12 or RSP indexing is used.
@@ -440,6 +809,35 @@ struct Instruction {
     }
   }
 
+  void set_vex_modrm_and_rex_for_rip_plus_s32(uint8_t reg,
+                                              s32 offset,
+                                              VEX3::LeadingBytes lb = VEX3::LeadingBytes::P_0F,
+                                              bool rex_w = false) {
+    bool rex_r = false;
+
+    if (reg >= 8) {
+      reg -= 8;
+      rex_r = true;
+    }
+
+    ModRM modrm;
+    modrm.mod = 0;
+    modrm.reg_op = reg;
+    modrm.rm = 5;  // use the RIP addressing mode
+    set(modrm);
+
+    if (rex_w || lb != VEX3::LeadingBytes::P_0F) {
+      // need three byte version
+      set(VEX3(rex_w, rex_r, false, false, lb));
+    } else {
+      assert(lb == VEX3::LeadingBytes::P_0F);  // vex2 implies 0x0f
+      assert(!rex_w);
+      set(VEX2(rex_r));
+    }
+
+    set_disp(Imm(4, offset));
+  }
+
   /*!
    * Set up modrm and rex for the commonly used 32-bit immediate displacement indexing mode.
    */
@@ -484,6 +882,7 @@ struct Instruction {
       return 0;
     assert(set_disp_imm);
     int offset = 0;
+    offset += n_vex;
     if (set_rex)
       offset++;
     offset++;  // opcode
@@ -506,6 +905,7 @@ struct Instruction {
       return 0;
     assert(set_imm);
     int offset = 0;
+    offset += n_vex;
     if (set_rex)
       offset++;
     offset++;  // opcode
@@ -529,6 +929,11 @@ struct Instruction {
     if (is_null)
       return 0;
     uint8_t count = 0;
+
+    for (int i = 0; i < n_vex; i++) {
+      buffer[count++] = vex[i];
+    }
+
     if (set_rex) {
       buffer[count++] = m_rex;
     }
@@ -569,6 +974,9 @@ struct Instruction {
     if (is_null)
       return 0;
     uint8_t count = 0;
+
+    count += n_vex;
+
     if (set_rex) {
       count++;
     }
diff --git a/goalc/emitter/Register.cpp b/goalc/emitter/Register.cpp
index 2b6d06772e..a342f70da6 100644
--- a/goalc/emitter/Register.cpp
+++ b/goalc/emitter/Register.cpp
@@ -70,14 +70,27 @@ RegisterInfo RegisterInfo::make_register_info() {
 
 RegisterInfo gRegInfo = RegisterInfo::make_register_info();
 
-std::string to_string(RegKind kind) {
+std::string to_string(HWRegKind kind) {
   switch (kind) {
-    case RegKind::GPR:
+    case HWRegKind::GPR:
       return "gpr";
-    case RegKind::XMM:
+    case HWRegKind::XMM:
       return "xmm";
     default:
-      throw std::runtime_error("Unsupported RegKind");
+      throw std::runtime_error("Unsupported HWRegKind");
+  }
+}
+
+HWRegKind reg_class_to_hw(RegClass reg_class) {
+  switch (reg_class) {
+    case RegClass::VECTOR_FLOAT:
+    case RegClass::FLOAT:
+    case RegClass::INT_128:
+      return HWRegKind::XMM;
+    case RegClass::GPR_64:
+      return HWRegKind::GPR;
+    default:
+      assert(false);
   }
 }
 
diff --git a/goalc/emitter/Register.h b/goalc/emitter/Register.h
index 1a28662760..6b5042ff04 100644
--- a/goalc/emitter/Register.h
+++ b/goalc/emitter/Register.h
@@ -13,12 +13,13 @@
 #include <vector>
 #include <string>
 #include "common/common_types.h"
+#include "common/goal_constants.h"
 
 namespace emitter {
 
-enum class RegKind : u8 { GPR, XMM, INVALID };
-
-std::string to_string(RegKind kind);
+enum class HWRegKind : u8 { GPR, XMM, INVALID };
+HWRegKind reg_class_to_hw(RegClass reg_class);
+std::string to_string(HWRegKind kind);
 
 constexpr int GPR_SIZE = 8;
 constexpr int XMM_SIZE = 16;
diff --git a/goalc/regalloc/Allocator.cpp b/goalc/regalloc/Allocator.cpp
index 34f6c8b812..36b2ffe108 100644
--- a/goalc/regalloc/Allocator.cpp
+++ b/goalc/regalloc/Allocator.cpp
@@ -618,13 +618,14 @@ int get_stack_slot_for_var(int var, RegAllocCache* cache) {
 const std::vector<emitter::Register>& get_default_alloc_order_for_var_spill(int v,
                                                                             RegAllocCache* cache) {
   auto& info = cache->iregs.at(v);
-  assert(info.kind != emitter::RegKind::INVALID);
-  if (info.kind == emitter::RegKind::GPR) {
+  assert(info.reg_class != RegClass::INVALID);
+  auto hw_kind = emitter::reg_class_to_hw(info.reg_class);
+  if (hw_kind == emitter::HWRegKind::GPR) {
     return emitter::gRegInfo.get_gpr_spill_alloc_order();
-  } else if (info.kind == emitter::RegKind::XMM) {
+  } else if (hw_kind == emitter::HWRegKind::XMM) {
     return emitter::gRegInfo.get_xmm_spill_alloc_order();
   } else {
-    throw std::runtime_error("Unsupported RegKind");
+    throw std::runtime_error("Unsupported HWRegKind");
   }
 }
 
@@ -632,22 +633,22 @@ const std::vector<emitter::Register>& get_default_alloc_order_for_var(int v,
                                                                       RegAllocCache* cache,
                                                                       bool get_all) {
   auto& info = cache->iregs.at(v);
-  // todo fix this.
-  //  assert(info.kind != emitter::RegKind::INVALID);
-  if (info.kind == emitter::RegKind::GPR || info.kind == emitter::RegKind::INVALID) {
+  assert(info.reg_class != RegClass::INVALID);
+  auto hw_kind = emitter::reg_class_to_hw(info.reg_class);
+  if (hw_kind == emitter::HWRegKind::GPR || hw_kind == emitter::HWRegKind::INVALID) {
     if (!get_all && cache->is_asm_func) {
       return emitter::gRegInfo.get_gpr_temp_alloc_order();
     } else {
       return emitter::gRegInfo.get_gpr_alloc_order();
     }
-  } else if (info.kind == emitter::RegKind::XMM) {
+  } else if (hw_kind == emitter::HWRegKind::XMM) {
     if (!get_all && cache->is_asm_func) {
       return emitter::gRegInfo.get_xmm_temp_alloc_order();
     } else {
       return emitter::gRegInfo.get_xmm_alloc_order();
     }
   } else {
-    throw std::runtime_error("Unsupported RegKind");
+    throw std::runtime_error("Unsupported HWRegKind");
   }
 }
 
diff --git a/goalc/regalloc/IRegister.cpp b/goalc/regalloc/IRegister.cpp
index f78982d2f9..b25efa39c2 100644
--- a/goalc/regalloc/IRegister.cpp
+++ b/goalc/regalloc/IRegister.cpp
@@ -1,3 +1,4 @@
+#include <cassert>
 #include "third-party/fmt/core.h"
 #include "IRegister.h"
 
@@ -10,8 +11,18 @@ std::string IRegister::to_string() const {
   //    }
   //    return result;
   //  } else {
-  return fmt::format("i{}-{}", emitter::to_string(kind), id);
-  //  }
+  switch (reg_class) {
+    case RegClass::GPR_64:
+      return fmt::format("igpr-{}", id);
+    case RegClass::FLOAT:
+      return fmt::format("ifpr-{}", id);
+    case RegClass::INT_128:
+      return fmt::format("ii128-{}", id);
+    case RegClass::VECTOR_FLOAT:
+      return fmt::format("ivf-{}", id);
+    default:
+      assert(false);
+  }
 }
 
 std::string IRegConstraint::to_string() const {
diff --git a/goalc/regalloc/IRegister.h b/goalc/regalloc/IRegister.h
index fb0aed5662..52cba278d7 100644
--- a/goalc/regalloc/IRegister.h
+++ b/goalc/regalloc/IRegister.h
@@ -4,15 +4,12 @@
  * IRegister is the Register for the Intermediate Representation.
  */
 
-#ifndef JAK_IREGISTER_H
-#define JAK_IREGISTER_H
-
 #include <string>
 #include <vector>
 #include "goalc/emitter/Register.h"
 
 struct IRegister {
-  emitter::RegKind kind = emitter::RegKind::INVALID;
+  RegClass reg_class = RegClass::INVALID;
   int id = -1;
   std::string to_string() const;
   struct hash {
@@ -27,5 +24,3 @@ struct IRegConstraint {
   emitter::Register desired_register;
   std::string to_string() const;
 };
-
-#endif  // JAK_IREGISTER_H
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ca7a4b50da..04a1dcb4e3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,6 +13,7 @@ add_executable(goalc-test
         test_type_system.cpp
         test_CodeTester.cpp
         test_emitter.cpp
+        test_emitter_avx.cpp
         test_common_util.cpp
         test_pretty_print.cpp
         test_zydis.cpp
diff --git a/test/goalc/source_templates/with_game/test-basic-vector-math.gc b/test/goalc/source_templates/with_game/test-basic-vector-math.gc
new file mode 100644
index 0000000000..5747ac2819
--- /dev/null
+++ b/test/goalc/source_templates/with_game/test-basic-vector-math.gc
@@ -0,0 +1,21 @@
+(defun test-basic-vector-math ()
+  (let ((vector-0 (new 'stack 'vector))
+        (vector-1 (new 'stack 'vector))
+        (vector-2 (new 'stack 'vector)))
+    (set! (-> vector-0 x) 1.0)
+    (set! (-> vector-0 y) 2.0)
+    (set! (-> vector-0 z) 3.0)
+    (set! (-> vector-0 w) 4.0)
+    
+    (set! (-> vector-1 x) 10.0)
+    (set! (-> vector-1 y) 20.0)
+    (set! (-> vector-1 z) 30.0)
+    (set! (-> vector-1 w) 40.1)
+    
+    (vector-! vector-2 vector-1 vector-0)
+    ; 9 + 18 + 27 = 54.0000
+    (format #t "~f~%" (+ (-> vector-2 x) (-> vector-2 y) (-> vector-2 z) (-> vector-2 w)))
+    )
+  )
+
+(test-basic-vector-math)
\ No newline at end of file
diff --git a/test/goalc/source_templates/with_game/test-load-static-vector.gc b/test/goalc/source_templates/with_game/test-load-static-vector.gc
new file mode 100644
index 0000000000..84280164a7
--- /dev/null
+++ b/test/goalc/source_templates/with_game/test-load-static-vector.gc
@@ -0,0 +1,7 @@
+(define my-vector (new 'global 'vector))
+(rlet ((vf1 :class vf :reset-here #t))
+      (.lvf vf1 (new 'static 'vector :x 1.0 :y 1.2 :z 1.5 :w 1.6))
+      (.svf my-vector vf1)
+      )
+
+(format #t "~f~%" (+ (-> my-vector x) (-> my-vector y) (-> my-vector z) (-> my-vector w)))
\ No newline at end of file
diff --git a/test/goalc/source_templates/with_game/test-vf-load-and-store.gc b/test/goalc/source_templates/with_game/test-vf-load-and-store.gc
new file mode 100644
index 0000000000..a96b20a8db
--- /dev/null
+++ b/test/goalc/source_templates/with_game/test-vf-load-and-store.gc
@@ -0,0 +1,21 @@
+(defun vf-test-load-and-store ()
+  (let ((vector-0 (new 'stack 'vector))
+        (vector-1 (new 'stack 'vector4s-3)))
+    (set! (-> vector-0 x) 1.0)
+    (set! (-> vector-0 y) 2.0)
+    (set! (-> vector-0 z) 3.0)
+    (set! (-> vector-0 w) 4.0)
+    
+    (rlet ((vf1 :class vf  :reset-here #t)
+           (vf2 :class vf :reg xmm1 :reset-here #t))
+          (.lvf vf1 vector-0)
+          (.mov vf2 vf1)
+          (.svf (-> vector-1 vector 0) vf2)
+          )
+    
+    (-> vector-1 vector 0 y)
+    )
+  )
+
+(format #t "~f~%" (vf-test-load-and-store))
+0
\ No newline at end of file
diff --git a/test/goalc/test_with_game.cpp b/test/goalc/test_with_game.cpp
index 0e2150c15f..ec45b5fe7b 100644
--- a/test/goalc/test_with_game.cpp
+++ b/test/goalc/test_with_game.cpp
@@ -345,6 +345,18 @@ TEST_F(WithGameTests, StaticBoxedArray) {
                          {"4 asdf \"test\" (a b) 0 object 12 12\n0\n"});
 }
 
+TEST_F(WithGameTests, VFLoadAndStore) {
+  runner.run_static_test(env, testCategory, "test-vf-load-and-store.gc", {"2.0000\n0\n"});
+}
+
+TEST_F(WithGameTests, VFSimpleMath) {
+  runner.run_static_test(env, testCategory, "test-basic-vector-math.gc", {"54.0000\n0\n"});
+}
+
+TEST_F(WithGameTests, VFLoadStatic) {
+  runner.run_static_test(env, testCategory, "test-load-static-vector.gc", {"5.3000\n0\n"});
+}
+
 TEST(TypeConsistency, TypeConsistency) {
   Compiler compiler;
   compiler.enable_throw_on_redefines();
diff --git a/test/test_emitter_avx.cpp b/test/test_emitter_avx.cpp
new file mode 100644
index 0000000000..5be9e93e61
--- /dev/null
+++ b/test/test_emitter_avx.cpp
@@ -0,0 +1,211 @@
+#include "gtest/gtest.h"
+#include "goalc/emitter/CodeTester.h"
+#include "goalc/emitter/IGen.h"
+
+using namespace emitter;
+
+TEST(EmitterAVX, MOV_VF) {
+  CodeTester tester;
+  tester.init_code_buffer(10000);
+  for (int i = 0; i < 16; i++) {
+    for (int j = 0; j < 16; j++) {
+      tester.emit(IGen::mov_vf_vf(XMM0 + i, XMM0 + j));
+    }
+  }
+
+  EXPECT_EQ(
+      tester.dump_to_hex_string(true),
+      "C5F828C0C5F828C1C5F828C2C5F828C3C5F828C4C5F828C5C5F828C6C5F828C7C57829C0C57829C8C57829D0C578"
+      "29D8C57829E0C57829E8C57829F0C57829F8C5F828C8C5F828C9C5F828CAC5F828CBC5F828CCC5F828CDC5F828CE"
+      "C5F828CFC57829C1C57829C9C57829D1C57829D9C57829E1C57829E9C57829F1C57829F9C5F828D0C5F828D1C5F8"
+      "28D2C5F828D3C5F828D4C5F828D5C5F828D6C5F828D7C57829C2C57829CAC57829D2C57829DAC57829E2C57829EA"
+      "C57829F2C57829FAC5F828D8C5F828D9C5F828DAC5F828DBC5F828DCC5F828DDC5F828DEC5F828DFC57829C3C578"
+      "29CBC57829D3C57829DBC57829E3C57829EBC57829F3C57829FBC5F828E0C5F828E1C5F828E2C5F828E3C5F828E4"
+      "C5F828E5C5F828E6C5F828E7C57829C4C57829CCC57829D4C57829DCC57829E4C57829ECC57829F4C57829FCC5F8"
+      "28E8C5F828E9C5F828EAC5F828EBC5F828ECC5F828EDC5F828EEC5F828EFC57829C5C57829CDC57829D5C57829DD"
+      "C57829E5C57829EDC57829F5C57829FDC5F828F0C5F828F1C5F828F2C5F828F3C5F828F4C5F828F5C5F828F6C5F8"
+      "28F7C57829C6C57829CEC57829D6C57829DEC57829E6C57829EEC57829F6C57829FEC5F828F8C5F828F9C5F828FA"
+      "C5F828FBC5F828FCC5F828FDC5F828FEC5F828FFC57829C7C57829CFC57829D7C57829DFC57829E7C57829EFC578"
+      "29F7C57829FFC57828C0C57828C1C57828C2C57828C3C57828C4C57828C5C57828C6C57828C7C4417828C0C44178"
+      "28C1C4417828C2C4417828C3C4417828C4C4417828C5C4417828C6C4417828C7C57828C8C57828C9C57828CAC578"
+      "28CBC57828CCC57828CDC57828CEC57828CFC4417828C8C4417828C9C4417828CAC4417828CBC4417828CCC44178"
+      "28CDC4417828CEC4417828CFC57828D0C57828D1C57828D2C57828D3C57828D4C57828D5C57828D6C57828D7C441"
+      "7828D0C4417828D1C4417828D2C4417828D3C4417828D4C4417828D5C4417828D6C4417828D7C57828D8C57828D9"
+      "C57828DAC57828DBC57828DCC57828DDC57828DEC57828DFC4417828D8C4417828D9C4417828DAC4417828DBC441"
+      "7828DCC4417828DDC4417828DEC4417828DFC57828E0C57828E1C57828E2C57828E3C57828E4C57828E5C57828E6"
+      "C57828E7C4417828E0C4417828E1C4417828E2C4417828E3C4417828E4C4417828E5C4417828E6C4417828E7C578"
+      "28E8C57828E9C57828EAC57828EBC57828ECC57828EDC57828EEC57828EFC4417828E8C4417828E9C4417828EAC4"
+      "417828EBC4417828ECC4417828EDC4417828EEC4417828EFC57828F0C57828F1C57828F2C57828F3C57828F4C578"
+      "28F5C57828F6C57828F7C4417828F0C4417828F1C4417828F2C4417828F3C4417828F4C4417828F5C4417828F6C4"
+      "417828F7C57828F8C57828F9C57828FAC57828FBC57828FCC57828FDC57828FEC57828FFC4417828F8C4417828F9"
+      "C4417828FAC4417828FBC4417828FCC4417828FDC4417828FEC4417828FF");
+}
+
+TEST(EmitterAVX, LoadVF_Reg) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 3, RSI, R15));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 3, R12, R15));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 13, RSI, R15));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64(XMM0 + 13, R12, R15));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178281C37C48178281C3CC44178282C37C40178282C3C");
+}
+
+TEST(EmitterAVX, LoadVF_RegS8) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, RSI, R15, -3));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, R12, R15, -3));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, RSI, R15, -3));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, R12, R15, -3));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C4C178285C37FDC48178285C3CFDC44178286C37FDC40178286C3CFD");
+}
+
+TEST(EmitterAVX, LoadVF_RegS32) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, RSI, R15, -0x100));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, R12, R15, -0x100));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, RSI, R15, -0x100));
+  tester.emit(IGen::loadvf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, R12, R15, -0x100));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C4C178289C3700FFFFFFC48178289C3C00FFFFFFC4417828AC3700FFFFFFC4017828AC3C00FFFFFF");
+}
+
+TEST(EmitterAVX, StoreVF_Reg) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 3, RSI, R15));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 3, R12, R15));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 13, RSI, R15));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64(XMM0 + 13, R12, R15));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true), "C4C178291C37C48178291C3CC44178292C37C40178292C3C");
+}
+
+TEST(EmitterAVX, StoreVF_RegS8) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, RSI, R15, -3));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 3, R12, R15, -3));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, RSI, R15, -3));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s8(XMM0 + 13, R12, R15, -3));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C4C178295C37FDC48178295C3CFDC44178296C37FDC40178296C3CFD");
+}
+
+TEST(EmitterAVX, StoreVF_RegS32) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, RSI, R15, -0x100));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 3, R12, R15, -0x100));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, RSI, R15, -0x100));
+  tester.emit(IGen::storevf_gpr64_plus_gpr64_plus_s32(XMM0 + 13, R12, R15, -0x100));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C4C178299C3700FFFFFFC48178299C3C00FFFFFFC4417829AC3700FFFFFFC4017829AC3C00FFFFFF");
+}
+
+TEST(EmitterAVX, MulVF) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::mul_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13));
+  tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::mul_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C5E059DBC4C16059DDC59059DBC4C11059DDC56059EBC4416059EDC51059EBC4411059ED");
+}
+
+TEST(EmitterAVX, ShuffleVF) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::shuffle_vf(XMM0 + 3, XMM0 + 4, 1, 3, 2, 1));
+  tester.emit(IGen::shuffle_vf(XMM0 + 3, XMM0 + 14, 1, 3, 2, 1));
+  tester.emit(IGen::shuffle_vf(XMM0 + 13, XMM0 + 4, 1, 3, 2, 1));
+  tester.emit(IGen::shuffle_vf(XMM0 + 13, XMM0 + 14, 1, 3, 2, 1));
+  EXPECT_EQ(tester.dump_to_hex_string(true), "C5D8C6DC6DC4C108C6DE6DC558C6EC6DC44108C6EE6D");
+}
+
+TEST(EmitterAVX, XorVF) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::xor_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13));
+  tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::xor_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C5E057DBC4C16057DDC59057DBC4C11057DDC56057EBC4416057EDC51057EBC4411057ED");
+}
+
+TEST(EmitterAVX, SubVF) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::sub_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13));
+  tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::sub_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C5E05CDBC4C1605CDDC5905CDBC4C1105CDDC5605CEBC441605CEDC5105CEBC441105CED");
+}
+
+TEST(EmitterAVX, AddVF) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::add_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13));
+  tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3));
+  tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13));
+  tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3));
+  tester.emit(IGen::add_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C5E058DBC4C16058DDC59058DBC4C11058DDC56058EBC4416058EDC51058EBC4411058ED");
+}
+
+TEST(EmitterAVX, BlendVF) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 3, XMM0 + 3, 3));
+  tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 3, XMM0 + 13, 3));
+  tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 13, XMM0 + 3, 3));
+  tester.emit(IGen::blend_vf(XMM0 + 3, XMM0 + 13, XMM0 + 13, 3));
+  tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 3, XMM0 + 3, 3));
+  tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 3, XMM0 + 13, 3));
+  tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 13, XMM0 + 3, 3));
+  tester.emit(IGen::blend_vf(XMM0 + 13, XMM0 + 13, XMM0 + 13, 3));
+
+  EXPECT_EQ(tester.dump_to_hex_string(true),
+            "C4E3610CDB03C4C3610CDD03C4E3110CDB03C4C3110CDD03C463610CEB03C443610CED03C463110CEB03C4"
+            "43110CED03");
+}
+
+TEST(EmitterAVX, RIP) {
+  CodeTester tester;
+  tester.init_code_buffer(1024);
+  tester.emit(IGen::loadvf_rip_plus_s32(XMM0 + 3, -123));
+  tester.emit(IGen::loadvf_rip_plus_s32(XMM0 + 13, -123));
+  EXPECT_EQ(tester.dump_to_hex_string(true), "C5F8281D85FFFFFFC578282D85FFFFFF");
+}
\ No newline at end of file
diff --git a/test/test_type_system.cpp b/test/test_type_system.cpp
index 44442eb13b..bfdac40b63 100644
--- a/test/test_type_system.cpp
+++ b/test/test_type_system.cpp
@@ -128,7 +128,7 @@ TEST(TypeSystem, DerefInfoNoLoadInfoOrStride) {
   EXPECT_TRUE(info.can_deref);
   EXPECT_TRUE(info.mem_deref);
   EXPECT_FALSE(info.sign_extend);  // it's a memory address being loaded
-  EXPECT_EQ(info.reg, RegKind::GPR_64);
+  EXPECT_EQ(info.reg, RegClass::GPR_64);
   EXPECT_EQ(info.stride, 4);
   EXPECT_EQ(info.result_type.print(), "(function string symbol int32)");
   EXPECT_EQ(info.load_size, 4);
@@ -141,7 +141,7 @@ TEST(TypeSystem, DerefInfoNoLoadInfoOrStride) {
   EXPECT_EQ(info.load_size, 8);
   EXPECT_EQ(info.stride, 8);
   EXPECT_EQ(info.sign_extend, true);
-  EXPECT_EQ(info.reg, RegKind::GPR_64);
+  EXPECT_EQ(info.reg, RegClass::GPR_64);
   EXPECT_EQ(info.result_type.print(), "int64");
 
   // test inline-array (won't work because type is dynamically sized)
@@ -331,7 +331,7 @@ TEST(TypeSystem, DecompLookupsTypeOfBasic) {
   dk.size = 4;
   dk.sign_extend = false;
   dk.is_store = false;
-  dk.reg_kind = RegKind::GPR_64;
+  dk.reg_kind = RegClass::GPR_64;
   input.deref = dk;
   auto result = ts.reverse_field_lookup(input);
 
@@ -356,7 +356,7 @@ TEST(TypeSystem, DecompLookupsMethod) {
   dk.size = 4;
   dk.sign_extend = false;
   dk.is_store = false;
-  dk.reg_kind = RegKind::GPR_64;
+  dk.reg_kind = RegClass::GPR_64;
   input.deref = dk;
   auto result = ts.reverse_field_lookup(input);
 
@@ -373,7 +373,7 @@ TEST(TypeSystem, DecompLookupsMethod) {
   dk.size = 4;
   dk.sign_extend = false;
   dk.is_store = false;
-  dk.reg_kind = RegKind::GPR_64;
+  dk.reg_kind = RegClass::GPR_64;
   input.deref = dk;
   result = ts.reverse_field_lookup(input);