diff --git a/src/Common/TYCON.sig b/src/Common/TYCON.sig
index 92d2d093d..a93203996 100644
--- a/src/Common/TYCON.sig
+++ b/src/Common/TYCON.sig
@@ -26,6 +26,7 @@ signature TYCON =
     val tycon_WORD64 : tycon
     val tycon_REAL   : tycon
     val tycon_F64    : tycon  (* Internal *)
+    val tycon_F256   : tycon  (* Internal *)
     val tycon_STRING : tycon
     val tycon_CHAR   : tycon
     val tycon_EXN    : tycon
diff --git a/src/Common/TYNAME.sig b/src/Common/TYNAME.sig
index b58eda38a..97035eae2 100644
--- a/src/Common/TYNAME.sig
+++ b/src/Common/TYNAME.sig
@@ -74,7 +74,8 @@ signature TYNAME =
     val tyName_WordDefault : unit -> TyName  (* word31 or word32 dependent on tagging *)
     val tyName_REAL    : TyName
     val tyName_F64     : TyName              (* Internal unboxed float type *)
-    val tyName_STRING  : TyName
+    val tyName_F256    : TyName
+    val tyName_STRING  : TyName              (* Internal unboxed float vector type *)
     val tyName_CHAR    : TyName
     val tyName_LIST    : TyName
     val tyName_FRAG    : TyName
diff --git a/src/Common/TyCon.sml b/src/Common/TyCon.sml
index d0615ff97..c066d6600 100644
--- a/src/Common/TyCon.sml
+++ b/src/Common/TyCon.sml
@@ -41,6 +41,7 @@ structure TyCon: TYCON =
     val tycon_WORD64 = TYCON "word64"
     val tycon_REAL   = TYCON "real"
     val tycon_F64    = TYCON "f64"
+    val tycon_F256   = TYCON "f256"
     val tycon_STRING = TYCON "string"
     val tycon_CHAR   = TYCON "char"
     val tycon_EXN    = TYCON "exn"
diff --git a/src/Common/TyName.sml b/src/Common/TyName.sml
index f97b126d9..2c21a4861 100644
--- a/src/Common/TyName.sml
+++ b/src/Common/TyName.sml
@@ -109,6 +109,7 @@ structure TyName :> TYNAME =
 	val tyName_WORD64     = predef false{tycon=TyCon.tycon_WORD64,     arity=0, equality=true}
 	val tyName_REAL       = predef false{tycon=TyCon.tycon_REAL,       arity=0, equality=false}
 	val tyName_F64        = predef true {tycon=TyCon.tycon_F64,        arity=0, equality=false}
+	val tyName_F256       = predef true {tycon=TyCon.tycon_F256,       arity=0, equality=false}
 	val tyName_STRING     = predef false{tycon=TyCon.tycon_STRING,     arity=0, equality=true}
 	val tyName_CHAR       = predef true {tycon=TyCon.tycon_CHAR,       arity=0, equality=true}
 	val tyName_LIST       = predef true {tycon=TyCon.tycon_LIST,       arity=1, equality=true}
diff --git a/src/Compiler/Backend/PrimName.sml b/src/Compiler/Backend/PrimName.sml
index dd4897719..dee513e42 100644
--- a/src/Compiler/Backend/PrimName.sml
+++ b/src/Compiler/Backend/PrimName.sml
@@ -30,26 +30,28 @@ datatype prim =
          (* other primitives *)
          Less_real | Lesseq_real | Greater_real | Greatereq_real |
          Less_f64 | Lesseq_f64 | Greater_f64 | Greatereq_f64 |
+         Less_f256 | Lesseq_f256 | Greater_f256 | Greatereq_f256 |
+         All_f256 | Any_f256 | And_f256 | Or_f256 | Not_f256 |
 
 	 Plus_int31 | Plus_int32ub | Plus_int32b |
 	 Plus_word31 | Plus_word32ub | Plus_word32b |
 	 Plus_int63 | Plus_int64ub | Plus_int64b |
 	 Plus_word63 | Plus_word64ub | Plus_word64b |
-         Plus_real | Plus_f64 |
+         Plus_real | Plus_f64 | Plus_f256 |
 
 	 Minus_int31 | Minus_int32ub | Minus_int32b |
 	 Minus_word31 | Minus_word32ub | Minus_word32b |
 	 Minus_int63 | Minus_int64ub | Minus_int64b |
 	 Minus_word63 | Minus_word64ub | Minus_word64b |
-         Minus_real | Minus_f64 |
+         Minus_real | Minus_f64 | Minus_f256 |
 
 	 Mul_int31 | Mul_int32ub | Mul_int32b |
 	 Mul_word31 | Mul_word32ub | Mul_word32b |
 	 Mul_int63 | Mul_int64ub | Mul_int64b |
 	 Mul_word63 | Mul_word64ub | Mul_word64b |
-         Mul_real | Mul_f64 |
+         Mul_real | Mul_f64 | Mul_f256 |
 
-	 Div_real | Div_f64 |
+	 Div_real | Div_f64 | Div_f256 |
 
 	 Neg_int31 | Neg_int32ub | Neg_int32b |
 	 Neg_int63 | Neg_int64ub | Neg_int64b |
@@ -153,8 +155,23 @@ datatype prim =
          Max_f64 | Min_f64 | Real_to_f64 | F64_to_real | Sqrt_f64 |
          Int_to_f64 |
 
+         Broadcast_f256 | Blend_f256 |
+
          Blockf64_update_real | Blockf64_sub_real | Blockf64_size | Blockf64_alloc |
-         Blockf64_update_f64 | Blockf64_sub_f64
+         Blockf64_update_f64 | Blockf64_sub_f64 |
+
+         Blockf64_update_m256d | Blockf64_update_f256 |
+         Blockf64_sub_m256d | Blockf64_sub_f256 |
+
+         F256_box | F256_store | F256_unbox |
+
+         M256d_plus | M256d_minus | M256d_mul | M256d_div |
+         M256d_less | M256d_lesseq | M256d_greater | M256d_greatereq |
+         M256d_all | M256d_any | M256d_blend | M256d_broadcast |
+         M256d_sum | M256d_product | Sum_f256 | Product_f256 |
+         M256d_and | M256d_or | M256d_not |
+
+         M256d_true | M256d_false | True_f256 | False_f256
 
 local
   structure M = StringFinMap
@@ -191,26 +208,28 @@ local
   val pairs =
         [("__less_real", Less_real), ("__lesseq_real", Lesseq_real), ("__greater_real", Greater_real), ("__greatereq_real", Greatereq_real),
          ("__less_f64", Less_f64), ("__lesseq_f64", Lesseq_f64), ("__greater_f64", Greater_f64), ("__greatereq_f64", Greatereq_f64),
+         ("__less_f256", Less_f256), ("__lesseq_f256", Lesseq_f256), ("__greater_f256", Greater_f256), ("__greatereq_f256", Greatereq_f256),
+         ("__all_f256", All_f256), ("__any_f256", Any_f256), ("__and_f256", And_f256), ("__or_f256", Or_f256), ("__not_f256", Not_f256),
 
 	 ("__plus_int31", Plus_int31), ("__plus_int32ub", Plus_int32ub), ("__plus_int32b", Plus_int32b),
 	 ("__plus_word31", Plus_word31), ("__plus_word32ub", Plus_word32ub), ("__plus_word32b", Plus_word32b),
 	 ("__plus_int63", Plus_int63), ("__plus_int64ub", Plus_int64ub), ("__plus_int64b", Plus_int64b),
 	 ("__plus_word63", Plus_word63), ("__plus_word64ub", Plus_word64ub), ("__plus_word64b", Plus_word64b),
-         ("__plus_real", Plus_real), ("__plus_f64", Plus_f64),
+     ("__plus_real", Plus_real), ("__plus_f64", Plus_f64), ("__plus_f256", Plus_f256),
 
 	 ("__minus_int31", Minus_int31), ("__minus_int32ub", Minus_int32ub), ("__minus_int32b", Minus_int32b),
 	 ("__minus_word31", Minus_word31), ("__minus_word32ub", Minus_word32ub), ("__minus_word32b", Minus_word32b),
 	 ("__minus_int63", Minus_int63), ("__minus_int64ub", Minus_int64ub), ("__minus_int64b", Minus_int64b),
 	 ("__minus_word63", Minus_word63), ("__minus_word64ub", Minus_word64ub), ("__minus_word64b", Minus_word64b),
-         ("__minus_real", Minus_real), ("__minus_f64", Minus_f64),
+     ("__minus_real", Minus_real), ("__minus_f64", Minus_f64), ("__minus_f256", Minus_f256),
 
 	 ("__mul_int31", Mul_int31), ("__mul_int32ub", Mul_int32ub), ("__mul_int32b", Mul_int32b),
 	 ("__mul_word31", Mul_word31), ("__mul_word32ub", Mul_word32ub), ("__mul_word32b", Mul_word32b),
 	 ("__mul_int63", Mul_int63), ("__mul_int64ub", Mul_int64ub), ("__mul_int64b", Mul_int64b),
 	 ("__mul_word63", Mul_word63), ("__mul_word64ub", Mul_word64ub), ("__mul_word64b", Mul_word64b),
-         ("__mul_real", Mul_real), ("__mul_f64", Mul_f64),
+     ("__mul_real", Mul_real), ("__mul_f64", Mul_f64), ("__mul_f256", Mul_f256),
 
-	 ("__div_real", Div_real), ("__div_f64", Div_f64),
+	 ("__div_real", Div_real), ("__div_f64", Div_f64), ("__div_f256", Div_f256),
 
 	 ("__neg_int31", Neg_int31), ("__neg_int32ub", Neg_int32ub), ("__neg_int32b", Neg_int32b),
 	 ("__neg_int63", Neg_int63), ("__neg_int64ub", Neg_int64ub), ("__neg_int64b", Neg_int64b),
@@ -323,7 +342,49 @@ local
          ("__blockf64_size", Blockf64_size),
          ("__blockf64_alloc", Blockf64_alloc),
          ("__blockf64_update_f64", Blockf64_update_f64),
-         ("__blockf64_sub_f64", Blockf64_sub_f64)
+         ("__blockf64_sub_f64", Blockf64_sub_f64),
+         ("__broadcast_f256", Broadcast_f256),
+         ("__blend_f256", Blend_f256),
+
+         ("__blockf64_update_m256d", Blockf64_update_m256d),
+         ("__blockf64_sub_m256d", Blockf64_sub_m256d),
+         ("__blockf64_update_f256", Blockf64_update_f256),
+         ("__blockf64_sub_f256", Blockf64_sub_f256),
+
+         ("__f256_box", F256_box),
+         ("__f256_store", F256_store),
+         ("__f256_unbox", F256_unbox),
+
+         ("__m256d_less", M256d_less),
+         ("__m256d_lesseq", M256d_lesseq),
+         ("__m256d_greater", M256d_greater),
+         ("__m256d_greatereq", M256d_greatereq),
+
+         ("__m256d_plus", M256d_plus),
+         ("__m256d_minus", M256d_minus),
+         ("__m256d_mul", M256d_mul),
+         ("__m256d_div", M256d_div),
+         ("__m256d_any", M256d_any),
+         ("__m256d_all", M256d_all),
+         ("__m256d_and", M256d_and),
+         ("__m256d_or", M256d_or),
+         ("__m256d_not", M256d_not),
+
+         ("__m256d_blend", M256d_blend),
+         ("__m256d_sum", M256d_sum),
+         ("__m256d_product", M256d_product),
+
+         ("__m256d_broadcast", M256d_broadcast),
+
+         ("__m256d_sum", M256d_sum),
+         ("__m256d_product", M256d_product),
+         ("__sum_f256", Sum_f256),
+         ("__product_f256", Product_f256),
+
+         ("__false_f256", False_f256),
+         ("__true_f256", True_f256),
+         ("__m256d_true", M256d_true),
+         ("__m256d_false", M256d_false)
 ]
 
   val M = M.fromList pairs
@@ -485,6 +546,16 @@ fun pp_prim (p:prim) : string =
       | Greater_f64 => "Greater_f64"
       | Greatereq_f64 => "Greatereq_f64"
 
+      | Less_f256 => "Less_f256"
+      | Lesseq_f256 => "Lesseq_f256"
+      | Greater_f256 => "Greater_f256"
+      | Greatereq_f256 => "Greatereq_f256"
+      | All_f256 => "All_f256"
+      | Any_f256 => "Any_f256"
+      | And_f256 => "And_f256"
+      | Or_f256 => "Or_f256"
+      | Not_f256 => "Not_f256"
+
       | Plus_int31 => "Plus_int31"
       | Plus_int32ub => "Plus_int32ub"
       | Plus_int32b => "Plus_int32b"
@@ -499,6 +570,7 @@ fun pp_prim (p:prim) : string =
       | Plus_word64b => "Plus_word64b"
       | Plus_real => "Plus_real"
       | Plus_f64 => "Plus_f64"
+      | Plus_f256 => "Plus_f256"
 
       | Minus_int31 => "Minus_int31"
       | Minus_int32ub => "Minus_int32ub"
@@ -514,6 +586,7 @@ fun pp_prim (p:prim) : string =
       | Minus_word64b => "Minus_word64b"
       | Minus_real => "Minus_real"
       | Minus_f64 => "Minus_f64"
+      | Minus_f256 => "Minus_f256"
 
       | Mul_int31 => "Mul_int31"
       | Mul_int32ub => "Mul_int32ub"
@@ -529,9 +602,11 @@ fun pp_prim (p:prim) : string =
       | Mul_word64b => "Mul_word64b"
       | Mul_real => "Mul_real"
       | Mul_f64 => "Mul_f64"
+      | Mul_f256 => "Mul_f256"
 
       | Div_real => "Div_real"
       | Div_f64 => "Div_f64"
+      | Div_f256 => "Div_f256"
 
       | Neg_int31 => "Neg_int31"
       | Neg_int32ub => "Neg_int32ub"
@@ -700,6 +775,47 @@ fun pp_prim (p:prim) : string =
       | Blockf64_update_f64 => "Blockf64_update_f64"
       | Blockf64_sub_f64 => "Blockf64_sub_f64"
 
+      | Blockf64_update_m256d => "Blockf64_update_m256d"
+      | Blockf64_sub_m256d => "Blockf64_sub_m256d"
+      | Blockf64_update_f256 => "Blockf64_update_f256"
+      | Blockf64_sub_f256 => "Blockf64_sub_f256"
+
+      | Broadcast_f256 => "Broadcast_f256"
+      | Blend_f256 => "Blend_f256"
+      | F256_box => "M256_box"
+      | F256_store => "M256_store"
+      | F256_unbox => "M256_unbox"
+
+      | M256d_plus => "M256d_plus"
+      | M256d_minus => "M256d_minus"
+      | M256d_mul => "M256d_mul"
+      | M256d_div => "M256d_div"
+
+      | M256d_less => "M256d_less"
+      | M256d_lesseq => "M256d_lesseq"
+      | M256d_greater => "M256d_greater"
+      | M256d_greatereq => "M256d_greatereq"
+
+      | M256d_any => "M256d_any"
+      | M256d_all => "M256d_all"
+
+      | M256d_and => "M256d_and"
+      | M256d_or => "M256d_or"
+      | M256d_not => "M256d_not"
+
+      | M256d_broadcast => "M256d_broadcast"
+      | M256d_blend => "M256d_blend"
+    
+	  | M256d_sum => "M256d_sum"
+	  | M256d_product => "M256d_product"
+	  | Sum_f256 => "Sum_f256"
+	  | Product_f256 => "Product_f256"
+
+	  | True_f256 => "True_f256"
+	  | False_f256 => "False_f256"
+	  | M256d_true => "M256d_true"
+	  | M256d_false => "M256d_false"
+
 end
 
 end
diff --git a/src/Compiler/Backend/X64/CodeGenUtilX64.sml b/src/Compiler/Backend/X64/CodeGenUtilX64.sml
index 5fbaa648c..831af623b 100644
--- a/src/Compiler/Backend/X64/CodeGenUtilX64.sml
+++ b/src/Compiler/Backend/X64/CodeGenUtilX64.sml
@@ -536,6 +536,19 @@ struct
       else
         I.movsd (R freg,D("0",base_reg)) :: C
 
+    (* Load vector into register (freg) from string *)
+    fun load_vector (vector_aty, t, size_ff, freg) =
+      fn C => case vector_aty
+                   of SS.PHREG_ATY x => I.vmovupd(D("8", x),R freg) :: C
+                    | _ => move_aty_into_reg(vector_aty,t,size_ff,
+                           I.vmovupd(D("8", t),R freg) :: C)
+
+
+    (* Store vector in string (freg) *)
+
+    fun store_vector (base_reg, t:reg, freg, C) =
+      I.vmovupd (R freg, D("8", base_reg)) :: C
+
     (* When tag free collection of pairs is enabled, a bit is stored
        in the region descriptor if the region is an infinite region
        holding pairs, refs, triples and arrays. Here we arrange that
@@ -2071,6 +2084,82 @@ struct
             copy(b_reg,d_reg, C'))))
          end
 
+     (* unboxed operations on vectors *)
+
+     fun bin_f256_op s v_inst (x,y,d,size_ff:int,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (y, y_C) = resolve_arg_aty(y,tmp_freg1,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+         in
+           x_C(y_C(v_inst(R y, R x, R d) :: C'))
+         end
+
+     val plus_f256 = bin_f256_op "vaddpd" I.vaddpd
+     val mul_f256 = bin_f256_op "vmulpd" I.vmulpd
+     val minus_f256 = bin_f256_op "vsubpd" I.vsubpd
+     val div_f256 = bin_f256_op "vdivpd" I.vdivpd
+     val and_f256 = bin_f256_op "vandpd" I.vandpd
+     val or_f256 = bin_f256_op "vorpd" I.vorpd
+
+    fun not_f256 (x,d,size_ff:int,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+         in
+           x_C(
+             I.vpcmpeqd (R tmp_freg0, R tmp_freg0, R tmp_freg0) :: (* This should return 1...1 *)
+             I.vpxor (R x, R tmp_freg0, R d) :: (* This should return 0...0 *)
+           C')
+         end
+
+    fun f256_sum (x,d,size_ff,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+         in
+           x_C(
+            I.vextractf128 (I "0x1", R x, R tmp_freg0) ::
+            I.vaddpd_128 (R x, R tmp_freg0, R x) ::
+            I.vunpckhpd_128 (R x, R x, R tmp_freg0) ::
+            I.vaddsd (R tmp_freg0, R x, R d) :: C'
+           )
+         end
+
+    fun f256_product (x,d,size_ff,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+         in
+           x_C(
+            I.vextractf128 (I "0x1", R x, R tmp_freg0) ::
+            I.vmulpd_128 (R x, R tmp_freg0, R x) ::
+            I.vunpckhpd_128 (R x, R x, R tmp_freg0) ::
+            I.vmulsd (R tmp_freg0, R x, R d) :: C'
+           )
+         end
+
+    fun cmp_f256 mode (x,y,d,size_ff:int,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (y, y_C) = resolve_arg_aty(y,tmp_freg1,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+           in x_C(y_C(I.vcmppd(I mode, R y, R x, R d) :: C'))
+         end
+
+
+    fun broadcast_f256 (x,d,size_ff:int, C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_freg1,size_ff, C)
+         in
+           x_C(I.vbroadcastsd(R x, R d) :: C')
+         end
+
+    fun blend_f256 (x,y,mask,d,size_ff:int, C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (y, y_C) = resolve_arg_aty(y,tmp_freg1,size_ff)
+             val (mask, mask_C) = resolve_arg_aty(mask,tmp_freg1,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+         in
+           x_C(y_C(mask_C(
+           I.vblendvpd(R mask, R y, R x, R d) :: C')))
+         end
+
 
      (* boxed operations on reals (floats) *)
 
@@ -2815,11 +2904,84 @@ struct
               end
          end
 
+     fun blockf64_update_f256 (t,i,x,d,size_ff,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+              in x_C(
+              move_aty_into_reg(i,tmp_reg1,size_ff,                 (* tmp_reg1 = i *)
+              move_aty_into_reg(t,tmp_reg0,size_ff,                 (* tmp_reg0 = t *)
+              (I.vmovupd(R x, DD("8",tmp_reg0,tmp_reg1,"8")) ::       (* *(8+tmp_reg0+8*tmp_reg1) = freg *)
+              C))))
+         end
+
+     fun blockf64_sub_f256 (t,i,d,size_ff,C) =
+         let val (t_reg,t_C) = resolve_arg_aty(t,tmp_reg1,size_ff)
+             val (d,C') = resolve_aty_def(d,tmp_freg0,size_ff,C)
+         in let val (i_reg,i_C) = resolve_arg_aty(i,tmp_reg0,size_ff)
+              in t_C(i_C(
+                 I.vmovupd(DD("8",t_reg,i_reg,"8"), R d) ::
+                 C'))
+              end
+         end
+
+     fun f256_unbox (x,d,size_ff,C) =
+         let val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff,C)
+         in load_vector (x, tmp_reg0, size_ff, d) C'
+         end
+
+     fun f256_store_kill_tmp01 (x,alloc,d,size_ff,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (a, a_C) = resolve_arg_aty(alloc,tmp_reg1,size_ff)
+             val (d_reg, C') = resolve_aty_def(d, tmp_reg0, size_ff, C)
+         in x_C(a_C(store_vector(a,tmp_reg1,x,
+            copy(a,d_reg, C'))))
+         end
+
+     fun f256_all (x,d,size_ff:int,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_reg1,size_ff, C)
+         in
+           x_C(
+           I.vmovmskpd (R x, R d) :: (* extract one bit from each mask *)
+           I.cmpq (I "0xF", R d) :: (* if all 4 are set *)
+           I.movq(I (i2s BI.ml_false), R d) ::
+           I.movq(I (i2s BI.ml_true), R tmp_reg0) ::
+           I.cmove(R tmp_reg0, R d) :: (* return true *)
+           C')
+         end
+
+     fun f256_any (x,d,size_ff:int,C) =
+         let val (x, x_C) = resolve_arg_aty(x,tmp_freg0,size_ff)
+             val (d, C') = resolve_aty_def(d,tmp_reg1,size_ff, C)
+         in
+           x_C(
+           I.vmovmskpd (R x, R d) ::
+           I.cmpq (I "0x0", R d) :: (* if just one is set *)
+           I.movq(I (i2s BI.ml_false), R d) ::
+           I.movq(I (i2s BI.ml_true), R tmp_reg0) ::
+           I.cmovne(R tmp_reg0, R d) :: (* return true *)
+           C')
+         end
+
+     fun f256_true (d,size_ff:int,C) =
+         let
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+         in
+           I.vpcmpeqd (R d, R d, R d) :: (* This should return 1...1 *)
+           C'
+         end
+
+     fun f256_false (d,size_ff:int,C) =
+         let
+             val (d, C') = resolve_aty_def(d,tmp_freg0,size_ff, C)
+         in
+           I.vpxor (R d, R d, R d) :: (* This should return 0...0 *)
+           C'
+         end
+
      local
        fun basic_sw basic_lss (LS.SWITCH(_,xlsss,lss)) =
            basic_lss lss andalso
            List.all (fn (_,lss) => basic_lss lss) xlsss
-
        fun basic_regions nil = true
          | basic_regions _ = false
 
diff --git a/src/Compiler/Backend/X64/CodeGenX64.sml b/src/Compiler/Backend/X64/CodeGenX64.sml
index e195f342d..2d869a4d8 100644
--- a/src/Compiler/Backend/X64/CodeGenX64.sml
+++ b/src/Compiler/Backend/X64/CodeGenX64.sml
@@ -941,6 +941,8 @@ struct
                             move_reg_into_aty(tmp_reg0,d,size_ff,C)
                          | Get_ctx =>
                            move_reg_into_aty(r14,d,size_ff,C)
+                         | True_f256 => f256_true (d,size_ff,C)
+                         | False_f256 => f256_false (d,size_ff,C)
                          | _ => die ("unsupported prim with 0 args: " ^ PrimName.pp_prim name))
                      | [x] =>
                        let val arg = (x,d,size_ff,C)
@@ -1028,6 +1030,13 @@ struct
                             | Abs_f64        => abs_f64 arg
                             | Int_to_f64     => int_to_f64 arg
                             | Blockf64_size  => blockf64_size arg
+                            | Broadcast_f256 => broadcast_f256 arg
+                            | F256_unbox     => f256_unbox arg
+                            | Any_f256       => f256_any arg
+                            | All_f256       => f256_all arg
+                            | Product_f256   => f256_product arg
+                            | Sum_f256       => f256_sum arg
+                            | Not_f256       => not_f256 arg
 
                             | Is_null => cmpi_kill_tmp01_cmov {box=false,quad=false} I.cmoveq
                                                               (x, SS.INTEGER_ATY{value=IntInf.fromInt 0,
@@ -1216,6 +1225,21 @@ struct
                             | F64_to_real => f64_to_real_kill_tmp01 arg
                             | Blockf64_alloc => blockf64_alloc arg
                             | Blockf64_sub_f64 => blockf64_sub_f64 arg
+                            | Blockf64_sub_f256 => blockf64_sub_f256 arg
+
+                            | Plus_f256 => plus_f256 arg
+                            | Minus_f256 => minus_f256 arg
+                            | Mul_f256 => mul_f256 arg
+                            | Div_f256 => div_f256 arg
+                            | And_f256 => and_f256 arg
+                            | Or_f256 => or_f256 arg
+                            | Less_f256 => cmp_f256 "0x1" arg
+                            | Lesseq_f256 => cmp_f256 "0x2" arg
+                            | Greater_f256 => cmp_f256 "0xE" arg
+                            | Greatereq_f256 => cmp_f256 "0xD" arg
+
+                            | F256_store => f256_store_kill_tmp01 arg
+                            | M256d_broadcast => die "M256d_broadcast: boxed broadcast not implemented"
                             | _ => die ("unsupported prim with 2 args: " ^ PrimName.pp_prim name)
                        end
                      | [b,x,y] =>
@@ -1253,6 +1277,8 @@ struct
                           | Blockf64_update_real => blockf64_update_real (b,x,y,d,size_ff,C)
                           | Blockf64_sub_real => blockf64_sub_real (b,x,y,d,size_ff,C)
                           | Blockf64_update_f64 => blockf64_update_f64 (b,x,y,d,size_ff,C)
+                          | Blockf64_update_f256 => blockf64_update_f256 (b,x,y,d,size_ff,C)
+                          | Blend_f256 => blend_f256 (b,x,y,d,size_ff,C)
                           | _ => die ("unsupported prim with 3 args: " ^ PrimName.pp_prim name))
                      | _ => die ("PRIM(" ^ PrimName.pp_prim name ^ ") not implemented")))
                  end
diff --git a/src/Compiler/Backend/X64/INSTS_X64.sml b/src/Compiler/Backend/X64/INSTS_X64.sml
index 5ecbe2afd..814160ec0 100644
--- a/src/Compiler/Backend/X64/INSTS_X64.sml
+++ b/src/Compiler/Backend/X64/INSTS_X64.sml
@@ -15,6 +15,10 @@ signature INSTS_X64 =
                  | xmm4 | xmm5 | xmm6 | xmm7
                  | xmm8 | xmm9 | xmm10 | xmm11
                  | xmm12 | xmm13 | xmm14 | xmm15
+                 | ymm0 | ymm1 | ymm2 | ymm3
+                 | ymm4 | ymm5 | ymm6 | ymm7
+                 | ymm8 | ymm9 | ymm10 | ymm11
+                 | ymm12 | ymm13 | ymm14 | ymm15
 
     val pr_reg : reg -> string
     val is_xmm : reg -> bool
@@ -55,6 +59,8 @@ signature INSTS_X64 =
     | movb of ea * ea
     | movzbq of ea * ea
     | movslq of ea * ea
+    | cmove of ea * ea (* conditional move *)
+    | cmovne of ea * ea  (* conditional move *)
     | push of ea
     | leaq of ea * ea
     | pop of ea
@@ -91,6 +97,7 @@ signature INSTS_X64 =
     | shrq of ea * ea   (* unsigned *)
     | salq of ea * ea
     | cmpq of ea * ea
+    | testq of ea * ea
     | btq of ea * ea    (* bit test; sets carry flag *)
     | btrq of ea * ea   (* bit test and reset; sets carry flag *)
     | cmpxchgq of ea * ea
@@ -108,6 +115,34 @@ signature INSTS_X64 =
     | sqrtsd of ea * ea
     | cvtsi2sdq of ea * ea
 
+    | vmovupd of ea * ea
+
+    | vaddpd of ea * ea * ea (* AVX OPERATIONS *)
+    | vaddpd_128 of ea * ea * ea (* 128 bit version *)
+    | vaddsd of ea * ea * ea (* 64 bit version *)
+
+    | vsubpd of ea * ea * ea
+
+    | vmulpd of ea * ea * ea
+    | vmulpd_128 of ea * ea * ea (* 128 bit version *)
+    | vmulsd of ea * ea * ea (* 64 bit version *)
+
+    | vdivpd of ea * ea * ea
+
+    | vandpd of ea * ea * ea
+    | vorpd of ea * ea * ea
+
+    | vbroadcastsd of ea * ea
+    | vblendvpd of ea * ea * ea * ea (* conditional move based on mask *)
+    | vcmppd of ea * ea * ea * ea (* compare and make mask *)
+    | vmovmskpd of ea * ea (* extract mask *)
+    | vpcmpeqd of ea * ea * ea (* equality of packed vector, useful for generating all ones *)
+    | vpxor of ea * ea * ea (* xor of packed vector, useful for generating all zeroes *)
+    | vextractf128 of ea * ea * ea (* extract 128 bits of a 256 bit register *)
+
+    | vunpckhpd of ea * ea * ea (* unpack and interleave *)
+    | vunpckhpd_128 of ea * ea * ea (* unpack and interleave, 128 bit version *)
+
     | fstpq of ea       (* store float and pop float stack *)
     | fldq of ea        (* push float onto the float stack *)
     | fldz              (* push 0.0 onto the float stack *)
diff --git a/src/Compiler/Backend/X64/InstsX64.sml b/src/Compiler/Backend/X64/InstsX64.sml
index d77294740..44c35a8c7 100644
--- a/src/Compiler/Backend/X64/InstsX64.sml
+++ b/src/Compiler/Backend/X64/InstsX64.sml
@@ -33,6 +33,10 @@ structure InstsX64: INSTS_X64 =
                  | xmm4 | xmm5 | xmm6 | xmm7
                  | xmm8 | xmm9 | xmm10 | xmm11
                  | xmm12 | xmm13 | xmm14 | xmm15
+                 | ymm0 | ymm1 | ymm2 | ymm3
+                 | ymm4 | ymm5 | ymm6 | ymm7
+                 | ymm8 | ymm9 | ymm10 | ymm11
+                 | ymm12 | ymm13 | ymm14 | ymm15
 
     type freg = int
 
@@ -73,6 +77,8 @@ structure InstsX64: INSTS_X64 =
     | movb of ea * ea
     | movzbq of ea * ea
     | movslq of ea * ea
+    | cmove of ea * ea (* conditional move *)
+    | cmovne of ea * ea  (* conditional move *)
     | push of ea
     | leaq of ea * ea
     | pop of ea
@@ -109,6 +115,7 @@ structure InstsX64: INSTS_X64 =
     | shrq of ea * ea   (* unsigned *)
     | salq of ea * ea
     | cmpq of ea * ea
+    | testq of ea * ea
     | btq of ea * ea    (* bit test; sets carry flag *)
     | btrq of ea * ea   (* bit test and reset; sets carry flag *)
     | cmpxchgq of ea * ea
@@ -126,6 +133,32 @@ structure InstsX64: INSTS_X64 =
     | sqrtsd of ea * ea
     | cvtsi2sdq of ea * ea
 
+    | vmovupd of ea * ea
+
+    | vaddpd of ea * ea * ea (* AVX OPERATIONS *)
+    | vaddpd_128 of ea * ea * ea (* 128 bit version *)
+    | vaddsd of ea * ea * ea (* 64 bit version *)
+
+    | vmulpd of ea * ea * ea
+    | vmulpd_128 of ea * ea * ea (* 128 bit version *)
+    | vmulsd of ea * ea * ea (* 64 bit version *)
+
+    | vsubpd of ea * ea * ea
+    | vdivpd of ea * ea * ea
+
+    | vandpd of ea * ea * ea
+    | vorpd of ea * ea * ea
+
+    | vbroadcastsd of ea * ea
+    | vblendvpd of ea * ea * ea * ea (* conditional move based on mask *)
+    | vcmppd of ea * ea * ea * ea (* compare and make mask *)
+    | vmovmskpd of ea * ea (* extract mask *)
+    | vpcmpeqd of ea * ea * ea (* equality of packed vector, useful for generating all ones *)
+    | vpxor of ea * ea * ea (* xor of packed vector, useful for generating all zeroes *)
+    | vextractf128 of ea * ea * ea (* extract 128 bits of a 256 bit register *)
+    | vunpckhpd of ea * ea * ea
+    | vunpckhpd_128 of ea * ea * ea
+
     | fstpq of ea       (* store float and pop float stack *)
     | fldq of ea        (* push float onto the float stack *)
     | fldz              (* push 0.0 onto the float stack *)
@@ -245,6 +278,22 @@ structure InstsX64: INSTS_X64 =
       | pr_reg xmm13 = "%xmm13"
       | pr_reg xmm14 = "%xmm14"
       | pr_reg xmm15 = "%xmm15"
+      | pr_reg ymm0 = "%ymm0"
+      | pr_reg ymm1 = "%ymm1"
+      | pr_reg ymm2 = "%ymm2"
+      | pr_reg ymm3 = "%ymm3"
+      | pr_reg ymm4 = "%ymm4"
+      | pr_reg ymm5 = "%ymm5"
+      | pr_reg ymm6 = "%ymm6"
+      | pr_reg ymm7 = "%ymm7"
+      | pr_reg ymm8 = "%ymm8"
+      | pr_reg ymm9 = "%ymm9"
+      | pr_reg ymm10 = "%ymm10"
+      | pr_reg ymm11 = "%ymm11"
+      | pr_reg ymm12 = "%ymm12"
+      | pr_reg ymm13 = "%ymm13"
+      | pr_reg ymm14 = "%ymm14"
+      | pr_reg ymm15 = "%ymm15"
 
     fun is_xmm (r:reg) =
         case r of
@@ -266,6 +315,29 @@ structure InstsX64: INSTS_X64 =
           | xmm15 => true
           | _ => false
 
+    fun to_ymm (x: ea) =
+      case x of
+           R r => R (case r of
+                        xmm0 => ymm0
+                      | xmm1 => ymm1
+                      | xmm2 => ymm2
+                      | xmm3 => ymm3
+                      | xmm4 => ymm4
+                      | xmm5 => ymm5
+                      | xmm6 => ymm6
+                      | xmm7 => ymm7
+                      | xmm8 => ymm8
+                      | xmm9 => ymm9
+                      | xmm10 => ymm10
+                      | xmm11 => ymm11
+                      | xmm12 => ymm12
+                      | xmm13 => ymm13
+                      | xmm14 => ymm14
+                      | xmm15 => ymm15
+                      | _ => r)
+        | _ => x
+
+
     fun remove_ctrl s =
         CharVector.map (fn c =>
                            if Char.isAlphaNum c orelse c = #"_" orelse c = #"."
@@ -316,6 +388,15 @@ structure InstsX64: INSTS_X64 =
           fun emit_bin (s, (ea1, ea2)) = (emit "\t"; emit s; emit " ";
                                           emit(pr_ea ea1); emit ",";
                                           emit(pr_ea ea2); emit_nl())
+          fun emit_ter (s, (ea1, ea2, ea3)) = (emit "\t"; emit s; emit " ";
+                                              emit(pr_ea ea1); emit ",";
+                                              emit(pr_ea ea2); emit ",";
+                                              emit(pr_ea ea3); emit_nl())
+          fun emit_quad (s, (ea1, ea2, ea3, ea4)) = (emit "\t"; emit s; emit " ";
+                                                    emit(pr_ea ea1); emit ",";
+                                                    emit(pr_ea ea2); emit ",";
+                                                    emit(pr_ea ea3); emit ",";
+                                                    emit(pr_ea ea4); emit_nl())
           fun emit_unary (s, ea) = (emit "\t"; emit s; emit " "; emit(pr_ea ea); emit_nl())
           fun emit_nullary s = (emit "\t"; emit s; emit_nl())
           fun emit_nullary0 s = (emit s; emit_nl())
@@ -327,6 +408,8 @@ structure InstsX64: INSTS_X64 =
                | movb a => emit_bin ("movb", a)
                | movzbq a => emit_bin ("movzbq", a)
                | movslq a => emit_bin ("movslq", a)
+               | cmove a => emit_bin ("cmove", a)
+               | cmovne a => emit_bin ("cmovne", a)
                | leaq a => emit_bin ("leaq", a)
                | push ea => emit_unary ("pushq", ea)
                | pop ea => emit_unary ("popq", ea)
@@ -363,6 +446,7 @@ structure InstsX64: INSTS_X64 =
                | shrq a => emit_bin("shrq", a)
                | salq a => emit_bin("salq", a)
                | cmpq a => emit_bin("cmpq", a)
+               | testq a => emit_bin("testq", a)
                | btq a => emit_bin("btq", a)
                | btrq a => emit_bin("btrq", a)
                | cmpxchgq a => emit_bin("lock cmpxchgq", a)
@@ -380,6 +464,32 @@ structure InstsX64: INSTS_X64 =
                | sqrtsd a => emit_bin("sqrtsd", a)
                | cvtsi2sdq a => emit_bin("cvtsi2sdq", a)
 
+               | vmovupd (a1, a2) => emit_bin("vmovupd", (to_ymm a1, to_ymm a2))
+               | vbroadcastsd (a1, a2) => emit_bin("vbroadcastsd", (a1, to_ymm a2))
+
+               | vaddpd (a1, a2, a3) => emit_ter("vaddpd", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vaddpd_128 (a1, a2, a3) => emit_ter("vaddpd", (a1, a2, a3))
+               | vaddsd (a1, a2, a3) => emit_ter("vaddsd", (a1, a2, a3))
+
+               | vmulpd (a1, a2, a3) => emit_ter("vmulpd", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vmulpd_128 (a1, a2, a3) => emit_ter("vmulpd", (a1, a2, a3))
+               | vmulsd (a1, a2, a3) => emit_ter("vmulsd", (a1, a2, a3))
+
+               | vdivpd (a1, a2, a3) => emit_ter("vdivpd", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vsubpd (a1, a2, a3) => emit_ter("vsubpd", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vandpd (a1, a2, a3) => emit_ter("vandpd", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vorpd (a1, a2, a3) => emit_ter("vorpd", (to_ymm a1, to_ymm a2, to_ymm a3))
+
+               | vblendvpd (a1, a2, a3, a4) => emit_quad("vblendvpd", (to_ymm a1, to_ymm a2, to_ymm a3, to_ymm a4))
+               | vcmppd (a1, a2, a3, a4) => emit_quad ("vcmppd", (a1, to_ymm a2, to_ymm a3, to_ymm a4))
+               | vmovmskpd (a1, a2) => emit_bin ("vmovmskpd", (to_ymm a1, a2))
+               | vpcmpeqd (a1, a2, a3) => emit_ter("vpcmpeqd", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vpxor (a1, a2, a3) => emit_ter("vpxor", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vextractf128 (a1, a2, a3) => emit_ter("vextractf128", (a1, to_ymm a2, a3))
+
+               | vunpckhpd (a1, a2, a3) => emit_ter("vunpckhpd", (to_ymm a1, to_ymm a2, to_ymm a3))
+               | vunpckhpd_128 (a1, a2, a3) => emit_ter("vunpckhpd", (a1, a2, a3))
+
                | fstpq ea => emit_unary("fstpq", ea)
                | fldq ea => emit_unary("fldq", ea)
                | fldz => emit_nullary "fldz"
@@ -641,6 +751,8 @@ structure InstsX64: INSTS_X64 =
              | movb (ea1,ea2) => movb (Em ea1,Em ea2)
              | movzbq (ea1,ea2) => movzbq (Em ea1,Em ea2)
              | movslq (ea1,ea2) => movslq (Em ea1,Em ea2)
+             | cmove (ea1,ea2) => cmove (Em ea1,Em ea2)
+             | cmovne (ea1,ea2) => cmovne (Em ea1,Em ea2)
              | push ea => push (Em ea)
              | leaq (ea1,ea2) => leaq (Em ea1,Em ea2)
              | pop ea => pop (Em ea)
@@ -675,6 +787,7 @@ structure InstsX64: INSTS_X64 =
              | shrq (ea1,ea2) => shrq (Em ea1,Em ea2)
              | salq (ea1,ea2) => salq (Em ea1,Em ea2)
              | cmpq (ea1,ea2) => cmpq (Em ea1,Em ea2)
+             | testq (ea1,ea2) => testq (Em ea1,Em ea2)
              | btq (ea1,ea2) => btq (Em ea1,Em ea2)
              | btrq (ea1,ea2) => btrq (Em ea1,Em ea2)
              | cmpxchgq (ea1,ea2) => cmpxchgq (Em ea1,Em ea2)
@@ -690,6 +803,32 @@ structure InstsX64: INSTS_X64 =
              | xorps (ea1,ea2) => xorps (Em ea1,Em ea2)
              | sqrtsd (ea1,ea2) => sqrtsd (Em ea1,Em ea2)
              | cvtsi2sdq (ea1,ea2) => cvtsi2sdq (Em ea1,Em ea2)
+
+             | vmovupd (ea1, ea2) => vmovupd (Em ea1,Em ea2)
+
+             | vaddpd (ea1, ea2, ea3) => vaddpd (Em ea1, Em ea2, Em ea3)
+             | vaddpd_128 (ea1, ea2, ea3) => vaddpd_128 (Em ea1, Em ea2, Em ea3)
+             | vaddsd (ea1, ea2, ea3) => vaddsd (Em ea1, Em ea2, Em ea3)
+
+             | vmulpd (ea1, ea2, ea3) => vmulpd (Em ea1, Em ea2, Em ea3)
+             | vmulpd_128 (ea1, ea2, ea3) => vmulpd_128 (Em ea1, Em ea2, Em ea3)
+             | vmulsd (ea1, ea2, ea3) => vmulsd (Em ea1, Em ea2, Em ea3)
+
+             | vdivpd (ea1, ea2, ea3) => vdivpd (Em ea1, Em ea2, Em ea3)
+             | vsubpd (ea1, ea2, ea3) => vsubpd (Em ea1, Em ea2, Em ea3)
+             | vandpd (ea1, ea2, ea3) => vandpd (Em ea1, Em ea2, Em ea3)
+             | vorpd (ea1, ea2, ea3) => vorpd (Em ea1, Em ea2, Em ea3)
+             | vpxor (ea1, ea2, ea3) => vpxor (Em ea1, Em ea2, Em ea3)
+
+             | vbroadcastsd (ea1, ea2) => vbroadcastsd (Em ea1, Em ea2)
+             | vblendvpd (ea1, ea2, ea3, ea4) => vblendvpd (Em ea1, Em ea2, Em ea3, Em ea4)
+             | vcmppd (ea1, ea2, ea3, ea4) => vcmppd (Em ea1, Em ea2, Em ea3, Em ea4)
+             | vmovmskpd (a1, a2) => vmovmskpd (Em a1, Em a2)
+             | vpcmpeqd (ea1, ea2, ea3) => vpcmpeqd (Em ea1, Em ea2, Em ea3)
+             | vextractf128 (ea1, ea2, ea3) => vextractf128 (Em ea1, Em ea2, Em ea3)
+             | vunpckhpd (ea1, ea2, ea3) => vunpckhpd (Em ea1, Em ea2, Em ea3)
+             | vunpckhpd_128 (ea1, ea2, ea3) => vunpckhpd_128 (Em ea1, Em ea2, Em ea3)
+
              | fstpq ea => fstpq (Em ea)
              | fldq ea => fldq (Em ea)
              | jmp ea => jmp (Em ea)
diff --git a/src/Compiler/CompBasis.sml b/src/Compiler/CompBasis.sml
index 1f24ab394..0a554b271 100644
--- a/src/Compiler/CompBasis.sml
+++ b/src/Compiler/CompBasis.sml
@@ -183,7 +183,7 @@ structure CompBasis: COMP_BASIS =
           val lvars = lvars' @ lvars
           val cons = cons' @ cons
           val tynames = tynames' @ tynames
-          val tynames = TyName.tyName_F64 :: tynames (* for optimiser float unboxing *)
+          val tynames = TyName.tyName_F64 :: TyName.tyName_F256 :: tynames (* for optimiser float unboxing *)
           val TCEnv1 = LambdaStatSem.restrict(TCEnv,{lvars=lvars,tynames=tynames,cons=cons,excons=excons})
           val rse1 = RegionStatEnv.restrict(rse,{lvars=lvars,tynames=tynames,cons=cons,excons=excons})
           val mulenv1 = Mul.restrict_efenv(mulenv,lvars)
diff --git a/src/Compiler/CompBasisToLamb.sml b/src/Compiler/CompBasisToLamb.sml
index 43032107e..608de2dff 100644
--- a/src/Compiler/CompBasisToLamb.sml
+++ b/src/Compiler/CompBasisToLamb.sml
@@ -144,7 +144,7 @@ structure CompBasisToLamb
 	  val (OEnv1,lvs',cns',tns') = OptLambda.restrict(OEnv,lvs,cns,tns)
           val lvs = lvs' @ lvs
           val cns = cns' @ cns
-          val tns = TyName.tyName_F64 :: tns' @ tns (* for optimiser float unboxing *)
+          val tns = TyName.tyName_F64 :: TyName.tyName_F256 :: tns' @ tns (* for optimiser float unboxing *)
 	  val TCEnv1 = LambdaStatSem.restrict(TCEnv,{lvars=lvs,tynames=tns,cons=cns,excons=excons})
       in ({NEnv=NEnv1,
            TCEnv=TCEnv1,
diff --git a/src/Compiler/Lambda/EliminateEq.sml b/src/Compiler/Lambda/EliminateEq.sml
index 5991fbf7f..ac378edab 100644
--- a/src/Compiler/Lambda/EliminateEq.sml
+++ b/src/Compiler/Lambda/EliminateEq.sml
@@ -675,6 +675,7 @@ structure EliminateEq : ELIMINATE_EQ =
 		| STRING _ => e
 		| REAL _ => e
 		| F64 _ => e
+		| F256 _ => e
 		| FN{pat,body} => FN{pat=pat,body=f body}
 		| LET {pat, bind, scope} => LET {pat=pat, bind=f bind,scope=f scope}
                 | LETREGION {regvars,scope} => LETREGION{regvars=regvars,scope=f scope}
diff --git a/src/Compiler/Lambda/LAMBDA_EXP.sml b/src/Compiler/Lambda/LAMBDA_EXP.sml
index 4ed846f4e..ac87505c7 100644
--- a/src/Compiler/Lambda/LAMBDA_EXP.sml
+++ b/src/Compiler/Lambda/LAMBDA_EXP.sml
@@ -57,6 +57,7 @@ signature LAMBDA_EXP =
     val wordDefaultType : unit -> Type  (* word63 if tag_values, otherwise word64 *)
     val realType: Type
     val f64Type: Type
+    val f256Type: Type
     val stringType: Type
     val chararrayType: Type
 
@@ -105,6 +106,7 @@ signature LAMBDA_EXP =
       | STRING   of string * regvar option
       | REAL     of string * regvar option
       | F64      of string
+      | F256     of string
       | FN       of {pat : (lvar * Type) list, body : LambdaExp}
       | LET      of {pat : (lvar * tyvar list * Type) list,
 		     bind : LambdaExp,
diff --git a/src/Compiler/Lambda/LambdaBasics.sml b/src/Compiler/Lambda/LambdaBasics.sml
index a6b0f0f04..aa410c84e 100644
--- a/src/Compiler/Lambda/LambdaBasics.sml
+++ b/src/Compiler/Lambda/LambdaBasics.sml
@@ -36,6 +36,7 @@ structure LambdaBasics: LAMBDA_BASICS =
            | STRING _ => lamb
            | REAL _ => lamb
            | F64 _ => lamb
+           | F256 _ => lamb
            | FN{pat,body} => FN{pat=pat,body=passTD f body}
            | LET{pat,bind,scope} => LET{pat=pat,bind=passTD f bind,scope = passTD f scope}
            | LETREGION{regvars,scope} => LETREGION{regvars=regvars,scope=passTD f scope}
@@ -80,6 +81,7 @@ structure LambdaBasics: LAMBDA_BASICS =
               | STRING _ => lamb
               | REAL _ => lamb
               | F64 _ => lamb
+              | F256 _ => lamb
               | FN{pat,body} => FN{pat=pat,body=passBU f body}
               | LET{pat,bind,scope} => LET{pat=pat,bind=passBU f bind,scope = passBU f scope}
               | LETREGION{regvars,scope} => LETREGION{regvars=regvars,scope=passBU f scope}
@@ -122,6 +124,7 @@ structure LambdaBasics: LAMBDA_BASICS =
            | STRING _ => new_acc
            | REAL _ => new_acc
            | F64 _ => new_acc
+           | F256 _ => new_acc
            | FN{pat,body} => foldTD f new_acc body
            | LET{pat,bind,scope} => foldTD f (foldTD f new_acc bind) scope
            | LETREGION{regvars,scope} => foldTD f new_acc scope
@@ -160,6 +163,7 @@ structure LambdaBasics: LAMBDA_BASICS =
          | WORD _ => lamb
          | REAL _ => lamb
          | F64 _ => lamb
+         | F256 _ => lamb
          | STRING _ => lamb
          | FN{pat,body} => FN{pat=pat,body=f body}
          | LET{pat,bind,scope} => LET{pat=pat,bind=f bind,scope=f scope}
@@ -205,6 +209,7 @@ structure LambdaBasics: LAMBDA_BASICS =
          | WORD _ => ()
          | REAL _ => ()
          | F64 _ => ()
+         | F256 _ => ()
          | STRING _ => ()
          | FN{pat,body} => f body
          | LET{pat,bind,scope} => (f bind; f scope)
@@ -435,6 +440,7 @@ structure LambdaBasics: LAMBDA_BASICS =
             | STRING _ => lamb
             | REAL _ => lamb
             | F64 _ => lamb
+            | F256 _ => lamb
             | FN{pat,body} => let val (pat', ren') = new_fnpat pat ren
                               in FN{pat=pat', body=on_e ren' body}
                               end
@@ -611,6 +617,7 @@ structure LambdaBasics: LAMBDA_BASICS =
                  | STRING _ => lamb
                  | REAL _ => lamb
                  | F64 _ => lamb
+                 | F256 _ => lamb
                  | FN{pat,body} => FN{pat = map (fn (lv, Type) => (lv, on_Type S Type)) pat,
                                       body = f S body}
                  | LET{pat,bind,scope} => let val (S',pat') = on_let_pat S pat
@@ -811,6 +818,7 @@ structure LambdaBasics: LAMBDA_BASICS =
             | STRING _ => e
             | REAL _ => e
             | F64 _ => e
+            | F256 _ => e
             | FN{pat,body} =>
               let val E = foldl (fn ((lv,_),E) => add(lv,NONE,E)) E pat
               in FN{pat=pat,body=N E body}
@@ -900,6 +908,7 @@ structure LambdaBasics: LAMBDA_BASICS =
                 | STRING _ => e
                 | REAL _ => e
                 | F64 _ => e
+                | F256 _ => e
                 | FN{pat,body} => FN{pat=pat,body=t true body}
                 | LET{pat,bind,scope} =>
                   LET{pat=pat,bind=t false bind,scope=t tail scope}
diff --git a/src/Compiler/Lambda/LambdaExp.sml b/src/Compiler/Lambda/LambdaExp.sml
index 41cca326b..421b93472 100644
--- a/src/Compiler/Lambda/LambdaExp.sml
+++ b/src/Compiler/Lambda/LambdaExp.sml
@@ -70,6 +70,7 @@ structure LambdaExp: LAMBDA_EXP =
     val exnType = CONStype([], TyName.tyName_EXN)
     val realType = CONStype([], TyName.tyName_REAL)
     val f64Type = CONStype([], TyName.tyName_F64)
+    val f256Type = CONStype([], TyName.tyName_F256)
     val stringType = CONStype([], TyName.tyName_STRING)
     val chararrayType = CONStype([], TyName.tyName_CHARARRAY)
     val unitType = RECORDtype([])
@@ -125,6 +126,7 @@ structure LambdaExp: LAMBDA_EXP =
       | STRING   of string * regvar option
       | REAL     of string * regvar option
       | F64      of string
+      | F256     of string
       | FN       of {pat : (lvar * Type) list, body : LambdaExp}
       | LET      of {pat : (lvar * tyvar list * Type) list,
 		     bind : LambdaExp,
@@ -178,6 +180,7 @@ structure LambdaExp: LAMBDA_EXP =
         | STRING _ => new_acc
         | REAL _ => new_acc
         | F64 _ => new_acc
+        | F256 _ => new_acc
 	| FN{pat,body} => foldTD fcns (foldl' (foldType g) new_acc (map #2 pat)) body
 	| LET{pat,bind,scope} => foldTD fcns (foldTD fcns (foldl' (foldType g) new_acc (map #3 pat)) bind) scope
         | LETREGION {regvars, scope} => foldTD fcns new_acc scope
@@ -328,6 +331,7 @@ structure LambdaExp: LAMBDA_EXP =
 	  | STRING _	                => ()
 	  | REAL _	                => ()
 	  | F64 _	                => ()
+	  | F256 _	                => ()
 	  | FN _	                => ()
 	  | LET {bind,scope,...}        => (safe bind; safe scope)
           | LETREGION _                 => raise NotSafe            (* memo: maybe safe? *)
@@ -861,6 +865,7 @@ structure LambdaExp: LAMBDA_EXP =
       | REAL (r,NONE) => PP.LEAF(r)
       | REAL (r,SOME rv) => PP.LEAF(r ^ "`" ^ RegVar.pr rv)
       | F64 r => PP.LEAF(r ^ "f64")
+      | F256 r => PP.LEAF(r ^ "f256")
       | FN {pat,body} =>
 	  PP.NODE{start="(fn ",finish=")", indent=4,
 		  children=[layoutFnPat pat,
@@ -1563,6 +1568,7 @@ structure LambdaExp: LAMBDA_EXP =
 	      | toInt (FRAME _) = 18
               | toInt (LETREGION _) = 19
 	      | toInt (F64 _) = 20
+	      | toInt (F256 _) = 21
 
 	    fun fun_VAR pu_LambdaExp =
 		Pickle.con1 VAR (fn VAR a => a | _ => die "pu_LambdaExp.VAR")
@@ -1643,6 +1649,9 @@ structure LambdaExp: LAMBDA_EXP =
 	    fun fun_F64 pu_LambdaExp =
 		Pickle.con1 F64 (fn F64 a => a | _ => die "pu_LambdaExp.F64")
 		Pickle.string
+	    fun fun_F256 pu_LambdaExp =
+		Pickle.con1 F256 (fn F256 a => a | _ => die "pu_LambdaExp.F256")
+		Pickle.string
 
 	in Pickle.dataGen("LambdaExp.LambdaExp",toInt,[fun_VAR,
 						       fun_INTEGER,
@@ -1664,7 +1673,7 @@ structure LambdaExp: LAMBDA_EXP =
 						       fun_PRIM,
 						       fun_FRAME,
                                                        fun_LETREGION,
-                                                       fun_F64])
+                                                       fun_F64, fun_F256])
 	end
 
     structure TyvarSet = NatSet
@@ -1705,6 +1714,7 @@ structure LambdaExp: LAMBDA_EXP =
         | STRING _ => acc
         | REAL _ => acc
         | F64 _ => acc
+        | F256 _ => acc
 	| FN{pat,body} => tyvars_Exp s body (foldl (fn ((_,t),acc) => tyvars_Type s t acc) acc pat)
 	| LET{pat,bind,scope} =>
           let val s' = foldl (fn ((_,tvs,_),s) => TVS.addList tvs s) s pat
diff --git a/src/Compiler/Lambda/LambdaStatSem.sml b/src/Compiler/Lambda/LambdaStatSem.sml
index 961db8883..7fb811a0c 100644
--- a/src/Compiler/Lambda/LambdaStatSem.sml
+++ b/src/Compiler/Lambda/LambdaStatSem.sml
@@ -181,6 +181,7 @@ structure LambdaStatSem: LAMBDA_STAT_SEM =
                                (tyName_WORD64, []),
                                (tyName_REAL, []),
                                (tyName_F64, []),
+                               (tyName_F256, []),
                                (tyName_STRING, []),
                                (tyName_CHAR, []),
                                (tyName_LIST, [Con.con_NIL, Con.con_CONS]),
@@ -790,6 +791,7 @@ structure LambdaStatSem: LAMBDA_STAT_SEM =
          | STRING s => Types [CONStype([], tyName_STRING)]
          | REAL s => Types [CONStype([], tyName_REAL)]
          | F64 s => Types [CONStype([], tyName_F64)]
+         | F256 s => Types [CONStype([], tyName_F256)]
          | FN {pat,body} =>
           let val env' = foldl (fn ((lvar,Type), env) =>
                                      add_lvar(lvar,([],Type),env)) env pat
diff --git a/src/Compiler/Lambda/Lvars.sml b/src/Compiler/Lambda/Lvars.sml
index b312c7ba5..c5f2e1750 100644
--- a/src/Compiler/Lambda/Lvars.sml
+++ b/src/Compiler/Lambda/Lvars.sml
@@ -11,12 +11,14 @@ structure Lvars: LVARS =
 
     type name = Name.name
 
-    type lvar = {name     : name,
+    type lvar = {
+         name     : name,
 		 str      : string,
 		 free     : bool ref,
 		 inserted : bool ref,
 		 use      : int ref,
-                 ubf64    : bool ref}
+         ubf64    : bool ref
+     }
 
     fun new_named_lvar (str : string) : lvar =
         {name=Name.new(), str=str,
diff --git a/src/Compiler/Lambda/OptLambda.sml b/src/Compiler/Lambda/OptLambda.sml
index 6265fc537..e1ead174b 100644
--- a/src/Compiler/Lambda/OptLambda.sml
+++ b/src/Compiler/Lambda/OptLambda.sml
@@ -257,6 +257,66 @@ structure OptLambda: OPT_LAMBDA =
       val f64_greatereq = f64_cmp "greatereq"
     end
 
+    (* Operations on unboxed vectors *)
+
+    local
+      type exp = LambdaExp
+      fun ccall name argtypes restype =
+          CCALLprim {name=name,instances=[],tyvars=[],
+		     Type=ARROWtype(argtypes,[restype])}
+      fun f256_bin opr (x:exp,y:exp) : exp =
+          PRIM(ccall ("__" ^ opr ^ "_f256") [f256Type,f256Type] f256Type, [x,y])
+      val allocVector : exp = PRIM(SCRATCHMEMprim 32, [])
+    in
+      fun f256_box (x:exp) : exp = PRIM(ccall "__f256_box" [f256Type] stringType,[x])
+      (* let val box = allocVector
+      *      val () = storeVector(x, box)
+      *   in box
+      *)
+      fun f256_box_alloc (x:exp) : exp =
+        let val box = Lvars.newLvar ()
+            val boxVar = VAR {lvar = box, instances = [], regvars = []}
+            val storeExp: exp =
+              LET { pat = [(Lvars.newLvar (), [], unitType)]
+                  , bind = PRIM(ccall "__f256_store" [f256Type, stringType] unitType, [x, boxVar])
+                  , scope = boxVar
+                  }
+         in
+           LET { pat = [(box, [], stringType)]
+               , bind = allocVector
+               , scope = storeExp
+               }
+        end
+
+      fun f256_unbox (x:exp) : exp = PRIM(ccall "__f256_unbox" [stringType] f256Type, [x])
+
+      fun f256_broadcast (x: exp) : exp = PRIM(ccall "__broadcast_f256" [f64Type] f256Type, [x])
+      fun f256_blend (a: exp, b: exp, mask: exp) : exp =
+          PRIM(ccall "__blend_f256" [f256Type, f256Type, f256Type] f256Type, [a,b,mask])
+      fun f256_product (a: exp) : exp =
+          PRIM(ccall "__product_f256" [f256Type] f64Type, [a])
+      fun f256_sum (a: exp) : exp =
+          PRIM(ccall "__sum_f256" [f256Type] f64Type, [a])
+
+      val f256_plus = f256_bin "plus"
+      val f256_minus = f256_bin "minus"
+      val f256_and = f256_bin "and"
+      val f256_or = f256_bin "or"
+      val f256_mul = f256_bin "mul"
+      val f256_div = f256_bin "div"
+      val f256_less = f256_bin "less"
+      val f256_lesseq = f256_bin "lesseq"
+      val f256_greater = f256_bin "greater"
+      val f256_greatereq = f256_bin "greatereq"
+
+	  fun f256_any (x: exp): exp = PRIM(ccall "__any_f256" [f256Type] boolType, [x])
+	  fun f256_all (x: exp): exp = PRIM(ccall "__all_f256" [f256Type] boolType, [x])
+	  fun f256_not (x: exp): exp = PRIM(ccall "__not_f256" [f256Type] f256Type, [x])
+
+	  val f256_true: exp = PRIM(ccall "__true_f256" [] f256Type, [])
+      val f256_false: exp = PRIM(ccall "__false_f256" [] f256Type, [])
+    end
+
    (* -----------------------------------------------------------------
     * Statistical functions
     * ----------------------------------------------------------------- *)
@@ -582,6 +642,7 @@ structure OptLambda: OPT_LAMBDA =
              case e of
                  REAL _ => true
                | F64 _ => true
+               | F256 _ => true
                | WORD _ => true
                | INTEGER _ => true
                | VAR{lvar,...} => if Lvars.eq(lvar,lv) then raise Bad else true
@@ -607,6 +668,7 @@ structure OptLambda: OPT_LAMBDA =
              case e of
                  REAL _ => e
                | F64 _ => e
+               | F256 _ => e
                | WORD _ => e
                | INTEGER _ => e
                | VAR _ => e
@@ -628,6 +690,112 @@ structure OptLambda: OPT_LAMBDA =
        in subst lamb
        end
 
+   fun string_lvar_f256_in_lamb (lv:lvar) (lamb:LambdaExp) : bool =
+     let exception Bad
+         fun ok n =
+             case n of
+                 "__m256d_plus" => true
+               | "__m256d_minus" => true
+               | "__m256d_mul" => true
+               | "__m256d_div" => true
+               | "__m256d_and" => true
+               | "__m256d_or" => true
+               | "__m256d_nor" => true
+               | "__m256d_less" => true
+               | "__m256d_lesseq" => true
+               | "__m256d_greater" => true
+               | "__m256d_greatereq" => true
+               | "__m256d_all" => true
+               | "__m256d_any" => true
+               | "__m256d_blend" => true
+               | "__m256d_true" => true
+               | "__m256d_false" => true
+               | "__m256d_sum" => true
+               | "__m256d_product" => true
+
+               | "__m256d_broadcast" => true
+               | "__f256_box" => true
+               | "__f256_unbox" => true
+
+               | "__plus_f256" => true
+               | "__minus_f256" => true
+               | "__mul_f256" => true
+               | "__div_f256" => true
+               | "__and_f256" => true
+               | "__or_f256" => true
+               | "__not_f256" => true
+               | "__less_f256" => true
+               | "__lesseq_f256" => true
+               | "__greater_f256" => true
+               | "__greatereq_f256" => true
+			   | "__all_f256" => true
+			   | "__any_f256" => true
+               | "__blend_f256" => true
+			   | "__true_f256" => true
+               | "__false_f256" => true
+			   | "__sum_f256" => true
+               | "__product_f256" => true
+
+               | "__broadcast_f256" => true
+               | "__real_to_f64" => true
+               | "__f64_to_real" => true
+               | "__blockf64_sub_f256" => true
+               | "__blockf64_update_f256" => true
+               | _ => false
+         fun check e = if lvar_in_lamb lv e then raise Bad else false
+         fun safeLook_sw safeLook (SWITCH(e,es,eopt)) =
+             if safeLook e then
+               let val ss = map (safeLook o #2) es
+                   val sopt = Option.map safeLook eopt
+               in Option.getOpt (sopt,true) andalso List.all (fn x => x) ss
+               end
+             else (app (ignore o check o #2) es; Option.app (ignore o check) eopt; false)
+         fun safeLook e =
+             case e of
+                 REAL _ => true
+               | F64 _ => true
+               | F256 _ => true
+               | WORD _ => true
+               | INTEGER _ => true
+               | VAR{lvar,...} => if Lvars.eq(lvar,lv) then raise Bad else true
+               | PRIM(CCALLprim{name="__f256_unbox",...},[VAR _]) => true
+               | PRIM(CCALLprim{name,...},es) =>
+                 if ok name then safeLooks es else raise Bad
+	       | LET{pat,bind,scope} => if safeLook bind then safeLook scope
+                                        else check scope
+               | PRIM(SELECTprim _, es) => safeLooks es
+               | PRIM(RECORDprim _, es) => safeLooks es
+               | PRIM(BLOCKF64prim, es) => safeLooks es
+               | PRIM(DROPprim, es) => safeLooks es
+               | PRIM(CONprim _, es) => safeLooks es
+               | SWITCH_C sw => safeLook_sw safeLook sw
+               | _ => check e
+         and safeLooks es =
+             List.foldl (fn (e,s) => if s then safeLook e else check e) true es
+     in
+       (safeLook lamb; true) handle Bad => false
+     end
+
+   fun subst_string_lvar_f256_in_lamb (lv:lvar) (lamb:LambdaExp) : LambdaExp =
+       let fun subst e =
+             case e of
+                 REAL _ => e
+               | F64 _ => e
+               | F256 _ => e
+               | WORD _ => e
+               | INTEGER _ => e
+               | VAR _ => e
+               | PRIM(CCALLprim{name="__f256_unbox",...},[e' as VAR {lvar,...}]) =>
+                 if Lvars.eq(lvar,lv) then e' else e
+               | PRIM(p,es) => PRIM(p,map subst es)
+	       | LET{pat,bind,scope} => LET{pat=pat,bind=subst bind,scope=subst scope}
+               | SWITCH_C(SWITCH(e,es,eopt)) =>
+                 SWITCH_C(SWITCH(subst e, map (fn (x,e) => (x,subst e)) es, Option.map subst eopt))
+               | _ => if lvar_in_lamb lv e then die "subst_string_lvar_f256_in_lamb: impossible"
+                      else e
+       in subst lamb
+       end
+
    (* -----------------------------------------------------------------
     * Marking Lambda Variables
     * ----------------------------------------------------------------- *)
@@ -837,8 +1005,14 @@ structure OptLambda: OPT_LAMBDA =
            VAR{instances=[],regvars=[],...} => true
          | INTEGER (_,t) => if tag_values() then eq_Type(t,int31Type) orelse eq_Type(t,int63Type) else true
          | F64 _ => true
+         | F256 _ => true
          | LET{pat,bind,scope} => simple_nonexpanding bind andalso simple_nonexpanding scope
          | PRIM(SELECTprim _, [e]) => simple_nonexpanding e
+         | PRIM(CCALLprim{name,...},[]) =>
+         		 (case name of
+			     "__false_f256" => true
+			   | "__true_f256" => true
+			   | _ => false)
          | PRIM(CCALLprim{name,...},[e]) =>
            (case name of
                 "__real_to_f64" => true
@@ -846,6 +1020,11 @@ structure OptLambda: OPT_LAMBDA =
               | "__abs_f64" => true
               | "__sqrt_f64" => true
               | "__int_to_f64" => true
+              | "__f256_unbox" => true
+              | "__broadcast_f256" => true
+              | "__sum_f256" => true
+              | "__product_f256" => true
+              | "__not_f256" => true
               | _ => false) andalso simple_nonexpanding e
          | PRIM(CCALLprim{name,...},[e1,e2]) =>
            (case name of
@@ -856,8 +1035,23 @@ structure OptLambda: OPT_LAMBDA =
               | "__max_f64" => true
               | "__min_f64" => true
               | "__blockf64_sub_f64" => true
+              | "__blockf64_sub_f256" => true
               | "__less_f64" => true
+              | "__plus_f256" => true
+              | "__minus_f256" => true
+              | "__mul_f256" => true
+              | "__div_f256" => true
+              | "__and_f256" => true
+              | "__or_f256" => true
+              | "__less_f256" => true
+              | "__lesseq_f256" => true
+              | "__greater_f256" => true
+              | "__greatereq_f256" => true
               | _ => false) andalso simple_nonexpanding e1 andalso simple_nonexpanding e2
+         | PRIM(CCALLprim{name,...},[e1,e2,e3]) =>
+         		 (case name of
+         		  "__blend_f256" => true
+				| _ => false) andalso simple_nonexpanding e1 andalso simple_nonexpanding e2 andalso simple_nonexpanding e3
          | _ => false
 
    (* =================================================================
@@ -1141,6 +1335,7 @@ structure OptLambda: OPT_LAMBDA =
                     | WORD _ => NONE
                     | REAL _ => NONE
                     | F64 _ => NONE
+                    | F256 _ => NONE
                     | STRING _ => NONE
                     | FN _ => NONE
                     | HANDLE _ => NONE
@@ -1530,6 +1725,19 @@ structure OptLambda: OPT_LAMBDA =
           (tick "real_to_f64";
            (f64_to_real (f64binop (real_to_f64 e1, real_to_f64 e2)), CUNKNOWN))
 
+      fun reduce_f256bin f256binop (e1,e2) =
+          (tick "f256_unbox";
+           (f256_box (f256binop (f256_unbox e1, f256_unbox e2)), CUNKNOWN))
+
+      fun reduce_f256any e =
+          (tick "f256_unbox"; (f256_any (f256_unbox e), CUNKNOWN))
+
+      fun reduce_f256all e =
+          (tick "f256_unbox"; (f256_all (f256_unbox e), CUNKNOWN))
+
+      fun reduce_f256not e =
+          (tick "f256_unbox"; (f256_box (f256_not (f256_unbox e)), CUNKNOWN))
+
       fun reduce_f64cmp f64cmp (e1,e2) =
           (tick "real_to_f64";
            (f64cmp (real_to_f64 e1, real_to_f64 e2), CUNKNOWN))
@@ -1581,6 +1789,7 @@ structure OptLambda: OPT_LAMBDA =
            | STRING _ => (lamb, CCONST lamb)
            | REAL _ => (lamb, CCONST lamb)
            | F64 _ => (lamb, CCONST lamb)
+           | F256 _ => (lamb, CCONST lamb)
            | LET{pat=[(lvar,tyvars,tau)],bind,scope} =>
                let
                  (* maybe let-float f64-binding outwards to open up for other optimisations *)
@@ -1606,7 +1815,7 @@ structure OptLambda: OPT_LAMBDA =
                                                  then hoist()
                                                  else default()
                        | _ => default()
-                 (* maybe unbox real binding *)
+                 (* maybe unbox real or vector binding *)
                  val (tau,bind,scope,fail) =
                      if unbox_reals() andalso eq_Type(realType,tau) andalso real_lvar_f64_in_lamb lvar scope then
                        (tick "reduce - unbox_real";
@@ -1616,6 +1825,14 @@ structure OptLambda: OPT_LAMBDA =
                         in (tau,bind,scope,(LET{pat=[(lvar,tyvars,tau)],bind=bind,scope=scope},
                                             CUNKNOWN))
                         end)
+                     else if unbox_reals() andalso eq_Type(stringType,tau) andalso string_lvar_f256_in_lamb lvar scope then
+                       (tick "reduce - unbox_vector";
+                        let val (tau,bind,scope) = (f256Type,f256_unbox bind,
+                                                    subst_string_lvar_f256_in_lamb lvar scope)
+                            val () = Lvars.set_ubf64 lvar
+                        in (tau,bind,scope,(LET{pat=[(lvar,tyvars,tau)],bind=bind,scope=scope},
+                                            CUNKNOWN))
+                        end)
                      else (tau,bind,scope,fail)
                  fun do_sw SW (SWITCH(VAR{lvar=lvar',instances,regvars=[]},sel,opt_e)) =
                      if Lvars.eq(lvar,lvar') andalso Lvars.one_use lvar then
@@ -1814,6 +2031,18 @@ structure OptLambda: OPT_LAMBDA =
                    end
                  | _ => default()
             end
+          | PRIM(CCALLprim{name="__f256_unbox",...},[e]) =>
+            let fun loop e f =
+                    case e of
+                        PRIM(CCALLprim{name="__f256_box",...},[e]) =>
+                        (tick "f256 unbox o box elimination - let";
+                         SOME (f e))
+                      | LET{pat,bind,scope} => loop scope (f o (fn e => LET{pat=pat,bind=bind,scope=e}))
+                      | _ => NONE
+            in case loop e (fn x => x) of
+                   NONE => constantFolding env lamb fail
+                 | SOME e' => reduce(env,(e',CUNKNOWN))
+            end
           | PRIM(CCALLprim{name="ord",...}, [WORD (i,t)]) =>
             (tick "ord immed"; (INTEGER(i,intDefaultType()), CUNKNOWN))
           | PRIM(CCALLprim{name,Type,...},xs) =>
@@ -1859,6 +2088,48 @@ structure OptLambda: OPT_LAMBDA =
                            [t,i,#1(reduce(env,(real_to_f64 v,CUNKNOWN)))]),
                       CUNKNOWN)
                   end
+                | ("__blockf64_sub_m256d",[t,i]) =>
+                  let val argTypes =
+                          case Type of
+                              ARROWtype(argTypes, _) => argTypes
+                            | _ => die "prim(__blockf64_sub_m256d): expecting arrow type"
+                  in tick "f256_box";
+                     (f256_box (PRIM(CCALLprim{name="__blockf64_sub_f256",instances=[],tyvars=[],
+                                                  Type=ARROWtype(argTypes,[f256Type])},
+                                        [t,i])),
+                      CUNKNOWN)
+                  end
+                | ("__blockf64_update_m256d",[t,i,v]) =>
+                  let val (bType,iType) =
+                          case Type of
+                              ARROWtype([bType,iType,_], _) => (bType,iType)
+                            | _ => die "prim(__blockf64_update_m256d): expecting arrow type with three args"
+                  in tick "f256_unbox";
+                     (PRIM(CCALLprim{name="__blockf64_update_f256",instances=[],tyvars=[],
+                                     Type=ARROWtype([bType,iType,f256Type],[unitType])},
+                           [t,i,#1(reduce(env,(f256_unbox v,CUNKNOWN)))]),
+                      CUNKNOWN)
+                  end
+                | ("__m256d_broadcast", [x]) => (f256_box (f256_broadcast (real_to_f64 x)), CUNKNOWN)
+                | ("__m256d_blend", [a, b, mask]) =>
+                		(f256_box (f256_blend (f256_unbox a, f256_unbox b, f256_unbox mask)), CUNKNOWN)
+                | ("__m256d_plus", [x, y]) => reduce_f256bin f256_plus (x, y)
+                | ("__m256d_minus", [x, y]) => reduce_f256bin f256_minus (x, y)
+                | ("__m256d_and", [x, y]) => reduce_f256bin f256_and (x, y)
+                | ("__m256d_or", [x, y]) => reduce_f256bin f256_or (x, y)
+                | ("__m256d_mul", [x, y]) => reduce_f256bin f256_mul (x, y)
+                | ("__m256d_div", [x, y]) => reduce_f256bin f256_div (x, y)
+                | ("__m256d_less", [x, y]) => reduce_f256bin f256_less (x, y)
+                | ("__m256d_lesseq", [x, y]) => reduce_f256bin f256_lesseq (x, y)
+                | ("__m256d_greater", [x, y]) => reduce_f256bin f256_greater (x, y)
+                | ("__m256d_greatereq", [x, y]) => reduce_f256bin f256_greatereq (x, y)
+                | ("__m256d_all", [x]) => reduce_f256all x
+                | ("__m256d_not", [x]) => reduce_f256not x
+                | ("__m256d_any", [x]) => reduce_f256any x
+                | ("__m256d_sum", [a]) => (f64_to_real (f256_sum (f256_unbox a)), CUNKNOWN)
+                | ("__m256d_product", [a]) => (f64_to_real (f256_product (f256_unbox a)), CUNKNOWN)
+                | ("__m256d_true", []) => (f256_box f256_true, CUNKNOWN)
+                | ("__m256d_false", []) => (f256_box f256_false, CUNKNOWN)
                 | _ => constantFolding env lamb fail
             else constantFolding env lamb fail
           | _ => constantFolding env lamb fail
@@ -2273,7 +2544,7 @@ structure OptLambda: OPT_LAMBDA =
                                   | _ => die "eliminate_explicit_records2"
                     fun mk_lamb [] [] [] = transf env' scope
                       | mk_lamb (lv::lvs) (tau::taus) (lamb::lambs) =
-                        ((if eq_Type(tau,f64Type) then Lvars.set_ubf64 lv else ());
+                        ((if eq_Type(tau,f64Type) orelse eq_Type(tau,f256Type) then Lvars.set_ubf64 lv else ());
                          LET{pat=[(lv,[],tau)],
                              bind=transf env lamb,
                              scope=mk_lamb lvs taus lambs})
@@ -3054,6 +3325,15 @@ structure OptLambda: OPT_LAMBDA =
          end
    end
 
+(* will convert the box primop into one the both allocs a blockf64 and then
+ * stores the vector register
+ *)
+fun alloc_vectors lamb =
+  case lamb of
+       PRIM(CCALLprim{name="__f256_box",...},[e]) => f256_box_alloc (alloc_vectors e)
+     | _ => map_lamb alloc_vectors lamb
+
+
    local
    fun exec (e: LambdaExp) (scope: LambdaExp) : LambdaExp =
        let val lv = Lvars.newLvar()
@@ -3537,6 +3817,7 @@ structure OptLambda: OPT_LAMBDA =
             val lamb = fix_conversion lamb
             val (lamb,inveta_env) = inverse_eta_for_fix_bound_lvars inveta_env lamb
             val lamb = table2d_simplify lamb
+            val lamb = alloc_vectors lamb
         in (lamb, (inveta_env, let_env))
         end
 
diff --git a/src/Compiler/Regions/MulExp.sml b/src/Compiler/Regions/MulExp.sml
index 2d2594e18..7bf005c4c 100644
--- a/src/Compiler/Regions/MulExp.sml
+++ b/src/Compiler/Regions/MulExp.sml
@@ -2066,6 +2066,7 @@ struct
                                          SOME (ty,rho) => (ty,SOME rho)
                                        | NONE => (mu,NONE)
                     val () = if R.isF64Type ty then Lvars.set_ubf64 lvar else ()
+                    val () = if R.isF256Type ty then Lvars.set_ubf64 lvar else ()
                 in [(lvar, ref ([]:R.il ref list), [], ref([]:effect list), ty, place, dummy_'c)]
                 end
               | RegionExp.RaisedExnBind => []
diff --git a/src/Compiler/Regions/RTYPE.sig b/src/Compiler/Regions/RTYPE.sig
index a1b52b2aa..d4c79e8e9 100644
--- a/src/Compiler/Regions/RTYPE.sig
+++ b/src/Compiler/Regions/RTYPE.sig
@@ -43,6 +43,7 @@ sig
   val boolType   : Type
   val realType   : Type
   val f64Type    : Type
+  val f256Type    : Type
   val stringType : Type
   val unitType   : Type
   val chararrayType : Type
@@ -51,6 +52,7 @@ sig
   val runtype    : Type -> runType option
 
   val isF64Type  : Type -> bool
+  val isF256Type : Type -> bool
 
   (* ann_mu(mus)acc is a list of all the places and arrow effects that occur
    * in mus consed onto acc; word regions are not included in the result. *)
diff --git a/src/Compiler/Regions/RType.sml b/src/Compiler/Regions/RType.sml
index 65a22326a..0510de65e 100644
--- a/src/Compiler/Regions/RType.sml
+++ b/src/Compiler/Regions/RType.sml
@@ -1103,6 +1103,7 @@ struct
   val boolType      : Type = CONSTYPE(TyName.tyName_BOOL,[],[],[])
   val realType      : Type = CONSTYPE(TyName.tyName_REAL,[],[],[])
   val f64Type       : Type = CONSTYPE(TyName.tyName_F64,[],[],[])
+  val f256Type      : Type = CONSTYPE(TyName.tyName_F256,[],[],[])
   val stringType    : Type = CONSTYPE(TyName.tyName_STRING,[],[],[])
   val chararrayType : Type = CONSTYPE(TyName.tyName_CHARARRAY,[],[],[])
 
@@ -1113,6 +1114,11 @@ struct
           CONSTYPE(tn,_,_,_) => TyName.eq(tn,TyName.tyName_F64)
         | _ => false
 
+  fun isF256Type t =
+      case t of
+          CONSTYPE(tn,_,_,_) => TyName.eq(tn,TyName.tyName_F256)
+        | _ => false
+
   fun unboxed t =
     case t
       of RECORD[] => true
diff --git a/src/Compiler/Regions/RegionStatEnv.sml b/src/Compiler/Regions/RegionStatEnv.sml
index 6dd51ee68..95ac7a216 100644
--- a/src/Compiler/Regions/RegionStatEnv.sml
+++ b/src/Compiler/Regions/RegionStatEnv.sml
@@ -66,6 +66,7 @@ structure RegionStatEnv: REGION_STAT_ENV =
     val tyname_env0= TyNameMap.add(TyName.tyName_STRING, (0,[],0),
 		     TyNameMap.add(TyName.tyName_REAL, (0,[],0),
 		     TyNameMap.add(TyName.tyName_F64, (0,[],0),
+		     TyNameMap.add(TyName.tyName_F256, (0,[],0),
 		     TyNameMap.add(TyName.tyName_EXN, (0,[],0),
 		     TyNameMap.add(TyName.tyName_REF, (1,[],0),
 		     TyNameMap.add(TyName.tyName_BOOL, (0,[],0),
@@ -90,7 +91,7 @@ structure RegionStatEnv: REGION_STAT_ENV =
 		     TyNameMap.add(TyName.tyName_FOREIGNPTR, (0,[],0),
 		     TyNameMap.add(TyName.tyName_ARRAY, (1,[],0),
 		     TyNameMap.add(TyName.tyName_VECTOR, (1,[],0),
-		     TyNameMap.empty)))))))))))))))))))))))
+		     TyNameMap.empty))))))))))))))))))))))))
 
     local
 
diff --git a/src/Manager/ManagerObjects0.sml b/src/Manager/ManagerObjects0.sml
index fbe788e70..6222bc740 100644
--- a/src/Manager/ManagerObjects0.sml
+++ b/src/Manager/ManagerObjects0.sml
@@ -256,7 +256,8 @@ functor ManagerObjects0(structure Execution : EXECUTION)
 			       TyName.tyName_CHAR,    (* char is needed for char constants *)
 			       TyName.tyName_REF,
 			       TyName.tyName_REAL,    (* real needed because of overloading *)
-			       TyName.tyName_F64]     (* f64 needed because of optimiser *)
+			       TyName.tyName_F64,
+			       TyName.tyName_F256]     (* f64 and f256 needed because of optimiser *)
 		  @ TyName.Set.list tynames
 		val IB(ife,ise,ce,cb) = ib
 		val {funids, sigids, longstrids, longvids, longtycons} = ids