diff --git a/.gitignore b/.gitignore
index e33b278..89de351 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,4 @@
 *.jl.*.cov
 *.jl.mem
 *.jl.*.mem
-Manifest.toml
\ No newline at end of file
+Manifest.toml
diff --git a/Project.toml b/Project.toml
index abc558c..2ae11b0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,9 +1,10 @@
 name = "SIMD"
 uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
 authors = ["Erik Schnetter <schnetter@gmail.com>"]
-version = "2.6.0"
+version = "2.7.0"
 
 [deps]
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [extras]
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
diff --git a/src/SIMD.jl b/src/SIMD.jl
index 196b096..3984809 100644
--- a/src/SIMD.jl
+++ b/src/SIMD.jl
@@ -18,9 +18,9 @@ for sz in (8, 16, 32, 64, 128)
             $Boolsz(b::Bool) =
                 new(ifelse(b, typemax($UIntsz), typemin($UIntsz)))
         end
-        booltype(::Type{Val{$sz}}) = $Boolsz
-        inttype(::Type{Val{$sz}}) = $Intsz
-        uinttype(::Type{Val{$sz}}) = $UIntsz
+        booltype(::Val($sz)) = $Boolsz
+        inttype(::Val($sz)) = $Intsz
+        uinttype(::Val($sz)) = $UIntsz
 
         Base.convert(::Type{Bool}, b::$Boolsz) = b.int != 0
 
@@ -43,9 +43,9 @@ Base.convert{I<:Integer}(::Type{I}, b::Boolean) = I(Bool(b))
 Base.convert{B<:Boolean}(::Type{B}, b::Boolean) = B(Bool(b))
 Base.convert{B<:Boolean}(::Type{B}, i::Integer) = B(i!=0)
 
-booltype{T}(::Type{T}) = booltype(Val{8*sizeof(T)})
-inttype{T}(::Type{T}) = inttype(Val{8*sizeof(T)})
-uinttype{T}(::Type{T}) = uinttype(Val{8*sizeof(T)})
+booltype{T}(::Type{T}) = booltype(Val(8*sizeof(T)))
+inttype{T}(::Type{T}) = inttype(Val(8*sizeof(T)))
+uinttype{T}(::Type{T}) = uinttype(Val(8*sizeof(T)))
 
 =#
 
@@ -290,94 +290,94 @@ function llvmtypedconst(::Type{Bool}, val)
 end
 
 # Type-dependent LLVM intrinsics
-llvmins(::Type{Val{:+}}, N, ::Type{T}) where {T <: IndexTypes} = "add"
-llvmins(::Type{Val{:-}}, N, ::Type{T}) where {T <: IndexTypes} = "sub"
-llvmins(::Type{Val{:*}}, N, ::Type{T}) where {T <: IntegerTypes} = "mul"
-llvmins(::Type{Val{:div}}, N, ::Type{T}) where {T <: IntTypes} = "sdiv"
-llvmins(::Type{Val{:rem}}, N, ::Type{T}) where {T <: IntTypes} = "srem"
-llvmins(::Type{Val{:div}}, N, ::Type{T}) where {T <: UIntTypes} = "udiv"
-llvmins(::Type{Val{:rem}}, N, ::Type{T}) where {T <: UIntTypes} = "urem"
-
-llvmins(::Type{Val{:~}}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
-llvmins(::Type{Val{:&}}, N, ::Type{T}) where {T <: IntegerTypes} = "and"
-llvmins(::Type{Val{:|}}, N, ::Type{T}) where {T <: IntegerTypes} = "or"
-llvmins(::Type{Val{:⊻}}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
-
-llvmins(::Type{Val{:<<}}, N, ::Type{T}) where {T <: IntegerTypes} = "shl"
-llvmins(::Type{Val{:>>>}}, N, ::Type{T}) where {T <: IntegerTypes} = "lshr"
-llvmins(::Type{Val{:>>}}, N, ::Type{T}) where {T <: UIntTypes} = "lshr"
-llvmins(::Type{Val{:>>}}, N, ::Type{T}) where {T <: IntTypes} = "ashr"
-
-llvmins(::Type{Val{:(==)}}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp eq"
-llvmins(::Type{Val{:(!=)}}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp ne"
-llvmins(::Type{Val{:(>)}}, N, ::Type{T}) where {T <: IntTypes} = "icmp sgt"
-llvmins(::Type{Val{:(>=)}}, N, ::Type{T}) where {T <: IntTypes} = "icmp sge"
-llvmins(::Type{Val{:(<)}}, N, ::Type{T}) where {T <: IntTypes} = "icmp slt"
-llvmins(::Type{Val{:(<=)}}, N, ::Type{T}) where {T <: IntTypes} = "icmp sle"
-llvmins(::Type{Val{:(>)}}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ugt"
-llvmins(::Type{Val{:(>=)}}, N, ::Type{T}) where {T <: UIntTypes} = "icmp uge"
-llvmins(::Type{Val{:(<)}}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ult"
-llvmins(::Type{Val{:(<=)}}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ule"
-
-llvmins(::Type{Val{:vifelse}}, N, ::Type{T}) where {T} = "select"
-
-llvmins(::Type{Val{:+}}, N, ::Type{T}) where {T <: FloatingTypes} = "fadd"
-llvmins(::Type{Val{:-}}, N, ::Type{T}) where {T <: FloatingTypes} = "fsub"
-llvmins(::Type{Val{:*}}, N, ::Type{T}) where {T <: FloatingTypes} = "fmul"
-llvmins(::Type{Val{:/}}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
-llvmins(::Type{Val{:inv}}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
-llvmins(::Type{Val{:rem}}, N, ::Type{T}) where {T <: FloatingTypes} = "frem"
-
-llvmins(::Type{Val{:(==)}}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oeq"
-llvmins(::Type{Val{:(!=)}}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp une"
-llvmins(::Type{Val{:(>)}}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ogt"
-llvmins(::Type{Val{:(>=)}}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oge"
-llvmins(::Type{Val{:(<)}}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp olt"
-llvmins(::Type{Val{:(<=)}}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ole"
-
-llvmins(::Type{Val{:^}}, N, ::Type{T}) where {T <: FloatingTypes} =
+llvmins(::Val{:+}, N, ::Type{T}) where {T <: IndexTypes} = "add"
+llvmins(::Val{:-}, N, ::Type{T}) where {T <: IndexTypes} = "sub"
+llvmins(::Val{:*}, N, ::Type{T}) where {T <: IntegerTypes} = "mul"
+llvmins(::Val{:div}, N, ::Type{T}) where {T <: IntTypes} = "sdiv"
+llvmins(::Val{:rem}, N, ::Type{T}) where {T <: IntTypes} = "srem"
+llvmins(::Val{:div}, N, ::Type{T}) where {T <: UIntTypes} = "udiv"
+llvmins(::Val{:rem}, N, ::Type{T}) where {T <: UIntTypes} = "urem"
+
+llvmins(::Val{:~}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
+llvmins(::Val{:&}, N, ::Type{T}) where {T <: IntegerTypes} = "and"
+llvmins(::Val{:|}, N, ::Type{T}) where {T <: IntegerTypes} = "or"
+llvmins(::Val{:⊻}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
+
+llvmins(::Val{:<<}, N, ::Type{T}) where {T <: IntegerTypes} = "shl"
+llvmins(::Val{:>>>}, N, ::Type{T}) where {T <: IntegerTypes} = "lshr"
+llvmins(::Val{:>>}, N, ::Type{T}) where {T <: UIntTypes} = "lshr"
+llvmins(::Val{:>>}, N, ::Type{T}) where {T <: IntTypes} = "ashr"
+
+llvmins(::Val{:(==)}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp eq"
+llvmins(::Val{:(!=)}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp ne"
+llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sgt"
+llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sge"
+llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: IntTypes} = "icmp slt"
+llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sle"
+llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ugt"
+llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp uge"
+llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ult"
+llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ule"
+
+llvmins(::Val{:vifelse}, N, ::Type{T}) where {T} = "select"
+
+llvmins(::Val{:+}, N, ::Type{T}) where {T <: FloatingTypes} = "fadd"
+llvmins(::Val{:-}, N, ::Type{T}) where {T <: FloatingTypes} = "fsub"
+llvmins(::Val{:*}, N, ::Type{T}) where {T <: FloatingTypes} = "fmul"
+llvmins(::Val{:/}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
+llvmins(::Val{:inv}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
+llvmins(::Val{:rem}, N, ::Type{T}) where {T <: FloatingTypes} = "frem"
+
+llvmins(::Val{:(==)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oeq"
+llvmins(::Val{:(!=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp une"
+llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ogt"
+llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oge"
+llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp olt"
+llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ole"
+
+llvmins(::Val{:^}, N, ::Type{T}) where {T <: FloatingTypes} =
     "@llvm.pow.$(suffix(N,T))"
-llvmins(::Type{Val{:abs}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:abs}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.fabs.$(suffix(N,T))"
-llvmins(::Type{Val{:ceil}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:ceil}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.ceil.$(suffix(N,T))"
-llvmins(::Type{Val{:copysign}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:copysign}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.copysign.$(suffix(N,T))"
-llvmins(::Type{Val{:cos}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:cos}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.cos.$(suffix(N,T))"
-llvmins(::Type{Val{:exp}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:exp}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.exp.$(suffix(N,T))"
-llvmins(::Type{Val{:exp2}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:exp2}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.exp2.$(suffix(N,T))"
-llvmins(::Type{Val{:floor}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:floor}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.floor.$(suffix(N,T))"
-llvmins(::Type{Val{:fma}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:fma}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.fma.$(suffix(N,T))"
-llvmins(::Type{Val{:log}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:log}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.log.$(suffix(N,T))"
-llvmins(::Type{Val{:log10}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:log10}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.log10.$(suffix(N,T))"
-llvmins(::Type{Val{:log2}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:log2}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.log2.$(suffix(N,T))"
-llvmins(::Type{Val{:max}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:max}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.maxnum.$(suffix(N,T))"
-llvmins(::Type{Val{:min}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:min}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.minnum.$(suffix(N,T))"
-# llvmins(::Type{Val{:max}}, N, ::Type{T}) where {T<:FloatingTypes} =
+# llvmins(::Val{:max}, N, ::Type{T}) where {T<:FloatingTypes} =
 #     "@llvm.maximum.$(suffix(N,T))"
-# llvmins(::Type{Val{:min}}, N, ::Type{T}) where {T<:FloatingTypes} =
+# llvmins(::Val{:min}, N, ::Type{T}) where {T<:FloatingTypes} =
 #     "@llvm.minimum.$(suffix(N,T))"
-llvmins(::Type{Val{:muladd}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:muladd}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.fmuladd.$(suffix(N,T))"
-llvmins(::Type{Val{:powi}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:powi}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.powi.$(suffix(N,T))"
-llvmins(::Type{Val{:round}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:round}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.rint.$(suffix(N,T))"
-llvmins(::Type{Val{:sin}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:sin}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.sin.$(suffix(N,T))"
-llvmins(::Type{Val{:sqrt}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:sqrt}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.sqrt.$(suffix(N,T))"
-llvmins(::Type{Val{:trunc}}, N, ::Type{T}) where {T<:FloatingTypes} =
+llvmins(::Val{:trunc}, N, ::Type{T}) where {T<:FloatingTypes} =
     "@llvm.trunc.$(suffix(N,T))"
 
 # Convert between LLVM scalars, vectors, and arrays
@@ -461,7 +461,7 @@ end
 # Element-wise access
 
 export setindex
-@generated function setindex(v::Vec{N,T}, x::Number, ::Type{Val{I}}) where {N,T,I}
+@generated function setindex(v::Vec{N,T}, x::Number, ::Val{I}) where {N,T,I}
     @assert isa(I, Integer)
     1 <= I <= N || throw(BoundsError())
     typ = llvmtype(T)
@@ -477,6 +477,9 @@ export setindex
             NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, T}, v.elts, T(x)))
     end
 end
+@inline function setindex(v::Vec{N,T}, x::Number, ::Type{Val{I}}) where {N,T,I}
+    setindex(v, x, Val(I))
+end
 
 @generated function setindex(v::Vec{N,T}, x::Number, i::Int) where {N,T}
     typ = llvmtype(T)
@@ -496,7 +499,8 @@ end
 end
 setindex(v::Vec{N,T}, x::Number, i) where {N,T} = setindex(v, Int(i), x)
 
-Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, ::Type{Val{I}}) where {N,T,I} = v.elts[I].value
+Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, ::Val{I}) where {N,T,I} = v.elts[I].value
+Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, ::Type{Val{I}}) where {N,T,I} = Base.getindex(v, Val(I))
 Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, i) where {N,T} = v.elts[i].value
 
 # Type conversion
@@ -522,14 +526,14 @@ end
 # Generic function wrappers
 
 # Functions taking one argument
-@generated function llvmwrap(::Type{Val{Op}}, v1::Vec{N,T1},
-        ::Type{R} = T1) where {Op,N,T1,R}
+@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1},
+                             ::Type{R} = T1) where {Op,N,T1,R}
     @assert isa(Op, Symbol)
     typ1 = llvmtype(T1)
     vtyp1 = "<$N x $typ1>"
     typr = llvmtype(R)
     vtypr = "<$N x $typr>"
-    ins = llvmins(Val{Op}, N, T1)
+    ins = llvmins(Val(Op), N, T1)
     decls = []
     instrs = []
     if ins[1] == '@'
@@ -557,12 +561,12 @@ end
 end
 
 # Functions taking one Bool argument
-@generated function llvmwrap(::Type{Val{Op}}, v1::Vec{N,Bool},
-        ::Type{Bool} = Bool) where {Op,N}
+@generated function llvmwrap(::Val{Op}, v1::Vec{N,Bool},
+                             ::Type{Bool} = Bool) where {Op,N}
     @assert isa(Op, Symbol)
     btyp = llvmtype(Bool)
     vbtyp = "<$N x $btyp>"
-    ins = llvmins(Val{Op}, N, Bool)
+    ins = llvmins(Val(Op), N, Bool)
     decls = []
     instrs = []
     push!(instrs, "%arg1 = trunc $vbtyp %0 to <$N x i1>")
@@ -578,8 +582,8 @@ end
 end
 
 # Functions taking two arguments
-@generated function llvmwrap(::Type{Val{Op}}, v1::Vec{N,T1},
-        v2::Vec{N,T2}, ::Type{R} = T1) where {Op,N,T1,T2,R}
+@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
+        ::Type{R} = T1) where {Op,N,T1,T2,R}
     @assert isa(Op, Symbol)
     typ1 = llvmtype(T1)
     vtyp1 = "<$N x $typ1>"
@@ -587,7 +591,7 @@ end
     vtyp2 = "<$N x $typ2>"
     typr = llvmtype(R)
     vtypr = "<$N x $typr>"
-    ins = llvmins(Val{Op}, N, T1)
+    ins = llvmins(Val(Op), N, T1)
     decls = []
     instrs = []
     if ins[1] == '@'
@@ -606,15 +610,15 @@ end
 end
 
 # Functions taking two arguments, second argument is a scalar
-@generated function llvmwrap(::Type{Val{Op}}, v1::Vec{N,T1},
-        s2::ScalarTypes, ::Type{R} = T1) where {Op,N,T1,R}
+@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, s2::ScalarTypes,
+        ::Type{R} = T1) where {Op,N,T1,R}
     @assert isa(Op, Symbol)
     typ1 = llvmtype(T1)
     vtyp1 = "<$N x $typ1>"
     typ2 = llvmtype(s2)
     typr = llvmtype(R)
     vtypr = "<$N x $typr>"
-    ins = llvmins(Val{Op}, N, T1)
+    ins = llvmins(Val(Op), N, T1)
     decls = []
     instrs = []
     if ins[1] == '@'
@@ -633,8 +637,8 @@ end
 end
 
 # Functions taking two arguments, returning Bool
-@generated function llvmwrap(::Type{Val{Op}}, v1::Vec{N,T1},
-        v2::Vec{N,T2}, ::Type{Bool}) where {Op,N,T1,T2}
+@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
+        ::Type{Bool}) where {Op,N,T1,T2}
     @assert isa(Op, Symbol)
     btyp = llvmtype(Bool)
     vbtyp = "<$N x $btyp>"
@@ -645,7 +649,7 @@ end
     typ2 = llvmtype(T2)
     vtyp2 = "<$N x $typ2>"
     atyp2 = "[$N x $typ2]"
-    ins = llvmins(Val{Op}, N, T1)
+    ins = llvmins(Val(Op), N, T1)
     decls = []
     instrs = []
     if false && N == 1
@@ -669,7 +673,7 @@ end
 end
 
 # Functions taking a vector and a scalar argument
-# @generated function llvmwrap{Op,N,T1,T2,R}(::Type{Val{Op}}, v1::Vec{N,T1},
+# @generated function llvmwrap{Op,N,T1,T2,R}(::Val{Op}, v1::Vec{N,T1},
 #         x2::T2, ::Type{R} = T1)
 #     @assert isa(Op, Symbol)
 #     typ1 = llvmtype(T1)
@@ -679,7 +683,7 @@ end
 #     typr = llvmtype(R)
 #     atypr = "[$N x $typr]"
 #     vtypr = "<$N x $typr>"
-#     ins = llvmins(Val{Op}, N, T1)
+#     ins = llvmins(Val(Op), N, T1)
 #     decls = []
 #     instrs = []
 #     append!(instrs, array2vector("%arg1", N, typ1, "%0", "%arg1arr"))
@@ -699,12 +703,12 @@ end
 # end
 
 # Functions taking two Bool arguments, returning Bool
-@generated function llvmwrap(::Type{Val{Op}}, v1::Vec{N,Bool},
-        v2::Vec{N,Bool}, ::Type{Bool} = Bool) where {Op,N}
+@generated function llvmwrap(::Val{Op}, v1::Vec{N,Bool}, v2::Vec{N,Bool},
+        ::Type{Bool} = Bool) where {Op,N}
     @assert isa(Op, Symbol)
     btyp = llvmtype(Bool)
     vbtyp = "<$N x $btyp>"
-    ins = llvmins(Val{Op}, N, Bool)
+    ins = llvmins(Val(Op), N, Bool)
     decls = []
     instrs = []
     push!(instrs, "%arg1 = trunc $vbtyp %0 to <$N x i1>")
@@ -721,8 +725,8 @@ end
 end
 
 # Functions taking three arguments
-@generated function llvmwrap(::Type{Val{Op}}, v1::Vec{N,T1},
-        v2::Vec{N,T2}, v3::Vec{N,T3}, ::Type{R} = T1) where {Op,N,T1,T2,T3,R}
+@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
+        v3::Vec{N,T3}, ::Type{R} = T1) where {Op,N,T1,T2,T3,R}
     @assert isa(Op, Symbol)
     typ1 = llvmtype(T1)
     vtyp1 = "<$N x $typ1>"
@@ -732,7 +736,7 @@ end
     vtyp3 = "<$N x $typ3>"
     typr = llvmtype(R)
     vtypr = "<$N x $typr>"
-    ins = llvmins(Val{Op}, N, T1)
+    ins = llvmins(Val(Op), N, T1)
     decls = []
     instrs = []
     if ins[1] == '@'
@@ -752,8 +756,8 @@ end
     end
 end
 
-@generated function llvmwrapshift(::Type{Val{Op}}, v1::Vec{N,T},
-                                  ::Type{Val{I}}) where {Op,N,T,I}
+@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
+        ::Val{I}) where {Op,N,T,I}
     @assert isa(Op, Symbol)
     if I >= 0
         op = Op
@@ -775,7 +779,7 @@ end
     @assert i >= 0
     typ = llvmtype(T)
     vtyp = "<$N x $typ>"
-    ins = llvmins(Val{op}, N, T)
+    ins = llvmins(Val(op), N, T)
     decls = []
     instrs = []
     nbits = 8*sizeof(T)
@@ -794,12 +798,12 @@ end
     end
 end
 
-@generated function llvmwrapshift(::Type{Val{Op}}, v1::Vec{N,T},
-                                  x2::Unsigned) where {Op,N,T}
+@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
+        x2::Unsigned) where {Op,N,T}
     @assert isa(Op, Symbol)
     typ = llvmtype(T)
     vtyp = "<$N x $typ>"
-    ins = llvmins(Val{Op}, N, T)
+    ins = llvmins(Val(Op), N, T)
     decls = []
     instrs = []
     append!(instrs, scalar2vector("%count", N, typ, "%1"))
@@ -824,8 +828,8 @@ end
     end
 end
 
-@generated function llvmwrapshift(::Type{Val{Op}}, v1::Vec{N,T},
-                                  x2::Integer) where {Op,N,T}
+@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
+        x2::Integer) where {Op,N,T}
     if Op === :>> || Op === :>>>
         NegOp = :<<
     else
@@ -836,8 +840,8 @@ end
             NegOp = :>>
         end
     end
-    ValOp = Val{Op}
-    ValNegOp = Val{NegOp}
+    ValOp = Val(Op)
+    ValNegOp = Val(NegOp)
     quote
         $(Expr(:meta, :inline))
         ifelse(x2 >= 0,
@@ -846,13 +850,12 @@ end
     end
 end
 
-@generated function llvmwrapshift(::Type{Val{Op}},
-                                  v1::Vec{N,T},
-                                  v2::Vec{N,U}) where {Op,N,T,U<:UIntTypes}
+@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
+        v2::Vec{N,U}) where {Op,N,T,U<:UIntTypes}
     @assert isa(Op, Symbol)
     typ = llvmtype(T)
     vtyp = "<$N x $typ>"
-    ins = llvmins(Val{Op}, N, T)
+    ins = llvmins(Val(Op), N, T)
     decls = []
     instrs = []
     push!(instrs, "%tmp = $ins $vtyp %0, %1")
@@ -877,9 +880,8 @@ end
     end
 end
 
-@generated function llvmwrapshift(::Type{Val{Op}},
-                                  v1::Vec{N,T},
-                                  v2::Vec{N,U}) where {Op,N,T,U<:IntegerTypes}
+@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
+        v2::Vec{N,U}) where {Op,N,T,U<:IntegerTypes}
     if Op === :>> || Op === :>>>
         NegOp = :<<
     else
@@ -890,8 +892,8 @@ end
             NegOp = :>>
         end
     end
-    ValOp = Val{Op}
-    ValNegOp = Val{NegOp}
+    ValOp = Val(Op)
+    ValNegOp = Val(NegOp)
     quote
         $(Expr(:meta, :inline))
         vifelse(v2 >= 0,
@@ -905,7 +907,7 @@ end
 for op in (:(==), :(!=), :(<), :(<=), :(>), :(>=))
     @eval begin
         @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T} =
-            llvmwrap(Val{$(QuoteNode(op))}, v1, v2, Bool)
+            llvmwrap(Val($(QuoteNode(op))), v1, v2, Bool)
     end
 end
 @inline function Base.isfinite(v1::Vec{N,T}) where {N,T<:FloatingTypes}
@@ -969,13 +971,13 @@ end
 for op in (:~, :+, :-)
     @eval begin
         @inline Base.$op(v1::Vec{N,T}) where {N,T<:IntegerTypes} =
-            llvmwrap(Val{$(QuoteNode(op))}, v1)
+            llvmwrap(Val($(QuoteNode(op))), v1)
     end
 end
 @inline Base.:!(v1::Vec{N,Bool}) where {N} = ~v1
 @inline function Base.abs(v1::Vec{N,T}) where {N,T<:IntTypes}
     # s = -Vec{N,T}(signbit(v1))
-    s = v1 >> Val{8*sizeof(T)}
+    s = v1 >> Val(8*sizeof(T))
     # Note: -v1 == ~v1 + 1
     (s ⊻ v1) - s
 end
@@ -994,7 +996,7 @@ end
 for op in (:&, :|, :⊻, :+, :-, :*, :div, :rem)
     @eval begin
         @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-            llvmwrap(Val{$(QuoteNode(op))}, v1, v2)
+            llvmwrap(Val($(QuoteNode(op))), v1, v2)
     end
 end
 @inline Base.copysign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntTypes} =
@@ -1018,20 +1020,22 @@ end
 #       ensure vifelse is efficient
 for op in (:<<, :>>, :>>>)
     @eval begin
+        @inline Base.$op(v1::Vec{N,T}, ::Val{I}) where {N,T<:IntegerTypes,I} =
+            llvmwrapshift(Val($(QuoteNode(op))), v1, Val(I))
         @inline Base.$op(v1::Vec{N,T}, ::Type{Val{I}}) where {N,T<:IntegerTypes,I} =
-            llvmwrapshift(Val{$(QuoteNode(op))}, v1, Val{I})
+            Base.$op(v1, Val(I))
         @inline Base.$op(v1::Vec{N,T}, x2::Unsigned) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val{$(QuoteNode(op))}, v1, x2)
+            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
         @inline Base.$op(v1::Vec{N,T}, x2::Int) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val{$(QuoteNode(op))}, v1, x2)
+            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
         @inline Base.$op(v1::Vec{N,T}, x2::Integer) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val{$(QuoteNode(op))}, v1, x2)
+            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
         @inline Base.$op(v1::Vec{N,T},
                          v2::Vec{N,U}) where {N,T<:IntegerTypes,U<:UIntTypes} =
-            llvmwrapshift(Val{$(QuoteNode(op))}, v1, v2)
+            llvmwrapshift(Val($(QuoteNode(op))), v1, v2)
         @inline Base.$op(v1::Vec{N,T},
                          v2::Vec{N,U}) where {N,T<:IntegerTypes,U<:IntegerTypes} =
-            llvmwrapshift(Val{$(QuoteNode(op))}, v1, v2)
+            llvmwrapshift(Val($(QuoteNode(op))), v1, v2)
         @inline Base.$op(x1::T, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
             $op(Vec{N,T}(x1), v2)
     end
@@ -1045,7 +1049,7 @@ for op in (
         :round, :sin, :sqrt, :trunc)
     @eval begin
         @inline Base.$op(v1::Vec{N,T}) where {N,T<:FloatingTypes} =
-            llvmwrap(Val{$(QuoteNode(op))}, v1)
+            llvmwrap(Val($(QuoteNode(op))), v1)
     end
 end
 @inline Base.exp10(v1::Vec{N,T}) where {N,T<:FloatingTypes} = Vec{N,T}(10)^v1
@@ -1055,15 +1059,15 @@ end
 for op in (:+, :-, :*, :/, :^, :copysign, :max, :min, :rem)
     @eval begin
         @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-            llvmwrap(Val{$(QuoteNode(op))}, v1, v2)
+            llvmwrap(Val($(QuoteNode(op))), v1, v2)
     end
 end
 # Using `IntegerTypes` here so that this definition "wins" against
 # `^(::ScalarTypes, v2::Vec)`.
 @inline Base.:^(v1::Vec{N,T}, x2::IntegerTypes) where {N,T<:FloatingTypes} =
-    llvmwrap(Val{:powi}, v1, Int(x2))
+    llvmwrap(Val(:powi), v1, Int(x2))
 @inline Base.:^(v1::Vec{N,T}, x2::Integer) where {N,T<:FloatingTypes} =
-    llvmwrap(Val{:powi}, v1, Int(x2))
+    llvmwrap(Val(:powi), v1, Int(x2))
 @inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
     vifelse(signbit(v2), -v1, v1)
 
@@ -1077,7 +1081,7 @@ for op in (:fma, :muladd)
     @eval begin
         @inline function Base.$op(v1::Vec{N,T},
                 v2::Vec{N,T}, v3::Vec{N,T}) where {N,T<:FloatingTypes}
-            llvmwrap(Val{$(QuoteNode(op))}, v1, v2, v3)
+            llvmwrap(Val($(QuoteNode(op))), v1, v2, v3)
         end
     end
 end
@@ -1177,9 +1181,9 @@ end
 for op in (:+, :-)
     @eval begin
         @inline Base.$op(v1::Vec{N,<:Ptr}, v2::Vec{N,<:IntegerTypes}) where {N} =
-            llvmwrap(Val{$(QuoteNode(op))}, v1, v2)
+            llvmwrap(Val($(QuoteNode(op))), v1, v2)
         @inline Base.$op(v1::Vec{N,<:IntegerTypes}, v2::Vec{N,<:Ptr}) where {N} =
-            llvmwrap(Val{$(QuoteNode(op))}, v1, v2)
+            llvmwrap(Val($(QuoteNode(op))), v1, v2)
         @inline Base.$op(s1::P, v2::Vec{N,<:IntegerTypes}) where {N,P<:Ptr} =
             $op(Vec{N,P}(s1), v2)
         @inline Base.$op(v1::Vec{N,<:IntegerTypes}, s2::P) where {N,P<:Ptr} =
@@ -1211,7 +1215,7 @@ end
 
 # We cannot pass in the neutral element via Val{}; if we try, Julia refuses to
 # inline this function, which is then disastrous for performance
-@generated function llvmwrapreduce(::Type{Val{Op}}, v::Vec{N,T}) where {Op,N,T}
+@generated function llvmwrapreduce(::Val{Op}, v::Vec{N,T}) where {Op,N,T}
     @assert isa(Op, Symbol)
     z = getneutral(Op, T)
     typ = llvmtype(T)
@@ -1230,7 +1234,7 @@ end
         nold,n = n, div(n, 2)
         namold,nam = nam,"%vec_$n"
         vtyp = "<$n x $typ>"
-        ins = llvmins(Val{Op}, n, T)
+        ins = llvmins(Val(Op), n, T)
         append!(instrs, subvector(namold, nold, typ, "$(nam)_1", n, 0))
         append!(instrs, subvector(namold, nold, typ, "$(nam)_2", n, n))
         if ins[1] == '@'
@@ -1250,16 +1254,16 @@ end
     end
 end
 
-@inline Base.all(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val{:&}, v)
-@inline Base.any(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val{:|}, v)
+@inline Base.all(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val(:&), v)
+@inline Base.any(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val(:|), v)
 @inline Base.maximum(v::Vec{N,T}) where {N,T<:FloatingTypes} =
-    llvmwrapreduce(Val{:max}, v)
+    llvmwrapreduce(Val(:max), v)
 @inline Base.minimum(v::Vec{N,T}) where {N,T<:FloatingTypes} =
-    llvmwrapreduce(Val{:min}, v)
-@inline Base.prod(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val{:*}, v)
-@inline Base.sum(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val{:+}, v)
+    llvmwrapreduce(Val(:min), v)
+@inline Base.prod(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val(:*), v)
+@inline Base.sum(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val(:+), v)
 
-@generated function Base.reduce(::Type{Val{Op}}, v::Vec{N,T}) where {Op,N,T}
+@generated function Base.reduce(::Val{Op}, v::Vec{N,T}) where {Op,N,T}
     @assert isa(Op, Symbol)
     z = getneutral(Op, T)
     stmts = []
@@ -1287,9 +1291,12 @@ end
     push!(stmts, :(v1[1]))
     Expr(:block, Expr(:meta, :inline), stmts...)
 end
+@inline function Base.reduce(::Type{Val{Op}}, v::Vec{N,T}) where {Op,N,T}
+    Base.reduce(Val(Op), v)
+end
 
-@inline Base.maximum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val{:max}, v)
-@inline Base.minimum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val{:min}, v)
+@inline Base.maximum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val(:max), v)
+@inline Base.minimum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val(:min), v)
 
 # Load and store functions
 
@@ -1322,8 +1329,8 @@ end
 
 export vload, vloada, vloadnt
 @generated function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                          ::Type{Val{Aligned}} = Val{false},
-                          ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+                          ::Val{Aligned} = Val(false),
+                          ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
     @assert isa(Aligned, Bool)
     ptyp = llvmtype(Int)
     typ = llvmtype(T)
@@ -1355,39 +1362,54 @@ export vload, vloada, vloadnt
             NTuple{N,VE{T}}, Tuple{Ptr{T}}, ptr))
     end
 end
+@inline function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
+                       ::Type{Val{Aligned}},
+                       ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+    vload(Vec{N, T}, ptr, Val(Aligned), Val(Nontemporal))
+end
 
 @inline vloada(::Type{Vec{N,T}}, ptr::Ptr{T}) where {N,T} =
-    vload(Vec{N,T}, ptr, Val{true})
+    vload(Vec{N,T}, ptr, Val(true))
 
 @inline vloadnt(::Type{Vec{N,T}}, ptr::Ptr{T}) where {N,T} =
-    vload(Vec{N,T}, ptr, Val{true}, Val{true})
+    vload(Vec{N,T}, ptr, Val(true), Val(true))
 
 @inline function vload(::Type{Vec{N,T}},
                        arr::FastContiguousArray{T,1},
                        i::Integer,
-                       ::Type{Val{Aligned}} = Val{false},
-                       ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+                       ::Val{Aligned} = Val(false),
+                       ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
     #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vload(Vec{N,T}, pointer(arr, i), Val{Aligned}, Val{Nontemporal})
+    vload(Vec{N,T}, pointer(arr, i), Val(Aligned), Val(Nontemporal))
+end
+@inline function vload(::Type{Vec{N,T}},
+                       arr::FastContiguousArray{T,1},
+                       i::Integer,
+                       ::Type{Val{Aligned}},k = Val{false},
+                       ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+    vload(Vec{N,T}, arr, i, Val(Aligned), Val(Nontemporal))
 end
 @inline function vloada(::Type{Vec{N,T}},
                         arr::FastContiguousArray{T,1},
                         i::Integer) where {N,T}
-    vload(Vec{N,T}, arr, i, Val{true})
+    vload(Vec{N,T}, arr, i, Val(true))
 end
 @inline function vloadnt(::Type{Vec{N,T}},
                         arr::Union{Array{T,1},SubArray{T,1}},
                         i::Integer) where {N,T}
-    vload(Vec{N,T}, arr, i, Val{true}, Val{true})
+    vload(Vec{N,T}, arr, i, Val(true), Val(true))
 end
 
 @inline vload(::Type{Vec{N,T}}, ptr::Ptr{T}, mask::Nothing,
-              ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned} =
-    vload(Vec{N,T}, ptr, Val{Aligned})
+              ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
+    vload(Vec{N,T}, ptr, Val(Aligned))
+@inline vload(::Type{Vec{N,T}}, ptr::Ptr{T}, mask::Nothing,
+              ::Type{Val{Aligned}}) where {N,T,Aligned} =
+    vload(Vec{N,T}, ptr, make, Val(Aligned))
 
 @generated function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
                           mask::Vec{N,Bool},
-                          ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned}
+                          ::Val{Aligned} = Val(false)) where {N,T,Aligned}
     @assert isa(Aligned, Bool)
     ptyp = llvmtype(Int)
     typ = llvmtype(T)
@@ -1421,28 +1443,39 @@ end
             NTuple{N,VE{T}}, Tuple{Ptr{T}, NTuple{N,VE{Bool}}}, ptr, mask.elts))
     end
 end
+@inline function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
+                       mask::Vec{N,Bool},
+                       ::Type{Val{Aligned}}) where {N,T,Aligned}
+    vload(Vec{N,T}, ptr, mask, Val(Aligned))
+end
 
 @inline vloada(::Type{Vec{N,T}}, ptr::Ptr{T},
                mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vload(Vec{N,T}, ptr, mask, Val{true})
+    vload(Vec{N,T}, ptr, mask, Val(true))
 
 @inline function vload(::Type{Vec{N,T}},
                        arr::FastContiguousArray{T,1},
                        i::Integer, mask::Union{Vec{N,Bool}, Nothing},
-                       ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned}
+                       ::Val{Aligned} = Val(false)) where {N,T,Aligned}
     #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vload(Vec{N,T}, pointer(arr, i), mask, Val{Aligned})
+    vload(Vec{N,T}, pointer(arr, i), mask, Val(Aligned))
+end
+@inline function vload(::Type{Vec{N,T}},
+                       arr::FastContiguousArray{T,1},
+                       i::Integer, mask::Union{Vec{N,Bool}, Nothing},
+                       ::Type{Val{Aligned}}) where {N,T,Aligned}
+    vload(Vec{N,T}, arr, i, mask, Val(Aligned))
 end
 @inline function vloada(::Type{Vec{N,T}},
                         arr::FastContiguousArray{T,1}, i::Integer,
                         mask::Union{Vec{N,Bool}, Nothing}) where {N,T}
-    vload(Vec{N,T}, arr, i, mask, Val{true})
+    vload(Vec{N,T}, arr, i, mask, Val(true))
 end
 
 export vstore, vstorea, vstorent
 @generated function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                           ::Type{Val{Aligned}} = Val{false},
-                           ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+                           ::Val{Aligned} = Val(false),
+                           ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
     @assert isa(Aligned, Bool)
     @assert isa(Nontemporal, Bool)
     ptyp = llvmtype(Int)
@@ -1475,6 +1508,11 @@ export vstore, vstorea, vstorent
                       Cvoid, Tuple{NTuple{N,VE{T}}, Ptr{T}}, v.elts, ptr)
     end
 end
+@inline function vstore(v::Vec{N,T}, ptr::Ptr{T},
+                        ::Type{Val{Aligned}},
+                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+    vstore(v, ptr, Val(Aligned), Val(Nontemporal))
+end
 
 @inline vstorea(v::Vec{N,T}, ptr::Ptr{T}) where {N,T} = vstore(v, ptr, Val{true})
 
@@ -1483,11 +1521,18 @@ end
 @inline function vstore(v::Vec{N,T},
                         arr::FastContiguousArray{T,1},
                         i::Integer,
-                        ::Type{Val{Aligned}} = Val{false},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+                        ::Val{Aligned} = Val(false),
+                        ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
     @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
     vstore(v, pointer(arr, i), Val{Aligned}, Val{Nontemporal})
 end
+@inline function vstore(v::Vec{N,T},
+                        arr::FastContiguousArray{T,1},
+                        i::Integer,
+                        ::Type{Val{Aligned}},
+                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+    vstore(v, arr, i, Val(Aligned), Val(Nontemporal))
+end
 @inline function vstorea(v::Vec{N,T}, arr::FastContiguousArray{T,1},
                          i::Integer) where {N,T}
     vstore(v, arr, i, Val{true})
@@ -1498,12 +1543,15 @@ end
 end
 
 @inline vstore(v::Vec{N,T}, ptr::Ptr{T}, mask::Nothing,
-               ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned} =
+               ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
     vstore(v, ptr, Val{Aligned})
+@inline vstore(v::Vec{N,T}, ptr::Ptr{T}, mask::Nothing,
+               ::Type{Val{Aligned}}) where {N,T,Aligned} =
+    vstore(v, ptr, mask, Val(Aligned))
 
 @generated function vstore(v::Vec{N,T}, ptr::Ptr{T},
                            mask::Vec{N,Bool},
-                           ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned}
+                           ::Val{Aligned} = Val(false)) where {N,T,Aligned}
     @assert isa(Aligned, Bool)
     ptyp = llvmtype(Int)
     typ = llvmtype(T)
@@ -1537,6 +1585,11 @@ end
             v.elts, ptr, mask.elts)
     end
 end
+@inline function vstore(v::Vec{N,T}, ptr::Ptr{T},
+                        mask::Vec{N,Bool},
+                        ::Type{Val{Aligned}}) where {N,T,Aligned}
+    vstore(v, ptr, mask, Val(Aligned))
+end
 
 @inline vstorea(v::Vec{N,T}, ptr::Ptr{T},
                 mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
@@ -1546,11 +1599,19 @@ end
                         arr::FastContiguousArray{T,1},
                         i::Integer,
                         mask::Union{Vec{N,Bool}, Nothing},
-                        ::Type{Val{Aligned}} = Val{false},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+                        ::Val{Aligned} = Val(false),
+                        ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
     #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
     vstore(v, pointer(arr, i), mask, Val{Aligned}, Val{Nontemporal})
 end
+@inline function vstore(v::Vec{N,T},
+                        arr::FastContiguousArray{T,1},
+                        i::Integer,
+                        mask::Union{Vec{N,Bool}, Nothing},
+                        ::Type{Val{Aligned}},
+                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
+    vstore(v, arr, i, mask, Val(Aligned), Val(Nontemporal))
+end
 @inline function vstorea(v::Vec{N,T},
                          arr::FastContiguousArray{T,1},
                          i::Integer,
@@ -1562,12 +1623,16 @@ export vgather, vgathera
 
 @inline vgather(
         ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned} =
-    vgather(Vec{N,T}, ptrs, Vec(ntuple(_ -> true, N)), Val{Aligned})
+        ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
+    vgather(Vec{N,T}, ptrs, Vec(ntuple(_ -> true, N)), Val(Aligned))
+@inline vgather(
+        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
+        ::Type{Val{Aligned}}) where {N,T,Aligned} =
+    vgather(Vec{N,T}, ptrs, mask, Val(Aligned))
 
 @generated function vgather(
         ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned}
+        ::Val{Aligned} = Val(false)) where {N,T,Aligned}
     @assert isa(Aligned, Bool)
     ptyp = llvmtype(Int)
     typ = llvmtype(T)
@@ -1603,6 +1668,11 @@ export vgather, vgathera
             ptrs.elts, mask.elts))
     end
 end
+@inline function vgather(
+        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
+        ::Type{Val{Aligned}}) where {N,T,Aligned}
+    vgather(Vec{N,T}, ptrs, mask, Val(Aligned))
+end
 
 @inline vgathera(::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}},
                  mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
@@ -1611,10 +1681,15 @@ end
 @inline vgather(arr::FastContiguousArray{T,1},
                 idx::Vec{N,<:Integer},
                 mask::Union{Vec{N,Bool}, Nothing} = nothing,
-                ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned} =
+                ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
     vgather(Vec{N,T},
             pointer(arr) + sizeof(T) * (idx - 1),
             mask, Val{Aligned})
+@inline vgather(arr::FastContiguousArray{T,1},
+                idx::Vec{N,<:Integer},
+                mask::Union{Vec{N,Bool}, Nothing},
+                ::Type{Val{Aligned}}) where {N,T,Aligned} =
+    vgather(arr, idx, mask, Val(Aligned))
 
 @inline vgathera(arr::FastContiguousArray{T,1},
                  idx::Vec{N,<:Integer},
@@ -1625,12 +1700,16 @@ export vscatter, vscattera
 
 @inline vscatter(
         v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned} =
-    vscatter(v, ptrs, Vec(ntuple(_ -> true, N)), Val{Aligned})
+        ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
+    vscatter(v, ptrs, Vec(ntuple(_ -> true, N)), Val(Aligned))
+@inline vscatter(
+        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
+        ::Type{Val{Aligned}}) where {N,T,Aligned} =
+    vscatter(v, ptrs, mask, Val(Aligned))
 
 @generated function vscatter(
         v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned}
+        ::Val{Aligned} = Val(false)) where {N,T,Aligned}
     @assert isa(Aligned, Bool)
     ptyp = llvmtype(Int)
     typ = llvmtype(T)
@@ -1666,6 +1745,11 @@ export vscatter, vscattera
             v.elts, ptrs.elts, mask.elts)
     end
 end
+@inline function vscatter(
+        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
+        ::Type{Val{Aligned}}) where {N,T,Aligned}
+    vscatter(v, ptrs, mask, Val(Aligned))
+end
 
 @inline vscattera(v::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
                   mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
@@ -1674,8 +1758,13 @@ end
 @inline vscatter(v::Vec{N,T}, arr::FastContiguousArray{T,1},
                  idx::Vec{N,<:Integer},
                  mask::Union{Vec{N,Bool}, Nothing} = nothing,
-                 ::Type{Val{Aligned}} = Val{false}) where {N,T,Aligned} =
-    vscatter(v, pointer(arr) + sizeof(T) * (idx - 1), mask, Val{Aligned})
+                 ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
+    vscatter(v, pointer(arr) + sizeof(T) * (idx - 1), mask, Val(Aligned))
+@inline vscatter(v::Vec{N,T}, arr::FastContiguousArray{T,1},
+                 idx::Vec{N,<:Integer},
+                 mask::Union{Vec{N,Bool}, Nothing},
+                 ::Type{Val{Aligned}}) where {N,T,Aligned} =
+    vscatter(v, arr, idx, mask, Val(Aligned))
 
 @inline vscattera(v::Vec{N,T}, arr::FastContiguousArray{T,1},
                   idx::Vec{N,<:Integer},
@@ -1700,7 +1789,7 @@ end
 
 export shufflevector
 @generated function shufflevector(v1::Vec{N,T}, v2::Vec{N,T},
-                                  ::Type{Val{I}}) where {N,T,I}
+                                  ::Val{I}) where {N,T,I}
     M, decls, instrs = shufflevector_instrs(N, T, I, true)
     quote
         $(Expr(:meta, :inline))
@@ -1710,8 +1799,12 @@ export shufflevector
             v1.elts, v2.elts))
     end
 end
+@inline function shufflevector(v1::Vec{N,T}, v2::Vec{N,T},
+                               ::Type{Val{I}}) where {N,T,I}
+    shufflevector(v1, v2, Val(I))
+end
 
-@generated function shufflevector(v1::Vec{N,T}, ::Type{Val{I}}) where {N,T,I}
+@generated function shufflevector(v1::Vec{N,T}, ::Val{I}) where {N,T,I}
     M, decls, instrs = shufflevector_instrs(N, T, I, false)
     quote
         $(Expr(:meta, :inline))
@@ -1721,6 +1814,9 @@ end
             v1.elts))
     end
 end
+@inline function shufflevector(v1::Vec{N,T}, ::Type{Val{I}}) where {N,T,I}
+    shufflevector(v1, Val(I))
+end
 
 export VecRange
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 9808dfe..47f9b2b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -57,6 +57,8 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
     @testset "Element-wise access" begin
 
         for i in 1:L8
+            @test Tuple(setindex(V8I32(v8i32), 9.0, Val(i))) ===
+                ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
             @test Tuple(setindex(V8I32(v8i32), 9.0, Val{i})) ===
                 ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
             @test Tuple(setindex(V8I32(v8i32), 9.0, i)) ===
@@ -66,21 +68,28 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             @test V8I32(v8i32)[i] === v8i32[i]
         end
 
+        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val(0))
         @test_throws BoundsError setindex(V8I32(v8i32), 0, Val{0})
+        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val(L8+1))
         @test_throws BoundsError setindex(V8I32(v8i32), 0, Val{L8+1})
         @test_throws BoundsError setindex(V8I32(v8i32), 0, 0)
         @test_throws BoundsError setindex(V8I32(v8i32), 0, L8+1)
+        @test_throws BoundsError V8I32(v8i32)[Val(0)]
         @test_throws BoundsError V8I32(v8i32)[Val{0}]
+        @test_throws BoundsError V8I32(v8i32)[Val(L8+1)]
         @test_throws BoundsError V8I32(v8i32)[Val{L8+1}]
         @test_throws BoundsError V8I32(v8i32)[0]
         @test_throws BoundsError V8I32(v8i32)[L8+1]
 
         for i in 1:L4
+            @test Tuple(setindex(V4F64(v4f64), 9, Val(i))) ===
+                ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
             @test Tuple(setindex(V4F64(v4f64), 9, Val{i})) ===
                 ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
             @test Tuple(setindex(V4F64(v4f64), 9, i)) ===
                 ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
 
+            @test V4F64(v4f64)[Val(i)] === v4f64[i]
             @test V4F64(v4f64)[Val{i}] === v4f64[i]
             @test V4F64(v4f64)[i] === v4f64[i]
         end
@@ -116,7 +125,9 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         end
 
         for op in (<<, >>, >>>)
+            @test Tuple(op(V8I32(v8i32), Val(3))) === map(x->op(x,3), v8i32)
             @test Tuple(op(V8I32(v8i32), Val{3})) === map(x->op(x,3), v8i32)
+            @test Tuple(op(V8I32(v8i32), Val(-3))) === map(x->op(x,-3), v8i32)
             @test Tuple(op(V8I32(v8i32), Val{-3})) === map(x->op(x,-3), v8i32)
             @test Tuple(op(V8I32(v8i32), 3)) === map(x->op(x,3), v8i32)
             @test Tuple(op(V8I32(v8i32), -3)) === map(x->op(x,-3), v8i32)
@@ -630,6 +641,29 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
 
     @testset "Vector shuffles" begin
 
+        for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)
+            a = Vec{4,T}((1,2,3,4))
+            b = Vec{4,T}((5,6,7,8))
+            @test shufflevector(a, b, Val((2,3,4,5))) === Vec{4,T}((3,4,5,6))
+            @test shufflevector(a, b, Val((1,7,5,5))) === Vec{4,T}((2,8,6,6))
+            @test shufflevector(a, b, Val(0:3)) === a
+            @test shufflevector(a, b, Val(4:7)) === b
+            @test shufflevector(a, Val((1,0,2,3))) === Vec{4,T}((2,1,3,4))
+            @test shufflevector(a, b, Val((0,1,4,5,2,3,6,7))) === Vec{8,T}((1,2,5,6,3,4,7,8))
+            @test shufflevector(shufflevector(a, b, Val((6,:undef,0,:undef))), Val((0,2))) === Vec{2,T}((7,1))
+            @test isa(shufflevector(a, Val((:undef,:undef,:undef,:undef))), Vec{4,T})
+            c = Vec{8,T}((1:8...,))
+            d = Vec{8,T}((9:16...,))
+            @test shufflevector(c, d, Val((0,1,8,15))) === Vec{4,T}((1,2,9,16))
+            @test shufflevector(c, d, Val(1:2:15)) === Vec{8,T}((2:2:16...,))
+        end
+
+        let
+            a = Vec{4,Bool}((true,false,true,false))
+            b = Vec{4,Bool}((false,false,true,true))
+            @test shufflevector(a, b, Val((2,3,4,5))) === Vec{4,Bool}((true,false,false,false))
+        end
+
         for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)
             a = Vec{4,T}((1,2,3,4))
             b = Vec{4,T}((5,6,7,8))