Skip to content

Commit

Permalink
Purge num_threads (#454)
Browse files Browse the repository at this point in the history
  • Loading branch information
chriselrod authored Jan 4, 2023
1 parent 35f8310 commit ac31711
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 105 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LoopVectorization"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
authors = ["Chris Elrod <[email protected]>"]
version = "0.12.143"
version = "0.12.144"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down Expand Up @@ -34,7 +34,7 @@ ArrayInterface = "6"
ArrayInterfaceCore = "0.1.5"
ArrayInterfaceOffsetArrays = "0.1.2"
ArrayInterfaceStaticArrays = "0.1.2"
CPUSummary = "0.1.3 - 0.1.8, 0.1.11"
CPUSummary = "0.1.3 - 0.1.8, 0.1.11, 0.2.1"
ChainRulesCore = "1"
CloseOpenIntervals = "0.1.10"
DocStringExtensions = "0.8, 0.9"
Expand All @@ -43,7 +43,7 @@ HostCPUFeatures = "0.1.10"
IfElse = "0.1"
LayoutPointers = "0.1.11"
OffsetArrays = "1.4.1"
PolyesterWeave = "0.1.10"
PolyesterWeave = "0.1.10, 0.2"
SIMDDualNumbers = "0.1"
SIMDTypes = "0.1"
SLEEFPirates = "0.6.23"
Expand Down
9 changes: 7 additions & 2 deletions src/LoopVectorization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,13 @@ using VectorizationBase:
maybestaticsize#,zero_mask

using HostCPUFeatures:
pick_vector_width, register_size, register_count, has_opmask_registers, unwrap, get_cpu_name
using CPUSummary: num_threads, num_cores, cache_linesize, cache_size
pick_vector_width,
register_size,
register_count,
has_opmask_registers,
unwrap,
get_cpu_name
using CPUSummary: num_cores, cache_linesize, cache_size


using IfElse: ifelse
Expand Down
2 changes: 1 addition & 1 deletion src/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ end
::Val{UNROLL},
::Val{dontbc},
) where {T<:NativeTypes,N,BC<:Union{Broadcasted,Product},Mod,UNROLL,dontbc}
2 + 1
# 2 + 1
# we have an N dimensional loop.
# need to construct the LoopSet
ls = LoopSet(Mod)
Expand Down
39 changes: 20 additions & 19 deletions src/codegen/lower_threads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,6 @@ struct StaticType{T} end
)
end

# function approx_cbrt(x)
# s = significand(x)
# e = exponent(x)

# # 40 + 0.00020833333333333335*(x-64000) -2.1701388888888896e-9*(x-64000)^2*0.5 + 5.6514033564814844e-14 * (x-64000)^3/6
# end
lv_max_num_threads() = ifelse(gt(num_threads(), num_cores()), num_cores(), num_threads())

@generated function calc_factors(::StaticInt{nc}) where {nc}
t = Expr(:tuple)
for i nc:-1:1
Expand Down Expand Up @@ -148,10 +140,10 @@ end

# if a threaded loop is vectorized, call
@inline function choose_num_blocks(M, ::StaticInt{U}, nt) where {U}
_choose_num_blocks(M % UInt, StaticInt{U}(), nt, lv_max_num_threads())
_choose_num_blocks(M % UInt, StaticInt{U}(), nt, num_cores())
end
# otherwise, call
@inline choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} =
@inline choose_num_blocks(nt, ::StaticInt{NC} = num_cores()) where {NC} =
@inbounds choose_num_block_table(StaticInt{NC}())[nt]

scale_cost(c) = @fastmath c * (Sys.ARCH === :x86_64 ? 0.0225 : 0.005625)
Expand All @@ -168,12 +160,15 @@ end
NT::UInt,
x::Base.BitInteger,
) where {T<:Union{Float32,Float64}}
min(
Base.fptoui(
UInt,
Base.ceil_llvm(Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x)))),
max(
min(
Base.fptoui(
UInt,
Base.ceil_llvm(Base.mul_float_fast(C, Base.sqrt_llvm_fast(Base.uitofp(T, x)))),
),
NT,
),
NT,
one(UInt),
)
end
function push_loop_length_expr!(q::Expr, ls::LoopSet)
Expand Down Expand Up @@ -431,9 +426,12 @@ function thread_one_loops_expr(
if all(isstaticloop, ls.loops)
_num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
_num_threads > 1 || return avx_body(ls, UNROLL)
choose_nthread = Expr(:(=), Symbol("#nthreads#"), _num_threads)
ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt)
choose_nthread =
Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads))
else
choose_nthread = :(_choose_num_threads($(Float32(c)), $ntmax))
choose_nthread =
:(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax)))
push_loop_length_expr!(choose_nthread, ls)
choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
end
Expand Down Expand Up @@ -623,9 +621,12 @@ function thread_two_loops_expr(
if all(isstaticloop, ls.loops)
_num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
_num_threads > 1 || return avx_body(ls, UNROLL)
choose_nthread = Expr(:(=), Symbol("#nthreads#"), _num_threads)
ntcallexpr = Expr(:call, %, Expr(:call, Threads.nthreads), UInt)
choose_nthread =
Expr(:(=), Symbol("#nthreads#"), Expr(:call, min, ntcallexpr, _num_threads))
else
choose_nthread = :(_choose_num_threads($(Float32(c)), $ntmax))
choose_nthread =
:(_choose_num_threads($(Float32(c)), min(Threads.nthreads() % UInt, $ntmax)))
push_loop_length_expr!(choose_nthread, ls)
choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
end
Expand Down
26 changes: 12 additions & 14 deletions src/condense_loopset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,7 @@ val(x) = Expr(:call, Expr(:curly, :Val, x))
ri = argmin(R)
quote
$(Expr(:meta, :inline))
p, li =
VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1), one($I)),), strides(x))
p, li = VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1), one($I)),), strides(x))
ptr = gep(p, li)
si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}(
(getfield(strides(x), $ri),),
Expand Down Expand Up @@ -572,7 +571,7 @@ end
StaticInt{W}(),
register_size(),
available_registers(),
lv_max_num_threads(),
num_cores(), #FIXME
cache_linesize(),
)
end
Expand Down Expand Up @@ -814,11 +813,12 @@ function generate_call_types(
add_external_functions!(extra_args, ls) # extract_external_functions!
add_outerreduct_types!(extra_args, ls) # extract_outerreduct_types!
argcestimate = length(extra_args.args) - 1
for ref = ls.refs_aliasing_syms
for ref in ls.refs_aliasing_syms
argcestimate += length(ref.loopedindex)
end
manyarg = !debug && (argcestimate > 16)
func = debug ? lv(:_turbo_loopset_debug) : (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!))
func =
debug ? lv(:_turbo_loopset_debug) : (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!))
q = Expr(
:call,
func,
Expand All @@ -835,18 +835,12 @@ function generate_call_types(
vargsym = gensym(:vargsym)
push!(
q.args,
Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym))
Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym)),
)
if manyarg
push!(
q.args,
Expr(:call, lv(:flatten_to_tuple), vargsym),
)
push!(q.args, Expr(:call, lv(:flatten_to_tuple), vargsym))
else
push!(
q.args,
Expr(:(...), Expr(:call, lv(:flatten_to_tuple), vargsym)),
)
push!(q.args, Expr(:(...), Expr(:call, lv(:flatten_to_tuple), vargsym)))
end
Expr(:block, Expr(:(=), vargsym, Expr(:tuple, lbarg, extra_args)))
end
Expand Down Expand Up @@ -943,6 +937,10 @@ for f ∈ (convert, reinterpret, trunc, unsafe_trunc, round, ceil, floor)
@eval can_turbo(::typeof($f), ::Val{2}) = true
end

# @inline function _can_turbo(f::F, t::Vararg{Any,K}) where {F,K}
# Base.promote_op(f, t...) !== Union{}
# end

"""
check_turbo_safe(ls::LoopSet)
Expand Down
1 change: 1 addition & 0 deletions test/.JuliaFormatter.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
indent = 2
1 change: 1 addition & 0 deletions test/precompile/LVUser/.JuliaFormatter.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
indent = 2
Loading

2 comments on commit ac31711

@chriselrod
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/75088

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.12.144 -m "<description of version>" ac317118c1eebf6332f4d4c222b0ef290399c879
git push origin v0.12.144

Please sign in to comment.