-
Notifications
You must be signed in to change notification settings - Fork 120
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Issue #64: added n_init to kmeans #78
Open
lbollar
wants to merge
8
commits into
JuliaStats:master
Choose a base branch
from
lbollar:n_init
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 6 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
775eefc
Issue #64: added n_init to kmeans
lbollar f85cce7
cleaned up for PR
lbollar 3ea7c39
removed tabs
lbollar fd296d4
removed line breaks to conform with style
lbollar 50f0654
fixed another whitespace conflict
lbollar a9cb7ab
used vim command to hopefully fix trailing whitespace issues
lbollar b8b381b
extraneous newline
lbollar 5ae2901
Merge branch 'master' into n_init
kmsquire File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,18 +9,19 @@ type KmeansResult{T<:AbstractFloat} <: ClusteringResult | |
counts::Vector{Int} # number of samples assigned to each cluster (k) | ||
cweights::Vector{Float64} # cluster weights (k) | ||
totalcost::Float64 # total cost (i.e. objective) (k) | ||
iterations::Int # number of elapsed iterations | ||
iterations::Int # number of elapsed iterations | ||
converged::Bool # whether the procedure converged | ||
end | ||
|
||
const _kmeans_default_init = :kmpp | ||
const _kmeans_default_maxiter = 100 | ||
const _kmeans_default_tol = 1.0e-6 | ||
const _kmeans_default_display = :none | ||
const _kmeans_default_n_init = 10 | ||
|
||
function kmeans!{T<:AbstractFloat}(X::Matrix{T}, centers::Matrix{T}; | ||
weights=nothing, | ||
maxiter::Integer=_kmeans_default_maxiter, | ||
maxiter::Integer=_kmeans_default_maxiter, | ||
tol::Real=_kmeans_default_tol, | ||
display::Symbol=_kmeans_default_display) | ||
|
||
|
@@ -34,27 +35,42 @@ function kmeans!{T<:AbstractFloat}(X::Matrix{T}, centers::Matrix{T}; | |
counts = Array(Int, k) | ||
cweights = Array(Float64, k) | ||
|
||
_kmeans!(X, conv_weights(T, n, weights), centers, | ||
assignments, costs, counts, cweights, | ||
_kmeans!(X, conv_weights(T, n, weights), centers, | ||
assignments, costs, counts, cweights, | ||
round(Int, maxiter), tol, display_level(display)) | ||
end | ||
|
||
function kmeans(X::Matrix, k::Int; | ||
function kmeans(X::Matrix, k::Int; | ||
weights=nothing, | ||
init=_kmeans_default_init, | ||
maxiter::Integer=_kmeans_default_maxiter, | ||
maxiter::Integer=_kmeans_default_maxiter, | ||
n_init::Integer=_kmeans_default_n_init, | ||
tol::Real=_kmeans_default_tol, | ||
display::Symbol=_kmeans_default_display) | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One last remaining extraneous newline. |
||
m, n = size(X) | ||
(2 <= k < n) || error("k must have 2 <= k < n.") | ||
iseeds = initseeds(init, X, k) | ||
centers = copyseeds(X, iseeds) | ||
kmeans!(X, centers; | ||
weights=weights, | ||
maxiter=maxiter, | ||
tol=tol, | ||
display=display) | ||
n_init > 0 || throw(ArgumentError("n_init must be greater than 0")) | ||
|
||
lowestcost::Float64 = Inf | ||
local bestresult::KmeansResult | ||
|
||
for i = 1:n_init | ||
iseeds = initseeds(init, X, k) | ||
centers = copyseeds(X, iseeds) | ||
result = kmeans!(X, centers; | ||
weights=weights, | ||
maxiter=maxiter, | ||
tol=tol, | ||
display=display) | ||
|
||
if result.totalcost < lowestcost | ||
lowestcost = result.totalcost | ||
bestresult = result | ||
end | ||
end | ||
return bestresult | ||
end | ||
|
||
#### Core implementation | ||
|
@@ -68,8 +84,8 @@ function _kmeans!{T<:AbstractFloat}( | |
costs::Vector{T}, # out: costs of the resultant assignments (n) | ||
counts::Vector{Int}, # out: the number of samples assigned to each cluster (k) | ||
cweights::Vector{Float64}, # out: the weights of each cluster | ||
maxiter::Int, # in: maximum number of iterations | ||
tol::Real, # in: tolerance of change at convergence | ||
maxiter::Int, # in: maximum number of iterations | ||
tol::Real, # in: tolerance of change at convergence | ||
displevel::Int) # in: the level of display | ||
|
||
# initialize | ||
|
@@ -153,7 +169,7 @@ function _kmeans!{T<:AbstractFloat}( | |
end | ||
end | ||
|
||
return KmeansResult(centers, assignments, costs, counts, cweights, | ||
return KmeansResult(centers, assignments, costs, counts, cweights, | ||
@compat(Float64(objv)), t, converged) | ||
end | ||
|
||
|
@@ -245,7 +261,7 @@ function update_centers!{T<:AbstractFloat}( | |
n::Int = size(x, 2) | ||
k::Int = size(centers, 2) | ||
|
||
# initialize center weights | ||
# initialize center weights | ||
for i = 1 : k | ||
if to_update[i] | ||
cweights[i] = 0. | ||
|
@@ -299,7 +315,7 @@ function update_centers!{T<:AbstractFloat}( | |
n::Int = size(x, 2) | ||
k::Int = size(centers, 2) | ||
|
||
# initialize center weights | ||
# initialize center weights | ||
for i = 1 : k | ||
if to_update[i] | ||
cweights[i] = 0. | ||
|
@@ -314,7 +330,7 @@ function update_centers!{T<:AbstractFloat}( | |
if wj > 0 | ||
@inbounds cj = assignments[j] | ||
1 <= cj <= k || error("assignment out of boundary.") | ||
|
||
if to_update[cj] | ||
rj = view(centers, :, cj) | ||
xj = view(x, :, j) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand that
n_init
comes from Python's sklearn (#64), but it doesn't sound like a best choice for me.Maybe something like
n_tries
to reflect that the parameter defines how many times the algorithm, rather than some initialization procedure, is run?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
or
ntries
? And wouldn't be an overkill to run 10 times? I recommend default value 1, because usually a quick partitioning is required and not necessarily best one. And, if one needs to find a best clustering, this parameter can be set to larger value explicitly.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
10 is what sklearn does at it sounds reasonable to me.
It isn't unusual to run 1000s of times, (that was done as the baseline for the affinity propagation paper)
If some need a quick partition they can ask for it.
The default shouldn't be so sensitive to random factors.
I think 10 strikes the right balance.
Though I could see argument for 3 or 30