Skip to content

Commit

Permalink
pca
Browse files Browse the repository at this point in the history
  • Loading branch information
katarzynasornat committed Jul 24, 2018
0 parents commit f9523be
Showing 1 changed file with 87 additions and 0 deletions.
87 changes: 87 additions & 0 deletions iris_pca_essentials.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# PCA calculation

# data loading
data("iris")

# libraries loading
require(plotly)
require(dplyr)
require(factoextra)

# data vizualization using plotly library
p <- plot_ly(data = iris, x = ~Sepal.Length, y = ~Sepal.Width, z = ~Petal.Length, color = ~Petal.Width)

# calculating variances of variables
iris %>% summarise_all(var) # the highest variance in the direction of variable Assault

# own implementation

own.pca <- function(dataset, if.standarize = TRUE)
{
require(dplyr)
if(if.standarize){
dataset <- scale(dataset)
}
spectral_decomposition <- dataset %>% cov() %>% eigen()

transformed_data <- as.matrix(dataset) %*% spectral_decomposition$vectors

return(list(transf.data = transformed_data, loadings = spectral_decomposition$vectors, variances = spectral_decomposition$values))
}

# Bartlett Test
bart<-function(dat){ #dat is your raw data
R<-cor(dat)
p<-ncol(dat)
n<-nrow(dat)
chi2<- -((n-1)-((2*p)+5)/6 ) * log(det(R)) #this is the formula
df<-(p*(p-1)/2)
crit<-qchisq(.95,df) #critical value
p<-pchisq(chi2,df,lower.tail=F) #pvalue
cat("Bartlett's test of sphericity: X2(",
df,")=",chi2,", p=",
round(p,3),sep="" )
}

# we should do the PCA: null hypotesis rejected


pca1 <- own.pca(iris[,1:4])
pca2 <- prcomp(iris[,1:4], scale = TRUE)
pca3 <- princomp(~.,data=iris[,1:4])

# first look at the eigenvalues
pca1$variances

# pca1 and pca2 the same output

# three methods to determine proper number of components
# 1. elbow
# 2. 70% of variance explained
# 3. eigenvalues >1 - means that new component explains more variance than one standarized variable (then variance is 1)

# Rule 3. shows that even one component describes our data in good way. But if we look at the elbow plot then we suppose to pick two components.
fviz_eig(pca2, addlabels = TRUE, ylim = c(0, 90))

summary(pca2)
# We can see in the third row, that first two components explain about 96% of total variance.
var <- get_pca_var(pca2)
var$cos2

# Value of cos2 for the first component is high for almost every variable except 'Sepal.Width'. It means that if we use only first component then explanation of 'Sepal.Width' would be poor. We decide to use first two components.
pca2$rotation
var$coord
var$contrib

# Petal.Length is positively and highly correlated with the first component (var$coord), so the longer petal, the highest value of the first component.
# Furthermore coordinate which corresponds to Petal.Length for the first component is the highest and equals 0.5804131 (pca2$rotation). Also the contribution is pretty high (33.69%).
# We analyze the rest in the same way and can describe the first component as 'petal.size' although we should remember that Sepal.Length has also significant impact.
# In interpretation we should remember: the highest values of petal size, the highest values of pc1. The second we can call 'Sepal.Width'. And here we have: the highest values of pc2, the lower sepal width.

new_values <- pca2$x
fviz_pca_var(pca2, col.var = "black")

biplot(pca2)

# http://manuals.pqstat.pl/statpqpl:redpl:pcapl
# http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/

0 comments on commit f9523be

Please sign in to comment.