# March 5, 2007 BST 226 Discussion Cluster Analysis library(Biobase) library(affy) library(affydata) library(hgu95av2cdf) library(estrogen) library(hgu95av2) library(multtest) library(genefilter) setwd("c:/Program Files/R/R-2.4.0/library/estrogen/extdata") pd <- read.phenoData("estrogen.txt", header=TRUE, row.names=1) EstArrays <-ReadAffy(filenames=rownames(pData(pd)),phenoData=pd, verbose=TRUE) Processed <- expresso(EstArrays, bgcorrect.method="rma", normalize.method="constant", summary.method="avgdiff", pmcorrect.method="pmonly") # Multiple Testing # Eliminate genes with low expression levels and low variability # Eliminate genes with fluorescenes less than 100 in at least 25% of the samples # and with an IQR on the log base 2 scale less than 0.5 fun1 <- pOverA(0.25, log2(100)) fun2 <- function(x) (IQR(x)>0.5) funfun <- filterfun(fun1, fun2) Screened <- genefilter(log2(exprs(Processed)), funfun) ScreenGenes <- exprs(Processed[Screened,]) library(cluster) library(hopach) # Distance Measures # Pull out one sample from each group and Look at first 50 genes only for computational speed SmallSet <- ScreenGenes[1:50,c(1,3,5,7)] # Calculate Euclidean distances bewteen expression levels of first 50 genes in the ScreenGenes set # dist returns an object of class dist SmallDist <- dist(SmallSet, method="euclidean") # Calculate cosine angle distances bewteen expression levels of first 50 genes in the ScreenGenes set # distancematrix returns a matrix CosDist <- distancematrix(SmallSet, d="cosangle", na.rm=TRUE) # We're going to look using pam and hopach. For pam need to determine number if clusters # Determine Number of Clusters for each set # Note arguments must be matrices not dist objects msscheck(CosDist) silcheck(CosDist, diss=TRUE) msscheck(as.matrix(SmallDist)) silcheck(as.matrix(SmallDist), diss=TRUE) # Note number of clusters differs depending on the distance metric # Apply clustering pam and hopach algorithms and compare # Note pam can take a distance object while hopach cannot pamEuc <- pam(SmallDist, k=9) hopachEuc <- hopach(SmallSet, dmat=SmallDist, d="euclid") pamCos <- pam(CosDist, k=3) hopachCos <- hopach(SmallSet, dmat=CosDist, d="cosangle") # Explore output of pam function names(pamEuc) help(pam.object) # returns genes that are used as medioid pamEuc$medoids # returns corresponding row indices pamEuc$id.med # Returns vector indicating which cluster each gene is in pamEuc$clustering # Returns information about the clusters (e.g. distance within and between) pamEuc$clusinfo # Look at results par(mfrow=c(1,2)) plot(pamCos) # Get cluster assignments of each gene pamCos$clustering # Get names of all genes in cluster 1 dimnames(SmallSet[which(pamCos$clustering==1),])[[1]] # Explore output of hopach function names(hopachCos) hopachCos$clustering help(hopach) names(hopachCos$final) hopachCos$final$labels # Plot hopach clustering dplot(CosDist, hopachobj=hopachCos) # Hierarchical Clustering Algorithms plot(hclust(SmallDist)) diana(SmallSet) plot(diana(SmallSet))