# February 26, 2007 BST Discussion Multiple Testing and Annotation library(Biobase) library(affy) library(affydata) library(hgu95av2cdf) library(vsn) library(hgu95av2) # Multiple Testing # Eliminate genes with low expression levels and low variability # Eliminate genes with fluorescenes less than 100 in at least 25% of the samples # and with an IQR on the log base 2 scale less than 0.5 library(genefilter) fun1 <- pOverA(0.25, log2(100)) fun2 <- function(x) (IQR(x)>0.5) funfun <- filterfun(fun1, fun2) Screened <- genefilter(log2(exprs(Processed)), funfun) ScreenGenes <- exprs(Processed[Screened,]) # Now we have a matrix of the genes dim(ScreenGenes) names(ScreenGenes) EstLevels <- pd[[1]] # Testing for differences in mean expression levels between High and Low Estrogen # Combined time periods # Two sided tests # Using "parametric" marginals # t-test with unequal variances Ttests <- mt.teststat(ScreenGenes,classlabel=EstLevels,test="t") TRawP <- round(2*(1-pt(abs(Ttests), df=6)),4) # Wilcoxon test Wiltests <- mt.teststat(ScreenGenes,classlabel=EstLevels,test="wilcoxon") # Using permutation marginals to get raw p-values resT <- mt.maxT(ScreenGenes, test="t", classlabel=EstLevels, B=10000) # Note resT$teststat returns t statistics in descending order of absolute t # To get t statistics in order of original data rawTs <- resT$teststat[order(resT$index)] rawPs <- resT$rawp[order(resT$index)] # This is not a good data set to use permutation on # That's why we have few distinct p-values sort(resT$rawp) length(which(resT$rawp < 0.05)) # The mt.maxT function also gives us adjusted p-values based on maxT procedure # using permutation based method of Westfall and Young adjPs <- resT$rawp[order(resT$index)] length(which(resT$adjp < 0.05)) # If we increase or threshold to 0.1 length(which(resT$adjp < 0.1)) # We can also easily use a non-parametric test by specifying test="wilcoxon" mt.maxT(ScreenGenes, test="wilcoxon", classlabel=EstLevels, B=10000) # Can similarly use minP procedure resP <- mt.minP(ScreenGenes, test="t", classlabel=EstLevels, B=10000) sort(resP$rawp) length(which(resP$rawp < 0.05)) length(which(resP$adjp < 0.05)) # If we increase or threshold to 0.1 length(which(resP$adjp < 0.1)) # Much different answer. Permutation is not good approach for this data set # Can turn raw p-values from any procedure into adjusted p-values # for FDR, gFWER or TPPFP rates # Bonferroni FWER BonRates <- mt.rawp2adjp(TRawP, proc="Bonferroni")$adjp mt.rawp2adjp(resT$rawp[order(index)], proc="Bonferroni")$adjp # FDR rates mt.rawp2adjp(TRawP, proc="BH")$adjp # TPPFP rates fwer2tppfp(BonRates[,2], q = 0.05) # Start of Annotation Discussion #Annotation # Lists "databases" available for the array help(hgu95av2) # Get lits of probe sets affyID <- ls(env=hgu95av2) probeNames(hgu95av2) # Continuation from Feb 12 # Get the genes that differed at this level tGenes <- tGenes[tGenes] # Take first 5 Five <- names(tGenes[1:5]) First <- Five[1] # Let's find some information about this gene # What's its accession number? get(First, env=hgu95av2ACCNUM) # What are its PubMed number(s)? get(First, env=hgu95av2PMID # What's its name? get(First, env=hgu95av2GENENAME) # Which chromosome is it on? get(First, env=hgu95av2CHR) # Get gene ontology information ? get(First, env=hgu95av2GO) # Can do most of these tasks for multiple genes simultaneously # with built in functions # Get GenBank Accession number. Requires internet connection # Not working for some reason getGI(Five) # Get sequence information given GenBank Accession number getSEQ(125081) PMIDS <- getPMID(Five, data="hgu95av2") GG <- getGO(Five, data="hgu95av2") # This requires GO package getGOdesc(GG[[2]], "MF") # Find PMIDs that are shared by 2 genes PMIDS[[1]][(PMIDS[[1]] %in% PMIDS[[2]])] # Connect to PubMed and get references for first Gene browseURL("www.r-project.org") pubmed(PMIDS[1], disp="browser") # Need XML package to get abstracts # Can also query Genbank ACCN <- get(First, env=hgu95av2ACCNUM) genbank(ACCN, disp="browser") # Generate an HTML report llid <- getLL(Five, data="hgu95av2") symb <- getSYMBOL(Five, data="hgu95av2") map <- sapply(1:5, function(x) get(Five[x], env=hgu95av2MAP)) chr <- sapply(1:5, function(x) get(Five[x], env=hgu95av2CHR)) res <- data.frame(Five, cbind(unlist(symb), unlist(chr), unlist(map))) names(res) <- c("Locus Link ID", "Gene symbol", "Chromosome", "Chromosomal Location") htmlpage(llid, filename="ll.html", title="HTML report for 5 Genes", othernames=res, table.head=c("LocusID", names(res)), table.center=TRUE)