## CGRS - Control Genes Ranking System ## ## Copyright (c) 2008, Swiss Institute of Bioinformatics ## All rights reserved. ## ## The contents of this file are subject to the Mozilla Public License ## Version 1.1 (the "License"); you may not use this file except in ## compliance with the License. You may obtain a copy of the License at ## http://www.mozilla.org/MPL/ ## ## Software distributed under the License is distributed on an "AS IS" ## basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the ## License for the specific language governing rights and limitations ## under the License. ## ## Author: Vlad Popovici (vlad.popovici@isb-sib.ch) ## ## These functions are designed to accompany the paper ## Popovici V., et. al., "Selecting control genes for RT-QPCR using ## public microarray data". Please check the paper for details on formulas ## and on the general approach. ## Before using the functions from this mini-library, you have to prepare ## your data. The expression matrices used by the functions below are supposed ## to contain gene-level expression values, so you have to summarize (using ## your preferred approach) your probeset-level (or probe-level) data. We ## suggest selecting that probeset/probe that manifests the higher variability ## within a data set. But this remains an open question. ## ## Typical usage (usually placed in a loop): ## -load your data in a matrix, say X (one gene per row) ## -get the statistics: ## d = get.stats(X) ## -rank the genes in this data set: ## l = get.ranked.list(d) ## -add the list to a gloabal list: ## S = c(S, list(l)) ## At the end, aggregate the lists: ## final.list = names(rkprod.score(S)) require(ggplot2) ## SCORE.STABILITY - gene stability score ## ## score = alpha*log2(max{mu - beta, 0}) - sigma ## ## Parameters: ## mu - the vector of mean expressions ## sigma - the vector of standard deviations ## alpha, beta - user define parameters (default values as in the paper) ## rounding - the scores may be rounded to significant decimals ## ## Returns: ## a vector of scores score.stability = function(mu, sigma, alpha=.25, beta=quantile(mu,probs=c(0.25))[1], rounding=-1) { if (length(mu) != length(sigma)) stop("[score.stability2:] the 2 vectors must have the same length!") s = rep(NA, length(mu)) s = alpha*log2(pmax(mu-beta,0)) - sigma if (rounding >= 0) { s = round(s, rounding) } s[mu <= beta] = -Inf # just to be sure... if (!is.null(names(mu))) names(s) = names(mu) return (s) } ## GET.STATS - given an expression matrix (genes by rows), it returns the ## relevant statistics for each gene: mean, standard deviation and normalized ## sd, as well as the scores. ## ## Parameters: ## X - the expression matrix (genes by rows) ## score.function - the scoring function ## ... - other parameters (e.g. alpha, beta) for the scoring ## function ## ## Returns: ## a data frame with the columns containing the relevant statistics get.stats = function(X, score.function=score.stability, ...) { mu = apply(X, 1, function(z) (mean(z, na.rm=TRUE))) sg = apply(X, 1, function(z) (sd(z, na.rm=TRUE))) # normalize the SD m = loess(y ~ x, data=data.frame(x=mu,y=sg), degree=1, span=.5) sg.new = predict(m, data=data.frame(x=mu,y=sg)) delta = .1 sg.new = sg / (sg.new + delta) d = data.frame(Mean=mu, Std.Dev=sg.new, Score=score.function(mu, sg.new, ...), Std.Dev.Nonrm=sg) rownames(d) = rownames(X) return(d) } ## GET.RANKED.LIST - ranks the genes by their scores: sort decreasingly ## the genes from a data set, according to their scores. Break the ties ## by consindering the mean expressions. ## ## Parameters: ## d - a data frame containing the relevant statistics (see get.stats()) ## ## Returns: ## a ranked list of genes get.ranked.list = function(d) { p = order(d[,c("Score","Mean")], decreasing=TRUE) # sort by Score and then by Mean l = rownames(d)[p] return (l) } ## RKPROD.SCORE computes the (-log of the) rank product score over a number ## of ranked lists. ## ## Parameters: ## S - a list of ranked lists. Each individual list (i.e. S[[i]], i=1,2,...) ## must contain the names of the genes ordered decreasingly by ## their respective scores. ## ## Returns: ## a ranked list of product scores (aggregated list). To get the ## names of the genes, use names(the_returned_list) ## ## For details, see: ## Breitling, et al., "Rank products: a simple, yet powerful, new method ## to detect differentially regulated genes in replicated microarray ## experiments", FEBS Letters 573 (2004), p.83-92 rkprod.score = function(S) { m = length(S) # number of datasets # build the list of genes present in S glist = NULL v = rep(0, m) for (i in 1:m) { glist = union(glist, S[[i]]) } scores = rep(0, length(glist)) # final aggregated list names(scores) = glist ng = rep(0, length(glist)) # counts how many times each gene is present names(ng) = glist for (k in 1:m) { # the lists are already sorted, so the ranks are 1,2,... scores[ S[[k]] ] = scores[ S[[k]] ] + log10(1:length(S[[k]])) ng[ S[[k]] ] = ng[ S[[k]] ] + 1 } # we change the sign of the scores so that high scores correspond to # better choices (and lower ranks): scores = 5 - scores / ng # if we get div. by. 0, then something is very wrong! # we add 5 in log10 scale (corresponds to 100000) to get the rank # scores closer to 0. Not necessary, but it's nicer than # having all of them large negative values... scores = sort(scores, decreasing=TRUE) return (scores) } ######### Utility functions for making plots ## PLOT.TOPK - plots the top k control genes in a mean-SD plane, along ## with the other genes from a data frame. ## ## Parameters: ## d - a data frame containing the relevant statistics (see get.stats()) ## glist - the list of top genes ## title - a string to be used as title ## ## Returns: ## a ggplot2 object. Use print(ggplot2_object) for plotting. plot.topk = function(d, glist, title="") { opts = ggopt() ggopt(axis.colour = "black", background.colour = "black", background.fill = "white", border.colour = "grey90", grid.colour = "grey90", grid.minor.colour = "white", grid.fill="white") mn = min(d[is.finite(d[,"Score"]),"Score"]) mx = max(d[is.finite(d[,"Score"]),"Score"]) qp = ggplot(d) qp = qp + geom_point(aes(x=Mean, y=Std.Dev, colour=Score), size=1.5) + scale_colour_gradient(low="grey90",high="black", limits=c(mn-.1,mx+.1)) qp = qp + geom_point(data=data.frame(x=d[glist,"Mean"],y=d[glist,"Std.Dev"]), aes(x=x,y=y), colour="red3", size=2) qp = qp + geom_point(data=data.frame(x=d[!is.finite(d$Score),"Mean"], y=d[!is.finite(d$Score),"Std.Dev"]), aes(x=x,y=y), colour="grey80", size=1.5) ggopt(opts) return (qp) } ## PLOT.SCORES - plots the scores (grey scale) and the contour curves ## ## Parameters: ## dset - a data set (frame) with the relevant statistics ## score.function - the scoring function ## step - distance between contour levels ## ... - parameters for the scoring function ## ## Returns: ## a ggplot2 object plot.scores = function(dset, score.function=score.stability, step=1.5, ...) { s = as.matrix(dset[,c("Mean","Std.Dev")]) z = dset[,"Score"] d = data.frame(Mean=s[,1], Std.Dev=s[,2], Score=z) xmin = min(d[,"Mean"], na.rm=TRUE) - 0.1 xmax = max(d[,"Mean"], na.rm=TRUE) + 0.1 ymin = min(d[,"Std.Dev"], na.rm=TRUE) - 0.1 ymax = max(d[,"Std.Dev"], na.rm=TRUE) - 0.1 xt = seq(xmin, xmax, (xmax-xmin)/50) yt = seq(ymin, ymax, (ymax-ymin)/50) c = matrix(0, nrow=length(xt)*length(yt), ncol=3) for (i in 1:length(xt)) { for (j in 1:length(yt)) { k = (i-1)*length(yt)+j c[k,1] = xt[i] c[k,2] = yt[j] } } c[,3] = score.function(c[,1], c[,2], beta=quantile(s[,1],probs=c(0.25))[1],...) c = data.frame(c) colnames(c) = c("Mean", "Std.Dev", "Score") old.ggopt = ggopt() ggopt(axis.colour = "black", background.colour = "black", background.fill = "white", border.colour = "grey90", grid.colour = "grey90", grid.minor.colour = "white", grid.fill="white") idx = which(is.finite(d[,"Score"])) q = qplot(x=Mean, y=Std.Dev, z=Score, data=d[idx,], colour=Score) q = q + scale_colour_gradient(low="grey80",high="black", limits=c(min(z[idx])-.1,max(z[idx])+.1),name="Score") + stat_contour(data=c,colour="red1",size=2) + scale_z_continuous(breaks=seq(min(z[idx]),min(max(z[idx]),60),step)) if (sum(!is.finite(z)) > 0) { q = q + geom_point(data=data.frame(Mean=d[-idx,"Mean"], Std.Dev=d[-idx,"Std.Dev"], Score=-10), shape=1, colour="grey80") } ggopt(old.ggopt) return (q) }