#####################################################
# Differential Expression C57BL/6J-chrYA vs C57BL/6J
# ============ ========== ======= ====== == ========
# Ricardo A. Verdugo
# Gary Churchill
#
# JAX, Bar Harbor 2009
#
#
# Data description
# ---- -----------
#
# This is an expression profile experiment done in the
# Illumina Mouse-Ref8 platform.
#
# Objective: To assess the effect of genetic variation
# in chromosome Y on mice on the size of cardiomyocytes.
#
# Experimental design: Eight adult male mice from
# two strains were profiled, C57BL/6J and
# C57BL/6J-chrY, referred to as B and BY herein.
# From each strain (genotype), four animals were
# castrated and four were sham operated.
#
# Aims:
# 1) To determine differential expression between genotypes
# 2) To determina differential expression between treatments
# 3) To assess differences in the response to treatment between
# the two genotypes
#
# References:
#
# For more information about the experimental samples:
# Llamas, Bastien, Ricardo Verdugo, Gary Churchill, and Christian Deschepper. 2009.
# Chromosome Y variants from different inbred mouse strains are linked to differences in the
# morphologic and molecular responses of cardiac cells to postpubertal testosterone. BMC
# Genomics 10, no. 1 (April 7): 150. doi:10.1186/1471-2164-10-150.
#
# For informations about the analysis of this data: Verdugo, Ricardo A., Christian F.
# Deschepper, Gloria Munoz, Daniel Pomp, and Gary A. Churchill. 2009. Importance of
# randomization in microarray experimental designs with Illumina platforms. Nucl. Acids Res.
# 37, no. 17 (September): 5610-8. doi:10.1093/nar/gkp573.
#
# The full dataset is available at the GEO database, record GSE15354.
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE15354
# Preliminaries
# =============
## Define some variables (it is a good practice to declare them at the beginning of the scripts)
outdir <- "output"
annot_file <- file.path(outdir, "MouseRef-8_annot.txt") # file.path() is a robust way to define file paths
# Proportion of false discoveries that is acceptable
fdr_th <- 0.2
# Define some useful functions (it is a good idea to always document what the intend to do)
logdiff2FC=function(x, base=2) {
# Transforms x from a log difference (logdiff) to a fold change (FC) scale
# logdiff is the expression difference in log scale of base = 'base'
# FC is the ratio between two expression values in the original scale
# The absolute value of the returned FC is always greater than or equal to 1
# The sign is kept to indicate direction of change
# When the the log difference is 0, the FC is always 1.
signs = sign(x)
out = base^abs(x)
if(any(x==0)) {
out[x==0] = 1
}
return(out*signs)
} # logdiff2FC
# Create an output directory
if(!file.exists(outdir)) {
dir.create(outdir, mode = "0755", recursive=T)
}
# Readin raw data
# ====== === ====
Data.Raw=read.delim("rawdata.txt")
signal=grep("AVG_Signal", colnames(Data.Raw))
sdevs=grep("BEAD_STDERR", colnames(Data.Raw))
detection=grep("Detection.Pval", colnames(Data.Raw))
annot_cols=c(1,2,99:125)
# Read hybridization design
design=read.csv("YChrom_design.csv")
# Lets take a look
print(design)
# Save annotations in case they are needed for other applications
# In Illumina experiment, the data comes with several columns of probe annotation
write.table(Data.Raw[,annot_cols], file=annot_file, sep="\t", quote=F, col.names=T, row.names=F)
# Quality Control
# ======= =======
# Boxplots
palette(rainbow(4))
# Color by genotype
png(file.path(outdir,"boxplot_raw_genotype.png"), width=4, height=4, unit="in", res=150)
par(xpd=NA, mar= c(6.1, 4.1, 4.1, 2.1), cex=.7)
boxplot(as.data.frame(log2(Data.Raw[,signal])), horiz=T, main="Raw log2 values Boxplot", las=1, col=design$Genotype, names=design$Sentrix_Position, cex.axis=.9)
legend(8, 2.5, legend=levels(design$Genotype), fill=1:2, ncol=2, xjust=.5)
dev.off()
# Color by treatment
png(file.path(outdir,"boxplot_raw_treatment.png"), width=4, height=4, unit="in", res=150)
par(xpd=NA, mar= c(6.1, 4.1, 4.1, 2.1), cex=.7)
boxplot(as.data.frame(log2(Data.Raw[,signal])), horiz=T, main="Raw log2 values Boxplot", las=1, col=design$Treatment, names=design$Sentrix_Position, cex.axis=.9)
legend(8, 2.5, legend=levels(design$Treatment), fill=1:2, ncol=2, xjust=.5)
dev.off()
# Create matrix of raw data
rawdata=as.matrix(Data.Raw[,signal])
rownames(rawdata)=Data.Raw$PROBE_ID
colnames(rawdata)=design$Sample_Name
## Scatter plots of raw data
# Untransformed
png(file.path(outdir,"Pairs_scatter_raw.png"), width=8, height=8, unit="in", res=150)
par(cex=.2, mar=c(2.1,2.1,2.1,1.1))
pairs(rawdata, main="Raw Intensity Values", pch=".", gap=.5, cex.labels=.5)
dev.off()
# log2 transformed
png(file.path(outdir,"Pairs_scatter_log2.png"), width=8, height=8, unit="in", res=150)
par(cex=.2, mar=c(2.1,2.1,2.1,1.1))
pairs(log2(rawdata), main="Log2 Raw Intensity Values", pch=".", gap=.5, cex.labels=.5)
dev.off()
# Data Normalization
# ==== =============
# Load functions for normalization
library(affy)
library(preprocessCore)
normdata=normalize.quantiles(rawdata)
colnames(normdata)=colnames(rawdata)
rownames(normdata)=rownames(rawdata)
# Probe Filtering
# ===== =========
# This step aims to removing probes that did not detect a transcript
# in any of the experimental groups. Note that this step can be optional.
#
# Create a vector or P/A calls for each probe using detection probabilities calculate by BeadStudio
present=which(apply(apply(Data.Raw[,detection]<.04, 1, tapply, design$Group, mean)>=.5, 2, any))
# Testing for differential expression
# ======= === ============ ==========
# Load the MAanova package
library(maanova)
# Create a madata object which only includes present detected transcripts
madata=read.madata(normdata[present,], design, log.trans=T)
# Some basis statistics on each experimental group
Means = t(apply(madata$data, 1, tapply, design$Group, mean))
colnames(Means)=paste("Mean", colnames(Means), sep=":")
SEs = t(apply(madata$data, 1, tapply, design$Group, function(x) sqrt(var(x)/length(x))))
colnames(SEs)=paste("SE", colnames(SEs), sep=":")
# Fit the model
fit.fix=fitmaanova(madata, formula=~Group)
# Construct a matrix of contrastas of interest
# --------------------------------------------
# In factorial design like this one, one can ask different questions from the data. Each
# question can be tested by a comparison between some set of experimental groups. These
# comparisons are called contrasts. The matest function from MAanova can take a matrix
# of contrasts and test whether those comparisons explain a significant proportion of variance
# in the expression levels measured by each probe.
#
# But, before we create our matrix of contrasts, lets define some terms to simplify nomenclature.
# Let,
# I : intact (not castrated) treatment
# C : castrated treatment
# B : C57BL/6J genotype
# B.Y : C57BL/6J-chrY genotype (chromosome Y congenic strain on a C57BL/6J genomic background)
# Geno : genotype
# Trt : treatment
# Int : genotype x treatment interaction
# Geno_I : genotype effect in I animals
# Geno_C : genotype effect in C animals
# Trt_B : treatment effect in the B genotype
# Trt_BY : treatment effect in the B.Y genotype
#
# Then the four experimental groups can be denoted by B.C, B.I, BY.C, and BY.I.
#
# Contrasts are vectors of coefficients that when multiplied to the vector of means by
# experimental group, create comparisons that relevant and that can be tested statistically.
# To create the vector of contrast coefficients, assume the experimental groups are sorted
# alphabetically.
#
# Now, construct a matrix of contrasts where rows are contrasts and columns are experimental
# groups:
# B.C B.I BY.C BY.I
cmat = rbind( Geno = c( 1, 1, -1, -1 )*.5,
Trt = c( 1, -1, 1, -1 )*.5,
Int = c( 1, -1, -1, 1 ),
Geno_I = c( 0, 1, 0, -1 ),
Geno_C = c( 1, 0, -1, 0 ),
Trt_B = c( 1, -1, 0, 0 ),
Trt_BY = c( 0, 0, 1, -1 ),
B.C_BY.I = c( 1, 0, 0, -1 ),
B.I_BY.C = c( 0, 1, -1, 0 ))
# We can use these contrasts for calculate some ratios (fold changes)
# that can be of interest.
# Log differences for main effects and the interaction
Geno = apply(Means, 1, function(x) sum(x*cmat[1,]))
Trt = apply(Means, 1, function(x) sum(x*cmat[2,]))
Int = apply(Means, 1, function(x) sum(x*cmat[3,]))
# Log differences for comparisons within factor levels
Geno_I = apply(Means, 1, function(x) sum(x*cmat[4,]))
Geno_C = apply(Means, 1, function(x) sum(x*cmat[5,]))
Trt_B = apply(Means, 1, function(x) sum(x*cmat[6,]))
Trt_BY = apply(Means, 1, function(x) sum(x*cmat[7,]))
B.C_BY.I = apply(Means, 1, function(x) sum(x*cmat[8,]))
B.I_BY.C = apply(Means, 1, function(x) sum(x*cmat[9,]))
# Bind columns into a matrix
logDiffs=cbind(Geno, Trt, Int, Geno_I, Geno_C, Trt_B, Trt_BY, B.C_BY.I, B.I_BY.C)
# Transform log differences to FC scale
FC = apply(logDiffs, 2, logdiff2FC)
# Test test each contrasts using 1000 permutations of sample labels (takes a long time!!!)
test.cmat=matest(madata, fit.fix, term="Group", Contrast=cmat, n.perm=10, test.type = "ttest",
shuffle.method="sample", verbose=TRUE)
# Contrasts names are not kept in the matrix of permutation results, so lets
# copy them from the matrix of tabular p-values
colnames(test.cmat$Fs$Pvalperm) = colnames(test.cmat$Fs$Ptab)
# Plot p-values comparing different ways of calculating them (see ?matest)
png(file.path(outdir,"P-values Hist.png"), width=6, height=6, unit="in", res=150)
par(mfrow=c(2,2), oma=c(2,0,2,0), cex=.8, xpd=NA)
palette(rainbow(3))
plot(density(test.cmat$F1$Ptab[,1]), col=1, main="F1:Ptab", lwd=2)
lines(density(test.cmat$F1$Ptab[,2]), col=2, lwd=2)
lines(density(test.cmat$F1$Ptab[,3]), col=3, lwd=2)
plot(density(test.cmat$F1$Pvalperm[,1]), col=1, main="F1:Pvalperm", lwd=2)
lines(density(test.cmat$F1$Pvalperm[,2]), col=2, lwd=2)
lines(density(test.cmat$F1$Pvalperm[,3]), col=3, lwd=2)
plot(density(test.cmat$Fs$Ptab[,1]), col=1, main="Fs:Ptab", lwd=2)
lines(density(test.cmat$Fs$Ptab[,2]), col=2, lwd=2)
lines(density(test.cmat$Fs$Ptab[,3]), col=3, lwd=2)
plot(density(test.cmat$Fs$Pvalperm[,1]), col=1, main="Fs:Pvalperm", lwd=2)
lines(density(test.cmat$Fs$Pvalperm[,2]), col=2, lwd=2)
lines(density(test.cmat$Fs$Pvalperm[,3]), col=3, lwd=2)
legend(-.5, -1.6, legend=c("Geno", "Trt", "Int"), col=1:3, lwd=2, xjust=.5, ncol=3, xpd=NA)
dev.off()
# Multiple comparison control (FDR transformation, see ?adjPval)
test.cmat=adjPval(test.cmat, method="adaptive")
# Plot distribution of FDR values
png(file.path(outdir,"FDR Hist.png"), width=6, height=6, unit="in", res=150)
par(mfrow=c(2,2), oma=c(2,0,2,0), cex=.8, xpd=NA)
palette(rainbow(3))
plot(density(test.cmat$F1$adjPtab[,1]), col=1, main="F1:Ptab", lwd=2, xlim=c(-.1,1.1))
lines(density(test.cmat$F1$adjPtab[,2]), col=2, lwd=2)
lines(density(test.cmat$F1$adjPtab[,3]), col=3, lwd=2)
plot(density(test.cmat$F1$adjPvalperm[,1]), col=1, main="F1:Pvalperm", lwd=2, xlim=c(-.1,1.1))
lines(density(test.cmat$F1$adjPvalperm[,2]), col=2, lwd=2)
lines(density(test.cmat$F1$adjPvalperm[,3]), col=3, lwd=2)
plot(density(test.cmat$Fs$adjPtab[,1]), col=1, main="Fs:Ptab", lwd=2, xlim=c(-.1,1.1))
lines(density(test.cmat$Fs$adjPtab[,2]), col=2, lwd=2)
lines(density(test.cmat$Fs$adjPtab[,3]), col=3, lwd=2)
plot(density(test.cmat$Fs$adjPvalperm[,1]), col=1, main="Fs:Pvalperm", lwd=2, xlim=c(-.1,1.1))
lines(density(test.cmat$Fs$adjPvalperm[,2]), col=2, lwd=2)
lines(density(test.cmat$Fs$adjPvalperm[,3]), col=3, lwd=2)
legend(-.5, -1.5, legend=c("Geno", "Trt", "Int"), col=1:3, lwd=2, xjust=.5, ncol=3)
dev.off()
# Summarize into a table of results for all present transcripts
# Here we are exporting only the tests from the permutations on the F values with shrinkage variance estimates,
# but other are available (see ?matest)
annot.out = c("SYMBOL", "ACCESSION", "REFSEQ_ID", "ENTREZ_GENE_ID", "PROTEIN_PRODUCT", "ONTOLOGY_COMPONENT", "ONTOLOGY_PROCESS", "ONTOLOGY_FUNCTION")
out = data.frame(Probe=rownames(rawdata)[present], Data.Raw[present, annot.out], Means, SEs, F_val=test.cmat$Fs$Fobs, P_val=test.cmat$Fs$Pvalperm,
FDR=test.cmat$Fs$adjPvalperm, FC=FC)
# Select probes with FDR <= 0.2 for any of the 9 tests
selected = rownames(rawdata)[present][apply(test.cmat$Fs$adjPvalperm <= fdr_th, 1, any)]
# Read probe annotations
annot = read.csv("MouseRef8_genemap.csv")
# Merge to table of test results
out.annot = merge(out, annot, by.x="Probe", by.y="probe_name", all.x=T, all.y=F, sort=F)
# Export all results
write.table(out.annot , file=file.path(outdir,"YChrom_results_allpresent.csv"), sep=",", row.names=F)
# Export only selected
write.table(out[selected,], file=file.path(outdir, "YChrom_results_onlyselected.csv"), sep=",", row.names=F)
# Count genes differentially expressed
# ----- ----- -------------- ---------
# In this experiment, a question of interest was how many genes respond differently to the
# treatment of castration in the two genotypes. In other words, how important is the
# interaction between genotype and treatment. Secondly, it was interesting to assess the
# nature of the interaction. Are both genotypes responding to the treatment but in opposite
# directions? Or does the treatment have an effect in one of genotype and not in the other?
# To answer the first question, one could count the number of probes that show a significant
# effect for the Int contrast, i.e. how many probes have an FDR below a threshold for the
# Int.Pvalperm test? If you don;t know how to calculate this in R yet, open the exported
# YChrom_results_onlyselected.csv file in a spread sheet editor such as Calc (OpenOffice)
# and use filters to calculate this number. Then try to do this in R and compare the
# results.
#
# To answer the second question, we can use Venn Diagrams. We want to count the number if
# genes that are selected in each genotype among those that show a significant interaction
# effect.
#
# One caveat is that genes are represented by multiple probes in this microarray platform.
# It is a good idea to count genes only once, but there is a question of how to count a gene
# when different probes are giving different signals, i.e. one is saying that the gene is
# selected whereas the other is saying that is not. This platform was based RefSeq
# (http://www.ncbi.nlm.nih.gov/RefSeq/), which is a human-curated database of reference
# transcripts for known genes and it was designed to avoid redundancy
# (http://www.illumina.com/products/mouseref-8_expression_beadchip_kits_v2.ilmn). Therefore,
# we will assume that each probe is testing different biological signals, and not a repeated
# measure of the same transcript. We will count a gene as selected if any of it transcripts
# (probes) is selected. For other platforms where probes provide repeated measurements for
# the same transcript, one may want to use a voting or averaging approach to summarize
# results at the gene-level.
# First, eliminate probes without a gene annotation
out.annot=out.annot[!is.na(out.annot$gene_id),]
# Now count significant genes for each comparison. All comparison are done only for
# probes that are also significant for the Interaction term.
# Because were are interested in genes that show an interaction, we cannot the the Geno
# contrast to test for significance, since that contrast tests the marginal effect of
# the genotype across treatments. In other words, it tests for effects that are consistent in both
# treatments. These can be zero even if the genotype has an effect in both treatments but those effects
# have opposite signs. The same situation is true for testing treatment effects in probes with
# significant interaction. Therefore, we need to use the contrasts that tested genotype
# differences within each level of treatment, and viceversa.
Genes.Int_Geno_I = cbind(Gene=with(out.annot, unique(gene_id[FDR.Int <= fdr_th & FDR.Geno_I <= fdr_th])), I=1)
Genes.Int_Geno_C = cbind(Gene=with(out.annot, unique(gene_id[FDR.Int <= fdr_th & FDR.Geno_C <= fdr_th])), C=1)
Genes.Int_Trt_B = cbind(Gene=with(out.annot, unique(gene_id[FDR.Int <= fdr_th & FDR.Trt_B <= fdr_th])), B=1)
Genes.Int_Trt_BY = cbind(Gene=with(out.annot, unique(gene_id[FDR.Int <= fdr_th & FDR.Trt_BY <= fdr_th])), BY=1)
# Merge counts of genotype effects in both treatments into one table
Genes.Int_Geno = merge(Genes.Int_Geno_I, Genes.Int_Geno_C, by="Gene", all=T)
Genes.Int_Geno = cbind(Genes.Int_Geno$Gene,apply(Genes.Int_Geno[,2:3], 2, function(x) as.numeric(!is.na(x))))
# Merge counts of treatment effects in both genotypes into one table
Genes.Int_Trt = merge(Genes.Int_Trt_B, Genes.Int_Trt_BY, by="Gene", all=T)
Genes.Int_Trt = cbind(Genes.Int_Trt$Gene,apply(Genes.Int_Trt[,2:3], 2, function(x) as.numeric(!is.na(x))))
# Load the limma library for creating Venn diagrams
library(limma)
# Count genes for each compartment of the Venn diagram
Counts.Int_Geno=vennCounts(Genes.Int_Geno[,2:3])
print(Counts.Int_Geno)
Counts.Int_Trt=vennCounts(Genes.Int_Trt[,2:3])
print(Counts.Int_Trt)
# Plot genes responding to genotype in a treatment dependent manner
png(file.path(outdir, "vennDiagra_Int_Geno.png"), width=6, height=6, unit="in", res=150)
vennDiagram(Counts.Int_Geno, main="\n\n\nGenes Responding to Genotype\nin a Treatment Dependent Manner")
dev.off()
# Plot genes responding to treatment in a genotype dependent manner
png(file.path(outdir, "vennDiagra_Int_Trt.png"), width=6, height=6, unit="in", res=150)
vennDiagram(Counts.Int_Trt, main="\n\n\nGenes Responding to Treatment\nin a Genotype Dependent Manner")
dev.off()
# Interpretation of the Venn diagrams:
#
# In theory, both plots are showing the same test in two different ways, i.e. the number of
# genes with interaction effects, but partitioned either by treatment or by genotype.
# Because in practice we are showing results from four different (but related) tests, the
# total number of selected genes in each diagram is not exactly the same, but they should
# largely agree.
#
# Although the numbers here are small because we used only a small sample or probes, you
# will see that more genes are responding to the treatment in the BY genotype. Also, you
# should see more differences between genotypes in the castrated animals. This was the
# pattern observed in the full dataset. See Figure 4 of Llamas et al 2008 (reference above).
#