Statistics_genomics

1. Load the example SNP data with the following code:

library(snpStats)
library(broom)
data(for.exercise)
use <- seq(1, ncol(snps.10), 10)
sub.10 <- snps.10[,use]
snpdata = sub.10@.Data
status = subject.support$cc

Fit a linear model and a logistic regression model to the data for the 3rd SNP. What are the coefficients for the SNP variable? How are they interpreted? (Hint: Don’t forget to recode the 0 values to NA for the SNP data)

# recode 0 values to NA
snp3 = as.numeric(snpdata[,3])
snp3[snp3==0] = NA

# fit a linear model
lm3 = lm(status ~ snp3)
tidy(lm3)

## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   0.544     0.0549     9.91  3.75e-22
## 2 snp3         -0.0394    0.0468    -0.842 4.00e- 1

# fit a logistic regression model
glm3 = glm(status ~ snp3,family="binomial")
tidy(glm3)

## # A tibble: 2 x 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    0.177     0.220     0.806   0.420
## 2 snp3          -0.158     0.188    -0.841   0.400

Both models are fit on the additive scale. So in the linear model case, the coefficient is the decrease in probability associated with each additional copy of the minor allele. In the logistic regression case, it is the decrease in the log odds ratio associated with each additional copy of the minor allele.

2. In the previous question why might the choice of logistic regression be better than the choice of linear regression?

par(mfrow=c(1,2))

plot(status ~ snp3,pch=19)
abline(lm3,col="darkgrey",lwd=5)
plot(glm3$residuals)

3. Load the example SNP data with the following code:

library(snpStats)
library(broom)
data(for.exercise)
use <- seq(1, ncol(snps.10), 10)
sub.10 <- snps.10[,use]
snpdata = sub.10@.Data
status = subject.support$cc

Fit a logistic regression model on a recessive (need 2 copies of minor allele to confer risk) and additive scale for the 10th SNP. Make a table of the fitted values versus the case/control status. Does one model fit better than the other?

# fit a logistic regression model
snp10 = as.numeric(snpdata[,10])
snp10[snp10==0] = NA
glm10 = glm(status ~ snp10, family="binomial")
tidy(glm10)

## # A tibble: 2 x 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.00751    0.174    -0.0433   0.965
## 2 snp10        0.00201    0.0933    0.0215   0.983

snp10_dom = (snp10 == 2)
glm10_dom = glm(status ~ snp10_dom, family="binomial")
tidy(glm10_dom)

## # A tibble: 2 x 5
##   term          estimate std.error statistic p.value
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    -0.0339    0.0868    -0.391   0.696
## 2 snp10_domTRUE   0.0643    0.127      0.505   0.614

4. Load the example SNP data with the following code:

library(snpStats)
library(broom)
data(for.exercise)
use <- seq(1, ncol(snps.10), 10)
sub.10 <- snps.10[,use]
snpdata = sub.10@.Data
status = subject.support$cc

Fit an additive logistic regression model to each SNP. What is the average effect size? What is the max? What is the minimum?

# fit an additive logistic regression model to each SNP
results = rep(NA, dim(snpdata)[2])
for (i in 1:ncol(snpdata)){
  snpdata_i = as.numeric(snpdata[,i])
  snpdata_i[snpdata_i == 0] = NA
  glm_i = glm(status ~ snpdata_i, family = "binomial")
  results[i] = tidy(glm_i)$statistic[2]
}

# average effect size
mean(results)

## [1] 0.007155377

# minimum effect size
min(results)

## [1] -4.251469

# maximum effect size
max(results)

## [1] 3.900891

5. Load the example SNP data with the following code:

library(snpStats)
library(broom)
data(for.exercise)
use <- seq(1, ncol(snps.10), 10)
sub.10 <- snps.10[,use]
snpdata = sub.10@.Data
status = subject.support$cc

Fit an additive logistic regression model to each SNP and square the coefficients. What is the correlation with the results from using snp.rhs.tests and chi.squared? Why does this make sense?

# square the coefficients
results_coeff_squre =  results^2

# correlation with the results from using snp.rhs.tests and chi.squared
glm_all = snp.rhs.tests(status ~ 1, snp.data = sub.10)
cor(results_coeff_squre, chi.squared(glm_all))

## [1] 0.9992946

They are both testing for the same association using the same additive regression model on the logistic scale but using slightly different tests.

6. Load the Montgomery and Pickrell eSet:

library(ballgown)
library(Biobase)
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
load(file=con)
close(con)
mp = montpick.eset
pdata=pData(mp)
edata=as.data.frame(exprs(mp))
fdata = fData(mp)

Do the log2(data + 1) transform and fit calculate F-statistics for the difference between studies/populations using genefilter:rowFtests and using genefilter:rowttests. Do you get the same statistic? Do you get the same p-value?

  library(devtools)
  library(Biobase)
  library(limma)
  library(edge)
  library(genefilter)

edata = log2(as.matrix(edata) + 1)

# perform rowttests
tstats_obj = rowttests(edata, as.factor(pdata$population))
tidy(tstats_obj)

## # A tibble: 3 x 13
##   column     n   mean    sd   median trimmed     mad       min   max range
##   <chr>  <dbl>  <dbl> <dbl>    <dbl>   <dbl>   <dbl>     <dbl> <dbl> <dbl>
## 1 stati… 12984 -3.43  3.88  -2.84    -3.16   2.94    -2.31e+ 1  8.31 31.5 
## 2 dm     52580 -0.129 0.372  0       -0.0283 0       -4.37e+ 0  1.51  5.89
## 3 p.val… 12984  0.168 0.266  0.00441  0.109  0.00441  2.23e-47  1.00  1.00
## # … with 3 more variables: skew <dbl>, kurtosis <dbl>, se <dbl>

# perform rowFtests
fstats_obj = rowFtests(edata, as.factor(pdata$population))
tidy(fstats_obj)

## # A tibble: 2 x 13
##   column     n   mean     sd  median trimmed     mad      min    max  range
##   <chr>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>   <dbl>    <dbl>  <dbl>  <dbl>
## 1 stati… 12984 26.8   39.5   8.40     18.4   8.20    3.39e- 7 535.   535.  
## 2 p.val… 12984  0.168  0.266 0.00441   0.109 0.00441 2.23e-47   1.00   1.00
## # … with 3 more variables: skew <dbl>, kurtosis <dbl>, se <dbl>

par(mfrow=c(1,2))
hist(tstats_obj$statistic, col=2)
hist(fstats_obj$statistic, col=2)

7. Load the Montgomery and Pickrell eSet:

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
load(file=con)
close(con)
mp = montpick.eset
pdata=pData(mp)
edata=as.data.frame(exprs(mp))
edata = edata[rowMeans(edata) > 100,]
fdata = fData(mp)

First test for differences between the studies using the DESeq2 package using the DESeq function. Then do the log2(data + 1) transform and do the test for differences between studies using the limma package and the lmFit, ebayes and topTable functions. What is the correlation in the statistics between the two analyses? Are there more differences for the large statistics or the small statistics (hint: Make an MA-plot).

library(DESeq2)
library(limma)
library(edge)
library(genefilter)

# using DESeq2 test the differences between the studies
de = DESeqDataSetFromMatrix(edata, pdata, ~study)
glm_de = DESeq(de)
result_de = results(glm_de)

# using limma test the differences
edata = log2(as.matrix(edata) + 1)
mod = model.matrix(~ as.factor(pdata$study))
fit_limma = lmFit(edata, mod)
ebayes_limma = eBayes(fit_limma) 
top = topTable(ebayes_limma,number=dim(edata)[1], sort.by="none")

# correlation in the statistics between two analyses
cor(result_de$stat, top$t)

## [1] 0.9278568

# make an MA-plot
y = cbind(result_de$stat, top$t)
limma::plotMA(y)

8. Apply the Benjamni-Hochberg correction to the P-values from the two previous analyses. How many results are statistically significant at an FDR of 0.05 in each analysis?

# DESeq analysis
fp_bh = p.adjust(result_de$pvalue, method="BH")
sum(fp_bh < 0.05)

## [1] 1995

# limma analysis
fp_bh = p.adjust(top$P.Value, method="BH")
sum(fp_bh < 0.05)

## [1] 2807

9. Is the number of significant differences surprising for the analysis comparing studies from Question 8? Why or why not?

Yes and no. It is surprising because there is a large fraction of the genes that are significantly different, but it isn’t that surprising because we would expect that when comparing measurements from very different batches.

10. Suppose you observed the following P-values from the comparison of differences between studies. Why might you be suspicious of the analysis?

The p-values should have a spike near zero (the significant results) and be flat to the right hand side (the null results) so the distribution pushed toward one suggests conservative p-value calculation.

Statistics_genomics_quiz3

Shengyuan Wang

4/19/2020