library(IRanges)
library(GenomeInfoDb)
library(GenomicRanges)
library(dbplyr)
library(BiocFileCache)
library(rtracklayer)
library(AnnotationHub)
## retrieve the cpg
ah <- AnnotationHub()
ah_human <- subset(ah, species == "Homo sapiens")
ah_human_cpg <- query(ah_human, "CpG Islands")
ah_human_cpg_record <- ah_human_cpg[["AH5086"]]
## extract autosomes
filter <- c(paste("chr", 1:22, sep=""))
split_record <- split(ah_human_cpg_record, seqnames(ah_human_cpg_record))
autosomes <- split_record[filter]
## check the number of autosomes
unlist(autosomes)
## GRanges object with 26641 ranges and 1 metadata column:
## seqnames ranges strand | name
## <Rle> <IRanges> <Rle> | <character>
## chr1 chr1 28736-29810 * | CpG:_116
## chr1 chr1 135125-135563 * | CpG:_30
## chr1 chr1 327791-328229 * | CpG:_29
## chr1 chr1 437152-438164 * | CpG:_84
## chr1 chr1 449274-450544 * | CpG:_99
## ... ... ... ... . ...
## chr22 chr22 51135671-51136118 * | CpG:_44
## chr22 chr22 51142803-51143308 * | CpG:_38
## chr22 chr22 51158387-51160060 * | CpG:_167
## chr22 chr22 51169028-51170019 * | CpG:_81
## chr22 chr22 51221773-51222317 * | CpG:_63
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
autosomes[4]
## GRangesList object of length 1:
## $chr4
## GRanges object with 1031 ranges and 1 metadata column:
## seqnames ranges strand | name
## <Rle> <IRanges> <Rle> | <character>
## [1] chr4 53199-53672 * | CpG:_46
## [2] chr4 107147-107898 * | CpG:_89
## [3] chr4 124333-124841 * | CpG:_37
## [4] chr4 206378-206892 * | CpG:_49
## [5] chr4 298804-299312 * | CpG:_47
## ... ... ... ... . ...
## [1027] chr4 190939802-190940591 * | CpG:_69
## [1028] chr4 190942735-190944898 * | CpG:_196
## [1029] chr4 190959045-190960011 * | CpG:_72
## [1030] chr4 190962112-190962689 * | CpG:_59
## [1031] chr4 190986383-191013609 * | CpG:_2005
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
## retrieve the record
ah_H3K4me3 <- query(ah, c("H3K4me3", "narrowpeak", "E003"))
## select the narrowpeak
ah_H3K4me3_record <- ah_H3K4me3[["AH29884"]]
## extract autosomes and check the number of regions cover
split_H3K4me3 <- split(ah_H3K4me3_record, seqnames(ah_H3K4me3_record))
H3K4me3_autosomes <- split_H3K4me3[filter]
sum(width(unlist(H3K4me3_autosomes)))
## [1] 41135164
## retrieve the record
ah_H3K27me3 <- query(ah, c("H3K27me3", "narrowpeak", "E003"))
ah_H3K27me3_record <- ah_H3K27me3[["AH29892"]]
## extract autosomes
split_H3K27me3 <- split(ah_H3K27me3_record, seqnames(ah_H3K27me3_record))
H3K27me3_autosomes <- split_H3K27me3[filter]
## create a subset of extracted autosomes
ah_H3K27me3_autosomes <- subset(ah_H3K27me3_record, seqnames %in% filter)
## mean signalValue
mean_signalValue <- mean(ah_H3K27me3_autosomes$signalValue)
mean_signalValue
## [1] 4.770728
## intersect between two records
bivalent <- intersect(unlist(H3K4me3_autosomes), unlist(H3K27me3_autosomes))
sum(width(bivalent))
## [1] 10289096
# find bivalent regions overlap CpG Islands
cpg_autosomes <- autosomes
cpg_bivalent <- findOverlaps(bivalent, unlist(cpg_autosomes))
# calculate the fraction of the bivalent regions overlap CpG Islands
fraction_bivalent <- length(unique(queryHits(cpg_bivalent)))/length(bivalent)
fraction_bivalent
## [1] 0.5383644
cpg_bivalent_intersect <- intersect(bivalent, unlist(cpg_autosomes))
# calculate the fration of the bases intersected between CpG Islands and bivalent
fraction_bivalent_intersect <- sum(width(reduce(cpg_bivalent_intersect)))/sum(width(unlist(cpg_autosomes)))
fraction_bivalent_intersect
## [1] 0.241688
Tip: consider using the “resize()”" function.
# extract CpG Islands within 10kb
cpg_10kb <- resize(unlist(cpg_autosomes), width = 20000 + width(unlist(cpg_autosomes)), fix = "center")
cpg_10kb_bivalent <- intersect(cpg_10kb, bivalent)
sum(width(cpg_10kb_bivalent))
## [1] 9782086
Tip 1: the object returned by AnnotationHub contains “seqlengths”.
Tip 2: you may encounter an integer overflow. As described in the session on R Basic Types, you can address this by converting integers to numeric before summing them, “as.numeric()”.
# calculate human genome size
chr_list <- c(paste("chr", 1:22, sep=""))
genome <- keepSeqlevels(ah_human_cpg_record, chr_list, pruning.mode = "coarse")
genome_size <- sum(as.numeric(seqlengths(genome)))
# calculate the fraction of human genome which contained a CpG Island
cpg_autosomes_size <- sum(as.numeric(width(unlist(cpg_autosomes))))
cpg_autosomes_size / genome_size
## [1] 0.007047481
## calculate InOut matrix
overlapMat <- matrix(0,, ncol = 2, nrow = 2)
colnames(overlapMat) <- c("in", "out")
rownames(overlapMat) <- c("in", "out")
overlapMat[1,1] <- sum(width(cpg_bivalent_intersect))
overlapMat[1,2] <- sum(width(setdiff(bivalent, unlist(cpg_autosomes))))
overlapMat[2,1] <- sum(width(setdiff(unlist(cpg_autosomes), bivalent)))
overlapMat[2,2] <- genome_size - sum(overlapMat)
## calculate odds-ratio
oddsRatio <- overlapMat[1,1] * overlapMat[2,2] / (overlapMat[2,1] * overlapMat[1,2])
oddsRatio
## [1] 169.0962
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] BSgenome.Hsapiens.UCSC.hg19_1.4.0 BSgenome_1.54.0
## [3] Biostrings_2.54.0 XVector_0.26.0
## [5] AnnotationHub_2.18.0 rtracklayer_1.46.0
## [7] BiocFileCache_1.10.2 dbplyr_1.4.3
## [9] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [11] IRanges_2.20.2 S4Vectors_0.24.4
## [13] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.4.6 lattice_0.20-41
## [3] Rsamtools_2.2.3 assertthat_0.2.1
## [5] digest_0.6.25 mime_0.9
## [7] R6_2.4.1 RSQLite_2.2.0
## [9] evaluate_0.14 httr_1.4.1
## [11] pillar_1.4.3 zlibbioc_1.32.0
## [13] rlang_0.4.6 curl_4.3
## [15] blob_1.2.1 Matrix_1.2-18
## [17] rmarkdown_2.1 BiocParallel_1.20.1
## [19] stringr_1.4.0 RCurl_1.98-1.2
## [21] bit_1.1-15.2 shiny_1.4.0.2
## [23] DelayedArray_0.12.3 compiler_3.6.3
## [25] httpuv_1.5.2 xfun_0.13
## [27] pkgconfig_2.0.3 htmltools_0.4.0
## [29] tidyselect_1.0.0 SummarizedExperiment_1.16.1
## [31] tibble_3.0.1 GenomeInfoDbData_1.2.2
## [33] interactiveDisplayBase_1.24.0 matrixStats_0.56.0
## [35] XML_3.99-0.3 crayon_1.3.4
## [37] dplyr_0.8.5 later_1.0.0
## [39] GenomicAlignments_1.22.1 bitops_1.0-6
## [41] rappdirs_0.3.1 grid_3.6.3
## [43] xtable_1.8-4 lifecycle_0.2.0
## [45] DBI_1.1.0 magrittr_1.5
## [47] stringi_1.4.6 promises_1.1.0
## [49] ellipsis_0.3.0 vctrs_0.2.4
## [51] tools_3.6.3 bit64_0.9-7
## [53] Biobase_2.46.0 glue_1.4.0
## [55] purrr_0.3.4 BiocVersion_3.10.1
## [57] fastmap_1.0.1 yaml_2.2.1
## [59] AnnotationDbi_1.48.0 BiocManager_1.30.10
## [61] memoise_1.1.0 knitr_1.28