Tip: The reference genome includes “N” bases; you will need to exclude those.
library(AnnotationHub)
library(Biostrings)
library(BSgenome)
library(GenomicRanges)
library(rtracklayer)
library(Rsamtools)
library(GenomicFeatures)
library("BSgenome.Hsapiens.UCSC.hg19")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# count total bases on chr22
alphabet_frequency <- alphabetFrequency(Hsapiens$chr22)
total_bases <- sum(alphabet_frequency[c('A','G','C','T')])
# count GC bases on chr22
GC_bases <- sum(alphabet_frequency[c('G','C')])
#calculate GC ratio
GC_content <- GC_bases/total_bases
GC_content
## [1] 0.4798807
Clarification: Compute the GC content for each peak region as a percentage and then average those percentages to compute a number between 0 and 1.
# retrieve record
ah <- AnnotationHub()
H3K27me3_qh <- query(ah, c("H3K27me3", "E003", "narrowPeak"))
H3K27me3_record <- H3K27me3_qh[["AH29892"]]
# extract chr 22 and sequences
H3K27me3_chr22 <- subset(H3K27me3_record, seqnames == "chr22")
H3K27me3_chr22_views <- Views(Hsapiens, H3K27me3_chr22)
# calculate mean GC content
GC_contents <- letterFrequency(H3K27me3_chr22_views, "GC", as.prob = TRUE)
mean_GC <- mean(GC_contents)
mean_GC
## [1] 0.528866
signal_value <- mcols(H3K27me3_chr22_views)$signalValue
cor(signal_value, GC_contents)
## G|C
## [1,] 0.004467924
Clarification: First compute the average “fc.signal” for across each region, for example using “Views”; this yields a single number of each region. Next correlate these numbers with the “signalValue” of the “narrowPeaks”.
# retrieve record
H3K27me3_fc <- query(ah, c("H3K27me3", "E003", "fc.signal"))
H3K27me3_fc_record <- H3K27me3_fc[["AH32033"]]
# get subset data on chr22
gr22 <- GRanges(seqnames = "chr22", ranges = IRanges(start = start(Hsapiens$chr22), end = end(Hsapiens$chr22)))
H3K27me3_fc_gr <- import(H3K27me3_fc_record, which = gr22, as = "Rle")
H3K27me3_fc_gr22 <- H3K27me3_fc_gr$chr22
# view fc.signal data
fc.signal <- Views(H3K27me3_fc_gr22, start = start(H3K27me3_chr22), end = end(H3K27me3_chr22))
# calculate the correlation between the average of fc.signal and signalValue
fc.signal_mean <- mean(fc.signal)
cor(fc.signal_mean, signal_value)
## [1] 0.9149614
sum(H3K27me3_fc_gr22 >= 1)
## [1] 10914671
Tip: If you end up with having to intersect two different Views, note that you will need to convert the Views to IRanges or GRanges first with ir<-as(vi,“IRanges”).
H3K27me3_E055 <- query(ah, c("H3K27me3", "E055"))
H3K27me3_E055_record <- H3K27me3_E055[["AH32470"]]
# get subset data on chr22
gr_chr22 <- GRanges(seqnames = "chr22", ranges = IRanges(start = start(Hsapiens$chr22), end = end(Hsapiens$chr22)))
H3K27me3_fc_gr_E055 <- import(H3K27me3_E055_record, which = gr_chr22, as = "Rle")
H3K27me3_fc_gr22_E055 <- H3K27me3_fc_gr_E055$chr22
# identify region
region_E003 <- as(slice(H3K27me3_fc_gr22, upper = 0.5), "IRanges")
region_E055 <- as(slice(H3K27me3_fc_gr22_E055, lower = 2), "IRanges")
inter_region <- intersect(region_E003, region_E055)
sum(width(inter_region))
## [1] 1869937
## retrieve the cpg
ah_human <- subset(ah, species == "Homo sapiens")
ah_human_cpg <- query(ah_human, "CpG Islands")
ah_human_cpg_record <- ah_human_cpg[["AH5086"]]
# get subset data on chr22
ah_human_cpg_chr22 <- subset(ah_human_cpg_record, seqnames == "chr22")
ah_human_cpg_chr22_views <- Views(Hsapiens, ah_human_cpg_chr22)
# calculate observed GC bases
observed_GC <- dinucleotideFrequency(ah_human_cpg_chr22_views)[,7]/width(ah_human_cpg_chr22_views)
# calculate expected GC bases
freq_C <- letterFrequency(ah_human_cpg_chr22_views, "C")
freq_G <- letterFrequency(ah_human_cpg_chr22_views, "G")
expected_GC <- (freq_C/width(ah_human_cpg_chr22_views))*(freq_G/width(ah_human_cpg_chr22_views))
# calculate the average observed-to-expected ratio of CpG dinucleotides
mean(observed_GC/expected_GC)
## [1] 0.8340929
Clarification: You need to remember to search both forward and reverse strands.
TATA_boxes <- countPattern("TATAAA", Hsapiens$chr22) + countPattern("TATAAA", reverseComplement(Hsapiens$chr22))
TATA_boxes
## [1] 27263
Clarification: Use the TxDb.Hsapiens.UCSC.hg19.knownGene package to define transcripts and coding sequence. Here, we defined a promoter to be 900bp upstream and 100bp downstream of the transcription start site.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
gr <- GRanges(seqnames = "chr22", ranges = IRanges(start = start(Hsapiens$chr22), end = end(Hsapiens$chr22)))
# find promoters of transcripts on chr 22
gr_trans_chr22 <- subsetByOverlaps(transcripts(txdb), gr, ignore.strand = TRUE)
proms <- promoters(gr_trans_chr22, upstream = 900, downstream = 100)
# find coding sequences on chr 22
gr_cds_chr22 <- subsetByOverlaps(cds(txdb), gr, ignore.strand = TRUE)
# find overlaps between promoters of transcripts and coding sequences
proms_cds <- findOverlaps(proms, gr_cds_chr22)
# calculate TATA box on overlaps
count = 0
for (i in unique(queryHits(proms_cds))){
proms_cds_view <- Views(Hsapiens, proms[i])
count = count + vcountPattern("TATAAA", DNAStringSet(proms_cds_view))
}
count
## [1] 57
Clarification: Use the TxDb.Hsapiens.UCSC.hg19.knownGene package to define transcripts and coding sequence. Here, we define a promoter to be 900bp upstream and 100bp downstream of the transcription start site. In this case, ignore strand in the analysis.
# calculate transcript lengths
trans_len_chr22 <- transcriptLengths(txdb, with.cds_len = TRUE)
trans_len_chr22 <- trans_len_chr22[trans_len_chr22$cds_len > 0,]
# find promoters from different transcripts to overlap
trans_eval <- proms[mcols(proms)$tx_id %in% trans_len_chr22$tx_id]
result = sum(coverage(trans_eval) > 1)
result["chr22"]
## chr22
## 306920
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [2] BSgenome.Hsapiens.UCSC.hg19_1.4.0
## [3] GenomicFeatures_1.38.2
## [4] AnnotationDbi_1.48.0
## [5] Biobase_2.46.0
## [6] Rsamtools_2.2.3
## [7] BSgenome_1.54.0
## [8] rtracklayer_1.46.0
## [9] GenomicRanges_1.38.0
## [10] GenomeInfoDb_1.22.1
## [11] Biostrings_2.54.0
## [12] XVector_0.26.0
## [13] IRanges_2.20.2
## [14] S4Vectors_0.24.4
## [15] AnnotationHub_2.18.0
## [16] BiocFileCache_1.10.2
## [17] dbplyr_1.4.3
## [18] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.1 bit64_0.9-7
## [3] shiny_1.4.0.2 assertthat_0.2.1
## [5] interactiveDisplayBase_1.24.0 askpass_1.1
## [7] BiocManager_1.30.10 blob_1.2.1
## [9] GenomeInfoDbData_1.2.2 yaml_2.2.1
## [11] progress_1.2.2 BiocVersion_3.10.1
## [13] pillar_1.4.3 RSQLite_2.2.0
## [15] lattice_0.20-41 glue_1.4.0
## [17] digest_0.6.25 promises_1.1.0
## [19] htmltools_0.4.0 httpuv_1.5.2
## [21] Matrix_1.2-18 XML_3.99-0.3
## [23] pkgconfig_2.0.3 biomaRt_2.42.1
## [25] zlibbioc_1.32.0 purrr_0.3.4
## [27] xtable_1.8-4 later_1.0.0
## [29] BiocParallel_1.20.1 tibble_3.0.1
## [31] openssl_1.4.1 ellipsis_0.3.0
## [33] SummarizedExperiment_1.16.1 magrittr_1.5
## [35] crayon_1.3.4 mime_0.9
## [37] memoise_1.1.0 evaluate_0.14
## [39] tools_3.6.3 prettyunits_1.1.1
## [41] hms_0.5.3 lifecycle_0.2.0
## [43] matrixStats_0.56.0 stringr_1.4.0
## [45] DelayedArray_0.12.3 compiler_3.6.3
## [47] rlang_0.4.6 grid_3.6.3
## [49] RCurl_1.98-1.2 rappdirs_0.3.1
## [51] bitops_1.0-6 rmarkdown_2.1
## [53] DBI_1.1.0 curl_4.3
## [55] R6_2.4.1 GenomicAlignments_1.22.1
## [57] knitr_1.28 dplyr_0.8.5
## [59] fastmap_1.0.1 bit_1.1-15.2
## [61] stringi_1.4.6 Rcpp_1.0.4.6
## [63] vctrs_0.2.4 tidyselect_1.0.0
## [65] xfun_0.13