library(ALL)
data(ALL)
mean(exprs(ALL[,5]))
## [1] 5.629627
mart <- useMart(host=’feb2014.archive.ensembl.org’,biomart=“ENSEMBL_MART_ENSEMBL”).
library(biomaRt)
library("hgu95av2.db")
# list Marts
mart <- useMart(host='feb2014.archive.ensembl.org', biomart = "ENSEMBL_MART_ENSEMBL")
ensembl <- useDataset("hsapiens_gene_ensembl", mart)
# annotate each feature
feature_name <- featureNames(ALL)
annotation_ALL <- getBM(attributes=c("ensembl_gene_id","affy_hg_u95av2"), filters="affy_hg_u95av2", values=feature_name, mart=ensembl)
sum(table(annotation_ALL[,2])>1)
## [1] 1045
# list Attributes
attributes <- listAttributes(ensembl)
filters <- listFilters(ensembl)
# annotate autosomes
chrom <- c(1:22)
annotation_ALL_chr <- getBM(attributes=c("ensembl_gene_id", "affy_hg_u95av2", "chromosome_name"), filters=c("affy_hg_u95av2","chromosome_name"), values=list(feature_name, chrom), mart=ensembl)
sum(table(table(annotation_ALL_chr[,2])))
## [1] 11016
library(minfiData)
library(minfi)
mean(getMeth(MsetEx)[,2])
## [1] 7228.277
library(GEOquery)
eList <- getGEO("GSE788")
eData <- eList[[1]]
mean(exprs(eData)[,2])
## [1] 756.432
library(airway)
library(GenomicRanges)
data(airway)
mean(airway$avgLength)
## [1] 113.75
sum(assay(airway)[,3]>=1)
## [1] 25699
Clarification: A feature has to overlap the actual transcript, not the intron of a transcript. So you will need to make sure that the transcript representation does not contain introns.
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# exon data of txdb
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb_exons <- exons(TxDb.Hsapiens.UCSC.hg19.knownGene)
# transcripts on the autosome
autosome <- paste0("chr", c(1:22))
txdb_exons_autosome <- keepSeqlevels(txdb_exons, autosome, pruning.mode = "coarse")
# rename in NCBI format
txdb_ncbi <- mapSeqlevels(seqlevels(txdb_exons), "NCBI")
txdb_exons_ncbi <- renameSeqlevels(txdb_exons_autosome, txdb_ncbi)
dim(subsetByOverlaps(airway, txdb_exons_ncbi))[1]
## [1] 26276
sample_SRR1039508 <- airway[, 1]
sample_SRR1039508_autosome <- subsetByOverlaps(sample_SRR1039508, txdb_exons_ncbi)
autosome_reads <- sum(assay(sample_SRR1039508_autosome, "counts"))
total_reads <- sum(assay(sample_SRR1039508, "counts"))
# percentage of the total reads in airway dataset for SRR1039508 which overlaps autosome of txdb
autosome_reads/total_reads
## [1] 0.9004193
Clarification: We are using the standard 2.2kb default Bioconductor promotor setting.
Conclusion: Compare this to the median number of counts for features without a H3K4me3 peak. Note that this short analysis has not taken transcript lengths into account and it compares different genomic regions to each other; this is highly suscepticle to bias such as sequence bias.
library(AnnotationHub)
ah <- AnnotationHub()
ah_E096 <- query(ah, c("E096", "H3K4me3", "narrowPeak"))
ah_record <- ah_E096[["AH30596"]]
ah_record_autosome <- keepSeqlevels(ah_record, autosome, pruning.mode = "coarse")
ah_record_ncbi <- renameSeqlevels(ah_record_autosome, txdb_ncbi)
ncbi_group <- extractSeqlevelsByGroup(species = "Homo sapiens", style = "NCBI", group = "auto")
sample_ncbi <- keepSeqlevels(range(rowRanges(sample_SRR1039508_autosome)), ncbi_group)
ov <- subsetByOverlaps(promoters(sample_ncbi), ah_record_ncbi)
ov <- subsetByOverlaps(sample_SRR1039508, ov)
median(assay(ov, "counts"))
## [1] 205
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] BSgenome.Hsapiens.UCSC.hg19_1.4.0
## [2] BSgenome_1.54.0
## [3] rtracklayer_1.46.0
## [4] AnnotationHub_2.18.0
## [5] BiocFileCache_1.10.2
## [6] dbplyr_1.4.3
## [7] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [8] GenomicFeatures_1.38.2
## [9] airway_1.6.0
## [10] GEOquery_2.54.1
## [11] minfiData_0.32.0
## [12] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.0
## [13] IlluminaHumanMethylation450kmanifest_0.4.0
## [14] minfi_1.32.0
## [15] bumphunter_1.28.0
## [16] locfit_1.5-9.4
## [17] iterators_1.0.12
## [18] foreach_1.5.0
## [19] Biostrings_2.54.0
## [20] XVector_0.26.0
## [21] SummarizedExperiment_1.16.1
## [22] DelayedArray_0.12.3
## [23] BiocParallel_1.20.1
## [24] matrixStats_0.56.0
## [25] GenomicRanges_1.38.0
## [26] GenomeInfoDb_1.22.1
## [27] hgu95av2.db_3.2.3
## [28] org.Hs.eg.db_3.10.0
## [29] AnnotationDbi_1.48.0
## [30] IRanges_2.20.2
## [31] S4Vectors_0.24.4
## [32] biomaRt_2.42.1
## [33] ALL_1.28.0
## [34] Biobase_2.46.0
## [35] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] ellipsis_0.3.0 siggenes_1.60.0
## [3] mclust_5.4.6 base64_2.0
## [5] bit64_0.9-7 interactiveDisplayBase_1.24.0
## [7] xml2_1.3.2 codetools_0.2-16
## [9] splines_3.6.3 scrime_1.3.5
## [11] knitr_1.28 Rsamtools_2.2.3
## [13] annotate_1.64.0 shiny_1.4.0.2
## [15] HDF5Array_1.14.4 BiocManager_1.30.10
## [17] readr_1.3.1 compiler_3.6.3
## [19] httr_1.4.1 fastmap_1.0.1
## [21] assertthat_0.2.1 Matrix_1.2-18
## [23] limma_3.42.2 later_1.0.0
## [25] htmltools_0.4.0 prettyunits_1.1.1
## [27] tools_3.6.3 glue_1.4.0
## [29] GenomeInfoDbData_1.2.2 dplyr_0.8.5
## [31] rappdirs_0.3.1 doRNG_1.8.2
## [33] Rcpp_1.0.4.6 vctrs_0.2.4
## [35] multtest_2.42.0 preprocessCore_1.48.0
## [37] nlme_3.1-147 DelayedMatrixStats_1.8.0
## [39] xfun_0.13 stringr_1.4.0
## [41] mime_0.9 lifecycle_0.2.0
## [43] rngtools_1.5 XML_3.99-0.3
## [45] beanplot_1.2 zlibbioc_1.32.0
## [47] MASS_7.3-51.6 promises_1.1.0
## [49] hms_0.5.3 rhdf5_2.30.1
## [51] RColorBrewer_1.1-2 yaml_2.2.1
## [53] curl_4.3 memoise_1.1.0
## [55] reshape_0.8.8 stringi_1.4.6
## [57] RSQLite_2.2.0 BiocVersion_3.10.1
## [59] genefilter_1.68.0 rlang_0.4.6
## [61] pkgconfig_2.0.3 bitops_1.0-6
## [63] nor1mix_1.3-0 evaluate_0.14
## [65] lattice_0.20-41 purrr_0.3.4
## [67] Rhdf5lib_1.8.0 GenomicAlignments_1.22.1
## [69] bit_1.1-15.2 tidyselect_1.0.0
## [71] plyr_1.8.6 magrittr_1.5
## [73] R6_2.4.1 DBI_1.1.0
## [75] pillar_1.4.3 survival_3.1-12
## [77] RCurl_1.98-1.2 tibble_3.0.1
## [79] crayon_1.3.4 rmarkdown_2.1
## [81] progress_1.2.2 grid_3.6.3
## [83] data.table_1.12.8 blob_1.2.1
## [85] digest_0.6.25 xtable_1.8-4
## [87] httpuv_1.5.2 tidyr_1.0.2
## [89] illuminaio_0.28.0 openssl_1.4.1
## [91] askpass_1.1 quadprog_1.5-8