So far I have simulated data sets for a set of 1000 randomly chosen 3’ UTRs from the mouse annotation set we are using for the real data-sets.
I simulated data sets for all combinations of the following parameters:
For all data sets I compared the simulated labeling rate to the recovered labeling rates by computing:
Based on this I tried visualizing the effect of read length, coverage, T->C rate and labeling rates on the performance of SlamSeq/SlamDunk
library(dplyr)
library(tidyr)
library(ggplot2)
tcs = c(0.024, 0.07)
covs = c(25, 50, 75, 100, 150, 200)
rls = c(38, 88, 138)
reps = c(1)
# Read data
data = read.table("~/Dropbox/Slamdunk/Data/mesc_1k_random_expressed_eval.tsv", sep = '\t')
dataUnique = read.table("~/Dropbox/Slamdunk/Data/mesc_1k_random_expressed_eval_unique.tsv", sep = '\t')
#data = read.table("~/Dropbox/Slamdunk/Data/golden_list.tsv", sep = '\t')
#dataUnique = read.table("~/Dropbox/Slamdunk/Data/golden_list_unique.tsv", sep = '\t')
readLength = 38
tcRatePerPosition = 0.024
1 - dbinom(0, round(readLength / 4), tcRatePerPosition)
## [1] 0.2156712
for(r in rls) {
p = data %>% filter(rl==r) %>% ggplot(., aes(x=cov, y=rel_error)) +
stat_summary(fun.y = median, geom = "point") +
coord_cartesian(ylim = c(0,20)) +
ylab("relative error [%]") + xlab("Coverage") + ggtitle(paste0(r, " bp reads"))
print(p)
}
data %>% group_by(simulated, cov, rl) %>% count(rel_error < 20)
## Source: local data frame [756 x 5]
## Groups: simulated, cov, rl [?]
##
## simulated cov rl `rel_error < 20` n
## <dbl> <int> <int> <lgl> <int>
## 1 0 25 38 FALSE 1661
## 2 0 25 38 TRUE 339
## 3 0 25 88 FALSE 1853
## 4 0 25 88 TRUE 147
## 5 0 25 138 FALSE 1889
## 6 0 25 138 TRUE 111
## 7 0 50 38 FALSE 1371
## 8 0 50 38 TRUE 629
## 9 0 50 88 FALSE 1762
## 10 0 50 88 TRUE 238
## # ... with 746 more rows
data %>% filter(rl == 88, cov == 100) %>% group_by(simulated, cov, rl) %>% count(abs_error < 0.05)
## Source: local data frame [41 x 5]
## Groups: simulated, cov, rl [?]
##
## simulated cov rl `abs_error < 0.05` n
## <dbl> <int> <int> <lgl> <int>
## 1 0.00 100 88 TRUE 2000
## 2 0.05 100 88 FALSE 3
## 3 0.05 100 88 TRUE 1997
## 4 0.10 100 88 FALSE 15
## 5 0.10 100 88 TRUE 1985
## 6 0.15 100 88 FALSE 46
## 7 0.15 100 88 TRUE 1954
## 8 0.20 100 88 FALSE 120
## 9 0.20 100 88 TRUE 1880
## 10 0.25 100 88 FALSE 153
## # ... with 31 more rows
data %>% filter(rl == 88, cov == 100) %>% count(abs_error < 0.05)
## # A tibble: 2 <U+00D7> 2
## `abs_error < 0.05` n
## <lgl> <int>
## 1 FALSE 7128
## 2 TRUE 34872
dataUnique %>% filter(rl == 88, cov == 100) %>% count(abs_error < 0.05)
## # A tibble: 2 <U+00D7> 2
## `abs_error < 0.05` n
## <lgl> <int>
## 1 FALSE 7211
## 2 TRUE 34789
for(rl in c(38, 88, 138)) {
for(cov in c(50, 100, 200)) {
data %>% filter(rl == rl, cov == cov) %>% ggplot(., aes(abs_error)) + xlim(-1, 1) + geom_histogram(binwidth = 0.01)
}
}
# A histogram of bill sizes
data %>% filter(rl, cov == 50 | cov == 100 | cov == 200) %>% ggplot(., aes(x=abs_error)) + xlim(-0.25, 0.25) + geom_histogram(binwidth = 0.01) + facet_grid(rl ~ cov)
## Warning: Removed 2254 rows containing non-finite values (stat_bin).
data %>% filter(rl, cov == 50 | cov == 100 | cov == 200) %>% ggplot(., aes(x=rel_error)) + xlim(-25, 25) + geom_histogram(binwidth = 1) + facet_grid(rl ~ cov)
## Warning: Removed 71746 rows containing non-finite values (stat_bin).
stats = read.table("/project/ngs/philipp/slamdunk-analysis/paper/paper_runs/slamdunk_mESC_full_timecourse_1/summary.txt", header=T)
stats$mapped_perc = stats$Mapped / stats$Sequenced
stats[stats$SampleType == "chase",]$SampleTime = 1440 - stats[stats$SampleType == "chase",]$SampleTime
plot(stats$mapped_perc ~ stats$SampleTime)
cor(stats$mapped_perc, stats$SampleTime)
## [1] 0.3163251