Quick summary about SlamSeq:

Important terms:

Simulation and Evaluation

So far I have simulated data sets for a set of 1000 randomly chosen 3’ UTRs from the mouse annotation set we are using for the real data-sets.

I simulated data sets for all combinations of the following parameters:

For all data sets I compared the simulated labeling rate to the recovered labeling rates by computing:

Based on this I tried visualizing the effect of read length, coverage, T->C rate and labeling rates on the performance of SlamSeq/SlamDunk

Reading data from file

library(dplyr)
library(tidyr)
library(ggplot2)

tcs = c(0.024, 0.07)
covs = c(25, 50, 75, 100, 150, 200)
rls = c(38, 88, 138)
reps = c(1)

# Read data
data = read.table("~/Dropbox/Slamdunk/Data/mesc_1k_random_expressed_eval.tsv", sep = '\t')
dataUnique = read.table("~/Dropbox/Slamdunk/Data/mesc_1k_random_expressed_eval_unique.tsv", sep = '\t')

#data = read.table("~/Dropbox/Slamdunk/Data/golden_list.tsv", sep = '\t')
#dataUnique = read.table("~/Dropbox/Slamdunk/Data/golden_list_unique.tsv", sep = '\t')

Probability to see a T->C mutation in a “labeled” read of length R, given a T->C conversion rate of tcRatePerPosition

readLength = 38
tcRatePerPosition = 0.024
1 - dbinom(0, round(readLength / 4), tcRatePerPosition)
## [1] 0.2156712

Relative error for all combinations of read length and coverage

Absolut error for all combinations of read lengths and coverage

Log2 error for all combinations of read lengths and coverage

Relative error for labeling rate bins

Scatter plots: simulated vs recovered labeling rates

Median relative error

for(r in rls) {
  p = data %>% filter(rl==r) %>% ggplot(., aes(x=cov, y=rel_error)) + 
    stat_summary(fun.y = median, geom = "point") +
    coord_cartesian(ylim = c(0,20)) +
    ylab("relative error [%]") + xlab("Coverage") + ggtitle(paste0(r, " bp reads"))
  print(p)
}

data %>% group_by(simulated, cov, rl) %>% count(rel_error < 20)
## Source: local data frame [756 x 5]
## Groups: simulated, cov, rl [?]
## 
##    simulated   cov    rl `rel_error < 20`     n
##        <dbl> <int> <int>            <lgl> <int>
## 1          0    25    38            FALSE  1661
## 2          0    25    38             TRUE   339
## 3          0    25    88            FALSE  1853
## 4          0    25    88             TRUE   147
## 5          0    25   138            FALSE  1889
## 6          0    25   138             TRUE   111
## 7          0    50    38            FALSE  1371
## 8          0    50    38             TRUE   629
## 9          0    50    88            FALSE  1762
## 10         0    50    88             TRUE   238
## # ... with 746 more rows
data %>% filter(rl == 88, cov == 100) %>% group_by(simulated, cov, rl) %>% count(abs_error < 0.05)
## Source: local data frame [41 x 5]
## Groups: simulated, cov, rl [?]
## 
##    simulated   cov    rl `abs_error < 0.05`     n
##        <dbl> <int> <int>              <lgl> <int>
## 1       0.00   100    88               TRUE  2000
## 2       0.05   100    88              FALSE     3
## 3       0.05   100    88               TRUE  1997
## 4       0.10   100    88              FALSE    15
## 5       0.10   100    88               TRUE  1985
## 6       0.15   100    88              FALSE    46
## 7       0.15   100    88               TRUE  1954
## 8       0.20   100    88              FALSE   120
## 9       0.20   100    88               TRUE  1880
## 10      0.25   100    88              FALSE   153
## # ... with 31 more rows
data %>% filter(rl == 88, cov == 100) %>% count(abs_error < 0.05)
## # A tibble: 2 <U+00D7> 2
##   `abs_error < 0.05`     n
##                <lgl> <int>
## 1              FALSE  7128
## 2               TRUE 34872
dataUnique %>% filter(rl == 88, cov == 100) %>% count(abs_error < 0.05)
## # A tibble: 2 <U+00D7> 2
##   `abs_error < 0.05`     n
##                <lgl> <int>
## 1              FALSE  7211
## 2               TRUE 34789
for(rl in c(38, 88, 138)) {
  for(cov in c(50, 100, 200)) {

    data %>% filter(rl == rl, cov == cov) %>% ggplot(., aes(abs_error)) + xlim(-1, 1) + geom_histogram(binwidth = 0.01)
    
  }
}

# A histogram of bill sizes
data %>% filter(rl, cov == 50 | cov == 100 | cov == 200) %>% ggplot(., aes(x=abs_error)) + xlim(-0.25, 0.25) + geom_histogram(binwidth = 0.01) + facet_grid(rl ~ cov)
## Warning: Removed 2254 rows containing non-finite values (stat_bin).

data %>% filter(rl, cov == 50 | cov == 100 | cov == 200) %>% ggplot(., aes(x=rel_error)) + xlim(-25, 25) + geom_histogram(binwidth = 1) + facet_grid(rl ~ cov)
## Warning: Removed 71746 rows containing non-finite values (stat_bin).

stats = read.table("/project/ngs/philipp/slamdunk-analysis/paper/paper_runs/slamdunk_mESC_full_timecourse_1/summary.txt", header=T)

stats$mapped_perc = stats$Mapped / stats$Sequenced

stats[stats$SampleType == "chase",]$SampleTime = 1440 - stats[stats$SampleType == "chase",]$SampleTime

plot(stats$mapped_perc ~ stats$SampleTime)

cor(stats$mapped_perc, stats$SampleTime)
## [1] 0.3163251