[med-svn] [kineticstools] 01/04: Imported Upstream version 0.6.1+20161222

Afif Elghraoui Sun, 15 Jan 2017 14:28:57 -0800

This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository kineticstools.


commit 75c2f1c2be66860d6c3c8b14d4a1ebd99a606ad9
Author: Afif Elghraoui <a...@debian.org>
Date:   Sun Jan 15 14:20:29 2017 -0800

    Imported Upstream version 0.6.1+20161222
---
 doc/whitepaper/ecoli-roc.png          | Bin 0 -> 26253 bytes
 doc/whitepaper/kinetics.bib           |  92 +++++++++++++++++
 doc/whitepaper/kinetics.pdf           | Bin 0 -> 399628 bytes
 doc/whitepaper/kinetics.tex           | 183 ++++++++++++++++++++++++++++++++++
 doc/whitepaper/logo.jpg               | Bin 0 -> 22613 bytes
 doc/whitepaper/position-influence.png | Bin 0 -> 88200 bytes
 6 files changed, 275 insertions(+)

diff --git a/doc/whitepaper/ecoli-roc.png b/doc/whitepaper/ecoli-roc.png
new file mode 100644
index 0000000..f291508
Binary files /dev/null and b/doc/whitepaper/ecoli-roc.png differ
diff --git a/doc/whitepaper/kinetics.bib b/doc/whitepaper/kinetics.bib
new file mode 100644
index 0000000..b8d2744
--- /dev/null
+++ b/doc/whitepaper/kinetics.bib
@@ -0,0 +1,92 @@
+@article{ forney1973viterbi,
+  title={The viterbi algorithm},
+  author={Forney Jr, G.D.},
+  journal={Proceedings of the IEEE},
+  volume={61},
+  number={3},
+  pages={268--278},
+  year={1973},
+  publisher={IEEE}
+}
+
+@article{ flusberg2010direct,
+  title={Direct detection of DNA methylation during single-molecule, real-time 
sequencing},
+  author={Flusberg, B.A. and Webster, D.R. and Lee, J.H. and Travers, K.J. and 
Olivares, E.C. and Clark, T.A. and Korlach, J. and Turner, S.W.},
+  journal={Nature methods},
+  volume={7},
+  number={6},
+  pages={461--465},
+  year={2010},
+  publisher={Nature Publishing Group}
+}
+
+@article{ eid2009real,
+  title={Real-time DNA sequencing from single polymerase molecules},
+  author={Eid, J. and Fehr, A. and Gray, J. and Luong, K. and Lyle, J. and 
Otto, G. and Peluso, P. and Rank, D. and Baybayan, P. and Bettman, B. and 
others},
+  journal={Science},
+  volume={323},
+  number={5910},
+  pages={133--138},
+  year={2009},
+  publisher={American Association for the Advancement of Science}
+}
+
+@article{ clark2012characterization,
+  title={Characterization of DNA methyltransferase specificities using 
single-molecule, real-time DNA sequencing},
+  author={Clark, T.A. and Murray, I.A. and Morgan, R.D. and Kislyuk, A.O. and 
Spittle, K.E. and Boitano, M. and Fomenkov, A. and Roberts, R.J. and Korlach, 
J.},
+  journal={Nucleic Acids Research},
+  volume={40},
+  number={4},
+  pages={e29--e29},
+  year={2012},
+  publisher={Oxford Univ Press}
+}
+
+@Article{ blasr, 
+AUTHOR = {Chaisson, Mark and Tesler, Glenn},
+TITLE = {Mapping single molecule sequencing reads using Basic Local Alignment 
with Successive Refinement (BLASR): Theory and Application},
+JOURNAL = {BMC Bioinformatics},
+VOLUME = {13},
+YEAR = {2012},
+NUMBER = {1},
+PAGES = {238},
+URL = {http://www.biomedcentral.com/1471-2105/13/238},
+DOI = {10.1186/1471-2105-13-238},
+ISSN = {1471-2105},
+ABSTRACT = {We describe the method BLASR (Basic Local Alignment with 
Successive Refinement) for mapping Single Molecule Sequencing (SMS) reads that 
are thousands to tens of thousands of bases long with divergence between the 
read and genome dominated by insertion and deletion error. We also present a 
combinatorial model of sequencing error that motivates why our approach is 
effective. The results indicate that mapping SMS reads is both highly specific 
and rapid.},
+}
+
+
+@book{ cart,
+author = "L. Breiman and J. H. Friedman and R. Olshen and C. J. Stone",
+title = "Classification and Regression Trees",
+publisher = "Wadsworth",
+address = "Belmont, California",
+year = "1984"
+}
+
+@article{ gradientboost,
+   AUTHOR="Jerome H. Friedman",
+   TITLE="Greedy Function Approximation: A Gradient Boosting Machine",
+   JOURNAL="The Annals of Statistics",
+   YEAR="2001",
+  VOLUME="29",
+  PAGES="1189-1232"
+}
+
+@article{ crf-boost,
+  AUTHOR= "Thomas G. Dietterich, Adam Ashenfelter, and Yaroslav Bulatov",
+  TITLE="Training Conditional Random Fields via Gradient Tree Boosting",
+  JOURNAL="International Conference on Machine Learning",
+  YEAR="2004",
+  VOLUME="1",
+  PAGES="1-2"
+}
+
+@Manual{ gbm,
+  title = {gbm: Generalized Boosted Regression Models},
+  author = {Greg Ridgeway},
+  year = {2010},
+  note = {R package version 1.6-3.1},
+  url = {http://CRAN.R-project.org/package=gbm},
+}
diff --git a/doc/whitepaper/kinetics.pdf b/doc/whitepaper/kinetics.pdf
new file mode 100644
index 0000000..da899c6
Binary files /dev/null and b/doc/whitepaper/kinetics.pdf differ
diff --git a/doc/whitepaper/kinetics.tex b/doc/whitepaper/kinetics.tex
new file mode 100644
index 0000000..76943c0
--- /dev/null
+++ b/doc/whitepaper/kinetics.tex
@@ -0,0 +1,183 @@
+\documentclass[pdftex]{article}
+\usepackage{float}
+\usepackage{graphicx}
+\usepackage{hyperref}
+\usepackage{amsmath}
+
+\DeclareMathOperator*{\argmax}{argmax}
+
+\newcommand{\mC}[1]{$^\textrm{m#1}$C}
+\newcommand{\mA}{$^\textrm{m6}$A}
+
+\oddsidemargin  0.0in \evensidemargin 0.0in
+\textwidth      6in
+
+
+\begin{document}
+
+\title{\begin{center}\includegraphics[width=2in]{logo.jpg}\end{center}\vspace{0.5cm}
+Detection and Identification of Base Modifications with Single Molecule 
Real-Time Sequencing Data}
+\author{Patrick Marks, Onureena Banerjee, David Alexander}
+\maketitle
+
+
+% Will say the section an page
+\pagestyle{headings}
+
+\section{Introduction}
+Single Molecule Real-Time (SMRT\textsuperscript{\textregistered}) DNA 
sequencing\cite{eid2009real} data contains information about DNA base 
modifications imprinted in the kinetics of the polymerization 
reaction\cite{flusberg2010direct}.  This data enables single-base resolution 
detection of \mA, \mC{4}, and \mC{5}\cite{clark2012characterization}. We 
discuss the statistical approaches to the detection and identification of DNA 
modifications from SMRT sequencing data.  Our implementation i [...]
+
+
+\section{Preparation of IPD data}
+\label{sec:prep}
+
+This analysis assumes that we are interested in locus-specific DNA 
modifications - that is, samples where the same modification is present at a 
given locus, in some fraction of the sample molecules.  
+
+We compute a vector of IPDs that map confidently to a given genomic location. 
We use the alignment tool BLASR \cite{blasr} to generate alignments between 
reads and a reference template.   BLASR maps each read to the reference 
sequence. BLASR generates a Mapping QV representing the confidence that the 
read maps uniquely to the selected genomic interval.  Reads falling within 
repeats will have a low Mapping QV and are removed from the analysis to prevent 
incorrect modification calls inside [...]
+
+Sequencing errors in SMRT sequence data are predominately indels, which causes 
some ambiguity in the local placement of IPDs to genomic positions.  To avoid 
most of these errors we only use IPDs if the SMRT sequence matches exactly for 
$k$ bases around observed base. Currently we use $k=1$.
+
+These filtering steps yield a vector of IPDs confidently assigned to each 
genomic position.
+
+The distribution of IPDs observed at a genomic location is roughly 
exponential, with a long tail caused by polymerase pausing inherent to SMRT 
sequencing. Information about base modifications is contained in the main 
distribution, while the contaminating pauses just contribute noise.  We employ 
a simple capping procedure to reduce the influence of these outliers: 
+$I_i^{(l)} = \min(R_i^{(l)}, Q_{99})$ where $I_i^{(l)}$ is the $i$th capped 
IPD at genomic location $l$, $R_i^{(l)}$ is the $i$th raw IPD and $Q_{99}$ is 
the $99$th percentile IPD over all IPD observations.
+
+We summarize the capped IPDs at each positions with a sample mean and standard 
deviation of the mean:
+\begin{eqnarray} 
+\mu_l & = & \frac{1}{n_l} \sum_i R_i^{(l)} \\
+\sigma_l^2 & = & \frac{1}{n_l} \sqrt{\sum_i (R_i^{(l)} - m_l)^2 }
+\end{eqnarray}
+
+
+\section{Control IPDs}
+
+Modification detection proceeds by comparing the observed mean IPD $m_l$ with 
the expected mean IPD for unmodified DNA at that location. In case-control mode 
we sequence a sample of DNA from the same organism that has been \emph{whole 
genome amplified} (WGA) to preserve the DNA sequence while removing all base 
modifications. In this mode of operation we summarize the IPDs observed in the 
control sample in the same way as the case sample, then proceed to the 
modification detection step. I [...]
+
+\subsection{\emph{in-silico} Control}
+An alternative to the \emph{case-control} paradigm is to construct an 
\emph{in-silico} model to predict the mean IPD  unmodified DNA given the local 
sequence context. We use the Gradient Boosting Machines \cite{gradientboost} to 
construct a function $Q(\mathbf{c})$ that returns an estimate of the mean IPD 
given a DNA context $\mathbf{c} = \{c_{-8}, c_{-7}, ... ,c_0, c_1, .., c_3\}$ 
where $c_i \in \{A, C, G, T\}$ encodes the DNA sequence surrounding $c_0$. The 
IPD prediction $Q(\mathbf{c} [...]
+
+\begin{eqnarray}
+\mu_l^{(c)} & = & Q(\mathbf{Ctx}(l)) \\
+\sigma_l^{(c)} & = & f(\mu_l^{(c)})
+\end{eqnarray}
+
+\begin{figure}
+\begin{center}\includegraphics[width=5in]{position-influence.png}\end{center}
+\caption{ Influence of Base in Context Window }
+\label{fig:position-influence}
+\end{figure}
+
+
+\section{Modification Detection}
+We pose the modification detection problem as the detection of genome 
locations whose mean IPD differs significantly from the control mean. We use a 
Welch's $t$-test to test for differences in the means between the case sample 
and the control, derived from either a control sample or the \emph{in-silico} 
model above.
+
+The $t$-statistic is defined as:
+\begin{eqnarray}
+s_l & = & = \sqrt{ \sigma_l^2 + \sigma_l^{(c)2}} \\
+T_l & = &\frac{\mu_l - \mu_l^{(c)}}{s_l} 
+\end{eqnarray}
+
+and we compute the p-value of $T_l$ under the $t$-distribution, and report a 
Phred-transformed QV as well:
+\begin{eqnarray}
+p & = & \Pr(t > T_l) \\
+\mathrm{QV} & = & -10 \log_{10} p
+\end{eqnarray}
+
+
+\section{ Modification Identification } 
+
+\subsection { Positive Control Model }
+We extend the \emph{in-silico} model from an alphabet over DNA bases $c_i \in 
\{A,C,G,T\}$ to an alphabet that includes the \emph{modified} bases we aim to 
identify: $c_i \in \{A,C,G,T, ^\textrm{m6}A, ^\textrm{m5}C, ^\textrm{m4}C \}$. 
To train this model we require labeled examples of the modified bases in a 
reasonable diversity of background contexts.  Base modifications appearing in 
bacterial restriction-modification systems provide an excellent source of 
training data that is fairly s [...]
+
+Our positive control training set incorporates data from 11 bacteria, with the 
following number of unique methylated motifs for each modification type -- 
\mA:36, \mC{4}:7, \mC{5}:7.
+
+
+\subsection{Viterbi Decoding}
+
+Let $\mathbf{S} = s_1, s_2, s_3, ... s_n$ be the unmodified DNA sequence ($s_i 
\in \{A,C,G,T\}$). Let $\mathbf{M} = m_1, m_2, m_3, ..., m_n$ be a DNA sequence 
carrying modifications ($m_i \in \{A,C,G,T, ^\textrm{m6}A, ^\textrm{m5}C, 
^\textrm{m4}C \}$). A modification removal $R(m)$ operation maps a modification 
to it's unmodified base - so $R(^\textrm{m6}A) = A, R(^\textrm{m5}C) = C, 
R(^\textrm{m4}C) = C$.  The modified sequence $\mathbf{M}$ is constrained to 
maintain the base identity o [...]
+
+$$
+\log \Pr(\mathbf{O} \mid \mathbf{M}) = \sum_i \log \Pr(O_i \mid 
\mathcal{C}(\mathbf{M}, i))
+$$
+
+where $\mathcal{C}(\mathbf{M}, i)$ is the \emph{context} function that snips 
out the -8 to +3 bp context from sequence $\mathbf{M}$ around position $i$.
+IPD observations $O_i$ are assumed independent given the context 
$\mathcal{C}(\mathbf{M}, i)$.  We seek to find the modification sequence 
$\hat{\mathbf{M}}$ that maximizes the likelihood of IPD observations: 
+
+$$
+\hat{\mathbf{M}} = \argmax_{\mathbf{M}} \sum_i \log \Pr(O_i \mid 
\mathcal{C}(\mathbf{M}, i)) 
+$$
+
+Again we use the t-distribution to model the likelihood of the observed IPDs, 
given a mean prediction generated by the \emph{in-silico} IPD model:
+
+\begin{eqnarray}
+T_i & = & \frac{\mu_i - Q(\mathcal{C}(\mathbf{M}, i)}{s_i} \\
+Pr(O_i \mid \mathcal{C}(\mathbf{M}, i)) & = & f(T_i)
+\end{eqnarray}
+
+where $f(T)$ is the $t$-distribution PDF.
+
+We find the maximum-likelihood modification sequence by applying the Viterbi 
algorithm\cite{forney1973viterbi}. At each position $i$ we define 
$\mathbf{H}^{(i)}$, the set of all possible modification contexts centered at 
$i$ with a unmodified sequence that matches the reference sequence. In order to 
reduce the algorithm run time, we reduce the size of $\mathbf{H}^{(i)}$ by only 
considering alternatives that have some supporting evidence in the nearby 
single-site $p$-values. Here we show  [...]
+
+$$
+\mathbf{H}^{(i)} = \{ \mathcal{C}(\mathbf{M}, i) \mid \mathbf{M}, R(m_i) = s_i 
\} 
+$$
+
+The Viterbi forward matrix $\alpha(H^{(i)}_j, i)$ is defined recursively: The 
first argument is the current state $H^{(i)}_j$ drawn from the possible 
modification configurations $\mathbf{H}^{(i)}$; the second argument is the 
position $i$.
+
+$$
+\alpha(H^{(i)}_j, i) = \max_{K \in \mathbf{H}^{(i-1)}} \alpha(K, i-1) \Pr(O_i 
\mid H^{(i)}_j) \; \mathbf{SM}(K, H^{(i)}_j)
+$$
+
+$\mathbf{SM}(K, L) = \mathbf{1}\{K_1=L_2, ..., K_{11}=L_{12}\}$ is the 
\emph{context matching function}, where $K$ and $L$ are 12 base context 
strings. It returns $1$ if the last 11 bases of $K$ match the first 11 bases of 
$L$, and $0$ otherwise. This enforces the constraint that modification sequence 
contexts are self-consistent for all paths through the Viterbi matrix.
+
+The standard Viterbi traceback procedure yields the maximum-likelihood 
modification state at each genomic position. We compute a Modification QV for 
each modification in the ML configuration by comparing the likelihood of the 
best modification sequence to the likelihood with a given modification set back 
to the canonical base:
+
+\begin{eqnarray}
+p_i & = & \frac{\Pr(\mathbf{O} \mid R(\hat{\mathbf{M}}, i))} {\log 
\Pr(\mathbf{O} \mid R(\hat{\mathbf{M}}, i)) + \log \Pr(\mathbf{O} \mid 
\mathbf{M}') } \\
+\mathrm{QV}_i & = & -10 \log_{10} p_e
+\end{eqnarray}
+
+where $R(\hat{\mathbf{M}}, i))$ denotes the $\hat{\mathbf{M}}$ with base $m_i$ 
converted to the unmodified base $R(m_i)$.
+
+\section{Results}
+\subsection{Detection Performance}
+Figure \ref{fig:ecoli-roc} shows the ROC curve for single-site detection of 
$^\textrm{m6}$A in \emph{E.coli}, for data from varying numbers of SMRT Cells. 
In these data we get the following coverage levels: 2 cells: 32x per strand, 4 
cells: 65x per strand, 6 cells: 95x per strand. The \emph{E.coli} strain tested 
here has three methylated motifs: G\textbf{A}TC, GC\textbf{A}CNNNNNNGTT, 
A\textbf{A}CNNNNNNGTGC, with a total of 39430 sites matching one of those 
motifs.   Commonly, we observe  [...]
+
+\begin{figure}
+\begin{center}\includegraphics[width=5in]{ecoli-roc.png}\end{center}
+\caption{ ROC for $^\textrm{m6}$A detection in \emph{E.coli} genome }
+\label{fig:ecoli-roc}
+\end{figure}
+
+\subsection{Identification Performance}
+
+We tested the modification identification capability on the bacteria 
\emph{Desulfurobacterium thermolithotrophum}, which carries RM systems using 
\mA, \mC{5}, and \mC{4} modifications. The motif CA\textbf{C}C is modified with 
\mC{4}, GG\textbf{C}C with \mC{5}, and G\textbf{A}TC with \mA. Table 
\ref{table-confusion} shows that we can accurately separate \mC{4} and \mC{5} 
modifications, while accurately calling \mA. The TET treatment protocol that 
amplifies the \mC{5} signal appears to neg [...]
+
+\begin{table}
+\centering
+\begin{tabular}{c c c c c c}
+Motif & Not Detected & \mC{4} & \mC{5} & \mA & modified\_base \\
+\hline 
+CACC &   3830 &  \textbf{1222} &    10 &    0  &      114 \\
+GGCC &    235 &    0 & \textbf{1065} &    0  &       4  \\
+GATC &    66 &    0 &    0 & \textbf{4836}  &       0 \\
+None &     &   85 &   98 &   93  &       21799  \\
+\end{tabular}
+\caption{Modification Identification Confusion Matrix}
+\label{table-confusion}
+\end{table}
+
+
+\bibdata{kinetics}
+\bibliography{kinetics}
+\bibliographystyle{unsrt}
+
+\vspace{1in}
+
+\small{
+For Research Use Only. Not for use in diagnostic procedures. © Copyright 2012, 
Pacific Biosciences of California, Inc. All rights reserved. Information in 
this document is subject to change without notice. Pacific Biosciences assumes 
no responsibility for any errors or omissions in this document. Certain 
notices, terms, conditions and/or use restrictions may pertain to your use of 
Pacific Biosciences products and/or third party products. Please refer to the 
applicable Pacific Biosciences [...]
+Pacific Biosciences, the Pacific Biosciences logo, PacBio, SMRT and SMRTbell 
are trademarks of Pacific Biosciences in the United States and/or certain other 
countries. All other trademarks are the sole property of their respective 
owners.  }
+
+
+
+\end{document} 
\ No newline at end of file
diff --git a/doc/whitepaper/logo.jpg b/doc/whitepaper/logo.jpg
new file mode 100644
index 0000000..ddea572
Binary files /dev/null and b/doc/whitepaper/logo.jpg differ
diff --git a/doc/whitepaper/position-influence.png 
b/doc/whitepaper/position-influence.png
new file mode 100644
index 0000000..0e96887
Binary files /dev/null and b/doc/whitepaper/position-influence.png differ

-- 
Alioth's /usr/local/bin/git-commit-notice on 
/srv/git.debian.org/git/debian-med/kineticstools.git

_______________________________________________
debian-med-commit mailing list
debian-med-commit@lists.alioth.debian.org
http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit

[med-svn] [kineticstools] 01/04: Imported Upstream version 0.6.1+20161222

Reply via email to