\documentclass[landscape]{slides}
\usepackage{graphics}
\newcommand{\head}[1]{\begin{center}\textbf{#1}\end{center}}
\renewcommand{\bibitem}[1]{\item\label{#1}}
\renewcommand{\cite}[1]{[\ref{#1}]}
\newcommand{\keyw}[1]{{\textbf {#1}}}
%\pagestyle{empty}
% fullpage, landscape
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 6.5in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 8.9in
% Boolean operators, use regular latex symbols
\newcommand{\band}{\wedge}
\newcommand{\bor}{\vee}
\newcommand{\bnot}{\neg}
\begin{document}
\begin{slide}
\begin{center}
\head{A Survey of Bioinformatics}
Humberto Ortiz Zuazaga
\texttt{humberto@hpcf.upr.edu}
\texttt{http://www.hpcf.upr.edu/\~{}humberto/}
\end{center}
\end{slide}
%%%
\begin{slide}
\head{Bioinformatics}
The application of computers to the collection, analysis, and
presentation of biological information.
\end{slide}
%%%
\begin{slide}
\head{Taxonomy (of the talk)}
\begin{itemize}
\item Sequence analysis
\item Structure prediction
\item Genetic and physical mapping
\item Gene expression networks
%\item Biological databases
%\item Biological software components
\end{itemize}
\end{slide}
\begin{slide}
\head{Pairwise sequence alignments}
Given a pair of sequences over an alphabet $\Sigma$, and a cost
function that assigns a cost to an alignment, find an alignment of the
strings that minimizes the cost.
\begin{itemize}
\item This problem is central to many biological programs (homology
searches, multiple alignments, molecular phylogeny, exon prediction,
protein threading, ...)
\item Usually solved by Dynamic Programming.
\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{Alignment dynamic programming table}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto15.eps}}
\end{center}
\end{slide}
%%%
\begin{slide}
\head{The solved alignment}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto14.eps}}
\end{center}
\end{slide}
\begin{slide}
\head{References}
VSNS BCD Pairwise Alignments chapter.
\texttt{http://www.techfak.uni-bielefeld.de/bcd/Curric/PrwAli/prwali.html}
\end{slide}
%%%
\begin{slide}
\head{Multiple sequence alignments}
\begin{itemize}
\item Assign a cost to alignments of multiple sequences.
\item Sum of pairs metric, add all pairwise alignments together.
\item NP-complete for the SP metric.
\item Other heuristics must be used for practical programs (Divide
and Conquer, HMM's).
\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{A sample multiple alignment}
%\includegraphics{humberto13.eps}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto13.eps}}
\end{center}
\end{slide}
%%%
\begin{slide}
\head{References}
VSNS BCD Multiple Alignment chapter
\texttt{http://www.techfak.uni-bielefeld.de/bcd/Curric/MulAli/mulali.html}
\end{slide}
%%%
\begin{slide}
\head{The H-P model of globular proteins}
Simple model first postulated by Ken Dill~\cite{kD90}.
\begin{itemize}
\item Amino acids are hydrophobic (H, nonpolar) or hydrophilic (P,
polar).
\item H-H contacts contribute -1 to the energy of the protein.
\item all other contacts contribute 0.
\item Protein structures are constrained to self-avoiding paths on a
regular lattice.
\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{H-P proteins on a regular lattice}
% \includegraphics[5in,3in]{humberto12.eps}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto12.eps}}
\end{center}
\end{slide}
%%%
\begin{slide}
\head{The Protein Folding problem}
Given a sequence $s$ and an integer $E$, is there a fold that has $-E$
or lower energy?
\begin{itemize}
\item Has been shown to be NP-complete in 2D (HAMILTONIAN PATH)~\cite{CGPPY98}
\item MAXSNP-complete in 3D~\cite{CGPPY98}
\item Can be approximated to $3/8$ of optimal in linear time in 3D~\cite{wHsI95}\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{The Inverse Protein Folding (IFP) problem}
Given a target structure or conformation of a protein $G$, find a
sequence $s$ of length $n$ that:
\begin{itemize}
\item Has $G$ as is it's minimum energy state.
\item Has the lowest degeneracy (number of other conformations with
the same energy) of any possible sequence.
\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{The Heuristic Sequence Design (HSD) problem}
IFP is conjectured to be NP-complete, the best known algorithm must
search over all possible conformations of all possible sequences. HSD
problems try to simplify the computation by restricting the problem.
\begin{itemize}
\item The Canonical Method: find the sequence with at most $\lambda
n$ hydrophobic residues.
\item The Grand Canonical Method: Change the energy function so that
H-H contacts have -2, an exposed H residue has 1, and all other
interactions have 0.
\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{Computational Results}
\begin{itemize}
\item The Canonical Method is NP-complete (reduction from
SUBSET-SUM), but there are algorithms that can approximate the
energy of the optimal solution (OPT) to 1+OPT on 2D lattices, and 1/2
OPT on 3D lattices~\cite{wH97}.
\item The Grand Canonical Method can be solved in polynomial time.
\end{itemize}
\end{slide}
%%%
% Cite Hart and Istrail
\begin{slide}
\head{References}
\begin{enumerate}
\small
\bibitem{kD90} Ken Dill. Dominant forces in protein folding. {\em
Biochemistry} 29, pages 7133--7155, 1990.
\bibitem{CGPPY98} P. Crescenzi, D. Goldman, C. Papadimitriou, A. Piccolboni,
M. Yannakakis, On the Complexity of Protein Folding. In Proc. of the
Second Annual Conference of Computational Biology (RECOMB
'98).\\ \texttt{http://citeseer.nj.nec.com/31865.html}
\bibitem{wHsI95} Hart, W. and Istrail, S. Fast protein folding in the
hydrophobic-hydrophilic model within three eighths of optimal. In
Proceedings of the 27th Annual ACM Symposium on the Theory of
Computing. 1995.\\
\texttt{http://citeceer.nj.nec.com/hart95fast.html}
\bibitem{wH97} William Hart. On the computational complexity of sequence design
problems. In First Annual International Conference on Computational
Molecular Biology (RECOMB'97), pages 128--136,
1997.\\ \texttt{http://citeseer.nj.nec.com/hart97computational.html}
\end{enumerate}
\end{slide}
%%%
\begin{slide}
\head{Genetic Mapping}
\begin{itemize}
\item Goal: The determination of orders and distances among markers on
a chromosome based on the observed patterns of inheritance of the
alleles of the markers in three generation pedigrees.
\item Problem: For a variety of reasons the genotypic information
is not complete, and not all crosses in human pedigrees are
informative. In addition, the time required to order markers
grows exponentially with the number of markers.
\item Solution: Only use ``good'' markers to make maps. Biologists
already have a notion of a ``framework'' map, a map of a subset of
the markers which has very high odds against inversion of adjacent
markers.
\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{A genotyped pedigree}
% \includegraphics{humberto1.eps}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto1.eps}}
\end{center}
\end{slide}
%%% Minimum break method
\begin{slide}
\head{Counting Obligate Breaks as an Estimate of Genetic Distance}
A simple estimate of the genetic distance between two markers is the
number of observed recombinations between the markers in the data
set. For the first two markers in our sample pedigree we would
have:
\begin{center}
\setlength{\tabcolsep}{0.25ex}
\begin{tabular}{l|cccccccccccc}
UT851 &U&M&U&M&U&M&U&M&U&M&U&M \\
UT1398 &P&P&P&M&M&M&P&P&P&M&M&M \\
\hline
Breaks & &1& & & & & &1& & & &
\end{tabular}
\end{center}
for a total of 2 breaks.
This technique based on counting the number of recombinations is known
as meiotic breakpoint analysis (BPA).
\end{slide}
%3) wclique formulation as a way of formalizing biological intuition.
%%%
\begin{slide}
\head{Selecting Genetic Markers With \texttt{wclique}}
We have implemented an algorithm for screening markers for genetic
mapping by transforming the marker selection problem into a maximum
weighted clique (MWC) problem.
Each marker becomes a node of a graph. The weight of the node
corresponds to the frequency of known phases (\textit{i.e.}, the
total count of P and M phases) for this marker. This measure
directly reflects how informative each marker is for linkage
analysis.
Two nodes in the graph are connected by an edge whose weight is the
number of breaks between the corresponding markers. This is a
heuristic estimate for genetic distance, but has been shown to
result in correct marker orders as the number of gametes tends to
infinity.
\end{slide}
%show small sample pedigree and draw distance graph.
\begin{slide}
\head{A Small Distance Graph}
%\includegraphics[4in,4in]{humberto7.eps}
\begin{center}
\resizebox{!}{5.5in}{\includegraphics{humberto7.eps}}
\end{center}
\end{slide}
%5) NP-completeness result for weighted clique, explain why this is
%bad.
\begin{slide}
\head{The Maximal Weighted Clique problem is NP-Complete}
The MWC is a well known graph problem, extensively studied in
computer science. Unfortunately, it belongs to the class of
NP-complete problems, for which there is unlikely to be an efficient
algorithm.
Building a linear map by ordering genetic markers so as to minimize
the number of recombination events in a set of gametes can also be
cast as a graph problem, the traveling salesman problem (TSP), which
is also NP-complete.
\end{slide}
%6) Show algorithms for wclique, other possible avenues of attack.
\begin{slide}
\head{But I Still Need a Map}
\begin{itemize}
\item Exact algorithms can work on small sets of markers.
\item Local search techniques can find near optimal solutions for
some of these problems, at the cost of not knowing if an optimal
solution was ever found. The best heuristics for TSP can find a
solution with 1.05 times the optimal cost.
\item A change in the formulation of the problems can enable other
algorithms to be used. For example, if the data had no errors,
was complete, and no double recombination events occurred,
ordering genetic markers would be equivalent to the consecutive
ones problem (C1P) for which there are linear time algorithms.
\end{itemize}
\end{slide}
%%%
\begin{slide}
\head{Searls plot of unselected markers}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto2.eps}}
\end{center}
\end{slide}
%%%
\begin{slide}
\head{Searls plot of \texttt{wclique}-selected markers}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto3.eps}}
\end{center}
\end{slide}
%%%
\begin{slide}
\head{Comparison of MLA Maps of Hand-Selected
and \texttt{wclique}-Selected Markers}
% \includegraphics[3in,4in]{humberto4.eps}
\begin{center}
\resizebox{!}{5in}{\includegraphics{humberto4.eps}}
\end{center}
\end{slide}
%%%
\begin{slide}
\head{References}
\begin{enumerate}
\bibitem{wclique} H.
Ortiz-Zuazaga, and R. Plaetke. Screening
genetic markers with the maximum weighted clique method. Abstract
presented at Genome Mapping and Sequencing. Cold Spring Harbor, May
1997.
\bibitem{framework} S.L.
Naylor, R. Plaetke, H.
Ortiz-Zuazaga, P. O'Connell, B. Reus, X.
He, R. Linn, S. Wood, and R.J. Leach. Construction of Framework and
Radiation Hybrid Maps of Chromosomes 3 and 8. Abstract presented at
Genome Mapping and Sequencing. Cold Spring Harbor, NY, May
1997.
\end{enumerate}
\end{slide}
%%%
\begin{slide}
\head{Gene expression networks}
\begin{itemize}
\item Complete genomes available for several species.
\item 40,000 human genes, many already sequenced.
\item microarrays can measure expression levels for
ALL GENES in a single assay.
\end{itemize}
\end{slide}
\begin{slide}
\head{Boolean Genetic Network Model}
In~\cite{ISMB} we define Boolean genetic network model (BGNM):
\begin{itemize}
\item A {\em Boolean variable} takes the values 0, 1.
\item A {\em Boolean function} is a function of Boolean variables,
using the operations $\band$, $\bor$, $\bnot$.
\end{itemize}
A {\em Boolean genetic network model} (BGNM) is:
\begin{itemize}
\item An $n$-tuple of Boolean variables $(x_1,\dots,x_n)$ associated with
the genes
\item An $n$-tuple of Boolean control functions $(f_1,\dots,f_n)$,
describing how the genes are regulated
\end{itemize}
\end{slide}
\begin{slide}
\head{Results on Boolean Networks}
\begin{itemize}
\item Determining if a given assignment to all the variables is
consistent with a given gene network was shown to be NP-complete in~\cite{akutsu} (by reduction from 3-SAT).
\item In the worst case, $2^{(n-1)/2}$ experiments are needed
\item If the indegree of each node (the genes that affect our target
gene) is bound by a constant $D$, the cost is $O(n^{2D})$.
\item For low $D$,~\cite{ideker} and~\cite{liang} provide effective
procedures for reverse engineering, assuming any gene may be set to
any value.
\end{itemize}
\end{slide}
\begin{slide}
\head{Reverse Engineering Boolean Networks}
\begin{enumerate}
\bibitem{akutsu} Akutsu, S. Kuahara, T. Maruyama, O. Miyano, S. 1998. Identification of
gene regulatory networks by strategic gene disruptions and gene
overexpressions. Proceedings of the 9th ACM-SIAM Symposium on Discrete
Algorithms (SODA 98), H. Karloff, ed. ACM Press.
\bibitem{ideker} Ideker, T.E., Thorsson, V., and Karp, R.M. 2000. Discovery of
regulatory interactions through perturbation: inference and
experimental design. Pacific Symposium on Biocomputing 5:302-313.
\bibitem{liang} S. Liang, S. Fuhrman and R. Somogyi. 1998. REVEAL, A General Reverse
Engineering Algorithm for Inference of Genetic Network
Architectures. Pacific Symposium on Biocomputing 3:18-29.
\end{enumerate}
\end{slide}
\begin{slide}
\head{The World's Smallest Finite Field}
The integers $0$ and $1$, with integer addition and
multiplication modulo 2 form the finite field
$Z_2 = \{\{0,1\},+,\cdot\}$.
The operators $+$ and $\cdot$ are defined as follows:
\[\begin{array}{c|cc}
+ & 0 & 1 \\
\hline
0 & 0 & 1 \\
1 & 1 & 0
\end{array}
\hspace{3cm}
\begin{array}{c|cc}
\cdot & 0 & 1 \\
\hline
0 & 0 & 0 \\
1 & 0 & 1
\end{array}
\]
\end{slide}
\begin{slide}
\head{Finite field equivalents to the Boolean operators}
We can realize any Boolean function as an expression over $Z_2$:
\begin{eqnarray}
X \band Y&=& X\cdot Y \nonumber \\
X \bor Y &=& X+Y+X\cdot Y \nonumber \\
\bnot X &=& 1 + X \nonumber
\end{eqnarray}
\end{slide}
\begin{slide}
\head{Finite Field Genetic Networks}
Any BGNM can be converted into an equivalent model over $Z_2$ by
realizing the Boolean functions as sums-of-products and
products-of-sums, then converting the Booleans to $Z_2$. We now have a {\em finite field genetic network}
(FFGN):
\begin{itemize}
\item An $n$-tuple of variables over $Z_2$, $(x_1,\dots,x_n)$ associated with
the genes
\item An $n$-tuple of functions over $Z_2$, $(f_1,\dots,f_n)$, describing how
the genes are regulated
\end{itemize}
\end{slide}
\begin{slide}
\head{Publications}
\begin{enumerate}
\bibitem{RECOMB} Ortiz-Zuazaga, H., Avi\~{n}o-Diaz, M.~A.,
Laubenbacher, R., Moreno O. Finite fields are better
Booleans. Refereed abstract, poster to be presented at the Seventh
Annual Conference on Computational Molecular Biology (RECOMB 2003),
April 10--13, 2003, Germany.
\bibitem{ISMB}
Ortiz-Zuazaga, H., Avi\~{n}o-Diaz, M.~A., Corrada Bravo, C.~J.,
Laubenbacher, R., Pe\~{n}a-de-Ortiz, S., Moreno O. Applications of
finite fields to the study of microarray expression data. Submitted to
the 11th International Conference on
Intelligent Systems for Molecular Biology (ISMB 2003), June 29 to July
3, 2003, Brisbane, Australia.
\end{enumerate}
\end{slide}
\end{document}
%%%
\begin{slide}
\head{}
\end{slide}