%% ToC#231  v002/a012.tex  A. Deshpande, L. Rademacher, S. Vempala, G. Wang

\documentclass{toc}

\tocdetails{
   volume=2, number=12, year=2006, firstpage=225,
   received={August 11, 2005}, 
   revised={September 28, 2006},
   published={October 12, 2006}
}

\newcommand{\reals}{\ensuremath{\mathbb{R}}}
\newcommand{\norm}[1]{\|#1\|}
\newcommand{\fnorm}[1]{\norm{#1}_{F}}
\newcommand{\fnorms}[1]{\norm{#1}_{F}^2}
\newcommand{\suchthat}{\;:\;}
\newcommand{\card}[1]{\lvert#1\rvert}

\DeclareMathOperator{\vol}{vol}
\DeclareMathOperator{\sgn}{sgn}
\DeclareMathOperator{\rowspan}{span}
\DeclareMathOperator{\proj}{\pi}
\DeclareMathOperator{\E}{\mathsf{E}}

\newlength{\algboxwidth}
\setlength{\algboxwidth}{\textwidth}
\addtolength{\algboxwidth}{-8pt}

\newcommand\eps{\epsilon}
\newcommand\poly{\mbox{poly}}

\runningauthor{A. Deshpande, L. Rademacher, S. Vempala, G. Wang}

\copyrightauthor{Amit Deshpande and Luis Rademacher and Santosh Vempala and Grant Wang}
\runningtitle{Matrix Approximation and Projective Clustering via
  Volume Sampling}

\bibliographystyle{tocplain}

\begin{document}

\begin{frontmatter}
\title{Matrix Approximation and Projective Clustering via~Volume~Sampling}

\tocpdftitle{Matrix Approx. and Proj. Clustering via Vol. Sampling}
\tocpdfauthor{Amit Deshpande and Luis Rademacher and Santosh Vempala and Grant Wang}

\author[amit]{Amit Deshpande\thanks{Supported by NSF Award CCR-0312339.}}%
\author[luis]{Luis Rademacher\thanks{Supported by NSF Award CCR-0312339.}}%
\author[santosh]{Santosh Vempala\thanks{Supported by NSF Award CCR-0312339 and a Guggenheim Foundation Fellowship.}}%
\author[grant]{Grant Wang\thanks{Supported by NSF Award CCR-0312339.}}%

\tockeywords{algorithms, matrix approximation, projective clustering.}

\begin{abstract}
  Frieze, Kannan, and Vempala (JACM 2004) proved that a small sample
  of rows of a given matrix $A$ spans the rows of a low-rank
  approximation $D$ that minimizes $\fnorm{A-D}$ within a small additive
  error, and the sampling can be done efficiently using just two
  passes over the matrix. In this paper, we generalize this result in
  two ways. First, we prove that the additive error drops
  exponentially by iterating the sampling in an adaptive manner
  (\emph{adaptive sampling}).  Using this result, we give a
  pass-efficient algorithm for computing a low-rank approximation with
  reduced additive error.  Our second result is that there exist $k$
  rows of $A$ whose span contains the rows of a multiplicative
  $(k+1)$-approximation to the best rank-$k$ matrix; moreover, this
  subset can be found by sampling $k$-subsets of rows from a natural
  distribution (\emph{volume sampling}).  Combining \emph{volume
    sampling} with \emph{adaptive sampling} yields the existence of a
  set of $k+k(k+1)/\eps$ rows whose span contains the rows of a
  multiplicative $(1+\eps)$-approximation.  This leads to a PTAS for
  the following NP-hard projective clustering problem: Given a set $P$
  of points in $\reals^d$, and integers $j$ and $k$, find subspaces
  $F_1, \dotsc, F_j$, each of dimension at most $k$, that minimize
  $\sum_{p \in P} \min_{i} d(p,F_i)^2$.
\end{abstract}

\tocams{68W25, 65F15}
\tocacm{G.1.3, F.2.1}

\end{frontmatter}

\section{Introduction}

\subsection{Motivation}

Let the rows of a matrix be points in a high-dimensional space. It is
often of interest to find a low-dimensional representation. The
subspace spanned by the top $k$ right singular vectors of the matrix
is a good choice for many applications.  The problem of efficiently
finding an approximation to this subspace has received much attention
in the past decade~\cite{FKV, DFKVV, AM, DK, DKM}.  In this paper, we
give new algorithms for this problem and show existence of subspaces
lying in the span of a small set of rows with better additive
approximation as well as multiplicative approximation.  At the heart
of our analysis are generalizations of previous sampling
schemes~\cite{FKV}.  We apply these results to the general problem of
finding $j$ subspaces, each of dimension at most $k$, so as to
minimize the sum of squared distances of each point to its nearest
subspace, a measure of the ``error'' incurred by this representation;
as a result, we obtain the first polynomial-time approximation scheme
for this \emph{projective clustering} problem~\cite{AGGR,PJAM,APWYP,AgMu} when $j$ and $k$ are fixed.

The case of $j=1$, \ie, finding a single $k$-dimensional subspace, is
an important problem in itself and can be solved efficiently (for $j
\geq 2$, the problem is NP-hard~\cite{MT}, even for $k=1$~\cite{DFKVV}).  The optimal projection is given by the rank-$k$ matrix
$A_k = AYY^T$ where the columns of $Y$ are the top $k$ right singular
vectors of $A$ and can be computed using the Singular Value
Decomposition. Note that among all rank-$k$ matrices $D$, $A_k$ is the
one that minimizes $\fnorms{A-D} = \sum_{i,j} (A_{ij}-D_{ij})^2$. The
running time of this algorithm, dominated by the SVD computation of an
$m \times n$ matrix, is $O(\min\{mn^2, nm^2\})$. Although polynomial,
this is still too high for some applications.

For problems on data sets that are too large to store/process in their
entirety, one can view the data as a stream of data items arriving in
arbitrary order, and the goal is to process a subset chosen
judiciously on the fly and then extrapolate from this subset.
Motivated by the question of finding a faster algorithm, Frieze et
al.~\cite{FKV} showed that any matrix $A$ has $k/\eps$ rows whose span
contains the rows of a rank-$k$ approximation to $A$ within additive
error $\eps \fnorms{A}$.  In fact, the subset of rows can be obtained
as independent samples from a distribution that depends only on the
norms of the rows.  (In what follows, $A^{(i)}$ denotes the $i$th row
of $A$.)
\begin{theorem}[\cite{FKV}]\label{FKV}
Let $S$ be a sample of $s$ rows of an $m \times n$ matrix $A$,
each chosen independently from the
following distribution: row $i$ is picked with probability
\[
P_i = \frac{\norm{A^{(i)}}^2}{\fnorms{A}}\enspace.
\]
If $s \geq k/\eps$, then $\rowspan(S)$ contains the rows of a matrix
$\tilde{A}_k$ of rank at most $k$ for which
\[
\E_S(\fnorms{A-\tilde{A}_k}) \leq \fnorms{A-A_k} + \eps\fnorms{A}\enspace.
\]
\end{theorem}
This can be turned into an efficient algorithm based on
sampling~\cite{DFKVV}.\footnote{Frieze et al.~\cite{FKV} go further to
  show that there is an $s \times s$ submatrix for $s=\poly(k/\eps)$ from
  which the low-rank approximation can be computed in
  $\poly(k,1/\eps)$ time in an implicit form.}  The algorithm makes
one pass through $A$ to figure out the sampling distribution and
another pass to sample and compute the approximation. Its complexity
is $O(\min\{m,n\}k^2/\eps^4)$.  These results lead us to the following
questions: (1) Can the error be reduced significantly by using
multiple passes through the data?  (2) Can we get multiplicative
$(1+\eps)$-approximations? (3) Do these sampling algorithms have any
consequences for the general projective clustering problem?

\subsection{Our results}\label{sec:ourresults}
We begin with the observation that the additive error term drops
\emph{exponentially} with the number of passes when the sampling is
done adaptively.  Thus, low-rank approximation is a natural problem
for which multiple passes through the data are highly beneficial.

The idea behind the algorithm is quite simple. As an illustrative
example, suppose the data consists of points along a $1$-dimensional
subspace of $\reals^n$ except for one point. The best rank-$2$
subspace has zero error. However, one round of sampling will most
likely miss the point far from the line. So we use a two-round
approach. In the first pass, we get a sample from the squared norm
distribution.  Then we sample again, but adaptively---we sample with
probability proportional to the squared distance to the span of the
first sample.  We call this procedure \emph{adaptive sampling}.  If
the lone far-off point is missed in the first pass, it will have a
high probability of being chosen in the second pass.  The span of the
full sample now contains the rows of a good rank-$2$ approximation. In
the theorem below, for a set $S$ of rows of a matrix $A$, we denote by
$\proj_S(A)$ the matrix whose rows are the projection of the rows of
$A$ to the span of $S$.

\newcommand{\maintheorem}{
  Let $S = S_1 \cup \dotsb \cup S_t$ be a random sample of rows of an
  $m \times n$ matrix $A$ where, for $j=1,\dotsc,t$, each set $S_j$ is
  a sample of $s$ rows of $A$ chosen independently from the following
  distribution: row $i$ is picked with probability
  \[
  P_i^{(j)} = \frac{\norm{E_j^{(i)}}^2}{\fnorms{E_j}}
  \]
  where $E_1 =A$, $E_j = A - \proj_{S_1 \cup \dotsb \cup S_{j-1}}(A)$.
  Then for $s \geq k/\eps$, $\rowspan(S)$ contains the rows of a matrix
  $\tilde{A}_k$ of rank at most $k$ such that
  \[
  \E_{S} (\fnorms{A-\tilde{A}_k}) \leq \frac{1}{1-\eps}
  \fnorms{A-A_k} + \eps^t\fnorms{A}\enspace.
  \]
}
\begin{theorem}\label{MAIN}
  \maintheorem
\end{theorem}

The proof of \expref{Theorem}{MAIN} is given in
\expref{Section}{SEC:ADD}. The resulting algorithm, described in
\expref{Section}{SEC:ALGO} uses $2t$ passes through the data.  Although
the sampling distribution is modified $t$ times, the matrix itself is
not changed.  This is especially significant when $A$ is sparse as the
sparsity of the matrix is maintained.

\expref{Theorem}{MAIN} raises the question of whether we can get a
multiplicative approximation instead of an additive approximation.  To
answer this question, we generalize the sampling approach.  For a set
$S$ of $k$ points in $\reals^n$, let $\Delta(S)$ denote the
$k$-dimensional simplex formed by them along with the origin.  We pick
a random $k$-subset $S$ of rows of $A$ with probability proportional
to $\vol(\Delta(S))^2$. This procedure, which we call \emph{volume
  sampling}, is a generalization of the earlier sampling approach
which picks single rows according to their squared norms.

\begin{theorem}\label{KROWS}
  Let $S$ be a random $k$-subset of rows of a given matrix $A$ chosen
  with probability
  \[
  P_S = \frac{\vol(\Delta(S))^2}{\sum_{T:|T|=k} \vol(\Delta(T))^2}\enspace.
  \]
  Then $\tilde{A}_k$, the projection of $A$ to the span of $S$, satisfies
  \[
  \E_S(\fnorms{A-\tilde{A}_k}) \leq (k+1) \fnorms{A-A_k}\enspace.
  \]
\end{theorem}

We prove this theorem in \expref{Section}{RELATIVE}. Moreover, the
factor of $k+1$ is the best possible for a $k$-subset
(\expref{Proposition}{TIGHT}). By combining \expref{Theorem}{KROWS}
with the adaptive sampling idea from \expref{Theorem}{MAIN}, we show
that there exist $O(k^2/\eps)$ rows whose span contains the rows of a
multiplicative $(1+\eps)$-approximation.

\begin{theorem}\label{RELATIVE}
  For any $m \times n$ matrix $A$, there exist $k+k(k+1)/\eps$ rows
  whose span contains the rows of a rank-$k$ matrix $\tilde{A}_k$ such
  that
  \[
  \fnorms{A-\tilde{A}_k} \leq \left(1+\eps\right) \fnorms{A-A_k}\enspace.
  \]
\end{theorem}

The existence of a small number of rows containing a good multiplicative
approximation is the key ingredient in our last result---a
polynomial-time approximation scheme (PTAS) for the general projective
clustering problem.  This result makes a connection between matrix
approximation and projective clustering.  A key idea in the matrix
approximation work of~\cite{DK,DFKVV,FKV} is that, for any matrix,
there is a small subset of its rows whose span contains a good
approximation to the row space of the entire matrix.  This is similar
to the idea of core-sets~\cite{AHV}, which have been studied in
computing extent measures in computational geometry (and applied to a
variant of the projective clustering problem~\cite{HV}).  Roughly
speaking, a core-set is a subset of a point-set such that computing
the extent measure on the core-set provides an approximation to the
extent measure on the entire point set.  An extent measure is just a
statistic on the point set (for example, the diameter of a point set,
or the radius of the minimum enclosing cylinder); typically, it
measures the size of the point set or the minimum size of an object
that encloses the point set~\cite{AHV}.

We state the projective clustering problem using the notation from
computational geometry: Let $d(p,F)$ be the orthogonal distance of a
point $p$ to a subspace $F$.  Given a set $P$ of $n$ points in
$\reals^d$, find $j$ subspaces $F_1, \dotsc, F_j$, each of dimension
$k$, such that
\begin{equation}
\mathcal{C}(F_1,\dotsc,F_j) = \sum_{p \in P} \min_{i} d(p,F_i)^2
\end{equation}
is minimized. When subspaces are replaced by flats, the case $k=0$
corresponds to the ``$j$-means problem'' in computational geometry.

\expref{Theorem}{RELATIVE} suggests an enumerative algorithm. The
optimal set of $k$-dimensional subspaces induces a partition $P_1 \cup
\dotsb \cup P_j$ of the given point set. Each set $P_i$ contains a subset
of size $O(k^2/\eps)$ in whose span lies a multiplicative
$(1+\eps)$-approximation to the optimal $k$-dimensional subspace for
$P_i$. So we consider all possible combinations of $j$ subsets each of
size $O(k^2/\eps)$, and a $\delta$-net of $k$-dimensional subspaces in the
span of each subset.  The $\delta$-net depends on the points in each subset
and is not just a grid, as is often the case.  Each possible
combination of subspaces induces a partition and we output the best of
these. Since the subset size is bounded (and so is the size of the
net), this gives a PTAS for the problem (see
\expref{Section}{sec:CLUSTER}) when $j$ and $k$ are taken to be fixed
constants.
\begin{theorem}\label{CLUSTER}
  Given $n$ points in $\reals^d$ and parameters $B$ and $\eps$, in time
\[d \left( \frac{n}{\epsilon} \right)^{O(j k^3/\epsilon)}\]
we can find a solution to the projective clustering problem which is
of cost at most $(1+\epsilon) B$ provided there is a solution of cost
$B$.
\end{theorem}

\subsection{Related work}\label{RELATED}

The work of Frieze et al.~\cite{FKV} and Drineas et al.~\cite{DFKVV}
introduced matrix sampling for fast low-rank approximation.
Subsequently, an alternative sampling-based algorithm was given by
Achlioptas and McSherry~\cite{AM}.  That algorithm achieves somewhat
different bounds (see~\cite{AM} for a detailed comparison) using only
one pass. It does not seem amenable to the multipass improvements
presented here.  Bar-Yossef~\cite{Bar} has shown that the bounds of
these algorithms for one or two passes are optimal up to polynomial
factors in $1/\eps$.

These algorithms can also be viewed in the \emph{streaming} model of
computation~\cite{HRR}.  In this model, we do not have random access
to data; the data comes as a stream of data items in arbitrary order
and we are allowed one or a few sequential passes over the data.
Algorithms for the streaming model have been designed for computing
frequency moments~\cite{AMS}, histograms~\cite{GKS}, etc. and have
mainly focused on what can be done in one pass.  There has been some
recent work on what can be done in multiple passes~\cite{DK, FKMSZ}.
The ``pass-efficient'' model of computation was introduced
in~\cite{HRR}.  Our multipass algorithms fit this model and relate the
quality of approximation to the number of passes.  Feigenbaum et
al.~\cite{FKMSZ} show such a relationship for computing the maximum
unweighted matching in bipartite graphs.

The Lanczos method is an iterative algorithm that is used in practice
to compute the Singular Value Decomposition~\cite{GvL,KM}.  An
exponential decrease in an additive error term has also been proven
for the Lanczos method under a different notion of additive error
(\cite{GvL,KM}).  However, the exponential decrease in error depends
on the gap between singular values.  In particular, the following is
known for the Lanczos method: after $k$ iterations, each approximate
singular value $\theta_i^2$ obeys:
\begin{equation}\label{eq:lanc}
\theta_i^2 \geq \sigma_i^2 - c^kC
\end{equation}
where $\sigma_i$ is the $i$th singular value.  Both constants $c$ and
$C$ depend on the gap between singular values; in particular, $c$ is
proportional to $(\sigma_2^2 - \sigma_r^2)/(2 \sigma_1^2 - \sigma_2^2
- \sigma_r^2)$ and $C = \sigma_1^2 - \sigma_r^2$.  The guarantee
\eqref{eq:lanc} can be transformed into an inequality:
\begin{equation}
\fnorms{A-\tilde{A}_k} \leq \fnorms{A-A_k} + c^kC
\end{equation}
very similar to \expref{Theorem}{MAIN}, but without the multiplicative
error term for $\fnorms{A-A_k}$.  However, note that when $\sigma_1^2 =
\sigma_2^2$, successive rounds of the Lanczos method fail to converge to
$\sigma_i^2$.  In the Lanczos method, each iteration can be implemented in
one pass over $A$, whereas our algorithm requires two passes over $A$
in each iteration.  Kuczy\'{n}ski and Wo\'{z}niakowski~\cite{KW} prove
that the Lanczos method, with a randomly chosen starting vector,
outputs a vector $v$ with $\tilde{A}_1 = \proj_{\rowspan(v)}(A)$ such
that $\fnorms{\tilde{A}_1} \geq (1-\eps) \fnorms{A_1}$ in
$\log(n)/\sqrt{\epsilon}$ iterations.  However, this does not imply a
multiplicative $(1+\eps)$ error to $\fnorms{A-A_1}$.

While the idea of volume sampling appears to be new, in~\cite{GT} it
is proved that, using the rows and columns that correspond to the $k
\times k$ submatrix of maximal volume, one can compute a rank-$k$
approximation to the original matrix which differs in each entry by at
most $(k+1) \sigma_{k+1}$ (leading to much weaker $\sqrt{mn}$
approximations for the Frobenius norm).

The results of our paper connect two previously separate fields:
low-rank approximation and projective clustering.  Algorithms and
systems based on projective clustering have been applied to facial
recognition, data-mining, and synthetic data~\cite{AGGR,PJAM,APWYP},
motivated by the observation that no single subspace performs as well
as a few different subspaces. It should be noted that the advantage of
a low-dimensional representation is not merely in the computational
savings, but also the improved quality of retrieval.  In~\cite{AgMu},
Agarwal and Mustafa consider the same problem as in this paper, and
propose a variant of the $j$-means algorithm for it.  Their paper has
promising experimental results but does not provide any theoretical
guarantees.  There are theoretical results for special cases of
projective clustering, especially the $j$-means problem (find $j$
points).  Drineas et al.~\cite{DFKVV} gave a $2$-approximation to
$j$-means using SVD.  Subsequently, Ostrovsky and Rabani~\cite{OR}
gave the first randomized polynomial time approximation schemes for
$j$-means (and also the $j$-median problem).  Matou\v{s}ek~\cite{M} and
Effros and Schulman~\cite{ES} both gave deterministic PTAS's for
$j$-means.  Fernandez de la Vega et al.~\cite{FKKR} describe a
randomized algorithm with a running time of $n(\log n)^{O(1)}$.  Using
the idea of core-sets, Har-Peled and Mazumdar~\cite{HM} showed a
multiplicative $(1+\eps)$-approximation algorithm that runs in linear
time for fixed $j,\eps$.  Kumar et al.~\cite{KSS} give a linear-time
PTAS that uses random sampling.  There is a PTAS for $k=1$ (lines) as
well~\cite{APV}.  Other objective functions have also been studied,
\eg\ sum of distances ($j$-median when $k=0$,~\cite{OR, HM}) and
maximum distance ($j$-center when $k=0$,~\cite{BHI}).  For general
$k$, Har-Peled and Varadarajan~\cite{HV} give a multiplicative
$(1+\eps)$-approximation algorithm for the maximum distance objective
function.  Their algorithm runs in time
$dn^{O(jk^6\log(1/\eps)/\eps^5)}$ and is based on core-sets
(see~\cite{AHV} for a survey).

\subsection{Previous versions of this paper}

This paper is a journal version of the paper by the same set of
authors presented at the 17th Annual Symposium on Discrete Algorithms
(SODA 2006)~\cite{DRVW}.  This paper contains the same major results,
but additionally provides more intuition and more complete proofs.  A
previous version~\cite{RVW} of this paper by a subset of three of the
authors (L.  Rademacher, S. Vempala, G. Wang) appeared as an MIT CSAIL
technical report.  That version contained a subset of the results in
this paper.  In particular, the notion of volume sampling and related
results did not appear in the technical report.

\subsection{Notation and preliminaries}

Any $m \times n$ real matrix $A$ has a singular value decomposition,
that is, it can be written in the form
\[
    A = \sum_{i=1}^r \sigma_i u^{(i)} {v^{(i)}}^T
\]
where $r$ is the rank of $A$ and $\sigma_1 \geq \sigma_2 \geq
\dotsb \geq \sigma_r \geq 0$ are called the singular values;
$\{u^{(1)}, \dotsc, u^{(r)}\} \in \reals^m$, $\{v^{(1)}, \dotsc,
v^{(r)}\} \in \reals^n$ are sets of orthonormal vectors, called the
left and right singular vectors, respectively. It follows that
$A^T u^{(i)} = \sigma_i v^{(i)}$ and $A v^{(i)} = \sigma_i
u^{(i)}$ for $1 \leq i \leq r$.

The Frobenius norm of a matrix $A \in \reals^{m \times n}$ having
elements $(a_{ij})$ is denoted $\fnorm{A}$ and is given by
\[
\fnorm{A}^2 = \sum_{i=1}^m \sum_{j=1}^n a_{ij}^2\enspace.
\]
It satisfies $\fnorms{A} = \sum_{i=1}^r \sigma_i^2$.

For a subspace $V \subseteq \reals^n$, let $\proj_{V,k}(A)$ denote
the best rank-$k$ approximation (under the Frobenius norm) of $A$
with its rows in $V$.  Let
\[
\proj_k(A) = \proj_{\reals^n,k}(A) =
\sum_{i=1}^k \sigma_i u^{(i)} {v^{(i)}}^T\
\]
be the best rank-$k$
approximation of $A$.  Also $\proj_V(A) = \proj_{V,n}(A)$ is the
orthogonal projection of $A$ onto $V$. When we say ``a set (or
sample) of rows of $A$'' we mean a set of indices of rows, rather
than the actual rows. For a set $S$ of rows of $A$, let
$\rowspan(S) \subseteq \reals^n$ be the subspace generated by those
rows; we use the simplified notation $\proj_{S}(A)$ for
$\proj_{\rowspan(S)}(A)$ and $\proj_{S,k}(A)$ for
$\proj_{\rowspan(S),k}(A)$.

For subspaces $V, W \subseteq \reals^n$, their sum is denoted
$V+W$ and is given by
\[
    V+W = \{ x + y \in \reals^n \suchthat x \in V, y \in W\}\enspace.
\]

The following elementary properties of the operator $\proj_V$ will
be used:
\begin{itemize}
\item $\proj_V$ is linear, that is, $\proj_V(\lambda A + B) =
\lambda \proj_V(A) + \proj_V(B)$ for any $\lambda \in \reals$ and
matrices $A,B \in \reals^{m \times n}$.

\item If $V,W \in \reals^n$ are orthogonal linear subspaces, then
$\proj_{V+W}(A) = \proj_V(A) + \proj_W (A)$, for any matrix $A \in
\reals^{m \times n}$.
\end{itemize}

For a random vector $v$, its expectation, denoted
$\E(v)$, is the vector having as components the expected
values of the components of $v$.

\section{Improved approximation via adaptive sampling}

\subsection{Proof that adaptive sampling works}\label{SEC:ADD}

We will prove \expref{Theorem}{MAIN} in this section.
It will be convenient to formulate an intermediate theorem as follows,
whose proof is quite similar to one in~\cite{FKV}. 

\begin{theorem}\label{thm:tworounds}
  Let $A \in \reals^{m \times n}$, and $V \subseteq \reals^n$ be a vector
  subspace. Let $E = A - \proj_V(A)$ and let $S$ be a random sample of
  $s$ rows of $A$ from a distribution $D$ such that row $i$ is chosen
  with probability
  \begin{equation}\label{equ:hypothesisPi}
    P_i = \frac{\norm{E^{(i)}}^2}{\fnorms{E}}\enspace.
  \end{equation}
  Then, for any nonnegative integer $k$,
  \[
  \E_S(\fnorms{A-\proj_{V + \rowspan(S),k}(A)}) \leq
  \fnorms{A-\proj_k(A)} + \frac{k}{s} \fnorms{E}\enspace.
  \]
\end{theorem}

\begin{proof}
  We define vectors $w^{(1)}, \dotsc, w^{(k)} \in V + \rowspan(S)$
  such that $W = \rowspan\{w^{(1)}, \dotsc, w^{(k)}\}$ and show that
  $W$ is a good approximation to $\rowspan \{v^{(1)}, \dotsc, v^{(k)}
  \}$ in the sense that
  \begin{equation}
    \label{eq:goal}
    \E_S(\fnorms{A-\proj_{W}(A)}) \leq \fnorms{A-\proj_k(A)}
    + \frac{k}{s} \fnorms{E}\enspace.
  \end{equation}
  Recall that $\proj_k(A) = \proj_{\rowspan \{v^{(1)}, \dotsc,
    v^{(k)}\}}(A)$, \ie, $\rowspan \{v^{(1)}, \dotsc, v^{(k)} \}$ is
  the optimal subspace upon which to project.  Proving \eqref{eq:goal}
  proves the theorem, since $W \subseteq V + \rowspan(S)$.
  
  To this end, define $X_{l}^{(j)}$ to be a random variable such that
  for $i=1, \dotsc,m$ and $l=1, \dotsc, s$,
  $$
  X_{l}^{(j)} = \frac{u_i^{(j)}}{P_i}E^{(i)} =
  \frac{u_i^{(j)}}{P_i}\bigl(A^{(i)} - \proj_{V}(A^{(i)})\bigr)
  \text{ with probability } P_i\enspace.
  $$
  Note that $X_l^{(i)}$ is a linear function of a row of $A$ sampled
  from the distribution $D$.  Let $X^{(j)} = \frac{1}{s} \sum_{l=1}^s
  X_l^{(j)}$, and note that $\E_S(X^{(j)}) = E^T u^{(j)}$.
  
  For $1 \leq j \leq k$, define:
  \begin{equation}
    w^{(j)} = \proj_{V}(A)^Tu^{(j)} + X^{(j)}\enspace.
  \end{equation}
  Then we have that
  $\E_S(w^{(j)}) = \sigma_j v^{(j)}$.  We seek to bound the 
%%  expected
%%  squared norm of the difference between $w^{(j)}$ and its expectation
%%  $\sigma_j v^{(j)}$, 
   second central moment of  $w^{(j)}$,
   \ie,  $\E_S(\norm{w^{(j)} - \sigma_j v^{(j)}}^2)$.  
We have that 
  $$
  w^{(j)} - \sigma_j v^{(j)} = X^{(j)}
  - E^T u^{(j)}\enspace,
$$
 which gives us
  \begin{align}
    \E_S(\norm{w^{(j)} - \sigma_j v^{(j)}}^2)  &= 
    \E_S(\norm{X^{(j)} - E^Tu^{(j)}}^2) \nonumber\\
     &=  \E_S(\norm{X^{(j)}}^2) - 2 \E_S(X^{(j)}) \cdot
    E^Tu^{(j)} + \norm{E^T u^{(j)}}^2 \nonumber\\
     &= \E_S(\norm{X^{(j)}}^2) - \norm{E^T u^{(j)}}^2\enspace. \label{first}
  \end{align}
  
  We evaluate the first term in \eqref{first},
  \begin{align}
    \E_S(\norm{X^{(j)}}^2) & = \E_S\Bigl(\Bigl\| \frac{1}{s} \sum_{l=1}^s X^{(j)}_l\Bigr\|^2\Bigr) \nonumber\\
    & = \frac{1}{s^2} \sum_{l=1}^s \E_S(\norm{ X^{(j)}_l}^2) + 
    \frac{2}{s^2} \sum_{1 \leq l_1 < l_2 \leq s} \E_S(X^{(j)}_{l_1} \cdot X^{(j)}_{l_2}) \nonumber\\
    & = \frac{1}{s^2} \sum_{l=1}^s \E_S(\norm{ X^{(j)}_l}^2) + \frac{s-1}{s} \norm{E^Tu^{(j)}}^2\enspace. \label{second}
  \end{align}
  In \eqref{second} we used that $X^{(j)}_{l_1}$ and $X^{(j)}_{l_2}$
  are independent.  From \eqref{first} and \eqref{second} we have that
  \begin{equation}
    \E_S(\norm{w^{(j)} - \sigma_j v^{(j)}}^2) = 
    \frac{1}{s^2} \sum_{l=1}^s \E_S(\norm{ X^{(j)}_l}^2) - \frac{1}{s} \norm{E^Tu^{(j)}}^2\enspace.
  \end{equation}
  The definition of $P_i$ gives us
  \begin{equation}
    \E_S(\norm{ X^{(j)}_l}^2) = \sum_{i=1}^m P_i \frac{\norm{u_i^{(j)}
        E^{(i)}}}{P_i^2} \leq \fnorm{E}^2\enspace.
  \end{equation}
  Thus, we have obtained a bound on the 
%% expected squared distance of $w^{(j)}$ from its mean:
  second central moment of $w^{(j)}$:
  \begin{equation}
    \label{eq:errorterm}
    \E_S(\norm{w^{(j)} - \sigma_j v^{(j)}}^2)
         \leq \frac{1}{s} \fnorm{E}^2\enspace.
  \end{equation}
  
  With this bound in hand, we can complete the proof.  Let $y^{(j)} =
  w^{(j)}/\sigma_j$ for $1 \leq j \leq k$, and consider the matrix $F
  = A \sum_{i=1}^k v^{(i)} y^{(i)^T}$.  The row space of $F$ is
  contained in $W = \rowspan\{w^{(1)}, \dotsc, w^{(k)}\}$.  Therefore,
  $\fnorm{A-\proj_{W}(A)}^2 \leq \fnorm{A-F}^2$.  We will use $F$ to
  bound the error $\fnorm{A-\proj_W(A)}^2$.
  
  By decomposing $A-F$ along the left singular vectors $u^{(1)},
  \dotsc, u^{(r)}$, we can use the inequality \eqref{eq:errorterm} to bound
  $\fnorm{A-F}^2$:
%% ToC edit: Added ``the inequality'' above, for typesetting purposes only.
  \begin{align}
    \E_S(\fnorm{A-\proj_W(A)}^2) \leq 
    \E_S(\fnorm{A-F}^2) & = 
    \sum_{i=1}^r \E_S(\fnorm{(A-F)^T u^{(i)}}^2) \nonumber \\
    & = \sum_{i=1}^k \E_S(\norm{\sigma_i v^{(i)} - w^{(i)}}^2) + 
    \sum_{i=k+1}^r \sigma_i^2 \nonumber \\
    & \leq \frac{k}{s}\fnorm{E}^2 + \fnorm{A-\proj_{k}(A)}^2\enspace.
  \end{align}
\end{proof}

We can now prove \expref{Theorem}{MAIN} inductively using
\expref{Theorem}{thm:tworounds}.

\begin{proof}[Proof of \expref{Theorem}{MAIN}]
  We will prove the inequality by induction on $t$.  \expref{Theorem}{FKV}
  gives us the base case $t=1$.
  
  For the inductive step, let $E = A - \proj_{S_1 \cup \dotsb \cup
    S_{t-1}}(A)$. By means of \expref{Theorem}{thm:tworounds} with $s \geq
  k/\eps$ we have that
  \begin{align*}
    \E_{S_t} (\fnorms{A-\proj_{S_1 \cup \dotsb \cup
        S_t,k}(A)}) &\leq \fnorms{A-\proj_k(A)} + \eps \fnorms{E}\enspace.
  \end{align*}
  Combining this inequality with the fact that $\fnorms{E} \leq
  \fnorms{A-\proj_{S_1 \cup \dotsb \cup S_{t-1},k}(A)}$ we get
  \begin{align}
    \E_{S_t} (\fnorms{A-\proj_{S_1 \cup \dotsb \cup
        S_t,k}(A)}) &\leq \fnorms{A-\proj_k(A)} + \eps
    \fnorms{A-\proj_{S_1 \cup \dotsb \cup S_{t-1},k}(A)}\enspace.
  \end{align}
  Taking the expectation over $S_1, \dotsc, S_{t-1}$, and using the
  induction hypothesis for $t-1$ gives the result:
  \begin{align}
    \E_{S} (\fnorms{A-\proj_{S_1 \cup \dotsb \cup
        S_t,k}(A)}) & \leq \fnorms{A-\proj_k(A)} + \eps
    \E_{S_1, \dotsc, S_{t-1}}\left(\fnorms{A-\proj_{S_1 \cup
          \dotsb \cup S_{t-1},k}(A)}\right) \\
    & \leq \fnorms{A-\proj_k(A)} + \eps 
    \left( \frac{1}{1-\eps}\fnorms{A-\proj_k(A)} + \eps^{t-1} \fnorms{A}\right) \\
    & = \frac{1}{1-\eps}\fnorms{A-\proj_k(A)} + \eps^{t} \fnorms{A}\enspace.
  \end{align}
\end{proof}

\subsection{Algorithm}\label{SEC:ALGO}
In this section, we present the multipass algorithm for low-rank
approximation. We first describe it at a conceptual level and then
give the details of the implementation.

\begin{sloppypar}
Informally, the algorithm will find an approximation to the best
rank-$k$ subspace (the span of $v^{(1)}, \dotsc, v^{(k)}$) by first
choosing a sample $T$ of $s$ random rows with probabilities
proportional to the squared norms of the rows (as in
\expref{Theorem}{FKV}).  Then we focus on the space orthogonal to the
span of the chosen rows, that is, we consider the matrix $E = A -
\proj_{T}(A)$, which represents the error of our current
approximation, and we sample $s$ additional rows with probabilities
proportional to the squared norms of the rows of $E$. We consider the
union of this sample with our previous sample, and we continue adding
samples in this way, up to the number of passes that we have chosen.
\expref{Theorem}{MAIN} gives a bound on the error of this procedure.
\end{sloppypar}

\begin{center}
\fbox{\parbox{\algboxwidth}{
\begin{minipage}{\algboxwidth}
\begin{sf}
{\bf Fast Approximate SVD}\\

Input: $A \in \reals^{m \times n}$, integers $k \leq m$, $t$, error
parameter $\epsilon > 0$.

Output: A set of $k$ vectors in $\reals^n$.

\begin{enumerate}
\item Let $S = \emptyset$, $ s= k/\epsilon$.

\item\label{item:loop} Repeat $t$ times:

\begin{enumerate}
  
\item Compute the probabilities $P_i = \norm{E^{(i)}}^2/\fnorms{E}$
  for $i=1, \dotsc,m$. (One pass.)
  
\item Let $T$ be a sample of $s$ rows of $A$ according to the
  distribution that assigns probability $P_i$ to row $i$. (Another
  pass.)

\item\label{item:S} Let $S = S \cup T$.
\end{enumerate}

\item\label{item:svd} Let $h_1,\dotsc, h_k$ be the top $k$ right
singular vectors of $\proj_{S}(A)$.

\end{enumerate}
\end{sf}
\end{minipage}
}} \end{center}

In what follows, let $M$ be the number of non-zeros of $A$.

\begin{theorem}
  In $2t$ passes over the data, where in each pass the entries of the
  matrix arrive in arbitrary order, the algorithm finds vectors $h_1,
  \dotsc, h_k \in \reals^{n}$ such that with probability at least
  $3/4$ their span $V$ satisfies
\begin{equation}\label{equ:algorithm}
\fnorms{A - \proj_V(A)} \leq
\left(1+\frac{4\epsilon}{1-\epsilon}\right) \fnorms{A -
\proj_k(A)} + 4 \epsilon^t \fnorms{A}\enspace.
\end{equation}
The running time is $O\left(M
\frac{kt}{\epsilon} + (m+n)\frac{k^2 t^2}{\epsilon^2}\right)$.
\end{theorem}
\begin{proof}
For the correctness, observe that $\proj_{V}(A)$ is a random
variable with the same distribution as $\proj_{S,k}(A)$ as defined
in \expref{Theorem}{MAIN}. Also, $\fnorms{A - \proj_{S,k}(A)} -
\fnorms{A - \proj_k(A)}$ is a nonnegative random variable and
\expref{Theorem}{MAIN} gives a bound on its expectation:
\begin{align}
  \E_{S} (\fnorms{A-\proj_{S,k}(A)} - \fnorms{A -
    \proj_k(A)}) \leq \frac{\eps}{1-\eps} \fnorms{A-\proj_k (A)} +
  \eps^t\fnorms{A}\enspace.
\end{align}
Markov's inequality applied to this variable gives that with
probability at least $3/4$
\begin{align}
  \fnorms{A - \proj_V(A)} - \fnorms{A - \proj_k(A)} \leq
  \frac{4\epsilon}{1-\epsilon} \fnorms{A - \proj_k(A)} + 4
  \epsilon^t \fnorms{A}\enspace,
\end{align}
which implies inequality \eqref{equ:algorithm}.

We will now bound the running time. We maintain a basis of the rows
indexed by $S$. In each iteration, we extend this basis orthogonally
with a new set $Y$ of vectors, so that it spans the new sample $T$.
The residual squared norm of each row, $\norm{E^{(i)}}^2$, as well as
the total, $\fnorms{E}$, are computed by subtracting the contribution
of $\proj_{T}(A)$ from the values that they had during the previous
iteration. In each iteration, the projection onto $Y$ needed for
computing this contribution takes time $O(Ms)$.  In iteration $i$, the
computation of the orthonormal basis $Y$ takes time $O(ns^2i)$
(Gram-Schmidt orthonormalization of $s$ vectors in $\reals^n$ with
reference to an orthonormal basis of size at most $s(i+1)$).  Thus,
the total time in iteration $i$ is $O(M s + n s^2 i)$; with $t$
iterations, this is $O(M s t + ns^2 t^2)$. At the end of
\expref{Step}{item:loop} we have $\proj_{S}(A)$ in terms of our basis
(an $m \times st$ matrix).  Finding the top $k$ singular vectors in
\expref{Step}{item:svd} takes time $O(m s^2 t^2)$. Bringing them back
to the original basis takes time $O(nkst)$. Thus, the total running
time is $O(Mst + n s^2 t^2 + ms^2 t^2 + nkst)$ or, in other words,
$O\left(Mkt/\epsilon + (m+n) k^2 t^2/\epsilon^2\right)$.
\end{proof}

\section{Volume sampling and multiplicative approximation}\label{SEC:RELATIVE}

We begin with a proof of \expref{Theorem}{KROWS}, namely that volume
sampling leads to a multiplicative $(k+1)$-approximation (in expectation).

\begin{proof}[Proof of \expref{Theorem}{KROWS}]
  For every $S \subseteq \{1, \dotsc, m\}$, let $\Delta_S$ be the
  simplex formed by $\{ A^{(i)}\suchthat i \in S \}$ and the origin.  We wish
  to bound $\E_S(\|A-\tilde{A}_k\|_F^2)$ which can be written as follows:
  \begin{equation}
    \label{eq:expectation}
    \E_S(\|A-\tilde{A}_k\|_F^2) = \frac{1}{\sum_{S,|S|=k}
      \vol_k(\Delta_S)^2} \sum_{S, |S|=k} \vol_k(\Delta_S)^2
    \|A-\proj_{S}(A)\|_F^2\enspace.
  \end{equation}
  For any $(k+1)$-subset $S=\{i_1, \dotsc, i_{k+1}\}$ of rows of $A$, we 
can express $\vol_{k+1}(\Delta_S)^2$ in terms of $\vol_{k}(\Delta_{T})^{2}$
for $T = \{i_1, \dotsc, i_k\}$, along with the squared distance from 
$A^{(i_{k+1})}$ to $H_T$, where $H_{T}$ is the linear subspace spanned by 
$\{A^{(i)} \suchthat i \in T\}$:
\begin{equation}
  \vol_{k+1}(\Delta_S)^2 = \frac{1}{(k+1)^2} \vol_{k}(\Delta_T)^2
  d(A^{(i_{k+1})},H_T)^2\enspace.
\end{equation}
  Summing over all subsets $S$ of size $k+1$:
  \begin{align}
    \sum_{S, |S|=k+1} \vol_{k+1} (\Delta_S)^2 &= \frac{1}{k+1}
    \sum_{T, |T|=k} \sum_{j=1}^{m} \frac{1}{(k+1)^2}~ \vol_k
    (\Delta_T)^2~ d(A^{(j)}, H_T)^2 \nonumber \\
    &=  \frac{1}{(k+1)^3} \sum_{T, |T|=k} \vol_k (\Delta_T)^2
    \sum_{j=1}^{m} d(A^{(j)}, H_T)^2 \nonumber \\
    &= \frac{1}{(k+1)^3} \sum_{T, |T|=k} \vol_k (\Delta_T)^2
    \fnorms{A-\proj_T(A)}\enspace,
  \end{align}
  where in the last step we noted that $\sum_{j=1}^{m} d(A^{(j)},
  H_T)^2 = \fnorms{A-\proj_T(A)}$.  By substituting this equality in
  \eqref{eq:expectation} and applying \expref{Lemma}{VOLFORMULA} (proved
  next) twice, we have:
  \begin{align}
    \E_S(\fnorms{A-\tilde{A}_k}) & = \frac{1}{\sum_{T,|T|=k}
      \vol_k(\Delta_T)^2} \left((k+1)^3 \sum_{S, |S|=k+1}
      \vol_{k+1}(\Delta_S)^2\right) \nonumber \\
    & = \frac{1}{\sum_{T,|T|=k} \vol_k(\Delta_T)^2}
    \left( \frac{(k+1)}{(k!)^2} \sum_{1 \leq t_1 < \dotsb < t_{k+1}
        \leq n} \sigma_{t_1}^2 \dotsb \sigma_{t_{k+1}}^2\right) \nonumber\\
   & \leq \frac{1}{\sum_{T,|T|=k} \vol_k(\Delta_T)^2}
    \left( \frac{(k+1)}{(k!)^2}
      \sum_{1 \leq t_1 < \dotsb < t_k \leq r} \sigma_{t_1}^2 \dotsb
      \sigma_{t_k}^2 \sum_{j=k+1}^{r} \sigma_j^2\right)\\
    & = \frac{(k+1)}{\sum_{T,|T|=k} \vol_k(\Delta_T)^2}
    \left( \sum_{T: |T|=k} \vol_k(\Delta_T)^2
      \sum_{j=k+1}^{r} \sigma_j^2\right) \nonumber \\
    & = (k+1) \fnorms{A-A_k}\enspace. \nonumber
  \end{align}

\end{proof}

\begin{lemma}\label{VOLFORMULA}
\[ \sum_{S, |S|=k} \vol_k (\Delta_S)^2 = \frac{1}{(k!)^2} \sum_{1 \leq
  t_1 < t_2 < \dotsb < t_k \leq r} \sigma_{t_1}^2 \sigma_{t_2}^2
\dotsb \sigma_{t_k}^2 \] where $\sigma_1, \sigma_2, \dotsb, \sigma_r >
0 = \sigma_{r+1} = \dotsb = \sigma_n$ are the singular values of $A$.
\end{lemma}
\begin{proof}
  Let $A_S$ be the sub-matrix of $A$ formed by the rows $\{ A^{(i)}
  \suchthat i \in S \}$. Then we know that the volume of the
  $k$-simplex formed by these rows is given by \( \vol_k (\Delta_S) =
  \frac{1}{k!}~ \sqrt{\det(A_S A_S^T)}. \) Therefore,
  \begin{align}
    \sum_{S, |S|=k} \vol_k (\Delta_S)^2 &= \frac{1}{(k!)^2} \sum_{S,
      |S|=k} \det(A_S A_S^T)
    = \frac{1}{(k!)^2} \sum_{ \substack{ B \text{: principal}\\ \text{$k$-minor of $AA^T$}}} \det(B)\enspace.
  \end{align}
  Let $\det(AA^T - \lambda I) = (-1)^m \lambda^m + c_{m-1} \lambda^{m-1} +
  \dotsb + c_0$ be the characteristic polynomial of $AA^T$. From basic
  linear algebra we know that the roots of this polynomial are
  precisely the eigenvalues of $AA^T$, \ie, $\sigma_1^2, \sigma_2^2,
  \dotsc, \sigma_r^2$ and $0$ with multiplicity $(m-r)$.  Moreover the
  coefficient $c_{m-k}$ can be expressed in terms of these roots as:
  \begin{equation}
    c_{m-k} = (-1)^{m-k} \sum_{1 \leq t_1 < t_2 < \dotsb < t_k \leq
      r} \sigma_{t_1}^2 \sigma_{t_2}^2 \dotsb \sigma_{t_k}^2\enspace.
  \end{equation}
  But we also know that $c_{m-k}$ is the coefficient of
  $\lambda^{m-k}$ in $\det(AA^T - \lambda I)$, which means
  \begin{equation}
    c_{m-k} = (-1)^{m-k} \sum_{\substack{ B \text{: principal} \\ \text{$k$-minor
          of $AA^T$} }} \det(B)
  \end{equation}
  (see \eg,~\cite{Ar}; we prove it next
  as \expref{Proposition}{COEFF}).  
This gives us our desired result.
\end{proof}

\begin{proposition}\label{COEFF}
  Let the characteristic polynomial of $M \in \reals^{m \times m}$ be
  \[
  \det(M - \lambda I_m) = \lambda^m + c_{m-1} \lambda^{m-1} + \dotsb +
  c_0\enspace.
  \]
  Then 
\[ c_{m-k} = (-1)^{m-k} \sum_{\substack{ B \text{:
        principal} \\ \text{$k$-minor of $AA^T$} }} \det(B)
  \qquad \text{for}~ 1 \leq k \leq m\enspace. 
\]
\end{proposition}
\begin{proof}
  First, it is clear that $c_0 = \det(M)$.  Next, let $M' = M -
  \lambda I$, and $S_m$ be the set of permutations of $\{1, 2, \dotsc,
  m \}$. The sign of a permutation $\sgn(\tau)$, for $\tau \in
  \text{Perm}([m])$, is equal to $1$ if it is a product of an even number of
  transpositions and $-1$ otherwise.  For a subset $S$ of rows, we
  denote the submatrix of entries $(M_{i,j})_{i,j \in S}$ by $M_S$.
\begin{equation}
\det(M - \lambda I_m) = \det(M') = \sum_{\tau \in \text{Perm}([m])}
\sgn(\tau) M'_{1, \tau(1)} M'_{2, \tau(2)} \dotsb M'_{m, \tau(m)}\enspace.
\end{equation}
 The term $c_{m-k} \lambda^{m-k}$ is the sum over
$\tau$ which fix some set $S \subseteq [m]$ of size $(m-k)$, and
the elements $\prod_{i \in S} M'_{i,i}$ contribute $(-1)^{m-k}
\lambda^{m-k}$ and the coefficient comes from the constant term in\\
\[
\sum_{\tau \in \text{Perm}([m]-S)} \sgn(\tau) \prod_{i \notin S} M'_{i,
\tau(i)}\enspace.
\]
 Each term in this sum is the $c_0$ term of a principal minor of $M$
 and so the sum is equal to 
\[
\sum_{S,|S|=m-k} \det(M_{[m]-S})\enspace.
\]
Hence
\begin{equation}
c_{m-k} = (-1)^{m-k} \sum_{S, |S|=m-k} \det(M_{[m]-S}) = (-1)^{m-k}
 \sum_{\substack{ B \text{:
        principal} \\ \text{$k$-minor of $AA^T$} }} \det(B)\enspace.
\end{equation}
\end{proof}

The bound proved in \expref{Theorem}{KROWS} is in fact asymptotically tight:
\begin{proposition}\label{TIGHT}
  Given any $\epsilon > 0$, there exists a $(k+1) \times (k+1)$ matrix
  $A$ such that for any $k$-subset $S$ of rows of $A$,
  \[ \fnorms{A - \pi_{S,k} (A)} \geq (1-\epsilon)~(k+1)~ \fnorms{A -
    A_k}\enspace.
  \]
\end{proposition}

\begin{proof}
The tight example consists of a matrix with $k+1$ rows which are the
vertices of a regular $k$-dimensional simplex lying on 
the affine hyperplane $\{ X_{k+1} = \alpha \}$ in
$\reals^{k+1}$. Let $A^{(1)}$, $A^{(2)}$, \dots, $A^{(k+1)}$ be the 
vertices with the point $p = (0,0, \dotsc, 0, \alpha)$ as their 
centroid. For $\alpha$ small enough, the best $k$ dimensional subspace
for these points is given by $\{ X_{k+1}=0 \}$ and 
\begin{equation}
 \| A - A_k
\|_F^2 = (k+1) \alpha^2\enspace.
\end{equation}
Consider any $k$-subset of rows from
these, say $S = \{ A^{(1)}, A^{(2)}, \dotsc, A^{(k)} \}$, and let
$H_S$ be the linear subspace spanning them. Then,
\begin{equation} 
\| A - \pi_{S,k}(A) \|_F^2 = d(A^{(k+1)}, H_S)^2\enspace. 
\end{equation} 
We claim that for any $\eps > 0$, $\alpha$ can be chosen small enough
so that 
\begin{equation}
d(A^{(k+1)}, H_S) \geq \sqrt{(1-\eps)} (k+1)\alpha\enspace.
\end{equation}
Choose $\alpha$ small enough so that $d(p,H_S) \geq \sqrt{(1-\eps)}\alpha$. 
Now
\begin{equation}
\frac{d(A^{(k+1)},H_S)}{d(p,H_S)} = \frac{d(A^{(k+1)}, \mbox{conv}(A^{(1)},\dotsc,A^{(k)}))}{d(p, \mbox{conv}(A^{(1)},\dotsc,A^{(k)}))} = k+1
\end{equation}
since the points form a simplex and $p$ is their centroid. The claim follows.
Hence, 
\begin{equation} \| A - \pi_{S,k} (A) \|_F^2 = d(A^{(k+1)}, H_S)^2 \geq (1-\epsilon)~
(k+1)^2~ \alpha^2 = (1-\epsilon)~ (k+1)~ \| A - A_k \|_F^2\enspace.
\end{equation}
\end{proof}

Next, we prove \expref{Theorem}{RELATIVE}.  This theorem follows by
interpreting \expref{Theorem}{KROWS} as an existence theorem and applying
\expref{Theorem}{thm:tworounds}.

\begin{proof}[Proof of \expref{Theorem}{RELATIVE}]
  By \expref{Theorem}{KROWS}, there exists a $k$-subset $S_1$ of rows of
  $A$ such that
  \begin{equation}
    \fnorms{A - \proj_{S_1}(A)} \leq (k+1) \fnorms{A-A_k}\enspace.
  \end{equation}
  Now,
  applying \expref{Theorem}{thm:tworounds} with $V = \rowspan(S_1)$ and
  $s=k(k+1)/\eps$ we get that a random sample $S_2$ of the rows of $A$
  (according to the specified distribution) satisfies
  \begin{equation}
    \E_{S_2}(\fnorms{A-\proj_{V+\rowspan(S_2),k}(A)}) \leq (1+\eps)\fnorms{A-A_k}
  \end{equation}
  so there exists a subset of the rows achieving the expectation.
  Since $V+\rowspan(S_2) = \rowspan(S_1 \cup S_2)$, and $|S_1 \cup
  S_2| = k + k(k+1)/\eps$, we have the desired result.
\end{proof}

\section{Application: Projective clustering}\label{sec:CLUSTER}

In this section, we give a polynomial-time approximation scheme for
the projective clustering problem described in
\expref{Section}{sec:ourresults}.  We note that a simple
multiplicative $(k+1)$-approximation follows from \expref{Theorem}{KROWS}
with running time $O(dn^{jk})$.  Let $V_1, \dotsc, V_j$ be the optimal
subspaces partitioning the point set into $P_1 \cup \dotsb \cup P_j$, where
$P_i$ is the subset of points closest to $V_i$. \expref{Theorem}{KROWS}
tells us that each $P_i$ contains a subset $S_i$ of $k$ points whose
span $V_i'$ is a $(k+1)$-approximation, \ie,
\[
\sum_{p \in P_i} d(p, V_i')^2 \leq (k+1) \sum_{p \in P_i} d(p,V_i)^2\enspace.
\]
We can find the $S_i$'s by simply enumerating all possible subsets of
$k$ points, considering $j$ of them at a time, and taking the best of
these.  This leads to the complexity of $dn^{jk}$.

Getting a PTAS will be a bit more complicated. \expref{Theorem}{RELATIVE}
implies that there exists a set $\hat{P_i} \subseteq P_i$ of size $k +
k(k+1)/\eps$ in whose span lies an approximately optimal
$k$-dimensional subspace $W_i$.  We can enumerate over all
combinations of $j$ subsets, each of size $k + k(k+1)/\eps$ to find
the $\hat{P_i}$, but we cannot enumerate the infinitely many
$k$-dimensional subspaces lying in the span of $\hat{P_i}$.  One
natural approach to solve this problem would be to put a finite grid
down in a unit ball in the span of $\hat{P_i}$.  The hope would be
that there are $k$ grid points whose span $G$ is ``close'' to $W_i$,
since each basis vector for $W_i$ is close to a grid point.  However,
this will not work; consider a point $p$ very far from the origin.
Although the distance between a basis vector and a grid point might be
small, the error induced by projecting $p$ onto a grid point is
proportional to its distance to the origin, which could be too large.

The problem described above suggests that a grid construction must be
dependent on the point set $P_i$.  Our grid construction considers
grid points in the span of $\hat{P_i}$, but instead of a uniform grid
in a unit ball, we consider grid points at bounded distance from each
$p \in \proj_{\rowspan(\hat{P_i})}(P_i)$, \ie, the points in $P_i$
projected to the span of $\hat{P_i}$.  This avoids the problem of
points far from the origin, since there are grid points around each
point.  Note that we only put grid points around projected points.
This is because we seek a subspace ``close'' to $W_i$, which itself
lies in the span of $\hat{P_i}$; $W_i$ and any subspace lying in the
span of $\hat{P_i}$ incur the same error for the component of a point
orthogonal to the span of $\hat{P_i}$.  In
\expref{Lemma}{lem:deltamove}, we show that there exists a subspace
spanned by $k$ points in our grid that is not much worse than $W_i$.
The lemma is stated for a general point set, but we apply it to the
projected points in \expref{Theorem}{CLUSTER}.

The algorithm is given below.

\begin{center}
  \fbox{\parbox{\algboxwidth}{
    \begin{minipage}{\algboxwidth}
      \begin{sf}
        {\bf Algorithm Cluster}\\
        
        \textbf{Input:} $P \subseteq \reals^d$, error parameter $0 < \eps
        < 1$, and upper bound $B$ on the optimal cost.

\textbf{Output:} A set $\{F_1, \dotsc, F_j\}$ of $j$ $k$-dimensional
subspaces.
\begin{enumerate}
\item Set $\delta = \frac{\eps \sqrt{B}}{16jk
    \sqrt{(1+\frac{\eps}{2})n}}$, $R =
  \sqrt{(1+\frac{\eps}{2})B}+2\delta k$.\\
\item For each subset $T$ of $P$ of size $j(k+2k(k+1)/\eps)$:
  \label{subsetchoose}
  \begin{enumerate}
  \item For each equipartition $T = T_1 \cup \dotsb \cup T_j$:
    \label{partitionchoose}
    \begin{enumerate}
    \item For each $i$, construct a $\delta$-net $D_i$ with radius $R$
      for the projection of $P$ to the span of $T_i$. \label{dnetstep}
\item For each way of choosing $j$ subspaces $F_1, \dotsc, F_j$, where
  $F_i$ is the span of $k$ points from $D_i$, compute the cost
  $\mathcal{C}(F_1,\dotsc,F_j)$. \label{choosespace}
   \end{enumerate}
  \end{enumerate}
\item Report the subspaces $F_1, \dotsc, F_j$ of minimum cost
  $\mathcal{C}(F_1,\dotsc,F_j)$.
\end{enumerate}
\end{sf}
\end{minipage}}}
\end{center}

In \expref{Step}{dnetstep}, we construct a $\delta$-net $D_i$.  A
$\delta$-net $D$ with radius $R$ for $S$ is a set such that for any
point $q$ for which $d(q,p) \leq R$, for some $p \in S$, there exists
a $g \in D$ such that $d(q,g) \leq \delta$.  The size of a
$\delta$-net is exponential in the dimension of $S$.  This is why it
is crucial that we construct the $\delta$-net for $P$ projected to the
span of $T_i$.  By doing so, we reduce the dimension from $d$ to
$O(k^2/\eps)$.  The correctness of the algorithm relies crucially on
the next lemma.

\begin{lemma}\label{lem:deltamove}
  Let $\delta > 0$.  Let $P$ be a point set, with $|P|=n$ and $W$ be a
  subspace of dimension $k$. Let $D$ be a $\delta$-net with radius $R$ 
  for $P$, satisfying
  \[
  R \geq \sqrt{\sum_{p \in P} d(p,W)^2} + 2 \delta k\enspace.
  \]
  Then there exists a subspace $F$ spanned by $k$ points in $D$ such that:
  \begin{equation}\label{eq:promise}
    \sum_{p \in P} d(p,F)^2 \leq \sum_{p \in P} d(p,W)^2 + 4 k^2 n
    \delta^2 + 4 k \delta \sum_{p \in P} d(p,W)\enspace.
  \end{equation}
\end{lemma}

\begin{proof}
  We construct the subspace $F$ in $k$ steps.  Let $F_0 = W$.
  Inductively, in step $i$, we choose a point $p_i$ and rotate
  $F_{i-1}$ so that it includes a grid point $g_i$ around $p_i$.  The
  subspace resulting from the last rotation, $F_k$, is the subspace
  $F$ with the bound promised by the lemma.  To prove that
  \eqref{eq:promise} holds, we prove the following inequality for any
  point $p \in P$ going from $F_{i-1}$ to $F_i$
  \begin{equation}\label{eqn:deltamove}
    d(p,F_i) \leq d(p,F_{i-1}) + 2\delta\enspace.
  \end{equation}
  Summing over the $k$ steps, squaring, and summing over $n$ points,
  we have the desired result.

  Let $G_1 = \{\vec{0}\}$.  $G_i$ will be the span of the grid points
  $\{g_1, g_2, \dotsc, g_{i-1}\}$.  We describe how to construct the
  rotation $R_i$.  Let $p_i \in P$ maximize $\norm{\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i))}$
  and let $g_i \in D$
  minimize $d(\proj_{F_{i-1}} (p_i),g_i)$.
  The point $p_i$ is chosen as the
  furthest point from the origin in the subspace of $F_{i-1}$
  orthogonal to $G_i$.  The grid point $g_i$ is the point closest to
  $p_i$ in $F_{i-1}$.  Consider the plane $Z$ defined by
  $$
  \proj_{G_i^{\perp}}(g_i),\;  \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)),\;
  \text{and}\; \vec{0}\enspace.
  $$
  Let
  $\theta$ be the angle between $\proj_{G_i^{\perp}}(g_i)$ and
  $\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i))$.  Let $R_i$ be the
  rotation in the plane $Z$ by the angle $\theta$, and define $F_i =
  R_i F_{i-1}$.  Set $G_{i+1} = G_i + \rowspan\{g_i\}$.  By choosing
  the rotation dependent on $p_i$, we ensure that no point moves more
  than $p_i$.  This allows us to prove \eqref{eqn:deltamove} for all
  points $p$.

  Now we prove inequality \eqref{eqn:deltamove}.  We do so by proving
  the following inequality by induction on $i$ for any point $p$:
  \begin{equation}\label{eq:rotation}
    d(\proj_{F_{i-1}}(p), R_i \proj_{F_{i-1}}(p)) \leq 2 \delta\enspace.
  \end{equation}
  Note that this proves \eqref{eqn:deltamove} by applying the triangle
  inequality, since:
  \begin{align}
    d(p,F_i) & \leq d(p,F_{i-1}) + d(\proj_{F_{i-1}}(p), \proj_{F_{i}}(p))\\
    & \leq d(p,F_{i-1}) + d(\proj_{F_{i-1}}(p), R_i \proj_{F_{i-1}}(p))\enspace.
  \end{align}
  The base case of the inequality, $i=1$, is trivial.  Consider the
  inductive case; here, we are bounding the distance between
  $\proj_{F_{i-1}}(p)$ and $R_i \proj_{F_{i-1}}(p)$.  It suffices to
  bound the distance between these two points in the subspace
  orthogonal to $G_i$, since the rotation $R_i$ is chosen orthogonal
  to $G_i$.  That is,
  \begin{equation}
    d(\proj_{F_{i-1}}(p), R_i \proj_{F_{i-1}}(p)) \leq
    d(\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p)),R_i
    \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p)))\enspace.
  \end{equation}
  Now, consider the
  distance between a point $\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p))$
  and its rotation, $R_i \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p))$.
  This distance is maximized when
  $\norm{\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p))}$ is maximized, so we
  have, by construction, that the maximum value is achieved by $p_i$:
  \begin{equation}
    d(\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p)),R_i
    \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p))) \leq
    d(\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)),R_i
    \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)))\enspace.
  \end{equation}
  By the triangle
  inequality we have:
  $$d(\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)),R_i
    \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i))) \leq
    d(\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)),\proj_{G_i^\perp}(g_i))
    + d(\proj_{G_i^\perp}(g_i), R_i
    \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)))\enspace.$$
  To bound the first
  term, $
  d(\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)),\proj_{G_i^\perp}(g_i))$,
  note that
  \[
    d(\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)),\proj_{G_i^\perp}(g_i))
    \leq d(\proj_{F_{i-1}}(p_i) ,g_i)\enspace.
    \]
  We show that
  $\proj_{F_{i-1}}(p_i)$ is within a ball of radius $R$ around $p_i$;
  this implies
  \begin{equation}\label{eq:ft}
    d(\proj_{F_{i-1}}(p_i), g_i) \leq \delta
  \end{equation} by
  construction of the $\delta$-net around $p_i$.  We have:
  \begin{align*}
    d(p_i,\proj_{F_{i-1}}(p_i)) & \leq d(p_i,F_0) +
    \sum_{j=1}^{i-2} d(\proj_{F_j}(p_i),\proj_{F_{j+1}}(p_i))\\
    & \leq \sqrt{\sum_{p \in P} d(p,W)^2} + \sum_{j=1}^{i-2} d(\proj_{F_j}(p_i),R_{j+1} \proj_{F_{j}}(p_i))\\
    & \leq \sqrt{\sum_{p \in P} d(p,W)^2} + 2\delta k \leq R\enspace.
  \end{align*}
  The third line uses the induction hypothesis.

  Now we bound the second term, $d(\proj_{G_i^\perp}(g_i), R_i
  \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i)))$.  Note that $R_i
  \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i))$ is just a rescaling of
  $\proj_{G_i^\perp}(g_i)$ and that
  $\norm{\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p^*))} = \norm{R_i
  \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p^*))}$, since rotation
  preserves norms.  The bound on the first term implies that
  $\norm{\proj_{G_i^{\perp}}(g_i)} \geq
  \norm{\proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p^*))} - \delta$, so
  \begin{equation}\label{eq:st}
    d(\proj_{G_i^\perp}(g_i), R_i
    \proj_{G_i^{\perp}}(\proj_{F_{i-1}}(p_i))) \leq \delta\enspace.
  \end{equation}
  Combining \eqref{eq:ft} and \eqref{eq:st}, we have proved
  \eqref{eq:rotation}.
\end{proof}

Now, we are ready to prove \expref{Theorem}{CLUSTER}, which includes the
correctness of the algorithm.

\begin{proof}[Proof of \expref{Theorem}{CLUSTER}]
  Assume that the optimal solution is of value at most $B$.  Let $V_1,
  \dotsc ,V_j$ be the optimal subspaces, and let $P_1, \dotsc, P_j$ be
  the partition of $P$ such that $P_i$ is the subset of points closest
  to $V_i$.  \expref{Theorem}{RELATIVE} implies that there exists $S_i
  \subseteq P_i$ of size at most $k + 2k(k+1)/\eps$ such that there is
  a $k$-dimensional subspace $W_i$ in the span of $S_i$ with
  \begin{equation}\label{eqn:csetapp}
    \sum_{p \in P_i} d(p,W_i)^2 \leq \left(1+\frac{\eps}{2}\right) \sum_{p
      \in P_i} d(p,V_i)^2 \leq \left(1 + \frac{\eps}{2}\right) B\enspace.
  \end{equation}
  Consider $\proj_{\rowspan(S_i)}(P_i)$, the projection of $P_i$ to
  $\rowspan(S_i)$.  We want to apply \expref{Lemma}{lem:deltamove} to
  $\proj_{\rowspan(S_i)}(P_i)$ and $W_i$ with radius $R$ and $\delta$ as 
  in the algorithm. Note that the optimal solution is of value at most $B$, 
  so we have that:
  \begin{equation}
    \sqrt{\sum_{p \in \proj_{\rowspan(S_i)}(P_i)} d(p,W_i)^2} + 2 \delta k 
    \leq \sqrt{\sum_{p \in P_i} d(p,W_i)^2} + 2 \delta k 
    \leq \sqrt{\left(1+\frac{\eps}{2}\right)B} + 2 \delta k = R\enspace.
  \end{equation}
  Let $F_i$ be the subspace spanned by $k$ points from the $\delta$-net
  $D_i$ for $\proj_{\rowspan(S_i)}(P_i)$ promised by
  \expref{Lemma}{lem:deltamove}.  For every $i$, we have that:
  \begin{align}
    \sum_{p \in \proj_{\rowspan(S_i)}(P_i)} d(p,F_i)^{2} 
    &\leq \sum_{p \in \proj_{\rowspan(S_i)}(P_i)} d(p,W_i)^{2} 
    + 4k^{2} n \delta^{2} + 4 k \delta \sum_{p \in \proj_{\rowspan(S_i)}(P_i)} 
    d(p,W_i) \nonumber \\
    &\leq \sum_{p \in \proj_{\rowspan(S_i)}(P_i)} d(p,W_i)^{2} + \frac{\eps}{4j}B 
    + 4k \delta \biggl( n \sum_{p \in \proj_{\rowspan(S_i)}(P_i)} d(p,W_i)^{2}
    \biggr)^{1/2} \nonumber\\
    &\leq \sum_{p \in \proj_{\rowspan(S_i)}(P_i)} d(p,W_i)^{2} + \frac{\eps}{2j}B\enspace. \label{eq:appdelta}
  \end{align}
  Now, for any point $p \in P_i$, we can decompose the squared
  distance from $p$ to $F_i$ as follows:
  \[
    d(p,F_i)^2 =
    d(\proj_{\rowspan(S_i)}(p), F_i)^2 +
    d(\proj_{\rowspan(S_i)^{\perp}}(p), F_i)^2\enspace.
    \]
  The same decomposition
  can be done for $d(p,W_i)^2$.  Now, since $F_i$ and $W_i$ both lie
  in the span of $S_i$, we have the following for any point $p \in
  P_i$: $d(\proj_{\rowspan(S_i)^{\perp}}(p), F_i)^2 =
  d(\proj_{\rowspan(S_i)^{\perp}}(p), W_i)^2$.  Applying this to
  \eqref{eq:appdelta} and \eqref{eqn:csetapp}, we have:
  \[
    \sum_{p \in P_i} d(p,F_i)^2 \leq 
    \left(1+\frac{\eps}{2}\right) \sum_{p
      \in P_i} d(p,V_i)^2 + \frac{\eps}{2j}B\enspace.
    \]
  Let $S = \bigcup_i S_i$.  The algorithm will enumerate $S$ in
  \expref{Step}{partitionchoose}, and it will enumerate the partition
  $S_1 \cup \dotsb \cup S_j$ in \expref{Step}{partitionchoose}.  In
  \expref{Step}{dnetstep}, the algorithm will, for each $i$, construct
  a $\delta$-net $D_i$ for $\proj_{\rowspan(S_i)}(P)$.  Lastly, in
  \expref{Step}{choosespace} it will consider the subspaces $F_1,
  \dotsc, F_j$ whose existence is proven above.  The cost associated
  with this solution is:
  \[
    \mathcal{C}(F_1, \dotsc, F_j)
    \leq  \sum_{i=1}^{j} \sum_{p \in
      P_i} d(p,F_i)^2
    \leq  \left(1+ \frac{\eps}{2}\right) \sum_{i=1}^j \sum_{p \in P_i}
    d(p,V_i)^2 + \frac{\eps}{2}B = (1+\eps)B\enspace.
    \]
    
    The number of subsets of size $k + 2k(k+1)/\eps$ enumerated by the
    algorithm is at most $\binom{n}{2(k+1)^2/\eps}^j$. A $\delta$-net
    $D$ with radius $R$ for a point set $Q$ of dimension $d$ is
    implemented by putting a box with side length $2R$ of grid width
    $\delta/\sqrt{d}$ around each point in $Q$.  Let $X$ be the set of
    grid points in the box around a point $p$.  The number of
    subspaces in each $\delta$-net $D_i$ is therefore at most the
    number of $j$ subspaces that one can choose for a partition $T_1
    \cup \dotsb \cup T_j$ is $\left(n \card{X} \right)^{jk}$.  The
    computation for projecting points, finding a basis, and
    determining the cost of a candidate family of subspaces takes time
    $O(ndjk)$.  The cardinality of $X$ is (for $\eps <1$):
  \begin{align}
    \card{X} = \left( \frac{2 R}{\delta
        / \sqrt{2(k+1)^2/\eps}} \right)^{2(k+1)^2/\eps}
      \leq \left( O\Bigl(jk\sqrt{\frac{n}{\eps}}\Bigr)\right)^{2(k+1)^2/\eps}.
  \end{align}
  Therefore, the running time of the algorithm is at most
  $
  O(ndjk) \; n^{2j(k+1)^2/\eps}
  \left(n \card{X} \right)^{jk} = d
  \left( \frac{n}{\eps} \right)^{O(jk^3/\eps)}$.
\end{proof}

\section{Conclusions and subsequent work}

\expref{Theorem}{RELATIVE} was further improved in~\cite{DV} to show that
  for any real matrix, there exist $O(k/\eps + k \log k)$ rows whose
  span contains the rows of a multiplicative $(1+\eps)$-approximation to the
  best rank-$k$ matrix. 
Using this subset of $O(k/\eps + k \log k)$ rows, the exponent in the
  running time of the projective clustering algorithm decreases from
  $O(jk^{3}/\eps)$ to $O(jk^{2}/\eps)$.
It would be interesting to know if we can compute this set of
  $O(k/\eps + k \log k)$ rows efficiently in a small number of passes.
It would also be nice to see other applications of volume sampling.

In a recent result, Sarlos~\cite{S} proved that, using random linear
combinations of rows instead of a subset of rows, we can compute a
multiplicative $(1+\eps)$-approximation using only $2$ passes.

\bibliography{a012}

%\nocite{*}

\begin{tocauthors}
\begin{tocinfo}[amit]
Amit Deshpande \\
2-342 \\
77 Mass. Ave., \\
Cambridge, MA 02139, USA.\\
amitd\tocat{}mit\tocdot{}edu \\
\url{http://www.mit.edu/~amitd/}
\end{tocinfo}
\begin{tocinfo}[luis]
Luis Rademacher \\
2-331 \\ 
77 Mass. Ave. \\
Cambridge, MA 02139, USA. \\
lrademac\tocat math \tocdot mit\tocdot edu \\
\url{http://www-math.mit.edu/~lrademac/}
\end{tocinfo}
\begin{tocinfo}[santosh]
Santosh Vempala\\
College of Computing\\
Georgia Institute of Technology\\
Atlanta, GA 30332, USA.\\
vempala\tocat{}cc\tocdot{}gatech\tocdot{}edu\\
\url{http://www-math.mit.edu/~vempala/}
\end{tocinfo}
\begin{tocinfo}[grant]
  Grant Wang\\
  Yahoo!\\
  701 First Avenue\\
  Sunnyvale, CA 94089, USA.\\
  gjw\tocat{}alum\tocdot{}mit\tocdot{}edu\\
  \url{http://theory.csail.mit.edu/~gjw/}
\end{tocinfo}
\end{tocauthors}

\begin{tocaboutauthors}
\begin{tocabout}[amit]
{\sc Amit Deshpande} is a graduate student in 
\href{http://www-math.mit.edu}{Applied Math} at MIT. He did his undergraduate 
studies at \href{http://www.cmi.ac.in}{Chennai Mathematical Institute (CMI)} 
in India. Apart from math and theory, he enjoys north Indian classical music.
\end{tocabout}
\begin{tocabout}[luis]
\textsc{Luis Rademacher} is a \phd\ candidate in the
\href{http://www-math.mit.edu/}{Department of Mathematics} at
\href{http://www.mit.edu/}{MIT}, supervised by
\href{http://www-math.mit.edu/~vempala/}{Santosh Vempala}. His
research interests include game theory, matrix approximation,
computational lower bounds, and the intersection between geometry and
algorithms. He grew up in Chile and enjoys music as a hobby.
\end{tocabout}
\begin{tocabout}[santosh]
{\sc Santosh Vempala} is a professor in the 
\href{http://www.cc.gatech.edu}{College of Computing} and
director of the newly formed 
\href{http://www.arc.gatech.edu}{Algorithms and Randomness Center}
at Georgia Tech. 
His research interests, ironically, are in algorithms, randomness,
and geometry. He graduated from CMU in 1997 following the advice of
Avrim Blum and was at MIT till 2006 except for a year as a Miller fellow
at UC Berkeley. He gets unreasonably excited when a phenomenon that
appears complex from one perspective turns out to be simple from
another.
\end{tocabout}
\begin{tocabout}[grant]
{\sc Grant Wang} graduated from \href{http://www.mit.edu}{MIT} in
August 2006 with a \phd\ in computer science.  His advisor was Santosh
Vempala.  He graduated from  \href{http://www.cornell.edu}{Cornell
University} with a \bsc\ in Computer Science in 2001.  His research
interests are in algorithms, machine learning, and data mining.  As of
September 2006, he is working at \href{http://www.yahoo.com}{Yahoo!}
\end{tocabout}
\end{tocaboutauthors}

\end{document}
