%% ToC#238, Volume 2, article 11  v002/a011.tex  Charikar - Krauthgamer

\documentclass{toc}

\tocdetails{%
 volume=2, number=11, year=2006, firstpage=207,
   received={December 8, 2005}, 
   revised={September 18, 2006},
   published={September 29, 2006}
}


\newcommand{\etal}{et al.} 
\newcommand{\zo}{\{0,1\}}
\newcommand{\Pinv}{P^{-1}}
\newcommand{\x}{x^{-1}}
\newcommand{\y}{y^{-1}}
\newcommand{\lis}{longest increasing subsequence}
\newcommand{\lcs}{longest common subsequence}
\newcommand{\emptystring}{\epsilon}
\newcommand{\floor}[1]{\lfloor{#1}\rfloor}
\newcommand{\half}{{\frac12}}
\DeclareMathOperator{\EX}{\mathbb E}
\DeclareMathOperator{\ed}{ED}
\DeclareMathOperator{\ud}{UL}
\DeclareMathOperator{\LCS}{LCS}
\DeclareMathOperator{\LIS}{LIS}
\DeclareMathOperator{\bp}{b}

\runningauthor{M.~Charikar and R.~Krauthgamer}

\runningtitle{Embedding the Ulam metric into $\ell_1$}

\copyrightauthor{Moses Charikar and Robert Krauthgamer}


\begin{document}
\begin{frontmatter}[classification=text]

\title{Embedding the Ulam metric into $\ell_1$}
\tocpdftitle{Embedding the Ulam metric into $\ell_1$}
\tocpdfauthor{Moses Charikar and Robert Krauthgamer}

\author[moses]{Moses Charikar\thanks{Supported by NSF ITR grant CCR-0205594, DOE Early Career Principal
Investigator award DE-FG02-02ER25540, NSF CAREER award 
CCR-0237113, MSPA-MCS award 0528414, and an Alfred P.~Sloan Fellowship}}
\author[robi]{Robert Krauthgamer}

\begin{abstract}
Edit distance is a fundamental measure of distance between strings,
the extensive study of which has recently focused on computational problems
such as nearest neighbor search, sketching and fast approximation.
A very powerful paradigm is to map the metric space induced by the edit distance
into a normed space (\eg, $\ell_1$) with small distortion,
and then use the rich algorithmic toolkit known for normed spaces. 
Although the minimum distortion required to embed edit distance 
into $\ell_1$ has received a lot of attention lately,
there is a large gap between known upper and lower bounds.
We make progress on this question by considering 
large, well-structured submetrics of the edit distance metric space.

Our main technical result is that the Ulam metric,
namely, the edit distance on permutations of length at most $n$,
embeds into $\ell_1$ with distortion $O(\log n)$.
This immediately leads to sketching algorithms with constant size sketches,
and to efficient approximate nearest neighbor search algorithms, 
with approximation factor $O(\log n)$.
The embedding and its algorithmic consequences present a big improvement 
over those previously known for the Ulam metric, and they are significantly
better than the state of the art for edit distance in general.
Further, we extend these results for the Ulam metric 
to edit distance on strings that are (locally) non-repetitive, 
\ie, strings where (close by) substrings are distinct.
\end{abstract}

\tockeywords{edit distance, metric embedding, Ulam metric, 
low distortion, sketching, permutation edit distance}

\tocacm{F.2.2, G.2.1, G.3}

\tocams{68P05, 68W20, 68W25}

\end{frontmatter}

\section{Introduction}  \label{sec:intro}

The \emph{edit distance} (aka \emph{Levenshtein distance}) between two strings
is the least number of character insertions, deletions 
or substitutions required to transform one string to the other.
The edit distance arises naturally in several application areas, 
often involving large amounts of data, ranging from 
a moderate number of extremely long strings (as in computational
biology) to a large number of moderately long strings (as in text
processing and web search).
Therefore, efficient algorithms for edit distance, 
even with modest approximation guarantees, are highly desirable.

The edit distance is a fundamental measure of (dis)similarity,
because it is a very simple model that exhibits nontrivial alignment
(\ie, a single elementary modification may cause 
numerous characters to change their position in the string).
Popular metrics such as the Hamming distance do not adequately capture 
this phenomenon, and thus for data analysis purposes, edit distance often offers 
a much better model (up to minor variations such as weighting).

Let $\ed(x,y)$ denote the edit distance between strings $x$ and $y$.
Fixing an alphabet $\Sigma$, the edit distance function $\ed(\cdot,\cdot)$ 
defines a metric space, called the \emph{edit distance metric}, 
whose point set contains all the strings over $\Sigma$.
Every collection $X$ of strings over this alphabet (\eg, $X=\zo^n$) 
can therefore be associated with the submetric $(X,\ed)$.
A significant obstacle in dealing with the edit distance metric is that
it lacks many of the useful properties of normed spaces.
We restrict our attention to collections $X$ of strings 
that have limited repetitions.
While these collections of strings are rather large and 
give rise to submetrics $(X,\ed)$ of a rich structure,
we will be able to obtain results that are significantly better 
than those known for the general case $X=\Sigma^n$ (or even $\zo^n$).
Our main focus is the Ulam metric, which we define next.

\paragraph{The Ulam Metric}
Following~\cite{CMS01,Cormode:Thesis},
a string (over the alphabet $\Sigma$) is called a \emph{permutation} 
if its characters are all distinct.
(This notion, sometimes called a variation, extends the usual 
notion of permutation to cases where $|\Sigma|>n$,
but this slightly more general setting is more convenient for our purposes
and potentially more useful in applications.)
Throughout this paper, the \emph{Ulam metric} of dimension $n$ 
(called permutation edit distance in~\cite{CMS01})
is the metric space $(\mathcal P_n,\ed)$, 
where $\mathcal P_n$ contains all permutations of length $n$ over $\Sigma$.

Remark: The standard definition of the Ulam metric for permutations
(see \eg~\cite{AD99}) uses the distance function $\ud(x,y)$, defined
as the least number of character moves needed to transform $x$ into
$y$.  This distance function is obviously limited to the case $n=|\Sigma|$
(\ie\ to the usual notion of permutations), while it is easily seen to
be nearly equivalent to the edit distance, namely $\ud(x,y)\leq \ed(x,y)
\leq 2\ud(x,y)$.  Thus, our non-standard definition of the Ulam metric to
be $(\mathcal P_n,\ed)$ is merely a slight abuse of terminology to
gain additional generality.


\paragraph{Embeddings}
An \emph{embedding} of a metric space $(X,d_X)$ into a target
metric space $(Y,d_Y)$ is a mapping $f:X\to Y$.
The \emph{distortion} of the embedding $f$ is defined as
the smallest $K\geq 1$ for which there is (a scaling factor) $s>0$
such that 
\[
  d_X(x_1,x_2)\leq s\cdot d_Y(f(x_1),f(x_2)) \leq K\cdot d_X(x_1,x_2)
  \qquad \text{for all $x_1,x_2\in X$}\enspace.
\]
Low-distortion embeddings are a very powerful paradigm for 
reducing a host of problems 
(such as Nearest Neighbor Search, see \secref{sec:applications})
{from} a given metric space to a more structured or computationally easier 
metric space,
at the cost of a small loss in the approximation guarantee.
It is easy to see that the Ulam metric contains 
the $\floor{n/2}$-dimensional hypercube as a submetric (up to scaling),%
\footnote{Consider \eg\ the permutations $P$ for 
which $\{P(2i-1),P(2i)\}=\{2i-1,2i\}$ for all $i=1,\ldots,\floor{n/2}$.}
and hence, by Enflo's Theorem~\cite{Enflo69}, 
embedding the Ulam metric into $\ell_2$ requires distortion $\Omega(\sqrt n)$.
But an $\ell_1$ embedding is usually sufficient for algorithmic applications
such as sketching algorithms and Nearest Neighbor Search
(see \secref{sec:sketching} for definitions), 
which raises the question of embedding the Ulam metric, 
and more generally the edit distance metric, into $\ell_1$.

There is a very large gap between the known upper and lower bounds 
on the distortion required to embed the edit distance metric 
$(\Sigma^n,\ed)$ into $\ell_1$.
Ostrovsky and Rabani~\cite{OR05} showed an upper bound of
$2^{O(\sqrt{\log n \log \log n})}$ for embedding edit distance into 
$\ell_1$.
Very recently, Khot and Naor~\cite{KN05} obtained a lower bound of
$\Omega{(\sqrt{\log n/\log \log n})}$, and this was further improved
to $\Omega(\log n)$ by Krauthgamer and Rabani~\cite{KR06}.
For the Ulam metric, the same upper bound $2^{O(\sqrt{\log n \log \log n})}$ 
clearly holds and is still the state of the art.
On the other hand, Cormode~\cite[page 60]{Cormode:Thesis} shows a lower bound 
of $4/3$, using a $5$-point metric that is isomorphic to the $K_{2,3}$ graph
(up to scaling).
Our embedding results make progress towards resolving these intriguing
questions by proving an exponentially smaller upper bound
for the Ulam metric, and extending it to several related edit distance
submetrics.

\subsection{Results}

Our main result, appearing in \secref{sec:ulam}, is that the
$n$-dimensional Ulam metric embeds into $\ell_1$ with distortion 
$O(\log n)$.%
\footnote{This metric space contains more than $2^n$ points, and thus
  our distortion bound beats by far the one that follows {from}
  Bourgain's embedding theorem~\cite{Bourgain85} for general finite
  metrics.  Note that in the nearest neighbor search setting, one
  needs to embed not only $S$ into $\ell_1$, but also the (yet unknown)
  query point.}  The previously known upper bound is $2^{O(\sqrt{\log
    n\log \log n})}$, using the embedding of~\cite{OR05} for edit
distance in general.  Our embedding is surprisingly simple and easy to
describe.  In fact, this is the complete description: we have a
coordinate for every pair of symbols $a,b \in \Sigma$ and the value of this
coordinate in the embedding of a string is simply the inverse of the
distance between $a$ and $b$ in the string (or $0$ if one of $a,b$
does not occur).

\paragraph{Techniques}
Our methodology is inspired by the work of Cormode, Muthukrishnan and
Sahinalp~\cite{CMS01}, who designed a mapping {from} the Ulam metric
to Set-Intersection.%
\footnote{Namely, every permutation is mapped to a subset of 
some fixed ground set $U$, such that the edit distance between 
two permutations is approximately the size of the intersection 
between the two respective subsets.}
However, their mapping is not an embedding into
$\ell_1$ (in fact, Set-Intersection is not even a metric space)
and it does not yield a sketching algorithm for the Ulam metric.
The main difficulty in the analysis of our embedding is to prove that 
it is not too contractive.
To this end, we build on the framework of~\cite{CMS01},
in which a common subsequence for two permutations is constructed recursively
by partitioning each permutation into two subsequences.
The main novelty in our analysis is that we replace 
the deterministic recursive partitioning of~\cite{CMS01}
with a carefully-crafted stochastic partitioning,
resulting in a suitable averaging over all symbol pairs.

\paragraph{Applications}
The Ulam metric is interesting in its own right,
\eg\ when considering the Ulam metric as a measure of (dis)similarity 
between rankings for say aggregation purposes.
But it is not clear a priori that results on the Ulam metric would
lend themselves to broader classes of strings;
bit strings, for instance, cannot have distinct characters unless $n\leq 2$.
Nevertheless, we show in \secref{sec:applications} several applications 
of the above embedding result to edit distance on more general strings.

Let us mention one generalization of the above embedding here, 
leaving further discussion to \secref{sec:applications}.
Following~\cite{BJKK04}, we say that a string is \emph{$t$-non-repetitive},
if all its $t$-substrings are distinct. 
For example, a random bit string of length $n$ is, 
with high probability, $2\log n$-non-repetitive.
We can easily extend the above embedding to $t$-non-repetitive strings 
with a $2t$ factor loss in the distortion.
For instance, this embedding is applicable, with high probability,
for two random, but correlated, bit strings (\eg, $x$ is chosen uniformly 
at random and $y$ is derived {from} it by deleting certain positions).
Such scenarios often arise in computational biology contexts due to 
background distributions.

All our embeddings (\eg\ the one specified above) are efficiently computable, 
and are thus readily applicable to computational problems.
For instance, they immediately yield sketching algorithms and 
Nearest Neighbor Search schemes, as stated in \secref{sec:applications}.
The algorithms that we obtain for restricted families of strings all have 
significantly better approximation guarantees than the state of the art 
for edit distance in general~\cite{Indyk04,BJKK04,OR05}.
For one thing, restriction to strings with limited repetitions
may be reasonable in many specific scenarios,
and may serve as a rigorous starting point for domain-specific heuristics.

In addition, our results make partial progress on the general case;
they identify algorithmic tools that are provably useful (in certain cases),
and they pinpoint some difficult aspects (of the general case).
The recent embedding result of Ostrovsky and Rabani~\cite{OR05} 
relies on a recursive construction.
Our results on the Ulam metric and its extensions suggest that it 
may be possible to achieve a polylogarithmic distortion for embedding
general edit distance.
However, achieving this bound may require going beyond recursive 
constructions and using a ``magical'' direct embedding along the lines of the
one we construct.


\subsection{Related work}

Cormode, Muthukrishnan and Sahinalp~\cite{CMS01} were the first
to suggest embeddings of various distance functions on permutations.
For reversal distance and transposition distance they designed embeddings
into Hamming space with constant distortion.
However, as mentioned earlier, for edit distance (on permutations)
they designed a mapping into Set-Intersection,
which cannot be embedded into $\ell_1$ or yield a sketching algorithm.
They also use this mapping into Set-Intersection to obtain 
a fast approximate string matching for permutations (under edit distance).

Cormode and Muthukrishnan~\cite{CM02} 
show that a variant of edit distance called the \emph{block edit distance},
where a block of characters can be moved in a single edit operation,
embeds into $\ell_1$ with distortion $O(\log n\log^*n)$.
See also~\cite{CPSV00,MS00} for embeddings of similar distance function
on strings.

Batu \etal~\cite{BEKMRRS} developed a sub-linear time
algorithm that runs in $O(n^{\max(\alpha/2, 2\alpha-1)})$
time and solves the $O(n^{\alpha})$ vs.\ $\Omega(n)$ edit distance gap problem.
Their algorithm can be cast as a sketching algorithm,
although it would use a sketch whose size is far more than constant.


\subsection{Notation} \label{sec:notation}

As usual, define $[k]=\{1,\ldots,k\}$.  A \emph{string} is a sequence of
characters (\ie, symbols) taken {from} an alphabet $\Sigma$.  The $j$th
character in a string $s$ is denoted by $s[j]$.  We write $a\in s$ to
denote that a character $a\in\Sigma$ appears in the string $s$, and $a\notin s$
otherwise.  A \emph{$t$-substring} (aka \emph{$t$-gram})
of a string $s$ is a string consisting of $t$ consecutive characters
in $s$, \eg\ $s[i],s[i+1],\ldots,s[i+t-1]$.  In contrast, a
\emph{subsequence} of $s$ need not be contiguous in $s$.

A \emph{permutation} is string $P$ whose characters are all distinct.
For permutations, it will sometimes be more convenient to work with $\Pinv$,
\ie, $\Pinv(a)$ is the position at which $a\in\Sigma$ 
appears in the string $P$ (if at all).


\paragraph{Longest common subsequence and edit distance}

For two strings $x,y$, let $\LCS(x,y)$ be the length of 
the \lcs\ of $x$ and $y$ 
(\ie, the maximum length of a string $z$ that is both
a subsequence of $x$ and a subsequence of $y$).
It is well-known and easy to verify that for every two strings 
(and in particular permutations) $x,y$ of length $n$, we have
$n-\LCS(x,y) \leq \ed(x,y) \leq 2(n - \LCS(x,y))$.


\section{Embedding the Ulam metric}  \label{sec:ulam}

In this section we present a low-distortion embedding 
of the Ulam metric (\ie, the edit distance metric on permutations)
into $\ell_1$.

\begin{theorem} \label{thm:embed_ulam}
For every $n$, the Ulam metric of dimension $n$ can be embedded into
$\ell_1^{O(|\Sigma|^2)}$ with distortion $O(\log n)$.
\end{theorem}

Fix an integer $n$; we may assume that $n$ is a power of $2$,
\eg, by padding all strings using additional characters. 
Let $m=|\Sigma|$, and assume without loss of generality
that $\Sigma=\{1,\ldots,m\}$.

Define an embedding $f:\mathcal P_n\to\ell_1^{\binom{m}{2}}$ as follows.
First, associate every coordinate of the target space 
with a distinct pair $\{a,b\}$ where $a,b\in\Sigma$ and $a\neq b$.
Now for every permutation $P\in\mathcal P_n$, 
the coordinates of $f(P)$ are given by 
\begin{equation} \label{eq:embedding}
  f(P)_{\{a,b\}}:= 
  \begin{cases}
    1/(\Pinv(b)-\Pinv(a)) & \text{if $a,b\in P$, $a<b$;} \\
    0                     & \text{otherwise (\ie, $a\notin P$ or $b\notin P$).}\\
  \end{cases}
\end{equation}
The proof of \thmref{thm:embed_ulam} is completed below 
in \lemref{lem:UB} and \lemref{lem:LB}, which analyze 
the expansion and contraction of this embedding $f$, respectively.


\begin{lemma}[Expansion] \label{lem:UB}
Let $P$ and $Q$ be permutations of length $n$.
Then
\[
\|f(P)-f(Q)\|_1 \leq O(\log n)\cdot \ed(P,Q)\enspace.
\]
\end{lemma}
\begin{proof}
Extend $f$ to permutations of length at most $n$
by using the same definition \eqref{eq:embedding}.
Observe now that it suffices to prove the claimed inequality 
$\|f(P)-f(Q)\|_1 \leq O(\log n)\cdot \ed(P,Q)$
for the case where $\ed(P,Q) = 1$,
the length of $P$ is $n$ and the length of $Q$ is $n-1$.
Indeed, the general case then follows by the triangle inequality in $\ell_1$.
(In the process, we simulate a character substitution by
a deletion followed by an insertion,
which increases the number of character operations at most by a factor of $2$.)

Suppose then that $Q$ is obtained {from} $P$ by deleting some 
character $P[s]$.
Thus, we have (i) $Q[i]=P[i]$ for all $i<s$ 
and (ii) $Q[i]=P[i+1]$ for all $i\geq s$.
Clearly, 
\[
  \|f(P)-f(Q)\|_1 
  = \sum_{a,b\in\Sigma} \left|f(P)_{\{a,b\}} - f(Q)_{\{a,b\}}\right|\enspace.
\]
It suffices to consider only the terms in which $a\in P$ and $b\in P$,
as every other term is $0$.
So suppose $a=P[i]$ and $b=P[j]$ for $i<j$.
In the case where $i=s$, clearly $f(Q)_{\{a,b\}}=0$;
thus, the total contribution of this case is 
$\sum_{j=s+1}^n \frac{1}{j-s} \leq H(n)$
where $H(k)=\sum_{z=1}^k \frac1z$ is the $k$th harmonic number.
The case where $j=s$ is similar, with $f(Q)_{\{a,b\}}=0$
and thus total contribution 
$\sum_{i=1}^{s-1} (\frac{1}{s-i}) \leq H(n)$.
The case where both $i$ and $j$ are smaller than $s$,
as well as the case where both $i$ and $j$ are larger than $s$,
contribtes zero since $f(P)_{\{a,b\}} = f(Q)_{\{a,b\}} = \frac{1}{j-i}$.
The last case where $i<s<j$ has total contribution at most
\[
\sum_{i=1}^{s-1} \sum_{j=s+1}^\infty \left|\frac{1}{j-i} - \frac{1}{j-i-1}\right|\enspace;
\]
for each $i$, the summation over $j$ is a telescopic sum bounded above
by $\frac{1}{s-i}$, implying that the total contribution of this case
is at most $H(n)$.
Hence, $\|f(P)-f(Q)\|_1 \leq 3H(n) \leq 3(1+\ln n) = O(\log n)$.
\end{proof}

Our proof method of the next lemma, which bounds the contraction of $f$,
is inspired by the work of Cormode, Muthukrishnan and Sahinalp~\cite{CMS01}.
At a high level, we recursively construct a common subsequence 
by first partitioning the alphabet, thereby partitioning each string
into two subsequences, and then merging the two common subsequences
obtained by recursion.
Our analysis is more involved than that of~\cite{CMS01}.
In particular, we employ a carefully-crafted stochastic partitioning 
that ``smooths'' the effect of any single pair of characters.

\begin{lemma}[Contraction] \label{lem:LB}
Let $P$ and $Q$ be permutations of length $n$,
and assume $n$ is a power of $2$.
Then $\|f(P)-f(Q)\|_1 \geq \frac18 \ed(P,Q)$.
\end{lemma}

Before proving this lemma, we introduce some definitions and
a technical proposition.
Let $P$ be a permutation of length $k$.
Denote by $\LIS(P)$ the length of a longest increasing subsequence of $P$.
Define a \emph{breakpoint} in $P$ to be a position $i\in[k-1]$ 
where $P[i] > P[i+1]$, and denote by $\bp(P)$ the number of breakpoints in $P$.
Two subsequences of $P$ are called a \emph{partition} of $P$
if each character of $P$ appears in exactly one of the two subsequences.
A \emph{block} is a pair of positions $\{2i-1,2i\}$ where $i\in[\floor{k/2}]$.
A partition of $P$ into two subsequences $P_0$, $P_1$ is called 
\emph{block-balanced} if, at every block $\{2i-1,2i\}$, 
exactly one of the two characters belongs to $P_0$ 
(hence also exactly one of them belongs to $P_1$).
Note that if the length of $P$ is even and a partition of $P$ is 
block-balanced, then the two corresponding subsequences have equal length.

\begin{proposition} \label{prop:block-bal}
Let $P$ be a permutation of length $k$, and suppose $k$ is even.
Then for every block-balanced partition of $P$ into subsequences 
$P_0$ and, $P_1$,
\[
\LIS(P) \geq \LIS(P_0) + \LIS(P_1) - 2 \bp(P)\enspace.
 \]
\end{proposition}
\begin{proof}
We will actually prove that 
\begin{equation} \label{eq:LIS}
  \LIS(P) \geq 2\LIS(P_0) - 2\bp(P)\enspace.
\end{equation}
A symmetric argument for $P_1$ will imply similarly that 
$\LIS(P) \geq 2\LIS(P_1) - 2\bp(P)$,
and the lemma will follow by averaging the last two inequalities.

Let us then prove \eqref{eq:LIS}.
Fix a longest increasing subsequence of $P_0$,
and, with a slight abuse of notation, 
let $\LIS(P_0)$ denote both this subsequence and its length.
We construct an increasing subsequence of $P$,
by augmenting $\LIS(P_0)$ with certain characters {from} $P_1$.
For each position $j\in[k]$ such that $P[j]$ is in $P_0$,
let $j'\in\{j-1,j+1\}$ be the position such that $\{j,j'\}$ forms a block.
Notice that $P[j']$ is in $P_1$, and call it a \emph{candidate}
if the corresponding $P[j]$ is in $\LIS(P_0)$.
The number of candidates is thus exactly $\LIS(P_0)$.
In particular, if augmenting $\LIS(P_0)$ with all the candidates 
forms an increasing subsequence of $P$, then the length of this increasing
subsequence would be $2\LIS(P_0)$, and it would prove \eqref{eq:LIS}.
We show next that $\LIS(P_0)$ can be always be augmented with 
$\LIS(P_0)-2\bp(P)$ candidates.

Consider two consecutive characters of $\LIS(P_0)$, 
say $P[j_1]$ and $P[j_2]$ with $j_1<j_2$.
(Essentially the same argument works in the two extremal cases
where $j_2$ is the first character of $\LIS(P_0)$
and where $j_1$ is the last character of $\LIS(P_0)$.)
Let $t$ be the number of candidates among $P[j_1+1],\ldots,P[j_2-1]$.
We claim that $t\leq 2$; 
indeed, only $j_1+1$ and $j_2-1$ can possibly be candidates,
because if $P[j']$ is a candidate for $j'\in \{j_1+1,\ldots,j_2-1\}$ 
then for the corresponding $P[j]$ in $\LIS(P_0)$ we have
$j\in\{j'-1,j'+1\} \subseteq \{j_1,\ldots,j_2\}$ and thus $j\in\{j_1,j_2\}$.
If the $t$ candidates are themselves in increasing order 
and they are all between $P[j_1]$ and $P[j_2]$,
then augment $\LIS(P_0)$ with these $t$ candidates;
clearly, the result is still an increasing subsequence of $P$.
Otherwise, $P$ must contain some breakpoint $\hat j\in\{j_1,\ldots,j_2-1\}$,
and this breakpoint can be blamed for not augmenting $\LIS(P_0)$ 
with these $t\leq 2$ candidates.
Applying this augmentation for every two consecutive characters $P[j_1]$ and 
$P[j_2]$ of $\LIS(P_0)$,
we see that every breakpoint is blamed only for candidates that 
are in the same interval $\{j_1,\ldots,j_2-1\}$ as the breakpoint,
and thus every breakpoint is blamed for a total of at most $2$ candidates.
It follows that we have augmented $\LIS(P_0)$ with at least
$\LIS(P_0)-2\bp(P)$ candidates.
It is easy to see that this results in an increasing subsequence of $P$
(because augmenting at one interval does not prevent augmenting 
at another interval)
and this proves \eqref{eq:LIS}.
\end{proof}



\begin{proof}[Proof of \lemref{lem:LB}]
Start with the two length $n$ permutations $P$ and $Q$,
and recall that we used padding to make $n$ be a power of $2$.
We may assume that $P$ and $Q$ contain exactly the same characters,
because every character $a\in\Sigma$ that appears in exactly one of 
the two strings contributes $1$ to $\ed(P,Q)$ 
and at least $2$ (actually $\Omega(\log n)$) to its ``own'' coordinates 
(all $\{a,b\}$ where $b\in \Sigma\setminus a$) 
in the difference vector $f(P)-f(Q)$.
We can further assume (by renaming characters in $\Sigma$)
that $Q$ is the identity permutation, \ie, $Q[i]=i$.
Hence, 
\[
\ed(P,Q) \leq 2(n-\LCS(P,Q)) = 2(n-\LIS(P))\enspace.
\]

Pick a random block-balanced partition of $P$ into subsequences $P_0$ and $P_1$,
\ie, independently assign each $P[2i]$ to either $P_0$ or $P_1$ 
uniformly at random, and assign $P[2i-1]$ to the other subsequence.
By \propref{prop:block-bal}, we have
\[
\LIS(P) \geq \EX[\LIS(P_0) + \LIS(P_1)] - 2 \bp(P)\enspace.
\]
Now apply a similar partitioning to each subsequence using independent coins,
\eg, $P_0$ is split into $P_{00}$ and $P_{01}$.
Continue recursively in a similar fashion until we get a singleton subsequence
$P_\sigma$ for every $\sigma\in\zo^{\log n}$.
For convenience, let $\emptystring$ denote the empty string,
and define $P_\emptystring=P$ and $\zo^0=\{\emptystring\}$.

Applying \propref{prop:block-bal} recursively, we have
\[
 \LIS(P) 
    \geq \sum_{\sigma\in\zo^{\log n}} \EX[\LIS(P_\sigma)]
        - 2 \sum_{k=1}^{\log n} \sum_{\sigma\in\zo^{k-1}} \EX[\bp(P_\sigma)]\enspace.
\]
We rearrange this equation using the fact that 
if $P_\sigma$ is a singleton sequence then $\LIS(P_\sigma)=1$, and obtain
\[
 n-\LIS(P)
   \leq 2 \EX\left[\sum_{k=1}^{\log n} \sum_{\sigma\in\zo^{k-1}} \bp(P_\sigma)\right]\enspace.
\]

Notice that the sum $\sum_k \sum_\sigma \bp(P_\sigma)$ in the right-hand side 
gets a contribution of $1$ every time two characters $P[i],P[j]$ 
for which $i<j$ and $P[i]>P[j]$
become consecutive characters in a subsequence $P_\sigma$. 
Formally, for positions $i$ and $j$ such that $i<j$ and $P[i]>P[j]$,
let the random variable $Z_{ij}$ be the number of subsequences $P_\sigma$ 
which contain $P[i],P[j]$ as consecutive characters.
By linearity of expectation we get that 
\begin{equation}  \label{eq:edZ}
  \half\ed(P,Q) \leq n-\LIS(P) \leq 2 \sum_{i<j; P[i]>P[j]} \EX[Z_{ij}]\enspace.
\end{equation}


We claim that $\EX[Z_{ij}] \leq 4/(j-i)$.
To prove the claim, notice that 
$\EX[Z_{ij}] = \sum_{z=1}^{\log n} \Pr[Z_{ij} \geq z]$
because $Z_{ij}$ takes only integral values between $0$ and $\log n$
(as $P[i]$ appears in $\log n$ subsequences).
Therefore, it suffices to show for every integer $z\geq 1$ the upper bound
\[
\Pr[Z_{ij} \geq z] \leq \frac{2^{2-z}}{j-i}\enspace.
\]
Let us first show that $P[i],P[j]$ can become consecutive characters 
only after $\floor{\log (j-i)}$ iterations (partitions of $P$).
Indeed, if $j-i\geq 2$, then at the first iteration these two characters 
must belong to different blocks;
thus, with probability $1/2$ the random partition of $P$ 
sends them to different subsequences, 
in which case they will never form a consecutive pair of no $P_\sigma$,
and with probability $1/2$ they are sent to the same subsequence,
in which case the difference between their positions in that common subsequence
is at least $\floor{(j-i)/2}$. 
Continuing similarly we see that if $2^l\leq j-i<2^{l+1}$ 
then even after $l-1$ partitions, 
the difference between the positions of the two characters is at least $2$,
\ie, they can become consecutive only after $l$ iterations.
Since the different sequence partitions are independent, we get that 
\[
\Pr[Z_{ij} \geq 1] \leq \left(\frac{1}{2}\right)^{\floor{\log (j-i)}} \leq \frac{2}{j-i}\enspace.
\]
Similarly, if $P[i],P[j]$ are consecutive in a subsequence $P_\sigma$,
they can be consecutive in a later iteration only if
they are sent to the same subsequence of $P_\sigma$,
which happens with proability $1/2$ or less.
Fixing an integer $z\geq 1$ we can apply this argument $z-1$ times 
to get that $\Pr[Z_{ij} \geq z \mid Z_{ij}\geq 1] \leq (1/2)^{z-1}$.
We thus conclude that $\Pr[Z_{ij} \geq z] \leq 2^{2-z}/(j-i)$, 
which completes the proof of the claim.

Using \eqref{eq:edZ} together with the above claim regarding $\EX[Z_{ij}]$,
we get that
\[
\half\ed(P,Q)
    \leq 4 \sum_{i<j; P[i]>P[j]} \frac{1}{j-i}\enspace.
\]
Notice that for every $i<j$ with $P[i]>P[j]$, 
the respective coordinates of $f(P)$ and $f(Q)$ are
\[
f(P)_{\{P[i],P[j]\}}=\frac{1}{i-j} < 0 \qquad\text{and}\qquad 
f(Q)_{\{P[i],P[j]\}} = \frac{1}{P[i]-P[j]} > 0\enspace,
\]
and hence
$\left|f(P)_{\{P[i],P[j]\}}-f(Q)_{\{P[i],P[j]\}}\right| > 1/(j-i)$.
We conclude that
\[
\half\ed(P,Q) 
   \leq 4 \|f(P)-f(Q)\|_1\enspace,
\]
which completes the proof of \lemref{lem:LB}.
\end{proof}

This completes the proof of \thmref{thm:embed_ulam}.
We note that the bounds in \lemref{lem:UB} and \lemref{lem:LB}
are existentially tight, up to constant factors (for our embedding $f$).

\section{Applications}  \label{sec:applications}

We present several applications of our $\ell_1$-embedding 
of the Ulam metric {from} \secref{sec:ulam}.
Following~\cite{BJKK04}, we say that a string is \emph{$t$-non-repetitive},
if all its $t$-substrings are distinct. 
We first extend the embedding to strings that are $t$-non-repetitive 
(\secref{sec:embed_nonrep}).
We also extend the above embedding 
to strings in which every character appears a bounded number of times
(\secref{sec:embed_bdd}).
Both extensions follow by showing that the corresponding strings 
can be embedded in the Ulam metric with low distortion.
We then discuss the immediate applications of these embeddings
to sketching algorithms (\secref{sec:sketching}) and to
Nearest Neighbor Search (\secref{sec:NN}).

A technically more involved application of the above embedding 
is an improvement to a result of Bar-Yossef \etal~\cite{BJKK04}.
Call a string \emph{$(t,r)$-non-repetitive}
if every $r$ successive $t$-substrings of it are distinct.
We improve over the sketching algorithm of~\cite{BJKK04} for 
locally non-repetitive strings in two aspects: 
(i) we achieve an embedding result, which is stronger than a sketching
algorithm (namely, a sketching algorithm follows quite easily); and 
(ii) we improve the approximation factor.
See \secref{sec:embed_local_nonrep} for more details.


\subsection{Embedding non-repetitive strings}
\label{sec:embed_nonrep}

Recall that a string is \emph{$t$-non-repetitive},
if all its $t$-substrings are distinct. 
Let $X_{n,t}$ contain all the $t$-non-repetitive strings 
of length $n$ over $\Sigma$.

\begin{theorem} \label{thm:embed_nonrep}
The metric space $(X_{n,t},\ed)$ embeds with distortion $2t$
into the Ulam metric of dimension $n-t+1$ and alphabet size $2^t$.
Consequently, it embeds into $\ell_1$ with distortion $O(t\log n)$.
\end{theorem}

The proof of the first part of the theorem is based on a simple
observation, and that of the second part is an immediate consequence
of \thmref{thm:embed_ulam}.

\begin{proof}
We define an embedding $f$ of $(X_{n,t},\ed)$ into the aforementioned Ulam metric.
First, we identify the Ulam metric alphabet $[2^t]$ with $\zo^t$
(using an arbitrary bijection).
Now for a $t$-non-repetitive string $x\in\zo^n$,
define $f(x)$ to be a length $n-t+1$ string (over $[2^t]$),
whose $j$th coordinate is given by $f(x)_j=b^{-1}(x[j]\ldots x[j+t-1])$.
To complete the proof, we will show that for every two strings 
$x,y\in\Sigma^n$, 
\begin{equation} \label{eq:bit}
  \ed(x,y) \leq \ed(f(x),f(y)) \leq t \ed(x,y)\enspace.
\end{equation}

To prove the first inequality, consider a \lcs\ between $f(x)$ and $f(y)$.
Each character in it corresponds to a $t$-substring in $x$ and in $y$.
Taking the first character {from} each of these $t$-substrings,
except for the last such $t$-substring, which is taken in its entirety,
yields a subsequence that is common to $x$ and $y$,
and hence $\LCS(x,y) \geq \LCS(f(x),f(y))+ t-1$.
We obtain that 
\[
\half \ed(x,y) \leq n-\LCS(x,y) \leq n-t+1-\LCS(f(x),f(y)) \leq \ed(f(x),f(y))\enspace.
\]

To prove the second inequality, fix a \lcs\ of $x$ and $y$,
and with a slight abuse of notation let $\LCS(x,y)$ denote
both this sequence and its length.
Consider all the $t$-substrings of $x$ and of $y$
that are entirely contained in $\LCS(x,y)$.
Observe that the number of such $t$-substrings 
is at least $n-t+1- t(n-\LCS(x,y))$,
because each of the $n-\LCS(x,y)$ characters that do not belong 
to $\LCS(x,y)$ participates in $t$ or fewer $t$-substrings.
These $t$-substrings in $x$ and in $y$ give rise to a subsequence 
that is common to $f(x)$ and $f(y)$.
Therefore, $\LCS(f(x),f(y)) \geq n-t+1- t(n-\LCS(x,y))$, implying that 
\[
\ed(f(x),f(y)) \leq 2(n-t+1 - \LCS(f(x),f(y))) \leq t(n-\LCS(x,y)) 
 \leq t \ed(x,y)\enspace.
 \]
\end{proof}

\subsection{Embedding bounded-occurrence strings}
\label{sec:embed_bdd}

We say that a string $P$ has \emph{$t$-bounded-occurrence}
if every character $a\in\Sigma$ appears at most $t$ times in $P$.
Let $B_{n,t}\subseteq\Sigma^n$ contain all the $t$-bounded-occurrence 
strings of length $n$ over $\Sigma$.

\begin{theorem} \label{thm:embed_bdd}
The metric space $(B_{n,t},\ed)$ embeds with distortion $t$
into the Ulam metric of dimension $n$ over an extended alphabet of size 
$t|\Sigma|$.
Consequently, it embeds into $\ell_1$ with distortion $O(t\log n)$.
\end{theorem}

The proof the first part of the theorem is based on a simple
observation, and that of the second part follows immediately {from}
\thmref{thm:embed_ulam}.

\begin{proof}
Let $\Sigma'$ be an alphabet of size $t|\Sigma|$, 
and associate every character $a\in\Sigma$ with $t$ 
distinct characters $a_1,\ldots,a_t\in\Sigma'$.
Given a string $x$, let $f(x)$ be the string obtained {from} $x$ 
by replacing, for every $j\in[t]$ and every character $a\in\Sigma$, 
the $j$th occurrence of $a$ in $x$ with the character $a_j\in\Sigma'$.
To complete the proof of the first part,
it would suffice to prove that for every two strings 
$x,y\in\Sigma^n$,
\begin{equation} \label{eq:bit2}
  \half\ed(x,y) \leq \ed(f(x),f(y)) \leq t \ed(x,y)\enspace,
\end{equation}
and it is indeed straightforward to verify this inequality.
\end{proof}


\subsection{Sketching algorithms}
\label{sec:sketching}

A sketching algorithm for edit distance consists of
two procedures that work in concert as follows.
The first procedures produce a fingerprint, called \emph{sketch}, 
from each of the input strings, and
the second procedure uses solely the sketches
to approximate the edit distance between the two strings.
In the $k$ vs.\ $m$ gap version of the problem,
there is a promise that the edit distance between the two strings
is either at most $k$ or more than $m$,
and we wish to decide which of the two cases holds. 
The key feature is that the sketch of each string is constructed
without knowledge of the other string. 
The procedures are randomized and are allowed to share random coins, and
the main measure of complexity is the size of the sketches produced. 
For actual applications it is also desirable that both procedures are efficient
(say run in time that is polynomial in their input size).


In contrast to Hamming distance, whose sketching complexity is
well-understood~\cite{IM98, KOR00, FIMNSW}, 
relatively little is known about sketching of edit distance. 
The result of Ostrovsky and Rabani~\cite{OR05} gives a sketching
algorithm that, for every $k=k(n)$, 
distinguishes between pairs of strings at edit distance
at most $k$ and at least $k\cdot 2^{O(\sqrt{\log n \log \log n})}$
using sketches of size $O(1)$.

For strings that are $t$-non-repetitive (including \eg\ permutations), 
Bar-Yossef \etal~\cite{BJKK04} give a sketching algorithm that solves, 
for every $k=k(n)$,
the $k$ vs.\ $\Omega(tk^2)$ gap edit distance problem
using sketches of size $O(1)$.
We improve over their algorithm as follows.

\begin{theorem}[Sketching $t$-non-repetitive strings]
\label{thm:sketch_nonrep}
For every $k=k(n)$ there exists a polynomial-time sketching 
algorithm that solves the $k$ vs.\ $\Omega(kt\log n)$ gap 
edit distance problem on $t$-non-repetitive strings of length $n$ 
using sketches of size $O(1)$.
\end{theorem}

The proof of the theorem is a simple consequence of the 
$\ell_1$-embedding {from} \thmref{thm:embed_nonrep},
First, convert the $\ell_1$-metric can be into a scaled Hamming metric.
Observe that a scaling factor of $O(n^2)$ suffices:
rounding each coordinate in the $\ell_1$-embedding to multiples of 
$1/Cn^2$, for a sufficiently large constant $C>0$,
increases the distortion by a factor $2$,
because $f(P)_{\{a,b\}} - f(Q)_{\{a,b\}}$ for $f$ in \eqref{eq:embedding}
is either zero or has absolute value $\Omega(1/n^2)$.
(A different argument is that $f(P)$ has at most $n^2$ nonzero coordinates,
and thus the total error due to rounding is at most additive $1/C$.)
Then, use the sketching algorithm stated below for Hamming spaces,
which is implicit in~\cite{KOR00}, and can also be derived {from} 
the locality-sensitive hashing algorithms of~\cite{IM98,Indyk00}
(using the fact that for binary strings there is a direct correspondence 
between Hamming distance and $\ell_2$ distance).

\begin{theorem}[Sketching Hamming metric~\cite{KOR00}]
\label{thm:sketch_l1}
For every $\varepsilon > 0$ and $k=k(n)$,
there is a polynomial-time sketching algorithm that solves
the $k$ vs.\ $(1 + \varepsilon)k$ gap Hamming distance problem
in binary strings of length $n$,
with a sketch of size $O(1/\varepsilon^2)$.
\end{theorem}

We note that besides being a very basic computational primitive
for massive data sets (see \eg~\cite[Section 4.6]{Cormode:Thesis}),
sketching is also related to (i) Nearest Neighbor Search (see below), 
(ii) protocols that are secure (\ie, leak no information),
cf.~\cite{FIMNSW}, and (iii) the simultaneous messages communication model with
public coins~\cite{Yao79}.

\subsection{Nearest Neighbor Search}
\label{sec:NN}

One of the most extensively studied computational problems is 
\emph{Nearest Neighbor Search} (NN): 
Given a set $S$ of points in a metric space $X$, 
preprocess $S$ so as to efficiently answer queries for finding 
the point in $S$ that is closest to a query point $q\in X$.
The last decade has seen the advent of data structures for 
approximate NN in (high) $n$-dimensional normed spaces.
In particular, algorithms with preprocessing space (storage) 
polynomial in $n$ and $|S|$
and query time that is polynomial in $n$ and $\log |S|$,
were designed in~\cite{IM98,KOR00} for $\ell_1^n$ and $\ell_2^n$ 
(achieving approximation factor $1+\varepsilon$ for every constant 
$\varepsilon>0$)
and in~\cite{Indyk98} for $\ell_\infty^n$ 
(achieving approximation factor $O(\log\log n)$).
Other algorithms for $\ell_1^n$ and $\ell_2^n$, due to~\cite{IM98,GIM99}, 
achieve $1+\varepsilon$ approximation
require preprocessing space that is more moderate (subquadratic in $|S|$) 
and have query time that is sublinear in $|S|$ 
(roughly $O(|S|^{1/(1+\varepsilon)})$).

In contrast, the known algorithmic guarantees for approximate NN 
in edit distance metrics $(X,\ed)$ are much weaker.
For the general case $(\Sigma^n,\ed)$, 
Indyk~\cite{Indyk04} designed, for every fixed $\alpha>0$, a constant factor 
approximation (exponential in $1/\alpha$)
with space requirement that is exponential in $n^\alpha$.
The recent embedding result of Ostrovsky and Rabani~\cite{OR05} leads to a 
nearest neighbor data structure with approximation factor 
$2^{O(\sqrt{\log n \log \log n})}$ and polynomial space requirement.

Combining our $\ell_1$-embedding {from} \thmref{thm:embed_bdd}
with these NN algorithms for $\ell_1$ 
(or Hamming space or $\ell_2$, as discussed in \secref{sec:sketching})
immediately yields a nearest neighbor algorithm for $t$-non-repetitive strings 
(including \eg\ permutations) achieving approximation factor $O(t\log n)$, 
which improves over the bound obtain in~\cite{BJKK04} for this case.
In particular, using the algorithms of~\cite{IM98,KOR00}
results in query time $(n+\log |S|)^{O(1)}$,
and space requirement that $(n+|S|)^{O(1)}$.
Similarly, using the algorithms of~\cite{IM98,GIM99}
results in query time that is sublinear in $|S|$ 
and space requirement that is subquadratic in $|S|$.

The sketching algorithm can alternatively be used to speed up 
the naive algorithm that computes the edit distance between the query $q$ 
and every data point $x\in S$. 
Simply replace each edit distance computation with an estimate derived 
{from} a sketch of $x$ (computed at the preprocessing stage)
and a sketch of $q$.
The number of iterations would still be $O(|S|)$,
but each iteration will be much faster--about $O(\log |S|+\log\log n)$ 
time.


\subsection{Embedding locally non-repetitive strings}
\label{sec:embed_local_nonrep}

We can further generalize \thmref{thm:embed_nonrep} 
and obtain an embedding of strings that are locally non-repetitive 
(see the definition below).
The guarantee of this embedding is slightly weaker than a low distortion,
since it approximates well only distances that are sufficiently small.
An interesting aspect of our embedding is that it uses the sketching
algorithm of~\cite{KOR00} to obtain an embedding; this is opposite to 
the usual direction, where an embedding is used to obtain a sketching 
algorithm.

Our results improve over~\cite{BJKK04} in several respects:
(i) We give an embedding, which consequently leads to a sketching algorithm,
while~\cite{BJKK04} only give a sketching algorithm.
An embedding result is stronger (unless it is computationally inefficient) 
since it has to simultaneously handle exponentially many strings,
including pairs $x,y$ with rather different distance $\ed(x,y)$, 
while a sketching algorithm is only required to have 
a high success probability for every pair.
(ii) Our approximation factor is smaller since we rely on the Ulam metric
embedding.

\begin{definition}[Locally non-repetitive string]
A string is called \emph{$(t,r)$-non-repetitive},
or in short \emph{locally non-repetitive},
if every $r$ successive $t$-substrings are distinct,
\ie, for each interval $\{i,\ldots,i+r-1\}$, the $r$ substrings 
of length $t$ each that start in this interval are distinct.
\end{definition}

Let $X_{n,t,r}$ contain all the $(t,r)$-non-repetitive strings of 
length $n$ over $\Sigma$.
Notice that this family contains $X_{n,t}$ since
every $t$-non-repetitive string is also $(t,r)$-non-repetitive.


\begin{theorem}[Embedding] \label{thm:embed_local_nonrep}
For every $t=t(n)$ and every $k=k(n)$, there
exists an embedding $f$ of the $(t,180tk)$-non-repetitive strings 
into $\ell_1$, such that for every two such strings $x,y$,
\[
  \Omega(\min\{k, \ed(x,y)/t\log(tk)\})
  \leq \|f(x)-f(y)\| 
  \leq \ed(x,y)\enspace.
\]
\end{theorem}

This embedding builds on the iterative anchors technique used in the 
sketching algorithm of Bar-Yossef \etal~\cite[Section 2]{BJKK04}. 
The basic idea is to pick a set of non-overlapping 
substrings of length $t$ in a coordinated fashion.
These substrings are referred to as anchors and 
partition the string into disjoint substrings.
The substrings between anchors are embedded into $\ell_1$
and the embedding for the original string is obtained 
by combining these embeddings in a suitable way.
The key idea from~\cite{BJKK04} is a method to pick these
anchors in such a way that if $x$ and $y$ are two strings
with small edit distance, then the anchor selection 
process picks the same anchors for both $x$ and $y$.
One technical difference is that, between successive anchors, 
we employ the $\ell_1$-embedding {from} \secref{sec:ulam}. 
Another difference is that as a final step we apply the $\ell_1$
sketching algorithm of~\cite{KOR00}, which effectively ``thresholds'' 
the $\ell_1$ distance between images.
For clarity, we make no attempt to optimize constants.

\begin{proof}
We describe the embedding of a $(t,180tk)$-non-repetitive strings $x$ 
using a randomized procedure that generates a bit $f'(x)$.
The embedding $f$ into $\ell_1$ will then be the concatenation of $f'(x)$, 
ranging over all possible outcomes for the coin tosses, with suitable scaling.

Fix $W := 56 tk$. Augment the alphabet $\Sigma$ with $2W+t$ new characters
$a_1,\ldots,a_{2W+t}$ and append to $x$ the fixed string $a_1\ldots a_{2W+t}$.
Select a sequence of disjoint substrings
$\alpha_1,\ldots,\alpha_{r_x}$ of $x$, called ``anchors,''
iteratively as follows. 
Maintain a sliding window of length $2W+t$ over the string $x$. 
The left endpoint of the sliding window 
is denoted by $c$; initially, $c$ is set to $1$.
At each step, say step $i$, consider the $W$ substrings of length $t$ 
whose starting position lies in the interval $[c+W\ldots c+2W-1]$,
and denote the $j$th such substring, for $j\in[W]$,
by
\[
s_{i,j}=x[c+W+j-1\ldots c+W+j+t-2]\enspace.
\]
Select at random a permutation $\Pi_i$ of $\Sigma^t$,
and set the anchor $\alpha_i$ to be a substring $s_{i,l}$ 
that is minimal according to $\Pi_i$ (breaking ties arbitrarily), \ie,
$$
  \Pi_i(s_{i,l}) = \min \bigl\{\Pi_i(s_{i,1})),\ldots,\Pi_i(s_{i,W})\bigr\}\enspace.
$$
Then slide the window by setting $c$ to the position immediately
following the anchor, \ie, 
\[
c\gets c+W+l+t-1\enspace.
\]
If this new value of $c$ is at most $n$
start a new iteration.
Otherwise, stop, letting $r_x\leq O(n/tk)$ be the number of anchors collected.

For $i \in [r_x]$, let $\phi^i=\phi^i(x)$ be the substring of $x$ starting at 
the position immediately after the last character of anchor $\alpha_{i-1}$
and ending at the last character of $\alpha_i$.
By convention, $\phi^1$ starts at position $1$.

Now embed each $\phi^i$ into $\ell_1^{O(tk)}$ using 
\thmref{thm:embed_nonrep};
notice that $\phi^i$ is a substring of $x$ of length at most $2W+t \leq 180tk$, 
and is thus $t$-non-repetitive.
Next concatenate the resulting $r_x = O(n/tk)$ images into a vector 
$\varphi(x)\in \ell_1^{O(n)}$ (appending $0$'s as necessary).
Finally, choose a random bit string $r\in\zo^{O(n)}$ of the same length, 
such that for all $i$, $r_i=1$ independently with probability $1/kt\log(kt)$,
and let 
\[
f'(x)= \sum_i r_i\cdot\varphi_i(x) \mod 2\enspace.
\]
(This step is similar to~\cite{KOR00}, though the purpose of using it here
is very different.)

The embedding's correctness follows immediately from the next two
lemmas, which we shall prove shortly.

\begin{lemma} \label{lem:local_nonrep1}
If $x$ and $y$ are $(t,180tk)$-non-repetitive strings then
\[
\Pr[f'(x)\neq f'(y)] \leq O(\ed(x,y) / k)\enspace.
\]
\end{lemma}

\begin{lemma} \label{lem:local_nonrep2}
If $x$ and $y$ are $(t,180tk)$-non-repetitive strings then
\[
\Pr[f'(x)\neq f'(y)] \geq \Omega(\min\{\ed(x,y)/kt\log(kt) ,1\})\enspace.
\]
\end{lemma}

The proof of \thmref{thm:embed_local_nonrep} is completed
by observing that the concatenation of bits $f'$ with suitable scaling
yields an embedding $f$ which satisfies
\[
\|f(x)-f(y)\|_1 = k \EX|f'(x)-f'(y)| = k \Pr[f'(x)\neq f'(y)]\enspace.
\]
\end{proof}

\begin{proof} [Proof of \lemref{lem:local_nonrep1}]
The preliminary step of appending $x$ and $y$ 
with the same string of length $2W+t$ clearly does not change $\ed(x,y)$.
Now fix a sequence of edit operations $\tau$ that transforms this new $x$ 
into the new $y$ and uses $\ed(x,y)$ edit operations.
Let $\alpha_i$ be the $i$th anchor chosen for $x$ and let $\beta_i$
be the $i$th anchor chosen for $y$.
Let $r=\min\{r_x,r_y\}$. 
As in the proof of Lemma 2.6 in~\cite{BJKK04},
with probability at least $1-\ed(x,y)/7k$, the following event happens:
for all $i\in[r]$,
the tranformation $\tau$ (sequence of edit operations)
maps $\alpha_i$ to $\beta_i$
with no edit operations inside $\alpha_i$ or $\beta_i$.%
\footnote{The argument in~\cite{BJKK04} goes roughly as follows:
at a single iteration, the probability that the anchor selection goes wrong 
is at most $t$ times the number of edit operations inside the current window 
divided by $W$ (since there are $W$ choices for the anchor).
It can be verified that an edit operation can only affect one iteration,
and a union bound over all iterations 
gives an upper bound of $\ed(x,y)/7k$.
}
If this happens, we say that the \emph{anchors match}.

Assume for the moment that the anchors match.
Using the fact that $\{\phi^i(x)\}_{i\in [r]}$ are disjoint substrings 
of $x$ and similarly $\{\phi^i(y)\}_{i\in [r]}$ for $y$, we get that 
\[
\ed(x,y) = \sum_{i\in[r]} \ed(\phi^i(x),\phi^i(y))\enspace.
\]
Furthermore, at least one of the anchors $\alpha_r$ and $\beta_r$ 
is the last anchor in its string and thus contains one of the unique
$2W+t$ characters that were appended to $x$ and $y$.
But since the anchors $\alpha_r = \beta_r$, 
both must be the last anchor in their string, and thus $r_x=r_y$.
Using the guarantees of \thmref{thm:embed_nonrep}, we get that
\[
\|\varphi(x)-\varphi(y)\|_1 
  \leq \sum_{i\in[r]} O(t\log(2W+tk)) \ed(\phi^i(x),\phi^i(y))
  \leq O(t\log(tk)) \ed(x,y)\enspace.
\]

Finally, the analysis in~\cite{KOR00} of a random inner product modulo $2$
shows that 
\begin{equation} \label{eq:inner}
  \Pr\bigl[f'(x)\neq f'(y) \mid \varphi(x), \varphi(y)\bigr] 
  = \Theta(\min\{\|\varphi(x)-\varphi(y)\|/kt\log(kt), 1\})\enspace.
\end{equation}
We can then bound $\Pr[f'(x)\neq f'(y)]$ {from} above by conditioning on
whether the anchors match.
If they do match, $f'(x)\neq f'(y)$ with probability $O(\ed(x,y)/k)$.
Otherwise (which happens with probability at most $\ed(x,y)/7k$),
$f'(x)\neq f'(y)$ with probability at most $1$. We thus conclude that
\[
\Pr[f'(x)\neq f'(y)] \leq O(\ed(x,y)/k)\enspace.
\]
\end{proof}

\begin{proof} [Proof of \lemref{lem:local_nonrep2}]
The preliminary step of appending $x$ and $y$ 
with the same string of length $2W+t$ clearly does not change $\ed(x,y)$.
Now similar to the proof of Lemma 2.8 in~\cite{BJKK04},
let $r=\max\{r_x,r_y\}$ and 
for $i = r_x + 1,\ldots,r$ let $\phi^i(x) = \epsilon$ be the empty string
and similarly for $i = r_y +1,\ldots,r$ let $\phi^i(y) = \epsilon$.
Let $g$ be the $\ell_1$-embedding {from} \thmref{thm:embed_nonrep}.
Then for all $i\in[r]$ we have 
\[
\|g(\phi^i(x))-g(\phi^i(y))\| \geq \Omega(\ed(\phi^i(x),\phi^i(y)))\enspace.
\]
Since the substrings $\{\phi^i(x)\}_{i\in[r]}$ induce a partition 
of the string $x$ and similarly $\{\phi^i(y)\}_{i\in[r]}$ for $y$,
we get that
\[
\ed(x,y) 
  \leq \sum_{i\in[r]} \ed(\phi^i(x),\phi^i(y)) 
  \leq O(\|\varphi(x)-\varphi(y)\|)\enspace.
\]
The lemma follows by applying the analysis of
a random inner product modulo $2$, as stated in \eqref{eq:inner}.
\end{proof}



\paragraph{Improved sketching algorithm}
The embedding of \thmref{thm:embed_local_nonrep} leads 
to the following sketching result.

\begin{theorem}[Sketching] \label{thm:sketch_local_nonrep}
For every $t=t(n)$ and every $k=k(n)$, there
exists a polynomial-time efficient sketching algorithm that solves
the $k$ vs.\ $\Omega(t k\log k)$ gap edit distance problem for
$(t,180tk)$-non-repetitive strings using sketches of size $O(1)$.
\end{theorem}

The proof follows in a straightforward way
by ``concatenating'' the embedding of \thmref{thm:embed_local_nonrep}
with a sketching algorithm for the $k'$ vs.\ $k'(1+\epsilon)$ gap 
Hamming distance (or $\ell_1$) problem that is implicit in~\cite{IM98,KOR00}.
We note that the embedding uses many dimensions (coordinates),
but for the purpose of sketching it suffices to 
generate only $O(kt\log(kt))$ coordinates $f'$ at random,
which can be done efficiently using the shared random coins.
It is also easy to verify that the random permutation $\Pi$ can be replaced
by an almost min-wise hash family that is efficiently computable using shared
randomness, similar to~\cite{BJKK04}.

Note that a permutation is $(1,r)$-non-repetitive for every $r\geq 1$,
and so this theorem offers a somewhat unexpected small improvement 
for the Ulam metric (over \thmref{thm:sketch_nonrep}), 
reducing the gap {from} $O(\log n)$ to $O(\log k)$.


\paragraph{Acknowledgements}

We thank the anonymous reviewers for helpful comments.

\bibliographystyle{tocplain}
\bibliography{a011}

\begin{tocauthors}
\begin{tocinfo}[moses]%
Moses Charikar\\
Dept.\ of Computer Science\\
Princeton University\\
35 Olden Street\\
Princeton, NJ 08540, USA\\
\texttt{moses\tocat cs\tocdot princeton\tocdot edu}\\
\url{http://www.cs.princeton.edu/~moses/}
\end{tocinfo}

\begin{tocinfo}[robi]%
Robert Krauthgamer\\
IBM Almaden Research Center\\
Department K53/B2\\
650 Harry Road\\
San Jose, CA 95120, USA\\
\texttt{robi\tocat almaden\tocdot ibm\tocdot com}\\
\url{http://www.almaden.ibm.com/cs/people/robi/}
\end{tocinfo}
\end{tocauthors}

\begin{tocaboutauthors}
\begin{tocabout}[moses]
\textsc{Moses Charikar} is an Assistant Professor in the 
\href{http://www.cs.princeton.edu/}{Computer Science department} at
\href{http://www.princeton.edu/}{Princeton University}. 
He received his \phd\ in 2000 from
\href{http://www.stanford.edu/}{Stanford University}
under the supervision of
\href{http://theory.stanford.edu/~rajeev/}{Rajeev Motwani}.
Before that, he obtained his undergraduate degree 
from the \href{http://www.cse.iitb.ac.in/}{Indian 
Institute of Technology, Bombay}.
His research interests are in approximation algorithms,
metric embeddings, and algorithmic techniques for
large data sets.
His work on dimension reduction in $\ell_1$ 
won the Best Paper award at FOCS 2003.
A one year stint in the research group
at \href{http://www.google.com/}{Google}
gave him an opportunity to apply his theoretical ideas 
in the real world.
He still reaps the benefits of that experience
-- he has successfully managed to retain the top spot
for a Google search on his last name, but has wisely
given up trying to compete with his well-known 
namesake for searches on his first name.

\end{tocabout}
\begin{tocabout}[robi]
\textsc{Robert Krauthgamer} is a Research Staff Member in the
\href{http://www.almaden.ibm.com/software/disciplines/pm/}{theory group} 
at the \href{http://www.almaden.ibm.com/}{IBM Almaden Research Center}
in San Jose, CA.  He received his \phd\ in 2001 from the 
\href{http://www.weizmann.ac.il/}{Weizmann Institute of Science} in Israel.
A paper, coauthored (as part of his thesis) with his advisor,
\href{http://www.wisdom.weizmann.ac.il/~feige/}{Uri Feige},
was awarded the 2005 SIAM Outstanding Paper Prize.
Subsequently he spent two years as a postdoc in the
\href{http://www.eecs.berkeley.edu/Research/Areas/CS/THY/}{theory 
group at Berkeley}.
His research interests include combinatorial algorithms, 
finite metric spaces, high-dimensional geometry, data analysis, 
and related areas.
His favorite sport since youth has been swimming;
once he swam across the 
\href{http://en.wikipedia.org/wiki/Sea\_of\_Galilee}{Sea of Galilee} 
in a 10km competitive race,
and was the last one to arrive at the finish line.
\end{tocabout}

\end{tocaboutauthors}

\end{document}
