%% a008.tex   10-05-2005

\documentclass{toc}

\def\nev#1{{}}
\newtheorem{lemmaa}{Lemma}
\newtheorem{prop}{Proposition}
\newtheorem{cond}{}
\newtheorem*{remarks}{Remarks}
\newtheorem*{definitions}{Definitions}
\newenvironment{definitionlist}{%
  \begin{definitions}\hfill%
    \begin{enumerate}\setlength{\itemsep}{0pt}}%
    {\end{enumerate}\end{definitions}}
\renewcommand{\thecond}{\rm(\arabic{cond})}
\renewcommand{\thelemmaa}{{A{{\arabic{lemmaa}}}}}

\newcounter{bean}
\def\calq{{\cal Q}}
\def\cala{{\cal A}}
\def\calb{{\cal B}}
\def\cald{{\cal D}}
\def\calh{{\cal H}}
\def\cali{{\cal I}}
\def\calu{{\cal U}}
\def\calm{{\cal M}}
\def\caln{{\cal N}}
\def\calv{{\cal V}}
\def\calg{{\cal G}}
\def\calc{{\cal C}}
\def\calk{{\cal K}}
\def\calp{{\cal R}}
\def\calr{{\cal R}}
\def\cals{{\cal S}}
\def\calf{{\cal F}}
\def\call{{\cal L}}
\def\calt{{\cal T}}
\def\boldk{{\bf K}}
\def\tilda{{\tilde A}}
\def\tildc{{\tilde C}}
\def\tildd{{\tilde D}}
\def\phi{{\varphi}}
\def\range{{\tt range}}
\def\domain{{\tt domain}}
\def\ti{{\rm A}}
\def\pii{{\rm P2}}
\def\piii{{\rm P3}}
\def\la{{\langle}}
\def\ra{{\rangle}}
\def\lb{{\lbrace}}
\def\rb{{\rbrace}}
\def\zee{{\bf Z}}
\def\rac{{\bf Q}}
\def\init{{\tt init}}
\def\out{{\tt out}}
\def\state{{\tt state}}
\def\rstate{{\tt rstate}}
\def\register{{\tt register}}
\def\cont{{\tt cont}}
\def\rightborder{{\tt right}}
\def\border{{\tt border}}
\def\set{{\tt set}}
\def\link{{\tt link}}
\def\core{{\tt core}}
\def\acc{{\tt acc}}
\def\fan{{\tt fan}}
\def\stem{{\tt stem}}
\def\diag{{\rm diag}}
\def\func{{\rm func}}
\def\sub{{\rm sub}}
\def\rank{{\rm rank}}
\def\bl{{\rm bl}}
\def\Func{{\rm Func}}
\def\start{{\tt start}}
\def\sink{{\tt sink}}
\def\variable{{\tt var}}
\def\vvalue{{\tt val}}
\def\prob{{\tt prob}}
\def\bcks{{\backslash}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\bibliographystyle{tocplain}

\tocdetails{%
  volume=1, number=8, year=2005, firstpage=149,
  received={October 6, 2004},
  revised={May 5, 2005},
  published={October 5, 2005}}

\runningtitle{A Non-linear Time Lower Bound for Boolean Branching
  Programs}
\runningauthor{Mikl\'{o}s~Ajtai}
\copyrightauthor{Mikl\'{o}s~Ajtai}

\begin{document}

\begin{frontmatter}
  \title{A Non-linear Time Lower Bound for Boolean Branching Programs}
  \author[miklos]{Mikl\'os Ajtai}
 
  \begin{abstract}
    We give an exponential lower bound for the size of any linear-time
    Boolean branching program computing an explicitly given function.
    More precisely, we prove that for all positive integers $k$ and
    for all sufficiently small $\epsilon >0$, if $n$ is sufficiently
    large then there is no Boolean (or $2$-way) branching program of
    size less than $2^{\epsilon n}$ which, for all inputs $X\subseteq
    \lbrace 0,1,{\ldots},n-1\rbrace $, computes in time $kn$ the
    parity of the number of elements of the set of all pairs $\langle
    x,y\rangle $ with the property $x\in X$, $y\in X $, $x<y$, $x+y\in
    X$.  \nev{201}
 
    For the proof of this fact we show that if
    $A=(a_{i,j})_{i=0,j=0}^{n}$ is a random $n$ by $n$ matrix over the
    field with $2$ elements with the condition that ``$A$ is constant
    on each minor diagonal,'' then with high probability the rank of
    each $\delta n$ by $\delta n$ submatrix of $A$ is at least
    $c\delta |\log \delta |^{-2}n$, where $c>0$ is an absolute
    constant and $n$ is sufficiently large with respect to $\delta$.
 
    (A preliminary version of this paper has appeared in the
    Proceedings of the 40th IEEE Symposium on Foundations of Computer
    Science.) 
  \end{abstract}

\tocams{68Q17, 68Q15}

\tocacm{F.2.2, F.2.3}

\tockeywords{complexity theory, lower bounds, space complexity,
  branching programs, Hankel matrices, matrix rigidity}
\end{frontmatter}

\section{Introduction}

\subsection{The statement of the main result} A Boolean (or $2$-way)
branching program is a finite directed acyclic graph (which may
contain parallel edges) with a unique source node, so that each
non-sink node is labeled by one of the input variables
$x_{0},{\ldots},x_{n-1}$, each non-sink node has outdegree two, each edge
is labeled by an element of $\lbrace 0,1\rbrace $ so that the two
outgoing edges of a non-sink node always get different labels, and
each sink-node is labeled by an element of $\lbrace 0,1\rbrace $.  If
an input is given we start from the unique source node and go along a
path according to the following rule.  If we are at node $v$ and the
label of $v$ is the variable $x_{i}$ then we leave $v$ on the unique
outgoing edge whose label is the value of $x_{i}$.  This path will end
in a sink node; the label of the sink-node is the output of the
program at the given input, the length of the path is the
computational time at the given output, the maximal length of a path
in the graph that we may get from an input this way is the length (or
depth) of the branching program.  The number of nodes in the graph is
the size of the branching program. \nev{203}

This model describes a very general way of computing where the
computational time measures the number of accesses to the individual
bits of the input and the size measures the number of different states of
the machine performing the computations.  We do not measure the
computational time needed to determine the next state of our machine
(that is, the next node in the graph along the path).  We may also
think about this model as a random access machine whose input
registers contain a single input bit, with a working memory containing
$\log_{2} M$ bits where $M$ is the size of the branching program.
\nev{204}

Our goal is to give an explicit function which cannot be computed with
a Boolean branching program in linear time if the size of the
branching program is $2^{\epsilon n}$. The function has to assign to
each $\{0,1\}$-sequence
of length $n$ a single $\{0,1\}$-value.  We identify
the set of all $\{0,1\}$-sequences of length $n$ with the set of all
subsets of $\lbrace 0,1,{\ldots},n-1\rbrace $, that is, our function
$f$ will assign to each subset $X$ of $\lbrace 0,1,{\ldots},n-1\rbrace$ 
a $\{0,1\}$-value.
For such a set $X$ let $f(X) $ be the parity of the
number of elements of the set of all pairs $\langle x,y\rangle $ with
the property $x\in X$, $y\in Y$, $x<y$, and $x+y \in X$. We will say
that $f(X)$ is the \emph{parity of interior sums}
for the set $X$, where the
expression ``interior" refers to the fact that both the terms in the
sum and the sum itself must be in $X$.  \nev{206}

Our main result is that the parity of interior sums for a set of $n$
elements cannot be computed with a Boolean branching program in linear
time if the size of the branching program is $2^{\epsilon n}$
(see \thmref{T1}).
\nev{207}

\subsection{The history of the problem and related results}
\subsubsection{Boolean and R-way branching programs} One of the main
goals of complexity theory is to describe explicitly given functions
which cannot be computed in certain computational models with
specified amount of resources.  Branching programs form one of the
most general models of computation, e.g. random access machines with
memory $M$ and running time $t$ can be described by branching programs
of size $2^{M}$ and time (depth) $t$.  Because of the generality and
simplicity of the model and its mentioned close connection to the
practical random access models of computation, finding lower bounds
for explicitly given functions in the branching program model in
general, and with the given values of parameters in particular (linear
time and sublinear memory) was always considered a question of great
importance. \nev{208}

A generalization of the Boolean branching programs is the
computational model of $R$-way branching programs. Since most of the
results that we use in our proofs were established for $R$-way
branching programs, we describe briefly its definition.  The
computation is done in the same way, along a path of a directed graph
as in the Boolean case,
with the following differences.  A set $\Gamma$ with $R$ elements is 
given,
the elements of $\Gamma$ are the possible values of the input variables.
Each non-sink node has outdegree $R$, and the outgoing edges are
labeled by the elements $\Gamma$ so that the $R$ outgoing edges have $R$
different labels.  Each sink node is labeled by an element of $\Gamma$. If
an input is given, that is, we assign an element of $\Gamma$ to each of 
the
input variables $x_{1},{\ldots},x_{n}$, then we follow a path of the graph
in the following way. We start at the unique source node. If we are at
node $v$ and the label of node $v$ is the variable $x_{i}$ then we
leave $v$ on the unique outgoing edge whose label is the element of
$\Gamma$ assigned to the variable $x_{i}$.  This path will end in a sink
node. The element of $\Gamma$ which is the label of this sink node is the
output of the $R$-way branching program. The value of $R$, in the
results most interesting for us, is $n^{c}$ where $c$ is a constant.
(This corresponds to the random access machines where each register
can contain $c \log_{2} n$ bits.)  \nev{209}


\subsubsection{Branching programs with many
output bits, and the time segmentation method} 

The computational model of $R$-way branching programs was introduced
by Borodin and Cook \cite{BC}, 
who proved a time-space trade-off for
sorting $n$ integers.  This work also introduced a method for proving
lower bounds about $R$-way branching programs in the special case
where the number of output bits is relatively large compared to the
time allowed for the computation. 
Several other lower bounds and time-space trade-offs of similar 
nature were 
given, see e.\,g. Abrahamson~\cite{Abrahamson86,Abrahamson91},
Beame~\cite{Bea}, Karchmer~\cite{Karchmer86}, Reisch and
Schnitger~\cite{Reisch-Schnitger/82}, and Yesha~\cite{ Yesha84}.
These lower bound proofs have a common high-level
structure, namely the time is cut into short intervals and we use the
fact that during such an interval any information that we can use
about the past must be contained in the limited memory at the
beginning of the interval. In particular if many output bits are
provided in a single time interval, then these may depend only on
those input values which are accessed during this time interval and
the the content of the memory at the the beginning of the interval.
\nev{210}

\subsubsection{Lower bounds for explicit
functions and decision problems}
Using the same high level proof structure,
and other new ideas, Beame, Saks and Jayram\footnote{T.S.  Jayram, 
formerly Jayram S. Thathachar} \cite{BST}
gave a lower bound on the computational time for an explicitly given
function with a Boolean branching program of size $2^{o(n)}$.  Namely
they proved that there is an $\epsilon>0$ so that the question whether
the quadratic form $\sigma^{T}Q\sigma$ is zero, (where $\sigma$ is the
input, a $\{0,1\}$-vector of length $n$, and $Q$ is the $n\times n$
Sylvester matrix over the field with three elements) cannot be decided
with a branching program of length $(1+\epsilon)n$ and of size
$2^{o(n)}$.  (The proof shows that the theorem holds for
$\epsilon=.0178$.)  This is the best previously known lower bound in
the direction of our main result.  In the same paper they gave a
nonlinear lower bound on the length of an $R$-way branching program
computing an explicitly defined function, (similar to the function
they used in the Boolean case.)  More precisely they prove that for
all $k$ there is an $r_{k}$ so that for all sufficiently large $n$
there is an (explicitly given) $0$-$1$ valued function
$g(x_{1},{\ldots},x_{n})$ of $n$ variables such that: (a) each
variable is takes its values from a set of size $r_{k}$ and (b) there
is no $r_{k}$-way and size $n^{c}$ branching program which computes
$g(x_{1},{\ldots},x_{n})$ in depth $kn$.  \nev{211}

The author of the present paper proved \cite{A1}
that the element distinctness problem (where each ``element" is the
value of a variable) cannot be decided with an $R$-way branching
program, for $R=c\log_{2} n$, in length linear in $n$ if the size of
the program is at most $2^{\epsilon n}$, provided that $c\geq 2$.  (If the
problem is to find two elements whose Hamming distance is smaller than
$\frac{1}{4}c\log_{2} n$ then for a similar lower bound on the length
the necessary restriction on the size is only $2^{\epsilon n\log_{2} n}$.)
These proofs are based on the analysis of certain combinatorial
properties of the input, which are very similar to the combinatorial
properties used in \cite{BST}.  The high level structure of the proof
still follows the time segmentation idea described earlier.  Since the
present proof uses some of the technical lemmata of \cite{A1}, we will
give later a more detailed description of its techniques.  At this
point we sketch only some of the basic ideas of the proof in
\cite{A1}.  Since these are also related to the proof methods of
\cite{BST}, this will show the additional ideas that are needed to make
the time segmentation method work
when the number of output bits is small.  \nev{212}

\subsubsection{Lower bounds for binary
functions and relations}
It is shown in \cite{A1} that if a function $f$ can be computed in
linear time with the given restrictions on the size then there are two
large disjoint subsets, $W_{1},W_{2}$, of the set of the input
variables and an input $\chi$ with the following properties. For each
$i=1,2$ we may change the input $\chi$ in many different ways by changing
the values of the variables in $W_{i}$ only, so that the output does
not change.  Moreover, for $i=1,2$ we can select a large set of
changes $Y_{i}$ so that even if we perform a change from $Y_{1}$ (on
the values of the variables in $W_{1}$) and another one from $Y_{2}$
(on the values of the variables in $W_{2}$) simultaneously, the output
remains unchanged.

In the case of the element distinctness problem we are able to chose
an input $\chi$ which meets these requirements with the additional
property that $\chi$ (which is a list of $n$ integers) consists of
pairwise distinct integers.  Therefore, if our branching program
solves the element distinctness program, its output is, say, $1$.
However we can prove the for a fixed $i=1,2$ the inputs that we get
from $\chi$ through the changes in $Y_{i}$, take more than $\frac{n}{2}$
different values on the set of variables $W_{i}$.  Therefore there
will be an integer $x$ so that for both $i=1,2$, $x$ will be a value
of a variable from $W_{i}$ if we perform suitable change on $\chi$ from
$Y_{i}$. Consequently performing both suitable changes simultaneously
we get an input which contains $x$ twice and still the unchanged
output is $1$. This contradicts the assumption that the program solves
the element distinctness problem.  \nev{213}

Similar ideas are used for the other relations, or functions in the
mentioned lower bounds.  In each case we need a function $F(x,y)$ or a
relation $R(x,y)$ with two variables so that if we can separately
change $x$ and $y$ in many different ways then among these changes
there will be two so that performing them simultaneously we are able
to change the value of $F(x,y)$ or $R(x,y)$.  If $R$ is the equality
predicate then, as we have seen, this can be guaranteed if both sets
of changes produces at least $\frac{n}{2}$ different input values.  In
the case of the Hamming distance problem described above the situation
is even better since $\frac{n}{2}$ can be replaced by $n^{1-\epsilon'}$ 
for
some small constant $\epsilon'>0$ (see \cite{A1}). (This is the reason 
that
the proof of the lower bound for the Hamming distance problem is much
simpler and gives a stronger result than the proof for the element
distinctness problem.) \nev{214}



\subsubsection{Quadratic forms and
rigidity}
The binary function $F(x,y)$ can be also defined by a quadratic form
$x^{T} By$.  Assume that, as before, $x$ and $y$ independently run
over large sets and now we want to guarantee that the $x ^{T}By$ is
not constant.  Motivated by similar considerations, quadratic forms
were studied by Borodin, Razborov, and Smolensky \cite{BRS}, Jayram
\cite{Tha}, and Beame, Saks, and Thatachar \cite{BST}. The result in
this direction that we will use in our paper is the following. (This
was proved in more general forms in \cite{BRS}, \cite{Tha},
and \cite{BST}, and also follows from the results of \cite{CG85}.)
Suppose that the rank of the matrix $B$ is $r$ and $x$ resp. $y$ are
taking values independently from $m_{1}$ resp. $m_{2}$ dimensional
subspaces of an $m$ dimensional vectorspace. If $m_{1}+m_{2}+r> 2m$
then the quadratic form $x^{T}By$ is not constant. \nev{501}

As we described earlier, in the lower bound proofs we can usually
guarantee only that $x$ and $y$ can take values independently only in
some limited sense, namely we can apply independent changes to two
disjoint sets of variables. In \cite{BST} the mentioned property of
the quadratic forms is applied in the following way. The lower bound
is proved for a function of the form $x^{T}Ay$ where $A$ is a suitably
chosen, explicitly given, matrix over a finite field. This matrix has
the property that each $\delta n \times \delta n $ submatrix which does 
not contain
elements from the main diagonal is of rank at least $\delta^{2} n$ where
$\delta>0$ is a small constant. (This may be considered as a rigidity
property of the matrix $A$.) The input variables of the branching
program take values from the field $F$.  We pick two large sets of
independent changes on two disjoint sets of variables of at least $\delta
n$ elements. The submatrix of $A$ formed from the corresponding rows
and columns will be the matrix $B$ in the mentioned property of
quadratic forms. This way it is possible to guarantee that, roughly
speaking, under independent changes on these sets of variables, the
quadratic form cannot remain constant, which makes the lower bound
proof possible.  \nev{502}

In the present paper we will follow the same strategy for our proof
with the following improvements. We use two-way branching programs
with a single output bit which creates three new problems. (1) It is
more difficult to prove the existence of the two disjoint sets of
variables 
which admit many independent changes that leave the output of the
branching program unchanged. For this we use the machinery worked out
in \cite{A1}. (2) The explicitly given matrix of \cite{BST} is a
Sylvester matrix
over the field $F$ and so the size of the field must be
at least $n$. We need something similar for $F_{2}$, the field with
two elements. We do not give an explicit construction for such a
matrix, but a random construction which depends only on a linear
number of random bits which can be included in the input.  (3) it is
not enough for us if the rank of the submatrices of sizes 
$\delta n \times \delta n$
are $\delta^{2} n$, we need much larger ranks; what we prove will be 
$\delta |\log \delta |^{-2}n$.  \nev{215}

\subsubsection{Summary of the history of the problem} Summarizing the
historical developments about the lower bound techniques for branching
programs, we can say that there were two parallel developments.  The
first is the time segmentation method which later was supplemented by
the technique of considering changes of the values of variables on two
disjoint sets: \cite{BC},\cite{BST},\cite{A1}.  The second is the
development of the algebraic techniques about quadratic forms based on
matrices with rigidity properties, providing explicitly defined
functions which were suitable for the lower bound proof techniques
mentioned in the first direction of developments:
\cite{BRS},\cite{Tha},\cite{BST}. The present paper
uses the techniques of both of these directions.  \nev{216}

\subsection{Subsequent developments} A
preliminary version of this paper was
published in \cite{A2} containing all of
the essential elements of the proofs
presented here.  Since then, the main
result of this paper was further improved
by Beame, Saks, Sun, and Vee in \cite{BSSV}
by making the time/space lower bounds
sharper and generalizing the theorem for
the case of probabilistic branching
programs.  Their proofs
use the results and techniques of the present paper (together with
methods of different nature).  \nev{217}


\section{Overview of the proof}
\subsection{A lower bound for a nonexplicit function} 

Our proof in the present paper uses a technical lemma of the element
distinctness result.  As we have mentioned already in the
introduction, it is shown in \cite{A1} that if a function $f$ can be
computed in linear time with the given restrictions on the size then
there are two large disjoint subsets, $W_{1},W_{2}$, of the set of the
input variables and an input $\chi$ so that for each $i=1,2$ we may
change the input $\chi$ in many different ways by changing only the
values of the variables in $W_{i}$ so that the output does not change;
moreover these changes can be performed simultaneously on $W_{1}$ and
$W_{2}$ so that the output still does not change.  The ratio between
the sizes of the sets $W_{i}$ and the logarithm of the number of
changes has a crucial importance in the proofs of the present paper.
(A precise statement of this result is given in \lemref{L1} below.)
\nev{218}

We use this result to show that a quadratic form (which is \emph{not}
given explicitly) cannot be computed in linear time.  The algebraic
part of this proof (\lemref{L5}) is a theorem proved by Borodin,
Razborov, and Smolensky \cite{BRS} (and in more general forms by
Jayram \cite{Tha} and Beame, Saks, and Jayram \cite{BST}).  We
reduce the problem of giving a quadratic form with the required
properties to
a question about the ranks of the submatrices (or
minors) of the matrix generating the quadratic form in a similar way
as is done in \cite{BST}.  In both cases the goal is to get a
matrix $A$ so that each ${\lfloor}\delta n{\rfloor}$ by ${\lfloor}\delta 
n{\rfloor}$ submatrix of the
matrix $A$ has rank at least $\psi(\delta)n$, for each $\delta>0$, 
provided that
$n$ is sufficiently large with respect to $\delta$, where the function 
$\psi$
should be as large as possible.  The Sylvester matrices used in
\cite{BST} are explicitly given examples of such matrices with
$\psi(\delta)=\delta^{2}$, provided that we consider only submatrices that 
do not
contain any elements of the main diagonal.  (This restriction does not
affect the applicability of the matrix to the lower bound proof.)
\nev{219}

\subsection{Decreasing the randomness needed}

\begin{definition}
  We will call an $n\times n$ matrix $A=(a_{i,j})$ a \emph{Hankel} matrix
  if $\forall i,j,k,l\in \lbrace 0,1,{\ldots},n-1\rbrace , i+j=k+l$ 
implies
  $a_{i,j}=a_{k,l}$.  In other words $A$ is a Hankel matrix iff it is
  constant across minor diagonals.  \nev{220}
\end{definition}

\begin{remark}
  A Hankel matrix is determined by only $2n-1$ suitably chosen
  entries, e.g.  by entries of the first row and last column.  If a
  matrix is constant along all diagonals it is called a
  \emph{Toeplitz} matrix.  Reversing the ordering of the rows creates
  a one-to-one correspondence between Toeplitz matrices and Hankel
  matrices.  Therefore all of our results concerning the ranks of
  Hankel matrices remain valid for Toeplitz matrices as well.
  \nev{221}
\end{remark}

We show that if $A$ is a random $n$ by $n$ Hankel matrix over the
field with $2$ elements, with uniform distribution on the set of all
such matrices, then with high probability the described property about
the ranks of the submatrices holds with 
$\psi(\delta)=c\delta|\log \delta|^{-2}$ for an
absolute constant $c>0$.  As a consequence, using also the mentioned
lemma from \cite{A1}, we are able to show that if $\tilda$ is the
matrix that we get from $A$ by replacing each entry in the main
diagonal and above by $0$, then the quadratic form $\langle \tilda x, 
x\rangle $,
where $x$ is the input vector, cannot be computed with a branching
program of linear length and size at most $2^{\epsilon n}$.

\subsection{From a non-explicit function
to an explicit function}
Of course this is not an explicitly given function; we only know that
the lower bound holds for almost all matrices.  However, we got the
matrix by randomizing only $2n-1$ bits.  Therefore if we include these
bits in the input, then we get an explicitly given problem (with
$3n-1$ input variables, where the described tradeoff holds between the
length and size of any branching program computing the quadratic
form).  In other words, if $A(y)$, $y=\langle 
y_{0},{\ldots},y_{2n-2}\rangle $ denotes
the Hankel matrix with $a_{i,j}=y_{i+j}$, then $\langle \tilde 
A(y)x,x\rangle $
cannot be computed in the given length and size from the input $\langle 
x,y\rangle
$.

\subsection{Obtaining a lower bound for
the parity of interior sums problem}
Assume now that $A=(a_{i,j})$ is a fixed Hankel matrix so that $\langle
\tilda x,x\rangle $ cannot be computed with a branching program with the
given restrictions.  Suppose that $x= \langle 
x_{0},{\ldots},x_{n-1}\rangle $ and
$X=\lbrace i \mid x_{i}=1\rbrace $, and 
$$
D=\lbrace i+j \mid a_{i,j}=1, i,j\in
\lbrace 0,{\ldots},n-1\rbrace \rbrace \enspace.
$$
It is easy to see that $\langle \tilda x,x\rangle $ is the parity of the 
number of
all pairs $\langle i,j\rangle $, $i\in X, j\in X$ with the property $i<j$ 
and $i+j\in
D$.

This will already imply that if two subsets, $X,Y$, of the set
$\lbrace 1,2,...,2n\rbrace $ are given, then the problem of computing the
parity of the number of elements of the set of all pairs $\langle 
i,j\rangle$
with the property $i\in X$, $j\in X $, $i<j$, $i+j\in Y$ cannot be
solved by a branching program of linear length and of size at most
$2^{\epsilon n}$.
(The set $D$ defined above will play the role of $Y$.)  It will not be
difficult to make a single set from the two sets $X,Y$, by taking into
account the sizes of their elements, and so we will get that the task
of computing, given as input an $X\subseteq \lbrace 1,2...,n\rbrace $, the
parity of the number of elements of the set of all pairs $\langle 
i,j\rangle $
with the property $i\in X$, $j\in X $, $i<j$, $i+j\in X$ cannot be
accomplished by a branching program of linear length and of size at
most $2^{\epsilon n}$.   \nev{222}

  Finally we note that our results about random Hankel matrices remain
  true over any field with appropriate modifications.  (See the
  remarks after \lemref{L7}, \lemref{L8} and \lemref{L9}. See also the
  comment about the
  applicability of these modified versions to
  generalizations of \thmref{T1} in the proof of \lemref{L5}.)
  \nev{223}

\section{The reduction of the lower bound
to a problem about Hankel matrices}
\subsection{The statement of \thmref{T1}}

 In this section we reduce
the problem of giving a lower bound for the
time needed to solve the problem described
in the introduction to the existence of a
matrix $A$ which can be constructed from
$n$ bits with the property that each large
submatrix of $A$ has also relatively large
rank. \nev{224} \vskip 3 pt

\begin{definition}  If $X,Y$ are sets then
$\Func(X,Y)$ will denote the set of all
functions, defined on $X$, taking values in $Y$.
 \nev{225}
\end{definition}


A branching program as we will define below will be what is usually
called a (deterministic) Boolean or 2-way branching program indicating
that the input variables take their values from a set of size $2$.
\nev{226}

\begin{definition}
  A \emph{branching program} $\calb$ with $n$ input variables
  $x_{0},{\ldots},x_{n-1}$ is a five tuple
  $$
  \langle \calg, \start, \sink, \variable, \vvalue \rangle\enspace,
  $$
  with the following properties \nev{227}
  \begin{enumerate}
    \renewcommand{\theenumi}{(\alph{enumi})}
    \setlength{\itemsep}{0pt}
  \item $\calg$ is a finite directed acyclic graph, which may
    contain parallel edges
  \item $\start$ is the unique source node of $\calg$,
  \item $\variable$ is a function defined on the non-sink nodes
    of $\calg$ with values in the set  $\lbrace
    x_{0},{\ldots},x_{n-1}\rbrace $ of variables,
  \item $\out$ is a function defined on the set of sink nodes of
    $\calg$ with values in $\lbrace 0,1\rbrace $,
  \item $\vvalue$ is a function defined on the set of edges with
    values in $\lbrace 0,1 \rbrace $,
  \item each non-sink node has out-degree $2$, and the function
    $\vvalue$ takes different values on the two outgoing edges.
    \nev{228}
  \end{enumerate}
\end{definition}

An input for the branching program $\calb$ is a $\{0,1\}$-assignment of
the variables $x_{i}$.  (Instead of such an assignment we usually will
think about an input as a $\{0,1\}$-valued function $\eta$ defined on
$\lbrace 0,1,{\ldots},n-1\rbrace $ where $\eta(i)$ is the value of 
$x_{i}$.)
If an input is given, then starting from $\start$ we go along a path
in the graph in the following way.  When we are at a non-sink node $v$
then we look at the value of the variable $\variable(v)$ and leave the
node along the edge $e$ where the value of $\vvalue(e)$ is the same as
the value of this variable.  Since the graph is acyclic and finite,
this way we will reach a sink-node $w$. $\out(w)$ will be the output
of the branching program at the given input.  The number of edges
along the path determined this way by the input is the computational
time of the branching program at the given input.  The maximal
computational time for the set of all inputs (that is, the maximal
length of all paths arising from an input in the given way) is the
\emph{length} of the branching program.  The \emph{size} of the branching
program is the number of nodes of $\calg$.
 \nev{229}

\begin{definition}
  Assume that $X$ is a subset of $\lbrace 0,{\ldots},n-1\rbrace $.
  $N_{+}(X)$ will denote the number of all pairs $x,y\in X$, $x<y$ so
  that $x+y\in X$.
\end{definition}
The following theorem is the main result of the present paper.  It
states that the parity of the interior sums of a subset of
$0,1,{\ldots},n-1$ cannot be determined by a branching program of size
$2^{\epsilon n}$ in linear time.

\begin{theorem} \label{T1} For all positive integers $k$, 
 if $\epsilon>0$ is
  sufficiently small and $n$ is sufficiently large then there is no
  branching program $\calb$ with $n$ inputs, of length at most $kn$
  and of size at most $2^{\epsilon n}$, which for all inputs $\eta$ 
computes the
  parity of $N_{+}(X_{\eta})$ where $X_{\eta}=\lbrace i\in \lbrace
  0,1,{\ldots},n-1\rbrace \mid \eta(i)=1\rbrace \rbrace$
\end{theorem}

\subsection{Results from earlier works}
In the proof we will use the following lemma, \lemref{L1}, which is a
consequence of \lemref{AA1} proved in \cite{A1} (called Lemma~9 in
that paper).  The proof of \lemref{L1} from \lemref{AA1} is almost
identical to the proof of Theorem 4 from Lemma 9 in~\cite{A1} and does
not require any new ideas. We describe this proof (of \lemref{L1} from
\lemref{AA1}) in the last section.  \nev{230}


The remaining part of the paper, starting with \lemref{L3}, is
self-contained.  We begin with
the definitions needed to understand the statement of \lemref{L1}.

\begin{definitionlist}
\item An \emph{input} (of a branching problem with $n$ input
  variables) is a function $\chi$ defined on $\lbrace 
0,1,{\ldots},n-1\rbrace
  $ with values in $\lbrace 0,1\rbrace $.  A \emph{partial input} is a
  function $\eta$ defined on a subset of $\lbrace 0,1,{\ldots},n-1\rbrace 
$
  with values in $\lbrace 0,1\rbrace $.
  \item Assume that $\chi$ is an input and $\eta$ is a partial input. Then
    $\chi\wr \eta$ will denote the input which is identical to $\eta$ on
    $\domain(\eta)$ and identical to $\chi$ on $\domain(\chi)\backslash 
\domain(\eta)$.
    \nev{231}
  \item If $\delta\in \lbrace 0,1\rbrace $ and $\calb$ is a branching
    program, then $\calb^{-1}(\delta)$ will denote the set of all inputs
    $\eta$ so that the output of $\calb$ at input $\eta$ is $\delta$.
\end{definitionlist}

\begin{lemma} \label{L1} For all positive integers $k$, if $\sigma_{1}>0$
  is sufficiently small with respect to $k$, $\sigma_{2}>0$ is 
sufficiently
  small with respect to $\sigma_{1}$, $\epsilon>0$ is sufficiently small 
with
  respect to $\sigma_{2}$, $n$ is sufficiently large with respect to 
$\epsilon$,
  $\calb$ is a branching program with $n$ inputs of length at most
  $kn$ and of size at most $2^{\epsilon n}$, and $\delta\in \lbrace 
0,1\rbrace$ so that
  $|\calb^{-1}(\delta)|\geq 2^{n-1}$, then there exist a $\chi\in 
\calb^{-1}(\delta)$,
  $\lambda\in (\sigma_{2},\sigma_{1})$, $\mu\in (\sigma_{2},\sigma_{1})$, 
$W_{i}\subseteq \lbrace
  0,1,{\ldots},n-1\rbrace $, $i=1,2$, and sets of partial inputs $Y_{i}$,
  $i=1,2$ defined on $W_{i}$
satisfying the following conditions:

  \begin{cond} \label{G1} for all $i\in W_{1}$ and $j\in W_{2}$ we have
    $i<j$, \end{cond}

  \begin{cond} \label{G2} $|W_{1}|=|W_{2}|= \mu n$, \end{cond}

  \begin{cond} \label{G3} $|Y_{1}|, |Y_{2}|\geq 2^{\mu n-\lambda n}$, 
\end{cond}
 
  \begin{cond} \label{G4} $\mu^{1+\frac{1}{100k}}\geq 2\lambda$, 
and\end{cond}

  \begin{cond} \label{G5} for all $\eta_{1}\in Y_{1}$, $\eta_{2}\in 
Y_{2}$, we
    have $(\chi\wr \eta_{1})\wr \eta_{2}\in \calb^{-1}(\delta)$. 
\end{cond}
\end{lemma}

\subsection{Branching programs and matrix rigidity}

\begin{definition} Assume that $A$ is an $n$ by $n$ matrix over the
  field $F$ and $f$ is a real-valued function defined on $(0,1]$. We
  say that the matrix $A$ is \emph{$f$-rigid} if for each $q=1,{\ldots},n$
  and for each $q$ by $q$ submatrix $B$ of $A$ we have that the rank
  of $B$ is at least $f(\frac{q}{n})n$.  \nev{232}
\end{definition}

The proof of \thmref{T1} is based on \lemref{L3} and \lemref{L4}
described below.

\begin{lemma} \label{L3} There is a $\delta>0$ so that, for all 
$\gamma>0$, if the
  function $g(x)$ is defined by $g(x)={\delta x|\log x|^{-2} }$ if $x\in 
(\gamma,
  \frac{1}{2})$ and $g(x)=0$ otherwise, then for each sufficiently
  large positive integer $n$ there is an $n$ by $n$ Hankel matrix $A$
  over $F_{2}$, so that $A$ is $g$-rigid.
\end{lemma}

\begin{remark} It would be much easier to prove the lemma with
  $g(x)=\delta^{2}x$, though this is not enough for the present
  application.  \nev{233}
\end{remark}

We will prove \lemref{L3} in the next section;
 more precisely, we will
prove (\thmref{T2}) that a random matrix $A$ taken with uniform
distribution on the set of all Hankel matrices meets the requirements
of the Lemma with high probability.

\begin{definitionlist}
\item Assume that $\eta$ is a function with values in $\lbrace
  0,1\rbrace $ defined on $\lbrace 0,1,{\ldots},n-1\rbrace $.  $u_{\eta}$
  will denote the $n$-dimensional vector $\langle 
\eta(0),{\ldots},\eta(n-1)\rangle $
\item The inner product of the $n$-dimensional vectors $u,v$ will be
  denoted by $ u\cdot v $.  \nev{235}
\item Assume that $A=\lbrace a_{i,j}\rbrace_{i=0,j=0}^{n-1}$ is an
  $n$ by $n$ matrix. $\tilde{A}$ will denote the $n$ by $n$ matrix
  that we get from $A$ by keeping every entry of $A$ below the main
  diagonal and replacing all other entries by $0$.  In other
  words $\tilde{A}=\lbrace b_{i,j}\rbrace_{i=0,j=0}^{n-1}$, where
  $b_{i,j}=a_{i,j}$ for all $i>j$ and $b_{i,j}=0$ for all $i\leq j$,
  $i=1,{\ldots},n$, $j=1,{\ldots},n$.
\end{definitionlist}
 
\begin{lemma} \label{L4} For all positive integers $k$, if $\sigma_{1}>0$
  is sufficiently small with respect to $k$, $\sigma_{2}>0$ is 
sufficiently
  small with respect to $\sigma_{1}$, $\epsilon>0$ is sufficiently small 
with
  respect to $\sigma_{2} $, and $n$ is sufficiently large with respect to
  $\epsilon$, then the following holds.  Assume that the function $f $ is
  defined on $(0,1]$ by $f(x)=x^{1+\frac{1}{100k}}$ if $x\in (\sigma_{1},
  \sigma_{2}) $ and $f(x)=0$ otherwise.  If $A$ is an $f$-rigid $n$ by $n$
  matrix $A$ over $F_{2}$ then there is no branching program $\calb$
  with $n $ inputs of length at most $kn$ and of size at most $2^{\epsilon
    n}$ which, for all inputs $\eta$, computes $ \tilde{A} u_{\eta}\cdot 
u_{\eta} $.
\end{lemma}

\begin{remark}
  We use the matrix $\tilde{A}$ instead of $A$ in the expression $
  \tilde{A}u_{\eta} \cdot u_{\eta} $ at the conclusion of the lemma, since 
over
  a field of characteristic $2$ and for a symmetric matrix $A$, almost
  all of the terms of $ A u_{\eta}\cdot u_{\eta}$ will have $0$ 
coefficients.
  \nev{236} \vskip 5pt
\end{remark}

\begin{proof}[Proof of \lemref{L4}] Assume that, contrary to our
  statement, there is a branching program $\calb$ with the given
  properties which computes $\tilda u_{\eta} \cdot u_{\eta}$. We apply
  \lemref{L1} with the given values of $k,\sigma_{1}.
  \sigma_{2},\epsilon,n$ and with the given $\calb$.  According to
  \lemref{L1} there exist $\delta\in \lbrace 0,1\rbrace $, $\chi\in
  \calb^{-1}(\delta)$, $\lambda, \mu \in (\sigma_{2},\sigma_{1}) $, and
  $W_{i},Y_{i}$, $i=1,2$ with the properties listed in \lemref{L1}.
  Let $v=\langle v_{0},{\ldots},v_{n-1}\rangle $ be an $n$ dimensional
  vector over $F_{2}$ defined in the following way.  For all $i\notin
  W_{1} \cup W_{2}$ let $v_{i}=\chi(i)$ and for all $i\in W_{1}\cup
  W_{2}$ let $v_{i}=0$. Recall that for $i=1,2$, $Y_{i}$ is a set of
  functions from $W_{i}$ to $\lbrace 0,1\rbrace $. We define a vector
  $w^{(\xi)}=\langle w_{0}^{(\xi)},{\ldots},w_{n-1}^{(\xi)}\rangle $
  for all $\xi$ in $Y_{1}\cup Y_{2}$.  If $i\in \domain(\xi)$ then
  $w_{i}^{(\xi)}=\xi(i)$, if $i\notin \domain(\xi)$ then
  $w_{i}^{(\xi)}=0$.  Let $g_{i}$ be the following function defined on
  $Y_{i}$: for all $\xi\in Y_{i}$, $g_{i}(\xi)= \tilda (v+w^{(\xi)})
  \cdot (v+w^{(\xi)}) $.  Since the functions $g_{i}$ take at most two
  different values there are $Y_{i}'\subseteq Y_{i}$ so that
  $|Y_{i}'|\geq \frac{1}{2}|Y_{i}|$ and $g_{i}$ is constant on $Y_{i}'$
  for $i=1,2$.  Assume now that $\xi_{1}\in Y_{1}'$, $\xi_{2}\in
  Y_{2}'$ and let $\eta=(\chi\wr \xi_{1})\wr \xi_{2}$.  By
  \lemref{L1}, $\eta\in H$ and therefore
  \begin{align*}
    \tilda u_{\chi} \cdot u_{\chi} &= \tilda u_{\eta} \cdot u_{\eta} = 
\tilda
    (v+w^{(\xi_{1})}+w^{(\xi_{2})}) \cdot
    (v+w^{(\xi_{1})}+w^{(\xi_{2})})\\
    &= - \tilda v \cdot v +g_{1}(\xi_{1})+g_{2}(\xi_{2})+ \tilda 
w^{(\xi_{1})} \cdot
    w^{(\xi_{2})} + \tilda w^{(\xi_{2})} \cdot w^{(\xi_{1})}\enspace.
  \end{align*}
  $\tilda u_{\chi} \cdot u_{\chi}$ and $ \tilda v \cdot v$ do not depend 
on the
  choices of $\xi_{1},\xi_{2}$.  By the definition of $Y_{1}'$ and
  $Y_{2}'$, $g_{1}(\xi_{1})+g_{2}(\xi_{2})$ is constant on $Y_{1}'\times
  Y_{2}'$.  These facts imply that $ \tilda w^{(\xi_{1})} \cdot 
w^{(\xi_{2})}
  + \tilda w^{(\xi_{2})} \cdot w^{(\xi_{1})} $, as a function of
  $\xi_{1},\xi_{2}$, is also constant on $Y_{1}'\times Y_{2}'$. 
\condref{G1}
  and the definition of $\tilda$ implies that $ \tilda w^{(\xi_{2})} \cdot
  w^{(\xi_{1})} $ is identically $0$ on $Y_{1}'\times Y_{2}'$; therefore
  $\tilda w^{(\xi_{1})} \cdot w^{(\xi_{2})}$ is constant on $Y_{1}'\times 
Y_{2}'$.
  Let $V_{0}$ be the vectorspace all $F_{2}$-valued functions defined
  on $\lbrace 0,{\ldots},n-1\rbrace $, and let $V_{i}$, $i=1,2$, be
  the subspace of functions that vanish outside $W_{i}$. The dimension
  of $V_{i}$ is $\mu n$. We may assume that $Y_{i}, Y_{i}'\subseteq 
V_{i}$.  Let
  $\iota_{1}$ be the natural embedding of $V_{1}$ into $V_{0}$ and let
  $\pi_{2}$ be the orthogonal projection of $V_{0}$ onto $V_{2}$. $B$
  will be the linear map of $V_{1}$ into $V_{2}$ defined by
  $Bx=\pi_{2}\tilda \iota_{1}x$.  For all $\xi_{1}\in V_{1}$, $\xi_{2}\in 
V_{2}$ we
  have $ \tilda w^{(\xi_{1})} \cdot w^{(\xi_{2})} = B\xi_{1} \cdot \xi_{2} 
$. If we
  fix the bases in both $V_{1}$ and $V_{2}$ which consist of those
  functions which take the value $1$ at exactly one point and $0$
  everywhere else, then the matrix of $B$ is a submatrix of $\tilda$
  consisting of those entries whose column numbers are in $W_{1}$ and
  row numbers are in $W_{2}$.  By \condref{G1} this submatrix of
  $\tilda$ is identical to the corresponding submatrix of $A$.
  Therefore by the $f$-rigidity of $A$, the rank of $B$ is at least
  $\mu^{1+\frac{1}{100k}}n$.  We apply \lemref{L5} (below) with $V_{1}$,
  $V_{2}$, $m\to \mu n$, $X\to Y_{1}'$, $Y\to Y_{2}'$ and $B$. 
\condref{G3}
  implies that
  $$
  |Y_{i}'|\geq \frac{1}{2}|Y_{i}|\geq 2^{\mu n -\lambda n -1}\enspace.
  $$
  Therefore, according to \lemref{L5}, the fact that $ Bx \cdot y $ is
  constant on $\gamma_{1}(Y_{1})\times \gamma_{2}(Y_{2})$ implies that 
$2(\mu n-\lambda n) +
  \mu^{1+\frac{1}{100 k}}n\leq 2 \mu n$.  This is however impossible 
since, by
  \condref{G4}, $\mu^{1+\frac{1}{100k}}>2\lambda$.  \nev{237}
\end{proof}

The following lemma, in more general forms, is proved in \cite{BRS},
\cite{Tha}, \cite{BST}, and also follows from the results of
\cite{CG85}. To make the paper more self contained we provide a
proof.

\begin{lemma} \label{L5} Assume that $V_{1}$, $V_{2}$ are
  $m$-dimensional vectorspaces over the field $F_{2}$, $X\subseteq 
V_{1},Y\subseteq
  V_{2}$, $|X|\geq 2^{m_{1}}$, $|Y|\geq 2^{m_{2}}$ and $B$ is a linear map
  of $V_{1}$ into $V_{2}$ so that the rank of $B$ is at least $r$.  If
  $m_{1}+m_{2}+r>2m$ then the function $ Bx \cdot y $, $x\in X$, $y\in Y$ 
is
  not constant on $X\times Y$. \end{lemma}

\begin{proof}[Proof of \lemref{L5}]
  Let $x_{0}$ be an arbitrary but fixed element of $X$ and let
  $X'=\lbrace x-x_{0} \mid x\in X\rbrace $.  Clearly $|X|=|X'|$ and if $ 
Bx
  \cdot y $ is constant on $X\times Y$ then $ Bx \cdot y $ is
  identically $0$ on $X'\times Y$. Therefore it is enough to prove
  that the assumptions of the lemma imply that $ Bx \cdot y $ is not
  identically $0$ on $X\times Y$. Assume that, contrary to our
  assertion, it is identically $0$.  Let $H$ be the subspace in $V_{1}$
  generated by $X$ and $G$ be the subspace in $V_{2}$ generated by
  $Y$.  We have $BH \cdot G =0 $, that is, the subspaces $BH$ and $G$
  are orthogonal.  Therefore $\dim(BH)+\dim(G)\leq m$, where $\dim(W)$
  denotes the dimension of the subspace $W$.  Since the rank of $B$ is
  at least $r$ we have that $\dim(BH)\geq \dim(H)-(m-r)$.  We have
  $\dim(H)-(m-r)+\dim(G)\leq m$.  The lower bound on the sizes of the
  sets $X,Y$ imply the following lower bound on the dimensions of the
  subspaces generated by them: $\dim(H)\geq m_{1} $, $\dim(G)\geq
  m_{2}$.  This simply follows from the fact that a $d$-dimensional
  subspace has $2^{d}$ elements.  We have $m_{1}-(m-r)+m_{2}\leq m$,
  that is, $m_{1}+m_{2}-r\leq 2m$ in contradiction to our assumption.
\end{proof}
  \nev{238}

  \begin{remark} The lower bounds on $\dim(H)$ and $\dim(G)$ remain
    true even if the field has characteristic different from $2$, but
    we assume that the elements of $X$ and $Y$ have only 
    $\{0,1\}$
    coefficients in a suitably chosen basis of $V_{1}$ and $V_{2}$.  See
    Lemma 7 of \cite{Tha}.  This is important for the generalization of
    \thmref{T1} for fields with other characteristics.
  \end{remark}
 

\subsection{The proof of the main result}

\begin{proof}[Proof of \thmref{T1}]
  Assume that, contrary to our assertion, there is a branching program
  $\calb$ with the given parameters which computes the parity of
  $N_{+}(X)$.  Let $m={\lfloor}\frac{n}{10}\rfloor$.  We apply \lemref{L4} 
with
  $n\to m$, $k\to ck $, where $c$ is a sufficiently large absolute 
constant
  and $\epsilon \to \frac{\epsilon}{2}$. Assume that $\sigma_{1}$, 
$\sigma_{2}$ are picked with
  the properties described in the lemma.

  Let $g$ be the function defined in \lemref{L3}.  Applying
  \lemref{L3} with $n\to m$, $\gamma\to \sigma_{2}$ we
  get that there is an $m$ by $m$ $g$-rigid matrix $A=(a_{i,j})$ over
  $F_{2}$.  If $\sigma_{1}$ is sufficiently small with respect to
  $\delta$, then $A$ will be $f$-rigid as well.  Therefore by
  \lemref{L4} there is no branching program of size at most
  $2^{\frac{\epsilon}{2}n}$ which computes $ u_{\zeta}\tilda \cdot
  u_{\zeta} $ in time $ckn$ for all $\zeta$, where $\zeta$ is an
  $F_{2}$-valued function defined on $\lbrace 0,1,{\ldots},m-1\rbrace
  $.  Let $D=\lbrace i+j \mid \ a_{i,j}=1 \rbrace $ and
  $$
  X_{\zeta}=\lbrace i\in \lbrace 0,1,{\ldots},m-1\rbrace \mid \zeta(i)=1 
\rbrace\enspace.
  $$
  For any pair of sets of integers $X,Z$ let $N_{+}(X,Z)$ be the
  number of pairs $x,y$, $x<y$ so that $x\in X$, $y\in X$ and $x+y\in Z$.
  The statement of \lemref{L4} in our case is that the parity of
  $N_{+}(X_{\zeta},D)$ cannot be decided by a branching program with the
  given restrictions on its parameters.  We show that this problem can
  be reduced to the problem of determining the parity of
  $N_{+}(X_{\eta})$ for a suitably chosen $\eta\in \Func(n,2)$ in a way 
which
  can be implemented by a linear-time branching program.  Therefore
  our indirect hypothesis will contradict to \lemref{L4}.  $\eta$ is
  defined in the following way.

  We define first two sets $U_{1},U_{2}$.  $U_{1}=2m+X_{\zeta}$,
  $U_{2}=4m+D$.  Let $\eta$ be the unique element of $\Func(\lbrace
  0,1,{\ldots},n-1\rbrace,\lbrace 0,1\rbrace )$ so that
  $X_{\eta}=U_{1}\cup U_{2}$.  Clearly, if $x,y\in X_{\zeta}$, $x<y$, and
  $x+y\in D$ then $2m+x\in X_{\eta},2m+y\in X_{\eta}$,
  $2m+x<2m+y$, and $(2m+x)+(2m+y)\in X_{\eta}$.
  Conversely, assume
  that $z,w\in X_{\eta}$, $z<w$ and $z+w\in X_{\eta}$. It is easy to
  see that this implies $z,w\in \lbrace 2m,{\ldots},3m-1\rbrace $ and
  therefore $z-2m,w-2m\in X_{\zeta}$, $z-2m<w-2m$, and
  $(z-2m)+w(-2m)\in X_{\zeta}$. Therefore
  $N_{+}(X_{\zeta},D)=N_{+}(X_{\eta})$.  We claim that each value of
  $\eta$ can be computed in constant time by a branching program, and
  to do this the size of our program must be increased only by a
  factor of two since the extra memory needed for this step is only
  one bit.  Indeed, assume that we want to determine the value of
  $\eta(i)$ for some $i\in \lbrace 0,1, \ldots ,n-1\rbrace $. First
  the program decides whether $i\in U_{1}$ by checking whether
  $\zeta(i-2m)=1$. If not, then $\eta(i)=0$. If $\zeta(i-2m)=1$, then
  it has to decide whether $i\in U_{2}$.  Since $D$ is part of the
  input this can be decided by checking whether $i-4m\in D$.  If the
  anser is no then $\eta(i)=0$, if the answer is yes then $\eta(i)=1$.
  Therefore we have reduced the problem of determining the parity of
  $N_{+}(X_{\zeta},D)$ to the problem of determining the parity of
  $N_{+}(X_{\eta})$.
\end{proof}


\section{Random Hankel matrices}
\subsection{The statement of the result}
In this section we show that with a positive probability all large
submatrices of a random Hankel matrix have relatively large ranks.
\nev{240}

\begin{definition}
  The field with $q$ elements will be denoted by $F_{q}$.
\end{definition}

\begin{theorem} \label{T2} There exists a $c_{1}>0$ so that, for all
  $c_{2}>0$, if $n$ is sufficiently large then the following holds:
  Assume that $A=\lbrace a_{i,j}\rbrace$, $i=0,{\ldots},n-1$, $ 
j=0,{\ldots},n-1
  $ is a random $n$ by $n$ Hankel matrix over $F_{2}$, taken with
  uniform distribution on the set of all such matrices.  Then with a
  probability greater than $\frac{1}{2}$, $A$ has the following
  property:

  \begin{cond} \label{hegy} Suppose $S=\lbrace
    s_{0},{\ldots},s_{q-1}\rbrace$, $T=\lbrace 
t_{0},{\ldots},t_{q-1}\rbrace $
    are subsets of $\lbrace 0,{\ldots},n-1\rbrace $ with $q$ elements,
    where $c_{2}n<q<\frac{n}{2}$, and $B_{S,T}=(a_{s_{i},t_{j}})$, $
    i=0,{\ldots},q-1,\ j=0,{\ldots},q-1$ is the submatrix of $A$ 
consisting of
    those entries whose row numbers are in $S$ and column numbers are
    in $T$.  Then the rank of $B_{S,T}$ is at least
    $c_{1} |\log(\frac{q}{n})|^{-2}q$.  \nev{241} \end{cond}
\end{theorem}

\subsection{Sketch of the proof}
\subsubsection{A natural but
unsuccessful attempt} The
most
natural way to prove the statement of the
theorem would be the following.  Assume
that the sets $S,T$ are fixed.  We give an
upper bound $M$ on the probability
$p_{S,T}$
of the event that, for the randomization of
$A$, the matrix $B_{S,T}$ defined for the
fixed
sets $S$ and $T$ has rank smaller than
$c_{1}|\log (\frac{q}{n})|^{-2}$.  If $M$
multiplied by the number of choices for the
pair $\langle S,T \rangle $ is smaller than
$\frac{1}{2}$ then the assertion of the theorem
clearly holds.

Unfortunately a proof of this type cannot work. Indeed if $q=cn$ then
the number of pairs $\langle S,T\rangle $ is about $2^{2c(\log 
\frac{1}{c})n}$. On
the other hand for a fixed pair $S,T$, in the worst case, the number
of minor diagonals of $A$ intersected by $S \times T$ can be as small as
$2cn-1$.  Each of the choices of $0$s and $1$s in $A$ on these
diagonals are equally probable so the probability that we get rank
smaller than $c_{1}|\log (\frac{q}{n})|^{-2}$ is at least
$2^{-2cn+1}$. (It is not $0$ since, e.g. the $0$ matrix has such a
small rank.) Since the absolute value of the exponent in the number of
pairs is greater by a factor of $|\log \frac{1}{c}|$ than in the upper
bound $M$, the product cannot be smaller than $\frac{1}{2}$ if $c$ is
a small constant. \nev{242}

\subsubsection{Reducing the number of
relevant submatrices}

The main problem with the argument described above was that the number
of pairs $\langle S,T\rangle $ is too large compared to the number of 
relevant
minor diagonals. Let $\cals $ be the set of these pairs, that is, the
set of all pairs $\langle S,T\rangle $ so that $S,T\subseteq \lbrace 
0,1,{\ldots},n-1\rbrace $
and $|S|=|T|=q$. We will be able to avoid the mentioned difficulty in
the following way. Instead of working with the elements of $\cals$, we
will consider a smaller set $\cals'$ consisting of pairs $\langle 
S',T'\rangle $
so that $|S'|\leq q$, $|T'|\leq q$ and with the property that for all 
$\langle
S,T\rangle \in \cals$ there is a $\langle S',T'\rangle \in \cals'$ so that 
$S'\subseteq S$ and $T'\subseteq
T$. (We will refer to this property by saying that $\cals'$ is
\emph{dense} in $\cals$.)

It is enough to show that for all $\langle S', T' \rangle \in \cals'$ the 
rank of
$B_{S',T'}$ is at least $c_{1}|\log(\frac{q}{n})|^{-2}q$.  Indeed,
since $\cals'$ is dense in $\cals$, each matrix $B_{S,T}$ with $\langle 
S,T\rangle
\in \cals$ has a submatrix $B_{S',T'}$ with $\langle S',T'\rangle \in 
\cals $ and so
$\rank(B_{S,T})$ $\geq \rank(B_{S',T'}) \geq $ $
c_{1}|\log(\frac{q}{n})|^{-2}q$.  \nev{243}

We will define the set $\cals'$ by constructing a function $\calf$
defined on $\cals$ so that for each $\langle S,T\rangle \in \cals$, 
$\calf(\langle S,T\rangle
)=\langle S',T'\rangle $ with $S'\subseteq S $, $T'\subseteq T$. Clearly 
if such an $\calf$ is
given and $\cals' = \lbrace \calf(\langle S,T\rangle)\ |\ \langle 
S,T\rangle\in \cals \rbrace $,
then $\cals'$ is dense in $\cals$.  We also have to make sure that
$|\cals'|$ is small and that we are able to give a good upper bound,
for each fixed $\langle S',T'\rangle \in \cals' $, on the probability that 
the rank
of the matrix $B_{S',T'}$ is smaller than $c_{1}|\log
(\frac{q}{n})|^{-2}$.

\subsubsection{The rank of an enlarged
submatrix}
First we describe our method of estimating the probability that the
rank of a submatrix $B_{S,T}$ of $A$ is small for a fixed pair $\langle 
S,T\rangle
$. This will be based on the following observation. Assume that a pair
$\langle S,T \rangle$, $S,T\subseteq \lbrace 0,1,{\ldots},n-1\rbrace $, is 
fixed and $s> \max\
S=\max_{x\in S}x$, $t> \max\ T$, $S_{1}=S\cup \lbrace s\rbrace $, and
$T_{1}=T\cup \lbrace t\rbrace $. Then with probability at least
$\frac{1}{2}$ for the randomization of $A$ we have that the rank of
$B_{S_{1}, T_{1}}$ is strictly greater than the rank of $B_{S,T}$.  We
will prove this statement in the following way. For all
$k=0,1,{\ldots},n-1$, let $D_{k}$ be the minor diagonal of $A$ containing
the entries $a_{i,j}$ with $i+j=k$.  We show that if the values of the
entries of $A$ are fixed on all minor diagonals $D_{k}$ with $k<s+t$,
then out of the two possible definitions of $A$ on the minor diagonal
$D_{s+t}$, at least one will yield a matrix $B_{S_{1},T_{1}}$ with the
property that $\rank (B_{S_{1},T_{1}})> \rank (B_{S,T})$. The proof of
this fact is a simple argument in linear algebra as described in the
proof of \lemref{L7}. \nev{244}


\lemref{L7} itself is a slight generalization of this assertion,
stating that if we add not a single new element to $S$ and another
single element to $T$, but a set of new elements $\tilde S$ to $S$ so
that $\max\ S<\min\ \tilde S$, and a set of new elements $\tilde T$ to
$T$ so that $\max\ T < \min\ \tilde T$ then the resulting enlarged
sets $S_{1}=S\cup \tilde S$, $T _{1}=T\cup \tilde T $ have the following
property.  If the values of the entries of $A$ are fixed on all minor
diagonals $D_{k}$ with $k\leq \max\ S + \max\ T $, then for the
randomization of $A$ on the remaining minor diagonals we have that,
with probability at least $1-2^{-|\tilde S +\tilde T|}$, $\rank
(B_{S_{1},T_{1}})> \rank (B_{S,T})$.  Indeed, there are $|\tilde S +
\tilde T|$ minor diagonals which contain an entry $a_{s,t}$ of $A$
with $s\in \tilde S$ and $t\in \tilde T$. According to the already
described special case the randomization of the values of the entries
on each of these diagonals will lead to the required increase of the
rank with a probability of at least $\frac{1}{2}$. Since these
randomizations are independent we get that the rank increases with
probability at least $1-2^{-|\tilde S +\tilde T|}$.  \nev{245}

\subsubsection{Partitioning the rows and
columns}
What we have done so far is only good for estimating the probability
of $\rank(B_{S_{1},T_{1}})>\rank(B_{S,T})$ for some pairs $\langle 
S,T\rangle $,
$\langle S_{1},T_{1}\rangle $ where $S\subseteq S_{1}$, $T\subseteq 
T_{1}$.  To get a lower bound
on the probability of $\rank(B_{S,T})>R$ for some integer $R$, we will
partition $S$ into subsets $S_{1},{\ldots},S_{l}$ and $T$ into subsets
$T_{1},{\ldots},T_{l}$ so that $\max\ S_{i}<\min\ S_{i+1}$ and $\max\ 
T_{i}
< \min\ T_{i+1} $ for all $i=1,{\ldots},l-1$. If $\rank (B_{S,T})\leq 
R=l-r$
then there must be at least $r$ distinct elements $i$ of $ \lbrace
1,{\ldots},l\rbrace $ with the property 
$\rank(B_{\Gamma_{i},\Lambda_{i}})= \rank
(B_{\Gamma_{i+1},\Lambda_{i+1}})$, where $\Gamma_{j}=S_{1}\cup 
{\ldots}\cup S_{j}$ and $\Lambda
_{j}=T_{1}\cup {\ldots}\cup T_{j}$ for $j=1,{\ldots},l$. We will denote by 
$E$ the set
of all integers $i\in \lbrace 1,{\ldots},l\rbrace $ with this property. 
Then

\begin{cond} \label{E2} the probability of the event that we get
  equalities for every element of this set is at most
  $$
  2^{-\sum_{i\in  E}(|S_{i}+T_{i}|)}\enspace.
  $$
\end{cond}

If we add these upper bounds for all of the possible choices for $E$,
that is for all subsets of $\lbrace 1,{\ldots},l\rbrace $ with $r$
elements, then we get an upper bound on the probability of
$\rank(B_{S,T})\leq l-r$.  (This upper bound is formulated in a
slightly more general form in \lemref{L8}.)  We will use this estimate
for each fixed choice for $\langle S,T\rangle \in \cals' $ with
suitable choices of the partitions $S_{1},{\ldots},S_{l}$,
$T_{1},{\ldots},T_{l}$.  \nev{246}

\subsubsection{The choice of the partitions and the submatrices} Our
remaining task is to define the function $\calf$ so that
$$
\cals'= \lbrace \calf(\langle X_{1},X_{2}\rangle ) \mid \langle 
X_{1},X_{2}\rangle \in \cals
\rbrace
$$
is dense in $\cals$, select a pair of partitions for each $\langle 
S,T\rangle\in
\cals' $, and then add the corresponding upper bounds (with
$R=c_{1}q|\log (\frac{q}{n}) |^{-2}$) for each $\langle S,T\rangle \in 
\cals'$.  The
upper bounds will not depend on the choice of $\langle S,T\rangle \in 
\cals'$, so we
will have to prove that the common upper bound multiplied by
$|\cals'|$ is at most $\frac{1}{2}$.

When we define $\calf(\langle X_{1},X_{2}\rangle ) $ for some $\langle 
X_{1},X_{2}\rangle \in
\cals$, we will have already in mind the task of choosing suitable
partitions of $S$ and $T$, where $\langle S,T\rangle =\calf(X_{1},X_{2})$. 
 We
give here a somewhat simplified definition of $\calf$, the final
definition will be provided in the proof of \lemref{L9}. Let $t$ be a
positive integer which is a large constant. We assume now, for the sake
of simplicity, that $t^{2}|q$.  For $j=1,2$ we partition $X_{j}$ into
$\frac{q}{t^{2}}$ subsets $K^{(j)}_{1},{\ldots},K^{(j)}_{q/t^{2}}$ each
containing exactly $t^{2}$ elements so that $\max\ K^{(j)}_{i} <\min\
K^{(j)}_{i+1}$ for $i=1,{\ldots},\frac{q}{t^{2}}-1$. Clearly these
properties uniquely determine both partitions.  \nev{247}

For each fixed $i=1,{\ldots}, q/t^{2}$ we pick sets
$J^{(j)}_{i}\subseteq K^{(j)}_{i}$, $j=1,2$, with exactly $t$ elements
so that $|J^{(1)}_{i}+ J^{(2)}_{i}|$ is maximal. Since the sets
$J^{(j)}_{i}$ have $t$ elements this maximum is at most $t^{2}$. We
will show (\lemref{L6}) that, since we pick the sets $J^{(j)}_{i}$
from sets of size $t^{2}$, this upper bound can be can be attained,
and so $|J^{(1)}_{i}+ J^{(2)}_{i}|=t^{2}$ for all
$i=1,2,{\ldots},\frac{q}{t^{2}}$.  Let 
$$
Z_{j}=\bigcup_{i=1}^{q/ t^{2}}
J^{(j)}_{i}
$$
for $j=1,2$. Now we define $\calf$ by $\calf(\lbrace X_{1},X_{2}
\rbrace )= \langle Z_{1},Z_{2}\rangle $.  For $j=1,2$ we will use the 
partition
$J^{(j)}_{1},{\ldots}, J^{(j)}_{q/t^{2}} $ of the set $Z_{j}$ when
estimating $\prob(\rank (B_{Z_{1},Z_{2}}) \leq c_{1}|\log
(\frac{q}{n})|^{-2}q )$.  The inequality of \condref{E2} gives a good
upper bound which can be easily evaluated (as a function of $q,t$ and
$n$) since in the exponents the value of the expression
$-(|J^{(1)}_{i}+J^{(2)}_{i}|)$ is $t^{2}$, and the number of
exceptional sets $E$ can be also estimated without any problems.
Finally the number of possible pairs $\langle Z_{1},Z_{2}\rangle$ is at 
most
$\binom{n}{q/t^{2}}^{2}$.  These estimates lead to the conclusion of
the theorem.  \nev{248}

\subsubsection{Why did it work?}  {From} the description of the
necessary estimates at the end of the last paragraph it is not clear
what made it possible to get a good enough upper bound on the
probabilities $\prob(\rank(B_{Z_{1},Z_{2}})\leq R)$, where $R= c_{1}|\log
(\frac{q}{n})|^{-2}q $, compared to the number of pairs $\langle 
Z_{1},Z_{2}\rangle
$. It is true that the number of pairs became smaller, since the sizes
of the sets $Z_{j}$ are smaller by a factor of $t$ then the sizes of
the sets $X_{j}$, but for smaller sets the upper bounds on the
probabilities can be larger. Why is it that we gained more on the
number of sets then lost on the upper bounds on the probabilities?

The answer is that the upper bound did not depend on the sizes of the
sets $Z_{i}$ but depended only on the common size of the sets
$J^{(1)}_{i}+J^{(2)}_{i}$ which was $t^{2}$. The corresponding
quantity for the pair $\langle X_{1},X_{2}\rangle $ is
$|K^{(1)}_{i}+K^{(2)}_{i}|$.  Since we do not have any assumption
about the sets $K^{(1)}_{i},K^{(2)}_{i} $ other than that their sizes
are $t^{2}$, in the worst case $|K^{(1)}_{i}+K^{(2)}_{i}|$ can be as
small as $2t^{2}$. Therefore, although the sizes of the sets went down
by a factor of $t$, the critical quantity in the upper bounds remained
essentially unchanged. This guaranteed that we won more on decreasing
the number of pairs then we lost on increasing the upper bound of the
probabilities. To formulate the same phenomenon in the language of
minor diagonals we may say that: although the ratio of the sizes of
the sets $X_{j}$ and $Z_{j}$ is $t$, if we consider the number of
minor diagonals intersecting the subsets $K^{(1)}_{i}\times K^{(2)}_{i}$
resp.  $J^{(1)}_{i}\times J^{(2)}_{i}$ the ratio, at least in certain
cases, is at most $2$.  \nev{249}

This completes the sketch of the proof of the theorem. In the
remaining part of the section we gave a detailed proof of the
mentioned lemmata and the theorem.

\subsection{The proof of \thmref{T2}}

\begin{lemma} \label{L6} Assume that $t$ is a positive integer and
  $U,V$ are sets of integers with $|U|=|V|=t^{2}$.  Then there are
  $U'\subseteq U$, $V'\subseteq V$, so that $|U'|=|V'|=t$ and
  $|U'+V'|= t^{2}$. \end{lemma}

\begin{remark} As the proof will show, the
lemma remains true if we replace
 $|U|=|V|=t^{2}$ by the weaker
 assumption $|U|=|V|= t^{2}-t+1$.
\end{remark}

\begin{proof}[Proof of \lemref{L6}] We have to select the subsets
$U',V'$ of $U$ and $V$ so that each has exactly $t$ elements and all
of the $t^{2}$ sums $u+v$, $u\in U'$, $v\in V'$ are different.
Suppose that this does not hold for some selection of $U',V'$, that is
$u+v=\bar u+ \bar v$ for some suitably chosen $u, \bar u\in U'$, $v,
\bar v\in V'$. This would imply that $u- \bar u=\bar v - v$ and so the
sets $(U'-U')_{+}$ and $(V'-V')_{+}$ are not disjoint, where for a set
of integers $X$, $(X)_{+}=\lbrace x\in X \ |\ x>0\rbrace $.  Therefore
it is sufficient (and also necessary) to prove that there exist
$U'\subseteq U$, $V'\subseteq V$ so that $|U'|=|V'|=t$ and
$(U'-U')_{+}\cap (V'-V')_{+}=\emptyset$.  \nev{250}

Let $U=\lbrace u_{0},{\ldots},u_{t^{2}-1}\rbrace $, $V=\lbrace
v_{0},{\ldots},v_{t^{2}-1}\rbrace $ so that $u_{0}<{\ldots}<u_{t^{2}-1}$ 
and
$v_{0}<{\ldots}<v_{t^{2}-1}$. We define the integers $m_{u},m_{v}$ by
$$m_{u}=\min\ \lbrace u_{i+t-1}-u_{i}\ |\ i=0,1,..,t^{2}-t\rbrace
\qquad \text{and}\qquad 
m_{v}=\min\ \lbrace v_{i+t-1}-v_{i}\ |\
i=0,1,..,t^{2}-t\rbrace\enspace.
$$
Suppose that, e.g. $m_{u}\leq m_{v}$, and let $s$ be an integer with
$m_{u}= u_{s+t-1}-u_{s}$. We claim that the choice $U'=\lbrace
u_{s},u_{s+1},{\ldots},u_{s+t-1}\rbrace $, $V'=\lbrace v_{jt} \ |\
j=0,1,{\ldots},t-1\rbrace $ meets our requirements. Indeed, if
$v_{jt},v_{kt}\in V'$ and $j<k$ then $v_{kt}-v_{jt}\geq $ $ v_{(j+1)t}
-v_{jt}> $ $v_{jt+t-1}-v_{jt}\geq m_{v}\geq m_{u}$, and therefore each
element of $(V'-V')_{+}$ is strictly greater than $m_{u}$. On the
other hand $U'\subseteq [u_{s},u_{s+t-1}]=$ $[u_{s},u_{s}+m_{u}] $; 
therefore
$(U'-U')_{+}$ contains only integers not greater than $m_{u}$.
Consequently $(U'-U')_{+} \cap (V'-V')_{+} =\emptyset$.
\end{proof}

\begin{definitionlist}
\item $\func(n,2)$ will denote the set of all functions defined on
  $\lbrace 0, {\ldots}, n-1\rbrace $ with values in $F_{2}$.
  Similarly, 
$\func([l,n),2)$ will denote the set of all functions defined on the
  interval $[l,n)=\lbrace l,{\ldots}, n-1\rbrace $ with values in $F_{2}$.
  \nev{251}
\item Assume that $n_{1},n_{2}$ are positive integers and $f\in
  \func(n_{1}+n_{2}-1,2)$. Then $\diag(f,n_{1},n_{2})$ will be the
  $n_{1}$ by $n_{2}$ matrix $(d_{i,j})$, $i=0,{\ldots},n_{1}-1$,
  $j=0,1,{\ldots},n_{2}-1$, where $d_{i,j}=f(i+j)$.
\item Assume that $n_{1},n_{2},k_{1},k_{2}$, are positive integers,
  $n_{1}>k_{1}$, $n_{2}>k_{2}$, $f\in \func(k_{1}+k_{2}-1,2)$, and $g$ is
  taken with uniform distribution from the set
  $\func([k_{1}+k_{2},n_{1}+n_{2}-1),2)$.  $\Phi(n_{1},n_{2},f)$ will be
  a random variable whose value is $\diag(f\cup g,n_{1},n_{2})$ (where
  $f\cup g$ is the unique common extension of $f$ and $g$ to
  $[0,n_{1}+n_{2}-1)$). $\Phi(n_{1},n_{2})$ will denote the random
  variable whose value is $\diag(h,n_{1},n_{2})$ where $h$ is taken
  with uniform distribution from the set $\func(n_{1}+n_{2}-1,2)$.
\item Suppose $A=(a_{i,j})$, $i=0,{\ldots},n_{1}-1$, 
$j=0,1,{\ldots},n_{2}-1$,
  is an $n_{1}$ by $n_{2}$ matrix and $S\subseteq \lbrace
  0,1,{\ldots},n_{1}-1\rbrace $, $T\subseteq \lbrace 
0,1,{\ldots},n_{2}-1\rbrace $.
  Then $\sub(A,S,T)$ will denote the $|S|$ by $|T|$ matrix consisting
  of those entries of $A$ which have row numbers in $S$ and in column
  numbers in $T$.  \nev{252}
\end{definitionlist}

\begin{lemma} \label{L7} Assume that $n_{1}$, $n_{2}$, $k_{1}$, and 
$k_{2}$ are
  positive integers, $k_{1}<n_{1}$, $k_{2}<n_{2}$, $f$ is a function
  on $\lbrace 0,1,{\ldots},k_{1}+k_{2}-1\rbrace $ with values in $F_{2}$,
  $S\subseteq \lbrace 0,1,{\ldots},n_{1}-1\rbrace $, $T\subseteq \lbrace
  0,1,{\ldots},n_{2}-1\rbrace $, and 
  $$
  |(S\cap \lbrace k_{1},{\ldots},n_{1}-1\rbrace)
  +(T\cap \lbrace k_{2},{\ldots},n_{2}-1\rbrace)|\geq m \enspace.
  $$
  Then with probability at least $1-2^{-m}$ the following holds: the
  rank of the matrix $\sub(\Phi(n_{1},n_{2},f),S,T)$ is greater than the
  rank of the matrix
  $$
  \sub(\Phi(n_{1},n_{2},f),S\cap \lbrace 0,1,{\ldots},k_{1}-1 \rbrace
  ,T\cap \lbrace 0,1,{\ldots},k_{2}-1 \rbrace )\enspace.
  $$
\end{lemma}

\begin{remark}\label{remark:L7-generalization}
  If we define random Hankel matrices over an arbitrary field $F$ so
  that the random entries of the Hankel matrices are picked from a
  finite subset $D$ of $F$ with uniform distribution, then our Lemma
  remains true if we substitute $1-|D|^{-m}$ for the probability
  $1-2^{-m}$.  (Naturally we also have to modify the definition on
  $\Phi(n_{1},n_{2},f)$, since in this case $f$ is a function whose
  values are in the set $D$.) \nev{253}
\end{remark}

\begin{proof}[Proof of \lemref{L7}]
  Let $\Phi(n_{1},n_{2},f)= (\phi_{i,j})$, $i=0,{\ldots},n_{1}-1$,
  $j=0,{\ldots},n_{2}-1$.  For each $\ell=0,1,2,{\ldots}$ let 
$S_{\ell}=S\cap \lbrace
  0,1,{\ldots},\ell\rbrace $, $T_{j}=T\cap \lbrace 
0,1,{\ldots},\ell\rbrace $. For each
  $i\in S$, $j\in T$, $w_{i,j}$ will be a function defined on $T_{j}$ by
  $w_{i,j}(x)=\phi_{i,x}$ for all $x\in T_{j}$. Let $r$ be rank of the
  matrix $\sub(\Phi(n_{1},n_{2},f),S_{k_{1}-1},T_{k_{2}-1})$.  $r$ is the
  dimension of the vectorspace generated by the functions
  $w_{i,k_{2}-1}$, $i\in S_{k_{1}-1}$.  Suppose that $\bar{S}\subseteq
  S_{k_{1}-1}$, $|\bar{S}|=r$ so that the set of functions $W=\lbrace
  w_{i,k_{2}-1} \mid i\in \bar{S}\rbrace$ are linearly independent.

  According to the definition of $\Phi(n_{1},n_{2},f)$, we have to
  randomize a function $g$ with values in $F_{2}$ which is defined on
  the interval $[k_{1}+k_{2},n_{1}+n_{2}-1)$.  We randomize the values
  of $g$ sequentially for each $x\in [k_{1}+k_{2},n_{1}+n_{2}-1)\cap
  (S+T)$.  Assume that $x\in [k_{1}+k_{2},n_{1}+n_{2}-1)] $ and $g(y)$
  has been randomized already for all $y<x$.  Suppose that for a
  suitably chosen $i\in S\cap \lbrace k_{1},{\ldots},n_{1}-1\rbrace$
  and $j\in T\cap\lbrace k_{2},{\ldots},n_{2}-1\rbrace $ we have
  $i+j=x$.  By the assumption of the lemma this will happen for at
  least $m$ different values of $x$.  Therefore, it is enough to show
  that for such an $x$ the following holds with a probability at
  least $\frac{1}{2}$: the function $w_{i,j}$ is linearly independent
  from the set of functions $H=\lbrace w_{l,j}|l\in \bar{S}\rbrace $.
  (Such an independence obviously implies that the rank of the matrix
  $\sub(\Phi(n_{1},n_{2},f),S,T)$ is greater than $|\bar{S}|=r$.)
  Before the randomization of $g(x)$ the function $w_{i,j}$ is known
  in every point of $T_{j}$ with the exception of $j$.  Since there
  are two possibilities for the value of $w_{i,j}$ at $j$, we have two
  functions $u,v$; hence for the randomization of $g(x)$ we have
  $P(w_{i,j}=u)=P(w_{i,j}=v)=\frac{1}{2}$.
  Consequently, it is enough to show that at least one of the two
  vectors $u,v$ is linearly independent from the set $H$.  Indeed, if
  both are linearly dependent, then their difference is also linearly
  dependent on them, that is, $u-v=\sum_{s\in \bar{S}}\gamma_{s}w_{s,j}$ 
where
  $\gamma_{s}\not=0$ for at least one $s\in \bar{S}$.  We show that this 
is
  impossible.  Indeed, $u-v$ is a function on $T_{j}$ which is zero
  everywhere but at $j$ and $(u-v)(j)=1$.  Consequently $j\geq k_{2}$
  implies that the restriction of $u-v$ to $T_{k_{2}-1}$ is $0$.
  Therefore we get that $\sum_{s\in \bar{S}\gamma_{s}}\gamma_{s} 
w_{s,k_{2}}=0$.  The
  functions $w_{s,k_{2}}$ are linearly independent so we have
  $\gamma_{s}=0$ for all $s\in \bar{S}$, in contradiction to our 
assumption.
  \nev{254}
\end{proof}


\begin{lemma} \label{L8} Assume that for each $j=1,2$,
  $I^{(j)}_{1},{\ldots},I^{(j)}_{l}$, is a partition of the interval
  $[0,{\ldots},n)$ into pairwise disjoint subintervals, 
$S^{(1)},S^{(2)}\subseteq
  \lbrace 0,1,{\ldots},n-1\rbrace $, and $|(S^{(1)}\cap 
I^{(1)}_{i})+(S^{(2)}\cap
  I^{(2)}_{i})|\geq m_{i}$ for all $i=1,{\ldots},l$.  Then, for any 
positive
  integer $r$, the probability that the rank of
  $\sub(\Phi(n,n),S^{(1)},S^{(2)})$ is not greater than $l-r$ is at most
%%  $$
%%  \sum \lbrace 2^{-m_{i_{1}}-{\ldots}-m_{i_{r}}} |1\leq
%%  i_{1}<{\ldots}<i_{r}\leq l\rbrace\enspace.
%%  $$
%% ToC edit: AUTHOR QUESTION. May I replace this with:
  $$
  \sum_{1\leq i_{1}< \cdots < i_{r}\leq l} 
2^{-m_{i_{1}}-{\ldots}-m_{i_{r}}}\enspace.
  $$
\end{lemma}  %%% Lemma 4.7

\begin{remark}
  If we define the random Hankel matrix $\Phi(n,n)$ over an arbitrary
  field $F$ in the way described in \remref{remark:L7-generalization}
  (just after \lemref{L7}), then our lemma remains true if we
  substitute $|D|^{-m_{i_{1}}-{\ldots}-m_{i_{r}}}$ for
  $2^{-m_{i_{1}}-{\ldots}-m_{i_{r}}}$ in the last expression of the lemma.
  \nev{255}
\end{remark}

\begin{proof}[Proof of \lemref{L8}]
  Assume that for all $j=1,2$, $x\in I_{i}^{(j)}$, $y\in
  I_{i+1}^{(j)}$ implies $x<y$, and assume further that for all
  $j=1,2$, $i=1,{\ldots},l$, $I_{i}^{(j)}=[b_{j,i},b_{j,i+1})$.  Let
  $$
  S_{i}^{(j)}= S^{(j)}\cap \bigcup_{k=1}^{i}I_{k}^{(1)}=S^{(j)}\cap
  [0,b_{j,i+1})\enspace,
  $$
  and let $\Phi_{i}= \sub(\Phi(n,n), S_{i}^{(1)},S_{i}^{(2)})$. If the 
rank
  of $X=\sub(\Phi(n,n),S^{(1)},S^{(2)})$ is not greater than $l-r$ then
  there are $r$ integers $1\leq i_{1}<{\ldots}<i_{r}\leq l$ so that the 
rank of
  $\Phi_{i_{t}}$ and $\Phi_{i_{t}+1}$ is the same for $t=1,{\ldots},r$. We 
show
  that for each fixed $i_{1},{\ldots},i_{r}$ the probability of this event
  is at most $2^{-m_{i_{1}}-{\ldots}-m_{i_{r}}}$ which clearly implies the
  statement of the lemma. Suppose that $i_{1},{\ldots},i_{r}$ are fixed.
  According to the definition of $\Phi(n,n)$ we randomize an $h\in
  \func(2n-1,2)$. We pick the values of $h$ on $[2n-1,2)$
  sequentially. Assume that for some $t\in \lbrace 1,{\ldots},r\rbrace $ 
the
  values of $h(0),{\ldots},h(b_{i_{t},1}+ b_{i_{t},2} )-1$ have been 
already
  fixed. We define a function $f$ on the set $\lbrace 0,{\ldots},
  h(b_{i_{t},1}+ b_{i_{t},2} )-1\rbrace$ by $f(y)=h(y)$ for all
  $y=0,{\ldots}, h(b_{i_{t},1}+ b_{i_{t},2} )-1 $. Now we randomize the
  values of $h(x)$ for all $x=b_{i_{t,1}}+b_{i_{t,2}},{\ldots},
  b_{i_{t+1,1}}+b_{i_{t+1,2}}-1 $. We apply Lemma \ref{L7} for this
  part of the randomization with $n_{j}\to b_{t+1,j}$, $k_{j}\to
  b_{i_{t},j}$ for $j=1,2$, $S\to S_{t}^{(1)}$, $T\to S_{t}^{(2)}$, $m\to
  m_{i_{t}}$, and for the function $f$ defined above. We get that the
  probability of the event $\rank(\Phi_{i_{t-1}})=\rank(\Phi_{i_{t}})$ is
  less than $2^{-m_{t}}$. This implies that the probability that 
  $\rank(\Phi_{i_{t-1}})=\rank(\Phi_{i_{t}})$ for all $t=1,{\ldots},r$ is 
at
  most $2^{-m_{1}-{\ldots}-m_{r}}$. \nev{256}
\end{proof}

The following lemma will be used to give an estimate on the
probability that the rank of a matrix $\sub(A,S,T)$ is at least $R$
where the sets $S,T\subseteq \lbrace 0,1,{\ldots},n-1\rbrace $ are fixed 
and $A$ is
a random Hankel matrix over $F_{2}$.  Since the statement of the lemma
depends on many parameters we restate their roles as described in the
``sketch of the proof" of \thmref{T2}.  The sizes of the sets $S,T$
are the same: $|S|=|T|=q$.  We may think of $t$ as a large constant,
although the lemma requires only $t^{2}<q$.  In the ``sketch of the
proof" (using the notation $S\to X_{1}, T\to X_{2}$) we made the
simplifying assumption that $t^{2}|q$, and partitioned both $S$ and $T$
into $\frac{q}{t^{2}}$ subsets each of size $t^{2}$.  Without the
assumption $q^{2}|t$, we will take first subsets of both $S$ and $T$
with $t^{2}\lfloor \frac{q}{t^{2}} \rfloor $ elements and partition these 
subsets
into classes each of size $t^{2}$.  Therefore $Q= \lfloor \frac{q}{t^{2}} 
\rfloor
$ of the lemma refers to the number of classes in these partitions.
In the estimate given in the lemma the factor $2^{-(Q-R+1)t^{2}}$ is
an upper bound on the probability that if we select $R-1$ pairs of
corresponding classes from the two partitions, then the remaining ones
do not increase the rank of $\sub(A,S,T)$ in the sense explained in
the ``sketch".  The factor $\binom{Q}{Q-R+1}$ is the number of ways
that we can select these $R-1$ pairs from the $Q$ pairs of classes.
The factor $\binom{n}{Qt}^{2}$ has the following meaning.  In the
proof we consider all of the pairs of subsets $S'\subseteq S$, $T' 
\subseteq S$ and
estimate the probability that for at least one of them the rank of
$\sub(A,S',T')$ will be $R$ or greater.  Then we multiply this
estimate by the number of possible pairs of sets $S',T'$.  Since we
select $S',T'$ with $|S'|=|T'|=Qt$ (they contain exactly $t$ elements
from each class) the number of possible selections is at most
$\binom{n}{Qt}^{2}$.  \nev{257}


\begin{lemma} \label{L9} Assume that $n,q,R,t$ are positive integers,
  $t^{2}<q<n$ and $R<\lfloor \frac{q}{t^{2}} \rfloor$.  Suppose further
  that $A=\lbrace a_{i,j}\rbrace$, $i=0,{\ldots},n-1$, $
  j=0,{\ldots},n-1 $ is a random $n$ by $n$ Hankel matrix over
  $F_{2}$, taken with uniform distribution on the set of all such
  matrices.  Let $p$ be the probability of the following event:
 
  \begin{cond} \label{hketto} for all $S\subseteq [0,n), T\subseteq
    [0,n) $, $|S|=|T|=q$ the rank of the matrix $\sub(A,S,T)$ is at
    least $R$.
 
    Then
    $$
    p\geq 1-\binom{n}{Qt}^{2} \binom{Q}{Q-R+1}2^{-(Q-R+1)t^{2}}
    $$
    where $Q=\lfloor \frac{q}{t^{2}} \rfloor$.
  \end{cond}
\end{lemma}

\begin{remark} This lemma also remains true with some modifications
  over an arbitrary field if we randomize the Hankel matrix $A$
  according to the distribution described in the remark after
  \lemref{L7}.  Namely we have to substitute $|D|^{- (Q-R+1)t^{2}}$
  for $2^{- (Q-R+1)t^{2}}$ in the last expression of the lemma.
  \nev{258}
\end{remark}

\begin{proof}[Proof of \lemref{L9}] We will define a function
  $\calf$ on the set of all ordered pairs $\langle X_{1},X_{2}\rangle
  $ with $X_{j}\subseteq \lbrace 0,{\ldots},n-1\rbrace $, for $j=1,2$,
  $|X_{1}|=|X_{2}|=q$.  Before getting into the details of this
  definition, recall from the ``sketch of proof''of \thmref{T2}
  that, roughly speaking, we get $\calf(\langle X_{1},X_{2}\rangle )$
  in the following way. First we partition both $X_{1}$ and $X_{2}$
  into classes of sizes $t^{2}$.  Then we select subsets
  $J_{i}^{(1)}$, $J_{i}^{(2)}$ from each pairs of corresponding
  classes with the property that $|J_{i}^{(1)}|=|J_{i}^{(2)}|=t$,
  $|J_{i}^{(1)}+J_{i}^{(2)}|=t^{2}$. The pair $\langle \bigcup_{i
  }J_{i}^{(1)}, \bigcup_{i} J_{i}^{(2)} \rangle$ is the value of
  $\calf$.

  Each value of the function will be a pair $ \langle
  Z_{1},Z_{2}\rangle $ so that $Z_{1},Z_{2} \subseteq \lbrace
  0,{\ldots},n-1\rbrace$ and $|Z_{1}|=|Z_{2}|=\lfloor \frac{q}{t^{2}}
  \rfloor t$. The definition is the following. Assume that the pair
  $\langle X_{1},X_{2}\rangle $ is given with the described
  properties. For each $j=1,2$ we pick pairwise disjoint subsets
  $K_{1}^{(j)},{\ldots},K_{Q}^{(j)}$ of $X_{j}$, where $Q=\lfloor
  \frac{q}{t^{2}} \rfloor$, so that $|K_{i}^{(j)}|=t^{2}$ for all
  $j=1,2$ $i=1,{\ldots},Q$ and $x\in K_{i}^{(j)}$, $y\in K_{i'}^{(j)}$
  implies $x<y$ for all $j=1,2$, $1\leq i<i'\leq Q$. (By the definition
  of $Q$ this is possible.)  \nev{259}

  Assume now that an $i=1,{\ldots},Q$ is fixed. We apply \lemref{L6} with
  $U\to K_{i}^{(1)}$, $V\to K_{i}^{(2)}$. Let $U'$, $V'$ be the sets whose
  existence is stated in \lemref{L6} and let $J_{i}^{(1)}=U'$,
  $J_{i}^{(2)}=V'$.  Finally let $Z_{j}=\bigcup_{i=1}^{Q}J_{i}^{(j)}$ for
  $j=1,2$ and let $\calf(\langle X_{1},X_{2}\rangle )=\langle 
Z_{1},Z_{2}\rangle $.  Clearly
  the pair $\langle Z_{1},Z_{2} \rangle$ satisfies the conditions 
described above
  as well as the following additional properties:
  \begin{enumerate}
    \renewcommand{\theenumi}{(\alph{enumi})}\setlength{\itemsep}{0pt}
  \item for all $j=1,2$ $J_{1}^{(j)},{\ldots},J_{Q}^{(j)}$ is a
    partition of $Z_{j}$, $|J_{i}|=t$ for all $i=1,{\ldots},Q$,
  \item for all $j=1,2$, $1\leq i<i'\leq Q$ $x\in J_{i}^{(j)}$, $y\in
    J_{i'}^{(j)}$ implies $x<y$,
  \item for all $j=1,2$ and $i=1,{\ldots},q$ we have $Z_{j}\subseteq
    X_{j}$ and $|J_{i}^{(1)}|+|J_{i}^{(2)}|= t^{2}$.
  \end{enumerate}
  Assume now that $\langle Z_{1},Z_{2}\rangle \in \range(\calf) $.
  We estimate the probability $\rho_{Z_{1},Z_{2}}$ of the
  event that the rank of the matrix $\sub(A,Z_{1},Z_{2})$ is smaller than
  $R$.
 \nev{260}

  We apply \lemref{L8} with $l\to Q$, $I_{i}^{(j)}\to J_{i}^{(j)}$,
  $S^{(1)}\to Z_{1}$, $S^{(2)}\to Z_{2}$, $m_{i}\to t^{2}$, $r\to Q-R+1$. 
We
  get that $\rho_{Z_{1},Z_{2}}$ is at most ${Q}{Q-R+1}2^{-(Q-R+1)
    t^{2}}$. Therefore, using that $|Z_{j}|=Qt$, we get that the
  probability that the rank of the matrix $\sub(A,Z_{1},Z_{2})$ is
  smaller than $R$ for at least one $\langle Z_{1},Z_{2}\rangle \in \range 
(\calf)$
  is at most
  $$
  |\range(\calf)| \binom{Q}{Q-R+1}2^{-(Q-R+1) t^{2}}\leq
  \binom{n}{Qt}^{2} \binom{Q}{Q-R+1}2^{-(Q-R+1) t^{2}}\enspace.
  $$
  For each pair $S,T$, with the properties given in the lemma, if
  $\calf(\langle S,T\rangle )=\langle Z_{1},Z_{2}\rangle $, then 
$Z_{1}\subseteq S$, $Z_{2}\subseteq T$ and
  this implies that $\rank(\sub(A,S,T))\geq \rank(\sub(A,Z_{1},Z_{2}))$
  so we have the same upper bound on the probability that the rank of
  $\sub(A,S,T)$ is smaller than $R$.
\end{proof}

\begin{proof}[Proof of \thmref{T2}]
  Assume that $\theta>0$ is sufficiently small, $c_{1}>0$ is
  sufficiently small with respect to $\theta$, and $c_{2}>0$.  Suppose
  further that $n$ is sufficiently large and $c_{2}n<q\leq n$.  We apply
  \lemref{L9} with $n$, $q$, $R=c_{1}|\log(\frac{q}{n})|^{-1}q$, and 
$t=\lfloor
  \theta^{-1} |\log(\frac{q}{n})| \rfloor $.  We get that the probability 
that
  rank of $\sub(A,S,T)$ is at least $R$ is at least
  $$
  p\geq 1-\binom{n}{Qt}^{2} \binom{Q}{Q-R+1}2^{-(Q-R+1)t^{2}}
  $$
  where $Q=\lfloor \frac{q}{t^{2}} \rfloor$.  We show that 
  $$
  \binom{n}{Qt}^{2}\binom{Q}{Q-R+1}2^{-(Q-R+1)t^{2}}
  $$
  is at most $\frac{1}{2}$ by giving upper bounds in its factors. As we
  have remarked already at the end the of sketch of the proof, the
  crucial fact that leads to the desired result is that in the
  exponent of $2$ we have the factor $t^{2}$ and not only $t$.  We
  will see that in the actual estimates this $t^{2}$ makes it possible
  to get the strong upper bound we need. \nev{261}

  We will use that if $0<\alpha < \frac{1}{2}$, $n$ is sufficiently
  large, and $x<\alpha n$ then 
  $$
  \binom{x}{\alpha n } \leq e^{2 \alpha n
    \log \frac{1}{\alpha}}\enspace.
  $$
  Let $\gamma=\frac{q}{n}$, and $\lambda=\frac{Qt^{2}}{q }$.  Clearly 
$c_{2}<\gamma<1$
  and $\frac{1}{2}<\lambda \leq 1$.
  Hence
  $$
  \binom{n}{Qt} = \binom{n}{\gamma\lambda t^{-1} n}\leq e^{2\gamma\lambda
    t^{-1}n\log(\gamma^{-1}\lambda^{-1}t)}= e^{2\gamma\lambda 
t^{-1}n(\log\gamma^{-1}+\log\lambda^{-1}+
    \log t)}\enspace.
  $$
  Using that $t^{-1}\log \gamma^{-1}=\theta$, $t^{-1}\log \lambda^{-1}\leq 
t^{-1}\log 2 \leq
  t^{-1}\leq \theta$, and $t^{-1}\log t\leq t^{-\frac{1}{2}}\leq 
\theta^{\frac{1}{2}}$ we
  get that
  $$
  \binom{n}{Qt}^{2} \leq 
e^{4\gamma\lambda(\theta+\theta+\theta^{\frac{1}{2}}) n} \leq
  2^{\frac{1}{20}}\gamma\lambda n
  $$
  %% ToC edit (ar28) removed:
  %if $\theta$ is sufficiently small.  
  %% and added:
  and
  $$
  \binom{Q}{Q-R+1}\leq 2^{Q}=2^{\gamma\lambda t^{-2}n}\leq 
2^{\frac{1}{20}\gamma\lambda n}
  $$
  if $\theta$ is sufficiently small.  \nev{262}
  Moreover,
  $$
  2^{-(Q-R+1)t^{2}}\leq 
2^{-\frac{1}{8}Qt^{2}}=2^{-\frac{1}{8}\gamma\lambda
    t^{-2}t^{2}n}=2^{-\frac{1}{8}\gamma\lambda n}\enspace.
  $$
  These inequalities imply that
  $$
  \binom{n}{Qt}^{2}\binom{Q}{Q-R+1}2^{-(Q-R+1)t^{2}}\leq
  2^{\frac{1}{20}\gamma\lambda n+ \frac{1}{20}\gamma\lambda n 
-\frac{1}{8}\gamma\lambda n }\leq
  2^{-(\frac{1}{8}-\frac{1}{10})\gamma\lambda n}<\frac{1}{2}
  $$
  if $n$ is sufficiently large.
(Here we use that $c_{2}<\gamma$ and
  $\frac{1}{2}<\lambda $.) \nev{263}
\end{proof}

\section{The proof of \lemref{L1}}
\subsection{A lemma about disjoint sets of variables}
In this section we prove \lemref{L1} using Lemma 9 of \cite{A1}.
(This is the most important technical lemma of that paper with a long
proof.)  We reformulate below this result from \cite{A1} as
\lemref{AA1} to make it consistent with the terminology of the present
paper.  In the proof we will also use other lemmata from \cite{A1}; we
will formulate them as \lemref{AA2}, \lemref{AA3}, and \lemref{AA4} in
the present paper.  These latter three lemmata have short proofs
(given in \cite{A1}) using only the definitions of the concepts
contained in their statements.  The following definitions are needed
for the statement of \lemref{AA1}. \nev{100}

\begin{definitionlist}
\item Assume that $\calb$ is a branching program with $n$ input
  variables and $\eta$ is an input for $\calb$.  (Recall that an input is
  a $\{0,1\}$-valued function defined on $\lbrace 0,1,{\ldots},n-1\rbrace 
$
  with the meaning that the value $\eta(i)$ is assigned to the variable
  $x_{i}$.)  At input $\eta$ the branching program follows a path in the
  directed graph $\calg$ as described in the definition of a branching
  program.  We associate a time (a nonnegative integer) with each node
  of this path.  If the path is $v_{0},v_{1},{\ldots},v_{i}$, where 
$v_{0}$
  is the source node and $v_{i}$ is a sink node, then for all integers
  $t\in [0,1]$, we will say that the program is \emph{at node $v_{t}$ at 
time
  $t$} with respect to input $\eta$.  We will use the notation
  $\state(t,\eta)=v_{t}$. \nev{101}

\item Assume that $\state(t,\eta)=v_{t}$, and $\variable(v_{t})=x_{i}$.
  In this case we will say that the program \emph{accesses} the variable
  $x_{i}$ at time $t$.  \nev{102}

\item An input $\eta$ is \emph{visible} if each variable $x_{i}$,
  $i=0,1,{\ldots},n-1$ is accessed at some time during the computation
  performed at input $\eta$.
 
\item Assume that $\calb$ is a branching program so that the path
  associated with each input is of length $l$. In this case we will
  say that for each input the \emph{length} of the program is $l$.
\end{definitionlist}

\textsl{Additional assumptions about $\calb$.}  Without loss of
generality we will make the following two additional assumptions about
the branching program $\calb$ in the proof of \lemref{L1}. \nev{150}

(a) We assume that every input is visible.  Indeed, we can modify the
branching program $\calb $ so that the new branching program $\calb'$
first reads the value of each variable and then continues with the
original computation of $\calb$.  The length and the size of
the program are thus increased only by $n$.
Moreover, if \lemref{L1} holds for $\calb'$ then clearly it also holds
for $\calb$ since, apart from the size and the depth of the program,
\lemref{L1} treats the program as a black box, it speaks only about
the function defined by the branching program.  \nev{104}

(b) We assume that, independently of the input, the length of the
branching program is exactly $kn$, that is, for each input $\eta$ the
program reaches a sink node at time $kn$. This is not an essential
restriction because there is another program $\calb'$, whose size is
larger than the size of $\calb $ by only a factor of at most $n^{2}$,
so that program $\calb'$ works exactly the same way as program $\calb$
but also counts the time and when $\calb$ reaches a sink node $v$ then
it works further till time $kn$ when it gives the same output as the
output of $\calb$ at node $v$. (We may assume that at each time $t$ in
this new additional time interval, the branching program $\calb'$
accesses, e.g., the variable $x_{0}$.)  \nev{151}

As a consequence of this second assumption, for each fixed input $\eta$,
the function $\state(t,\eta)$ is defined for all $t=0,1,{\ldots},nk$ and 
the
branching program accesses a variable at each time $t$ for
$t=0,1,{\ldots},kn-1$.  \nev{152}

\begin{definitionlist}
\item Suppose that $\sigma$ is a real number with $\sigma\in 
(0,\frac{1}{2})$.  We
  assume that a partition of the set $\lbrace 0,1,{\ldots},kn-1\rbrace $
  into intervals is fixed so that the length of each interval is
  between $\sigma n$ and $2\sigma n$.  $\cali^{(\sigma)}$ will denote the 
set of
  these intervals.  If the choice of $\sigma$ is clear from the context, 
we
  will omit the superscript $\sigma$.  \nev{105}

\item Assume that $T\subseteq \lbrace 0,1,{\ldots},kn-1\rbrace $ is a set 
of
  integers.  The set of all integers $i\in \lbrace 0,1,{\ldots},n-1\rbrace 
$,
  so that the input variable $x_{i}$ is accessed by the branching
  program at some $t\in T$, at input $\eta$, will be denoted by
  $\register(T,\eta)$.  The set of all integers $j$ in $\register(T,\eta)$
  so that the value of variable $x_{j}$ is not accessed at any time
  outside $T$ at input $\eta$ will be denoted $\core(T,\eta)$.  Clearly
  $\core(T,\eta)\subseteq \register(T,\eta)$. \nev{106}
\end{definitionlist}

\begin{remark}
  The notation $\register(\eta)$ was motivated by the fact that, in
  \cite{A1}, instead of branching programs we work with random access
  machines, and so instead of reading the values of variables the
  machine reads the content of registers.  To make the two papers more
  compatible we did not change this notation.  \nev{107}
\end{remark}

\begin{definitionlist}
\item If a $\sigma>0$ is given, $F\subseteq \cali^{(\sigma)}$, and $\chi$ 
is an input, then
  $\stem(F,\chi)$ will denote the restriction of $\chi $ onto $\lbrace
  0,1,{\ldots},n-1 \rbrace \bcks \core(F,\chi) $. \nev{108}
\item Suppose that $T\subseteq \lbrace 0,{\ldots},kn-1\rbrace $.  We say 
that $x$
  is at the \emph{right border} of $T$ if $x\notin T$ and $x-1 \in T$. The 
set of
  those integers which are at the right border of $T$ will be denoted
  by $\rightborder(T)$. \nev{109}

\item Suppose that $T\subseteq \lbrace 0,{\ldots},kn-1\rbrace $ and $\chi$ 
is an
  input.  Let $f$ be a function defined on the set $\rightborder(T)$, so
that for all $t\in \rightborder(T) $ we have $f(t)=\state(t,\chi) $.  We 
will
call $f$ the \emph{right-state} function of the set $T$ at input $\chi$
and will denote it by $\rstate_{T,\chi}$.  \nev{110}
\end{definitionlist}

\begin{remark}
  The significance of the set $\core(T,\chi)$, the right border of $T$,
  and the function $\rightborder_{T,\chi}$ is the following. Assume that
starting from the input $\chi$ we change the value of some of the
variables in $\core(T,\chi)$ in a way that for the new input $\chi'$ we 
have
$\rightborder_{T,\chi}= \rightborder_{T,\chi'}$. Then the output of the 
program remains
unchanged.
\end{remark}

\begin{lemmaa} \label{AA1} For all positive integer $k$, if $\sigma>0 $ is
  sufficiently small with respect to $k$, $\epsilon>0$ is sufficiently
  small with respect to $\sigma$, $n$ is sufficiently large with respect 
to
  $\epsilon$, $\calb$ is a branching program with $n$ input variables,
  $\calb$ is of size at most $2^{\epsilon n}$, for each input the length 
of
  the program is $kn$, and $G$ is a set of visible inputs, then the
  following holds.  There exist $\kappa>\sigma$, $F_{1},F_{2}$, $f_{1}$,
  $f_{2}$, $H$ satisfying the following conditions: \nev{111}
 
\begin{cond} \label{V1}
$H\subseteq G$
and $|H|\geq 2^{-\kappa n}|G| $  \end{cond}

\begin{cond} \label{V2}
$F_{1},F_{2}$ are disjoint subsets of
$\cali^{(\sigma)}$
\end{cond}


\begin{cond} \label{V3}
for all
$i=1,2$ and $j=3-i$ if $\chi,\xi \in H$,
and $\stem(F_{i},\chi)= \stem(F_{i}, \xi)$,
then $\core(F_{j},\chi) = \core(F_{j},\xi)
$  \nev{117}
\end{cond}


\begin{cond} \label{V4}
$|\core(F_{i},\chi)|\geq \kappa^{\tau}n$ 
for all $\chi\in H$ and $i=1,2$,
where $\tau=1-\frac{1}{50 k}$,   \end{cond}


\begin{cond} \label{V5}
$\rstate_{\chi,
\bigcup F_{i}}=f_{i}$ for all $\chi\in H$,
$i=1,2$.  \nev{120}
\end{cond}


\begin{cond} \label{V6}
$\kappa<2^{-|\log \sigma|^{\frac{1}{4}}}$
\nev{121} \end{cond}

\end{lemmaa}


\paragraph{Motivation.} The intuitive meaning of \lemref{AA1} is the
following. Suppose that a branching program works in linear time.
Then, if we segment the time into intervals of length about $\sigma n$, it
is possible to select two disjoint sets of intervals $F_{1}$ and
$F_{2}$ so that in each of them, for a large number of inputs $\chi$, we
access many variables (the ones in $\core(F_{3-i},\chi)$) that are not
accessed anywhere else. Moreover the sets $F_{1}$ and $F_{2}$ can be
selected with the additional property described below. If the state of
the branching program is fixed at the right borders of $F_{1}$ and
$F_{2}$ (\condref{V5}), then $\core(F_{1},\chi)$ and $\core(F_{2},\chi)$ 
are
independent from each other in the following sense. In order to know
what is $\core(F_{i},\chi)$, we do not have to know the values of the
variables in $\core(F_{3-i},\chi)$ (\condref{V3}). This last condition is
the crucial part of the lemma, everything else in it can be proved by
a simple counting argument. \vskip 5pt

\begin{remarks} \hfill
  \begin{enumerate}
  \item \condref{V6} was not included in the original statement of
    the lemma in \cite{A1} but its proof clearly implies it.  The
    exact form of the upper bound on $\kappa$ is not important for us, any
    upper bound of the type $\kappa <g(\sigma)$ where $\lim_{x\to \infty} 
g(x)=0$ would
    be sufficient for the proof of \lemref{L1}.  \nev{122}
  \item We have changed the notation of the original lemma (by
    substituting $\kappa$ for $\lambda$) to make it more compatible to the
    notation of \lemref{L1}.  \nev{123}
  \item The lemma in \cite{A1} was originally formulated for random
    access machines, however in the case when the possible contents of
    the input registers form a set with two elements, the notion of
    the random access machines used there is identical to the notion
    of ($2$-way) branching programs.  Therefore \lemref{AA1} is a
    special case of Lemma 9 of \cite{A1}.  \nev{124}
  \item There is a slight difference between the notation of the two
    papers: in \cite{A1} an input is a function defined on the set
    $\lbrace 1,{\ldots},n\rbrace $ while in the present paper it is 
defined
    on $\lbrace 0,1,{\ldots},n-1\rbrace $.  \nev{125}
  \item The proof of \lemref{L1} from \lemref{AA1} is almost identical
    to the proof of Theorem 4 of \cite{A1} from Lemma 9 of the same
    paper.  \vskip 5pt \nev{126}
\end{enumerate}
\end{remarks}

\subsection{Reducing \lemref{L1} to \lemref{AA1}}

As we pick the values of the various parameters in \lemref{L1} we will
describe
the values of the parameters of \lemref{AA1} when we
use it in our proof.  \nev{127}

Assume that $k$ is given (we will apply \lemref{AA1} with the same
value of $k$).  Now we pick $\sigma_{1}$ and $\sigma_{2}$ so that
$\sigma_{1} $ is sufficiently small with respect to $k$ and
$\sigma_{2}$ is sufficiently small with respect to $\sigma_{1}$.  Let
$\sigma=3\sigma_{2}$.  Let $\epsilon>0$ be sufficiently small with
respect to $\sigma_{2}$, let $n$ be sufficiently large with respect
to $\epsilon$, and let $\calb$ be a branching program of length $kn$
(for each input)and size at most $2^{\epsilon n}$.  ($\epsilon$,
$\calb$ and $n$ are the same in the two lemmata.)  We pick $\delta\in
\lbrace 0,1\rbrace $ so that $|\calb^{-1}(\delta)|\geq 2^{n-1}$.  Let
$G= \calb^{-1}(\delta) $ in \lemref{AA1}.  (As we noted earlier we may
assume that every input of $\calb $ is visible, so $G$ meets this
requirement of \lemref{AA1}.)  Now we pick $\kappa$,
$F_{1},F_{2},f_{1},f_{2},H$ with the properties listed in
\lemref{AA1}.  \nev{129}



As a first step in the proof of \lemref{L1} we prove that there is a
$\bar \chi \in H$ so that for each $i=1,2$ the following 
condition is satisfied:
\nev{130}
\begin{cond} \label{Z2} assume that $s_{i}=|\core(F_{i},\bar\chi)|$ and
  $\bar Y_{i}$ is the set of all partial inputs $\eta $ defined on
  $\core( F_{i},\bar\chi)$ so that $\bar\chi \wr\eta \in H $;  then $|\bar
  Y_{i}|\geq \frac{1}{6} 2^{-\kappa n}2^{s_{i}}$. \nev{131} \end{cond}

%%%(This.is.the.analogue.of.(28).in
%%%\cite{A1}.)

For the proof we use the following two lemmata from \cite{A1}. The
first one, \lemref{AA2} is Lemma 10 in \cite{A1}, the second one
\lemref{AA2}, is Proposition 3 in that paper.


\begin{lemmaa} \label{AA2} Suppose that $F\subseteq \cali$, $\chi$, $\xi$ 
are
  inputs with $\stem(F,\chi)\not= \stem(F,\xi)$ and $\rstate_{\chi,\bigcup
    F}=\rstate_{\xi,\bigcup F}$. Then there is an $x\in
  \domain(\stem(F,\chi))\cap\domain(\stem(F,\xi))$ so that $\chi(x)\not= 
\xi(x)$.
\end{lemmaa}

\begin{lemmaa} \label{AA3} Assume that $A\subseteq A'$ are finite sets, 
$P$ is
  a partition of $A$, $P'$ is a partition of $A'$, each class of $P$
  is contained in a single class of $P'$, and $d=|A||A'|^{-1}$.  Then
  for all $\lambda >0$, there are at most $\lambda|A|$ elements $x$ of $A$ 
so
  that if $C,C'$ are the unique $P,P'$ classes containing $x$ then
  $|C||C'|^{-1}\leq \lambda d$.
\end{lemmaa}

As a first step in proving the existence of a $\bar \chi\in H$ so that
for all $i=1,2$ \condref{Z2} is satisfied, we fix an $i\in \lbrace
1,2\rbrace $ and give a lower bound on the number of inputs $\bar
\chi\in H$ with \condref{Z2} (with this fixed $i$).  We define a
partition $\calt_{i}$ of $H$ in the following way.  $\forall
\chi,\xi\in H $, $\xi,\chi$ belong to the same class iff
$\stem(F_{i},\chi)=\stem(F_{i},\xi)$. It is a consequence of this
definition that if $\chi$ and $\xi$ do not belong to the same class of
$\calt_{i}$, then the functions $\stem(F_{i},\chi)$ and
$\stem(F_{i},\xi)$ must be different (for the fixed value of $i$).
Since the domains of these two functions, that is, $\lbrace
0,1,{\ldots},n-1\rbrace\bcks \core(F_{i},\chi)$ and $\lbrace
0,1,{\ldots},n-1\rbrace \bcks \core(F_{i},\xi)$ are not necessarily
identical, in principle it would be possible for the functions
$\stem(F_{i},\chi)$ and $=\stem(F_{i},\xi)$ to be compatible, that is,
to be identical on the intersection of their domains. However
\condref{V5} of \lemref{AA1} and \lemref{AA2} imply that
this can never happen, that is, \nev{140}


\begin{cond} \label{Z3} functions that belong to different classes of
  $\calt_{i}$ are not compatible. \nev{141}
\end{cond}

We will denote by $H'$ the set of all inputs $\zeta$ so that there is
a $\chi\in H$ with the property that $\zeta$ is an extension of
$\stem(\chi,H)$.  For each fixed $\chi\in H$ let $W_{\chi}$ be the set
of all $\zeta\in H'$ so that $\zeta$ is an extension
$\stem(F_{i},\chi)$.  \condref{Z3} implies that the sets $W_{\chi}$,
$\chi\in H $ (we take each of them only once) form a partition
$\calt_{i}'$ of $H'$.  Clearly each class of $\calt_{i}$ is contained
in exactly one class of $\calt_{i}'$. \nev{142}

We want to apply \lemref{AA3} with $A\to H$, $A'\to
H'$, $P\to \calt_{i}$, $P'\to \calt_{i}'$, and $\lambda
\to \frac{1}{3}$.  Since, by the definition of $G$, we have
$|G|\geq 2^{n-1}$, \condref{V1} of \lemref{AA1} implies that $|H|\geq
2^{-\kappa n}2^{n-1}$.  Obviously $|H'|\leq 2^{n}$ and so $d= $
$|H||H'|^{-1}\geq \frac{1}{2}2^{-\kappa n} $.  Therefore, according to
\lemref{AA3}, for at least $\frac{1}{3}|H|$ inputs $\chi$ from the
set $ H $, the following condition is satisfied: $\chi$ belongs to a
class in $\calt_{i}$ whose density in the unique class of $\calt_{i}'$
containing it is at most $ \frac{1}{3}d \geq \frac{1}{6}2^{-\kappa
  n}$. Let $ X_{i}$ be the set of all inputs $\chi \in H$ with this
property.  Since $|X_{i}|\leq \frac{1}{3}|H|$ for both $i=1$ and $i=2$
we have that $|H\bcks(X_{1}\cup X_{2})|\geq \frac{1}{3}|H|$.  Let $\bar
\chi \in H\bcks (X_{1}\cup X_{2})$. The definition of $X_{i}$ implies
that for all $i=1,2$, $\bar \chi$ belongs to a class of $\calt_{i}$
whose density in the corresponding class of $\calt_{i}'$ is greater
than $\frac{1}{6} 2^{-\kappa n}$.  Since each class of $\calt_{i}'$
contains exactly $2^{s_{i}}$ elements this implies that $\bar \chi$
meets the requirements of \condref{Z2}.\nev{143}

Assume that $\bar \chi$ is fixed with
\condref{Z2}  and $\bar Y_{i}$,
$i=1,2$ are
the sets defined in the description
of that property. We will prove the
following:


\begin{cond} \label{G6} for all $\eta_{i}\in \bar Y_{i}$, $i=1,2$ we have
  $(\bar \chi\wr\eta_{1})\wr \eta_{2}\in \calb^{-1}(\delta) $.\nev{133} 
\end{cond}

For the proof of this fact we use the following lemma which is Lemma 2
in \cite{A1}:

\begin{lemmaa} \label{AA4} Assume that $\chi$ is an input, $\eta_{1}$,
  $\eta_{2}$ are partial inputs, $T_{1},T_{2}\subseteq \lbrace
  0,1,{\ldots},nk-1\rbrace $.  If $\chi$, $\eta_{1}$, $\eta_{2}$, $T_{1}$, 
and
  $T_{2}$ satisfy the following conditions, then $\calb(\chi)=$$\calb(( 
\chi
  \wr \eta_{1}) \wr \eta_{2})$.
 
  \begin{cond} \label{Y1}
    $\domain(\eta_{1})$ and
    $\domain(\eta_{2})$ are disjoint.
  \end{cond}
 
  \begin{cond} \label{Y2}
    $T_{1}$ and $T_{2}$ are
    disjoint.
\end{cond}

\begin{cond} \label{Y3}
  for all $i=1,2$ we have
$\domain(\eta_{i})\subseteq
\core(T_{i},\chi)$
\end{cond}

\begin{cond} \label{Y4}
 for all $i=1,2$
we have $\rstate_{T_{i},\chi}=\rstate_
{T_{i},\chi \wr \eta_{i}}$
\end{cond}

\begin{cond} \label{Y5}
 for all $i,j\in \lbrace
1,2\rbrace $, $i\not= j$ we have
$\domain(\eta_{i})\cap
\register(T_{j},\chi\wr
\eta_{j})=\emptyset $.
\end{cond}
\end{lemmaa}

To prove that \condref{G6} is
satisfied by $\bar \chi $,  we show that
the
assumptions of \lemref{AA4}  hold with
$\chi \to \bar \chi$, $\eta_{1}$,
$\eta_{2}$, $T_{1}\to \bigcup
F_{1}$, and $T_{2}\to \bigcup
F_{2}$.  \nev{160}

\condref{Y1}. By the definitions of $\eta_{i}$ and the function $\core$
we have $\domain(\eta_{i})=$ \nev{161} $ \core(F_{i},\chi) \subseteq $ $ 
F_{i}$ for
$i=1,2$. \condref{V2} of \lemref{AA1} implies that $F_{1}\cap 
F_{2}=\emptyset$,
so $\domain(\eta_{1})$ and $\domain(\eta_{2})$ are disjoint.

\condref{Y2}. This is a consequence of
\propref{V2}  of Lemma \ref{AA1}.

\condref{Y3}. By the definition of
$\eta_{i}$  this holds with equality.

\condref{Y4}. This follows from
$\chi, \bar \chi \wr \eta_{i}\in H$  and
\condref{V5}  of Lemma \ref{AA1}.

\condref{Y5}. Assume that $i,j\in
\lbrace 1,2\rbrace  $, $i \not= j$ are
fixed. We have
$\domain(\eta_{i})=\core(F_{i},\bar \chi)$.
\condref{V3}  of \lemref{AA1}
and the fact $\bar \chi \wr \eta_{j} \in H
 $ together imply that
$\core(F_{i},\bar
\chi)=\core(F_{i}, \bar \chi\wr\eta_{j})$.
Therefore $\domain(\eta
_{i})=\core(F_{i}, \bar\chi \wr \eta_{j})$.
$F_{1}\cap F_{2}= \emptyset$ so at input
$\bar \chi \wr \eta_{j}$ and  at
times belonging to the set $\bigcup F_{j}$
we can never access a variable in
$\core(F_{i},\bar \chi \wr \eta_{j})$, and
consequently
$$
\domain(\eta_{i}) \cap
\register (T_{j}, \bar\chi\wr \eta_{j}))=
\emptyset\enspace.
$$  \nev{165}

Since all of the assumptions of \lemref{AA4} hold, its conclusion must
hold as well and so $\bar \chi$ satisfies \propref{G6}. \nev{166}


We will need the following observation to
conclude the proof.  Let
$\core(F_{i},\bar{\chi})=S_{i}$.  For any
$i=1,2$ and for any $X\subseteq S_{i}$,
there is an $\bar{Y}_{i}(X)\subseteq
\bar{Y}_{i}$ so that $\eta(x)=\zeta(x)$ for
all $\eta,\zeta\in \bar{Y}_{i}(X)$, $x\in
S_{i}\backslash X$, and
$|\bar{Y}_{i}(X)|\geq \frac{1}{6}2^{-\kappa
n}2^{|X|}$.  Indeed, we may partition the
elements of $\bar{Y}_{i}$ into disjoint
classes according to the values of its
elements on the set $S_{i}\backslash X$.
Since there are at most $2^{s_{i}-|X|}$
classes, at least one class must contain at
least $2^{-s_{i}+|X|}|\bar{Y}_{i}|$
elements. $\bar{Y}_{i}(X)$ will be such a
class. The stated lower bound on $|\bar
{Y}_{i}(X)|$  and the lower bound  on
$|\bar Y_{i}|$ formulated in
\condref{Z2}  imply
$|\bar{Y}_{i}(X)|\geq \frac{1}{6}2^{-\kappa
n}2^{|X|}$.
  \nev{134}


By \condref{V4} of \lemref{AA1}
we have $|S_{i}|\geq \kappa^{\tau}n$ for
$i=1,2$.  Let $[ \frac{1}{2}\kappa^{\tau}n]=r$.  Let $z_{i}$ be the
$r$th smallest element of $S_{i}$ and
assume that e.g. $z_{1}\leq z_{2}$.  Let
$W_{1}$ be the set of the $r$ smallest
elements of $S_{1}$ and let $W_{2}$ be the
set of the $r$ largest elements of $S_{2}$.
Let $Y_{i}=\bar Y_{i}(W_{i})$ for $i=1,2$.
According to our previous observation we
have

\begin{cond} \label{E5} for all $i=1,2$,
 $|Y_{i}|\geq \frac{1}{6}2^{|W_{i}|-\kappa n}$.
\end{cond}


By the definitions of $r$, $z_{i}$, and $W_{i}$, \condref{G1} is
satisfied by $W_{1}$ and $W_{2}$.  We claim that the other
requirements of the lemma are also met by the following choice of the
various parameters.  We pick two partial inputs $\zeta_{1}\in Y_{1}$,
$\zeta_{2}\in Y_{2}$ in an arbitrary way.  Let $\chi=(\bar \chi\wr 
\zeta_{1})\wr \zeta_{2}$,
$\lambda={2\kappa }$, and $\mu=|W_{1}|n^{-1}=|W_{2}|n^{-1}$. ($W_{i}$, 
$Y_{i}$ have
already been defined.) \nev{135}


The definitions of $\sigma_{1},\sigma_{2},\epsilon$, and $\delta$ at the 
beginning of the
proof of \lemref{L1} show that the requirements of the lemma, stated
before the conditions $\lambda \in (\sigma_{2},\sigma_{1})$, $\mu\in 
(\sigma_{2},\sigma_{1}) $, are
met.  $\lambda \in (\sigma_{2},\sigma_{1})$ is an immediate consequence of 
$\lambda= 2\kappa$, the
inequalities $\sigma <\kappa$, $\kappa \leq 2^{-|\log 
\sigma|^{\frac{1}{4}}}$, and the fact
that we choose $\sigma_{2}=\frac{1}{3}\sigma$ so that it is sufficiently 
small with respect to $\sigma_{1}$.

The fact that
$\mu \in (\sigma_{2},\sigma_{1})$ is a consequence of the following facts:
$\tau=1-\frac{1}{50 k}$ (cf. \condref{V4} of \lemref{AA1}),
$\mu=|W_{i}|n^{-1}$, $|W_{i}|=[\frac{1}{2}\kappa^{\tau}n]$, $\sigma< 
\kappa\leq 2^{-|\log
  \sigma|^{\frac{1}{4}}} $, $\sigma=3\sigma_{2}$, and $\sigma_{2}$ is 
sufficiently small
with respect to $\sigma_{1}$.  Indeed $\sigma=3\sigma _{2}$ is 
sufficiently small
with respect to $\sigma_{1}$ (for a fixed $k$), and so 
$$
\mu = \left[\frac{1}{2}\kappa^{\tau}n\right]n^{-1}\leq 
\frac{1}{2}\kappa^{\tau}\leq 2^{-|\log
  \sigma|^{\frac{1}{4}}(1-{\frac{1}{50k})}} < \sigma_{1} \enspace.
$$
On the other hand
$$
\mu =\left[\frac{1}{2}\kappa^{\tau}n\right]n^{-1}> 
\frac{1}{3}\kappa^{\tau}\geq \frac{1}{3}
\sigma^{1-\frac{1}{50k}}\geq \frac{1}{3}\sigma = \sigma_{2}
$$
and so $\mu \in (\sigma_{2}, \sigma_{1})$.

We have already seen that \condref{G1} of \lemref{L1} is satisfied.
\nev{167}

\condref{G2} of \lemref{L1} is a
consequence of the definition of $\mu$.
 \nev{168}

 \condref{G3}. Using the inequality of \condref{E5} we get $|Y_{i}|\geq
 \frac{1}{6}2^{|W_{i}|-\kappa n}\geq 2^{|W_{i}|-\lambda n}=2^{\mu 
n-\lambda n}$.  \nev{169}
 
 \condref{G4}.  By the definition of $r=|W_{i}|=\mu n$ we have $\mu n=
 [\frac{1}{2}\kappa^{\tau}n] $ and so
 $$
 \mu\geq \frac{1}{3}\kappa^{\tau}= \frac{1}{3}(\frac{\lambda}{2} )^{\tau} 
= \frac{1}{3}
 (\frac{\lambda}{2})^{1-{\frac{1}{50k}}}\enspace.
 $$
 Therefore 
 $$
 \mu^{1+\frac{1}{100k}} \geq (\frac{1}{3})^{ 1+\frac{1}{100k}
 }(\frac{\lambda}{2})^{( 1-\frac{1}{50k} )( 1+\frac{1}{100k} ) }\geq 2
 \lambda\enspace .
 $$
 (Here we used that by \condref{G6}, both $\kappa $ and $\lambda>0$ are
 sufficiently small with respect to $k$.)  \nev{170}

\condref{G5} of \lemref{L1} is a
consequence of \condref{G6} and the
definitions of $\chi$ and $Y_{i}$.  These
definitions imply that
$(\chi\wr\eta_{1})\wr \eta_{2}=(\bar \chi
\wr \eta_{1}') \wr \eta_{2}'$ where
$\eta_{i}'=\eta_{i}\cup
\zeta_{i}|_{S_{i}-W_{i}}\in \bar Y_{i}$.

\bibliography{a008}

\begin{tocauthors}
\begin{tocinfo}[miklos]
  Mikl\'{o}s Ajtai\\
  IBM Almaden Research Center\\
  ajtai\tocat almaden\tocdot ibm\tocdot com
  %% \url{home page}
\end{tocinfo}
\end{tocauthors}

\begin{tocaboutauthors}
  \begin{tocabout}[miklos]
    {\sc Mikl\'{o}s Ajtai} received his Ph.\,D. from the Hungarian
    Academy of Sciences in 1975.  His advisor was Andr\'as Hajnal. He
    worked in the following areas: axiomatic set theory (independence
    proofs), lattice theory (posets with meet and join),
    combinatorics, the theory of random graphs, complexity theory,
    sorting networks, the theory of lattices ($n$-dimensional 
    grids) and their applications to complexity theory
    and cryptography. He is a member of the Hungarian Academy of
    Sciences and was an invited speaker at ICM in 1998.  He received
    the Knuth prize in 2003, and the IBM Corporate Award in 2000.
\end{tocabout}
\end{tocaboutauthors}

\end{document}

