paradyx/ma-tex: ch2_preliminaries.tex

% TeX root = main.tex
For ease of reading, the reader is provided with this overview of commonly
encountered symbols and their usage.
\begin{center}
    \begin{tabular}{|r|l|}
        \hline
        Symbols & Usage \\ 
        \hline
        $a,i,j,l,k,m,n$ & integer values \\
        $e$ & polynomial expressions \\
        $s, \beta, \sigma$ & assignments \\
        $p$ & probability \\
        $x,y,z$ & program variables \\
        $D$ & distributions \\
        $d$ & temporary variable replacing a distribution \\
        $\psi $ & constraints \\
        $\varphi $ & sets or conjunctions of constraints\\
        $\ell$ & locations \\
        $t$ & transitions \\
        $g$ & general transitions \\
        $\tau$ & guards of transitions \\
        $u$ & non-probabilistic update functions\\
        $\eta$ & probabilistic update functions\\
        $\vartheta$ & probabilistic polynomial.\\
        $\mu$ & probability distributions \\
        $\varf$ & finite prefixes \\
        $\scheduler$ & scheduler \\
        $G$ & graphs \\
        $L$ & sets of locations that form a loop \\
        $T$ & sets of transitions \\
        $\B$ & set of bounds \\
        $\D$ & set of distributions \\
        $\G$ & set of graphs \\
        $\H$ & set of polyhedra \\
        $\GT$ & set of general transitions \\
        $\C$ & set of linear constraints\\
        $\PC$ & set of polynomial constraints\\
        $\PV$ & set of program variables \\
        $\T$ & set of transitions \\
        $\V$ & set of variables \\
        \hline
    \end{tabular}
\end{center}

First, some basic notation is introduced. Let $\overline{\N} = \N \union
\{\infty\}$. $A^*$ is the set of finite words and $A^\omega$ is the set of
infinite words over an alphabet $A$. The set of all integer polynomials over
some indeterminates $A$ is denoted by $\Z[A]$. A syntactical substitution where
one symbol is replaced by another is denoted with $[\cdot/\cdot]$; for example
$(x<y)[x/1] = (1 < y)$. For a finite set $A = \{x_1,\dots,x_n\}$, the $n$
consecutive substitutions $[x_i/y_i]$ where the replacement expression $y_i$
varies only in some index $i$ is denoted using a shortened notation:
$[x_i/y_i]_{x_i \in A} = (\dots([x_1/y_i]) \dots) [x_n/y_n]$. For example,
$[x_i/i]_{x_i\in A} = (\dots([x_0/0])\dots)[x_n/{}n]$ is the substitution where
every variable is replaced by its index in $A = \{x_0, \dots, x_n\}$. The
restriction of a function $f: A \rightarrow B$ to a smaller domain $A' \subseteq
A$ is denoted by $\varf|_{A'} : A' \rightarrow B$.

\section{Constraints}\label{sec:constraints}
This section will define constraints as a special cases of first-order formulas.
In the following, $\V$ denotes a finite set of variables $\V$.

\begin{definition}[Constraints \cite{bagnara2005convexpolyhedron,
    meyer2021tacas,Domenech19}\label{def:constraints}]
    For a given finite set of variables $A \subseteq \V$, the set of
    \textit{linear non-strict inequality constraints} with variables $A$ is
    defined as the set
    \begin{align*}
        \C_A=\set{a_0 + \sum_{i=1}^{n} a_i v_i \leq 0}{a_i \in \R, v_i \in A}.
    \end{align*}

    The set of \textit{non-strict polynomial inequality constraints} with
    variables $A$ is defined as the set 
    \begin{equation*}
        \PC_A = \set{e \leq 0}{e \in \Z[A]}.
    \end{equation*}

    Let $\C$ and $\PV$ denote the linear and polynomial constraints with
    variables $A = \V$.
\end{definition}

\begin{rem}
    The set of non-strict linear constraints is a subset of the set of
    non-strict polynomial constraints. For the most part we will just refer to
    \enquote{constraints} and mean linear constraints. Whenever we encounter
    polynomial constraints we will mention it explicitly.
\end{rem}

For foundations on \gls{fo}-logic we refer the reader to \cite{barwise1977}.
Constraints are a subset of formulas in \gls{fo} logic; thus, their semantics
shall be defined accordingly using the $\models$ and $\equiv$ relation from
\gls{fo} logic \cite{barwise1977}. In this thesis, we will only consider integer
arithmetic. Let $\Zs := (\Z, +, \cdot, 0, 1, \leq)$ be the structure of standard
integer arithmetic. For an assignment $\beta : \V \rightarrow \Z$ and a formula
$\varphi \in \Phi$ we write $\beta \models \varphi$ instead of $\Zs, \beta
\models \varphi$. Finally, we define two trivial constraints $\texttt{true} = (0 =
0)$ and $\texttt{false} = (1 < 0)$. 

\begin{definition}[Satisfying assignments]\label{def:solutions}
    Let $\varphi = \psi_1, \dots, \psi_n$ be an conjunction of constraints
    $\psi_1, \dots,\psi_n \in \C$. $\llbracket \varphi \rrbracket$ is defined as
    the set of all satisfying assignments of the formula $\varphi$.
\end{definition}

A finite set of constraints $C \subseteq \C$ is satisfiable when their
conjunction is satisfiable and we write $\beta \models C$ instead of $\beta
\models \LAnd_{\psi_i \in C} \psi_i$. 

Let $\psi_1,\psi_2\in \C$ and $C \subseteq \C$. The following properties hold,
hence, conjunctions and sets of constraints can be used interchangeably.
\begin{align*}
    \llbracket \psi_1 \Land \psi_2 \rrbracket = \llbracket \psi_1 \rrbracket
    \intersect \llbracket \psi_2 \rrbracket \\
    \llbracket C \rrbracket = \llbracket \LAnd_{\psi_i \in C} \psi_i \rrbracket 
        =  \Intersect_{\psi_i \in C} \llbracket \psi_i \rrbracket
\end{align*}

\begin{definition}[Equivalence \cite{Domenech19}]\label{def:equivalence}
    Two constraints, or set of constraints $\varphi_1$ and $\varphi_2$ are
    considered \textit{equivalent}, denoted as $\varphi_1 \equiv \varphi_2$, when
    $\llbracket \varphi_1 \rrbracket = \llbracket \varphi_2 \rrbracket$.
\end{definition}

Note that the constraints are restricted to inequalities against zero. In
practice constraints are often provided in a more general form. Restricting the
constraints to the specific form does not limit their practicality, since they
can be easily converted into one of those forms. Constraints of the form $e_1
\diamond e_2$ with the operators $\diamond \in \{=, \leq, \geq, <, >\}$ and
$e_1, e_2 \in \Z[\V]$ can be transformed as follows:
\begin{align*}
    e_1 \leq e_2 &\equiv e_1 - e_2 \leq 0 \\
    e_1 \geq e_2 &\equiv e_2 - e_1 \leq 0 \\
    e_1 > e_2 &\equiv e_2 - e_1 < 0 \\
    e_1 < e_2 &\equiv e_1 - e_2 < 0 \\
    e_1 = e_2 &\equiv e_1 \leq e_2 \land e_2 \leq e_1 \\
              &\equiv e_1 - e_2 \leq 0 \land e_2 - e_1 \leq 0
\end{align*}

For real arithmetic it is important to distinguish between strict and non-strict
constraints; however for integer arithmetic it is enough to consider only strict
inequality as one can be transformed into the other: 
\begin{align*}
    e_1 \geq e_2 &\equiv e_2 - e_1 - 1 < 0 \\
    e_1 \leq e_2 &\equiv e_1 - e_2 - 1< 0
\end{align*}

\begin{definition}[Entailment\cite{Domenech19}]\label{def:entailment}
    Let $\varphi_1, \varphi_2 \in 2^\PC$. If $\llbracket \varphi_1 \rrbracket
    \subseteq \llbracket \varphi_2 \rrbracket$; we say that $\varphi_1$
    \textit{entails} $\varphi_2$.
\end{definition}

\begin{lemma}\label{lem:entailment}
    $\varphi_1$ entails $\varphi_2$ if and only if $\varphi_1 \Land
    \neg\varphi_2$ is unsatisfiable.

    \proof{see Lemma \ref{apdx:lem:entailment} in the Appendix on page
    \pageref{apdx:lem:entailment}.}
\end{lemma}

\begin{example}
    Lets take a look at some examples of entailment:
    \begin{itemize}
        \item $x > 1 \land y \geq x$ entails $y \geq 0$. 
        \item $1 < 0$ entails all constraints, because $\llbracket 1 < 0
            \rrbracket = \emptyset$ and the empty set is a subset of any other
            set. 
        \item $x > 0$ does not entail $x > 1$, since the assignment $\sigma(x) =
            1$ satisfies $x > 0$ but not $x > 1$.
    \end{itemize}
\end{example}

The problem of determining whether a set of constraints is satisfiable or
finding valid variable assignments for a given formula is part of \gls{smt} and
is a well studied field in computer science \cite{walsh2009smt,abraham17smt}.
The satisfiability of sets polynomial constraints is known to be
undecidable \cite{kremer2016lncs}.
Thus whenever we encounter polynomial constraints the need to be
over-approximated by linear constraints. The theory relevant for this thesis is
the quantifier-free linear integer arithmetic (usually called \texttt{QF\_LIA}).

We define a function $\texttt{lin} : 2^\PC \rightarrow 2^\C$ that linearly
over-approximates polynomial constraints, that is $\llbracket \varphi\rrbracket
\subseteq \llbracket\texttt{lin}(\varphi) \rrbracket$. It is not important, how
the over-approximation works in details. An easy way of over-approximating
polynomial sets of constraints is to just drop all non-linear constraints.
However, one might take minima and maxima of the polynomials into account and
get arbitrarily precise. 

Sometimes one is not interested in the result for all the variables but only
those for a subset of variables. This will later become relevant when handling
temporary variables that are only relevant to the scope of a transition. In
\gls{fo} logic one would just existentially quantify the variable in the
formula. However, the new \gls{fo} formula would not longer be quantifier-free.
Instead, a new formula is sought that has just the same solutions when ignoring
the values for the irrelevant variables and that uses only the reduced variable
set.

\begin{definition}[Projection]
    Let $A \subseteq V$ and $\varphi \in 2^\C$. The projection of $\varphi$ onto
    $A$ denoted as $\varphi|_A$ is a constraint $\varphi|_A\in
    2^{\C_A}$, with $\llbracket \varphi|_A \rrbracket = \set{\beta|_A}{\beta \in
    \llbracket \varphi \rrbracket}$. 
\end{definition}

\section{Updates}\label{sec:update}
An update assigns a value to a variable after the update, by taking into account
all the \textit{former} values of the variables. Constraints and assignments
have no notion of time, whereas for updates there exists a \textit{before} and
an \textit{after}.

Let $\Sigma = \{s : \V \rightarrow \Z\}$ be the set of full integer assignments,
which are assignments that assign a value to every variable $v\in\V$. In
contrast, partial assignments over a variable set $A \subseteq \V$ assign values
only to a subset of variables, and the set of partial assignments over the
variables $A\subseteq\V$ is denoted by $\Sigma_A = \{s: A \rightarrow \Z\}$.

For an integer polynomial $f\in\Z[A]$ over a variable set $A \subseteq \V$ and
an assignment $s \in \Sigma_A$ we write $\varf(s)$ for the value that is found
when evaluating the polynomial $\varf[x_i/s(x_i)]_{x\in A}$.

\begin{definition}[(Non-probabilistic) polynomial update\label{def:update}]
    Let $A,B \subseteq \V$. An update is described by an update function $u : A
    \rightarrow \Z[B]$ that maps every updated variable to a polynomial
    expression over $B$. The value of a variable $x\in A$ \textit{after} the
    update for an assignment $s\in\Sigma_B$ is determined by
    $\bigl(u(x)\bigr)(s)$. Note that $u(x) \in \Z[B]$ is a polynomial and
    $\bigl(u(x)\bigr)(s) \in \Z$ is the evaluation of that polynomial.
\end{definition}

For an update described by the update function $u : A \rightarrow \Z[\V]$ and an
assignment $s \in \Sigma$ we define a function $\rho_u: \Sigma \rightarrow
\Sigma_A$. It computes the assignment after an update $u$ with a given an
assignment $s \in \Sigma$ before the update:

\begin{equation*}
    \bigl(\rho_u (s)\bigr)(x) = \bigl(u(x)\bigr)(s) \text{ for all } x\in A.
\end{equation*}

\begin{example}
    Consider the update $u : \{x\} \rightarrow \Z[\{x,y\}]$ with $u(x) = x\cdot
    y+1$. Let $s\in\Sigma_{\{x,y\}}$ assign $s(x) = 2$ and $s(y)=3$
    \textit{before} the update. \textit{After} the update, holds
    $\bigl(\rho_u(s)\bigr)(x) = 2 \cdot 3 + 1 = 7$
\end{example}

\begin{definition}[Updating a constraint]
    Let $A, B \subseteq \V$. Given an assignment $s\in\Sigma$, a set of
    constraints $\varphi \in 2^\C$ and an update $u: A \rightarrow \Z[B]$.
    Without loss of generality $A' = \set{x'}{x \in A} \subset \V$ is a set of
    fresh variables with $A' \cap (A \union B) = \emptyset$. We write under
    misuse of notation
    \begin{equation*}
        u(\varphi) = ((\varphi \Land \LAnd_{x\in A} x' =
        u(x))|_{A'})[x'/x]_{x\in A}.
    \end{equation*}
\end{definition}

\begin{lemma}\label{lem:nonprop_update}
    Let $A, B \subseteq \V$. Given an assignment $s\in\Sigma$, a formula
    $\varphi \in 2^\C$ and an update $u: A \rightarrow \Z[B]$. For any partial
    assignment $s' = \rho_u(s)|_A$ and if $s \models \varphi$, then $s' \models
    u(\varphi)$. For any partial assignment $s' = \rho_u(s)|_A$ and if $s
    \models \varphi$, then $s' \models u(\varphi)$.

    \proof{see Lemma \ref{apdx:lem:nonprop_update} in the Appendix on \pref{apdx:lem:nonprop_update}.}
\end{lemma}

\section{Abstract Domains}\label{sec:abstract_domains}

Another way of reasoning about satisfying assignments is to see them as points
in an $m$-dimensional space for variables $A \subseteq \V$ and $|A| = m$. Every
constraint partitions the space. One part is included and the other is excluded.
In the case of integer arithmetic, the solutions are discrete points in this
space. 

An abstract domain is a representation of those sets of points. Usually a value
over-approximates the set of points in order to facilitate operations. Commonly
used abstract domains are Polyhedra \cite{bagnara2005convexpolyhedron},
Octagons \cite{mine2001wcre, mine2006hosc,truchet2010synacs} and
Boxes \cite{gurfinkel2010}.

\subsection{Operations}\label{ssec:ad_operations}
All abstract domains support the following operations with varying computational
and memory requirements: emptiness test, intersection, inclusion test,
projection, upper and lower bounds. There are many more but we will focus only
on the ones important for this thesis.

The test for emptiness is equivalent to asking if a formula is satisfiable as
every point represents a solution to the formula. Intersecting two or more
abstract values is equivalent to the conjunction of one or more formulas.

\begin{example}\label{ex:intersection}
    \begin{figure}[h]
        \centering
        \begin{subcaptionblock}[t]{0.3\textwidth}
            \input{figures/ch2_example1_phi1}
            \caption{$\varphi_1$}
        \end{subcaptionblock}
        \begin{subcaptionblock}[t]{0.3\textwidth}
            \input{figures/ch2_example1_phi2}
            \caption{$\varphi_2$}
        \end{subcaptionblock}
        \begin{subcaptionblock}[t]{0.3\textwidth}
            \input{figures/ch2_example1_phi3}
            \caption{$\varphi_3 = \varphi_1 \land \varphi_2$}
        \end{subcaptionblock}
        \caption{Graphical representation of the set of constraints $\varphi_1$,
        $\varphi_2$ and $\varphi_3$ from Example \ref{ex:intersection}. Every constraint is
        represented by a line and the excluded side is marked by
        hatchings.\label{fig:ex_intersection} }
    \end{figure}

    Consider the following two constraints over the variables $\V = \{x, y\}$:
    \begin{align*}
        \varphi_1 &= 2y \leq x \\
        \varphi_2 &= 2y > x^2 - 4
    \end{align*}
    They are graphically represented in \fref{fig:ex_intersection} using a
    2D-coordinate system. Since the space is two-dimensional the linear
    constraint $\varphi_1$ can be represented as a straight line and $\varphi_2$
    which is a polynomial constraint can be represented as a parabola. 

    The solutions are marked as circles. For example, the assignment $\sigma: x
    \mapsto 0, y \mapsto -2$ (colored red) satisfies $\varphi_1$ but not
    $\varphi_2$. The conjunction $\varphi_3 = \varphi_1 \land \varphi_2$ is
    equal to the intersection of both sets of solutions. While $\varphi_1$ and
    $\varphi_2$ have an infinite number of solutions, the conjunction
    $\varphi_3$ does not. 
\end{example}

Geometrically, the projection is the set of assignments found by orthogonally
projecting every satisfying assignment onto the remaining dimensions with
respect to the canonical basis of $\R^m$. 

\begin{example}
    \begin{figure}
        \centering
        \input{figures/ch2_projection}
        \caption{Projecting a 2D-formula onto the $x$-axis\label{fig:orthogonal_projection}}
    \end{figure}

    Take the formula $\varphi = y < x + 4 \land y < -x + 4 \land y > x + 1
    \land y > -x + 1 $ over the variables $\V = \{x, y\}$, for example. The
    solutions to the formula are points in a two-dimensional space. Projecting
    the formula $\varphi$ onto the variables $X = \{x\}$ gives the following
    formula $\varphi|_X = x \geq -1 \land x \leq 1$.

    The projection is equal to casting the shadow of the 2D-polyhedron onto the
    $x$-axis as shown in \fref{fig:orthogonal_projection}. 
\end{example}


In the following we will give a short introduction to the abstract domains. A
visual example of the three abstract domains is given in
\fref{fig:abstract_domains}. 

\begin{figure}[h]
    \centering
    \begin{subcaptionblock}[t]{0.3\textwidth}
        \input{figures/ch2_abstract_domains_box}
        \caption{Box domain\label{fig:abstract_domains_box}}
    \end{subcaptionblock}
    \begin{subcaptionblock}[t]{0.3\textwidth}
        \input{figures/ch2_abstract_domains_oct}
        \caption{Octagon domain\label{fig:abstract_domains_oct}}
    \end{subcaptionblock}
    \begin{subcaptionblock}[t]{0.3\textwidth}
        \input{figures/ch2_abstract_domains_poly}
        \caption{Polyhedra domain\label{fig:abstract_domains_poly}}
    \end{subcaptionblock}
    \caption{Visual comparison of the Box domain, Octagons and
    Polyhedra\label{fig:abstract_domains}}
\end{figure}

\subsection{Convex Polyhedra}\label{ssec:polyhedra}
Every linear non-strict inequality constraint describes an closed affine
half-space and every linear strict inequality constraint describes an open
affine half-space in the Euclidean space $\R^n$. As discussed in
\Sref{sec:constraints}, the strictness of linear constraints and in turn the
openness of half-spaces are not relevant to the integer domain.

\begin{definition}[Convex Polyhedra \cite{bagnara2005convexpolyhedron}\label{def:polyhedra}]
    $H \subseteq \N^n$ is a \emph{convex polyhedron} if and only if it can be
    expressed by an intersection of a finite number of affine half-spaces. Let
    $\mathcal{H}$ be the set of all convex polyhedra.
\end{definition}

Polyhedra can represent linear constraints exactly. Non-linear polynomial
constraints are usually just removed, possibly leaving dimensions unbounded in
the process. There are certainly better linear over-approximations to polynomial
constraints, but finding these would go beyond the scope of this thesis.

Polyhedra will be used to compute the projection of sets of constraints. 

\subsection{Octagons}\label{ssec:octagons}
The octagon domain was constructed to bridge the performance gap between
intervals and polyhedra \cite{mine2006hosc}.

\begin{definition}[Octagons \cite{mine2006hosc,mine2001wcre}\label{def:octagons}]
    $\mathcal{O} \subseteq \R^n$ is an octagon if and only if it can be
    expressed by a set of constraints of the form $\pm v_i \pm v_j \leq b$. 
\end{definition}

Obviously, such constraints are linear constraints and hence octagons are a
special case of polyhedra. Octagons are used in \gls{koat} during the generation
of invariants. They are faster to work with than polyhedra but less precise,
since they can not represent all linear constraints and are forced to
over-approximated some of them.

\subsection{Boxes}\label{ssec:box}
For the sake of completeness, let us introduce a third abstract domain.
A closed interval $[a,b]$ with $a, b \in \R$ for a variable $v \in \V$ can be
expressed as a pair of linear constraints of the form $a \leq v$ and $v \leq b$.

\begin{definition}[Box \cite{gurfinkel2010}\label{def:box}]
    $B \subseteq \R^n$ is a Box if it can be expressed by a cartesian
    product $I_1 \times \dots \times I_n$ of intervals $I_1,\dots,I_n$
\end{definition}

By construction, every box is an octagon and in turn a polyhedron. For every
polyhedron $H \in \mathcal{H}$ one can find a box $B$ that contains the
polyhedron $H$. Boxes are computationally very efficient, although rather
imprecise for most applications.

\section{Probability Theory}\label{sec:probability}
The field of probability and random variables is well studied. Only the core
concepts needed to understand this thesis will be introduced here, following the
definitions from \citeauthor{billingsley2011} \cite{billingsley2011}. For
additional information we refer the reader to the literature
\cite{kraaikamp2005modern, billingsley2011}.

Imagine a probabilistic experiment with several possible outcomes $\Omega$. For
example, a coin toss can have many arbitrary outcomes when taking the position
on the table into account, the fact that the coin might get lost in the process,
etc. A probability measure describes the probabilities of sets of random
outcomes. For example, the coin toss on the table has a subset of outcomes
where the coin's head is uppermost. The probability measure would assign a
probability close to 50\% for a somewhat normal coin toss. 

Formally, a probability measure is defined as a function on a $\sigma$-algebra
that asserts the intuitive property of a random event: if you can measure the
probability two subsets of outcomes (e.g. $A = \{\omega \in \Omega\mid\text{
    coin shows head}\}$ and $B = \{\omega \in \Omega\mid\text{coin shows
tail}\}$) so you can measure the probability of the union of both subsets ($A
\union B = \set{\omega \in \Omega} {\text{coin shows head or tails}}$) and the
probability of the combined outcomes is equal to the sum of the individual
outcomes, if $A$ and $B$ are disjoint.

\begin{definition}[$\sigma$-Algebra \cite{billingsley2011}]
    Let $\Omega$ be an arbitrary set. A set $A \subseteq 2^\Omega$ is called a
    $\sigma$-algebra (also known as $\sigma$-field) when 
    \begin{enumerate}
        \item $\Omega \in A$,
        \item $A$ is closed under complement, i.e., $B \in A$ implies $B^C \in
            A$,
        \item $A$ is closed under countable unions, i.e., $A_1, A_2 \dots \in A$ 
            implies $\Union_{n\in\N}A_n \in A$.
    \end{enumerate}
\end{definition}

\begin{definition}[Probability Measure \cite{billingsley2011}]\label{def:measure}
    A function $\mu: \F \rightarrow [0,1]$ on a $\sigma$-algebra $\F$ is a
    probability measure if it satisfies the following conditions:
    \begin{enumerate}
        \item $\mu(A)\in [0,1]$ for $A \in \F$
        \item if $A_1, A_2, ...$ is a disjoint sequence of $\F$-sets then 
            \begin{equation*}
                \mu\left(\Union_{k=1}^\infty A_k \right) = \sum_{k=1}^\infty
                \mu(A_k)
            \end{equation*}
        \item $\mu(\Omega) = 1$
    \end{enumerate}
\end{definition}

A probability space describes the set of all possible outcomes $\Omega$, the
measurable sub-sets of outcomes $\F$ and the probability $P$ of those measurable
outcomes.

\begin{definition}[Probability space \cite{billingsley2011}]\label{def:probability_space}
    A probability space is a triple $(\Omega,\F,P)$ where the
    following holds: 
    \begin{enumerate}
        \item $\Omega$ is a non-empty set of all possible outcomes and $\omega
            \in \Omega$ denotes an arbitrary possible outcome.
        \item $\F \subseteq 2^\Omega$ is a $\sigma$-algebra containing $\Omega$.
        \item $P$ assigns a probability to every set in $\F$ and is a
            probability measure. 
    \end{enumerate}
\end{definition}

The concept of random variables is not to be confused with program variables or
variables in formulas. Informally, a random variable maps the results of a
probabilistic event $\omega$ to a real value, or the value of the random
variable depends on the outcome of a probabilistic event.

In the following, we will encounter discrete random variables $X : \Omega
\rightarrow \bar{N}$ with countable (and possibly infinite) possible
results.

\begin{definition}[Random variable \cite{billingsley2011}]\label{def:random_variable}
    A random variable on a probability space $(\Omega, \F, P)$ is a function $X
    : \Omega \rightarrow \overline{\N}$ with $\set{\omega}{X(\omega) = x} \in \F$ for
    every $x \in \overline{N}$. 
\end{definition}

The distribution of a random variable is a way of describing a random variable
and the underlying probability space. In the case of discrete random variables,
the distribution is sufficient to fully describe the random variable. The
underlying probability space becomes irrelevant at this point, as the specific
events and their probabilities are not important to the outcome of the random
variable, as long as the probability of the outcome is fully defined.

\begin{definition}[Distribution of a random variable \cite{billingsley2011}]\label{def:distribution}
    Let $X$ be a discrete random variable in a probability space $(\Omega, \F,
    P)$. The probability of observing a specific value is defined by the
    distribution $\mu: \F \rightarrow [0,1]$ as follows:
    \begin{align*} 
        \mu(r) = P(X=r) = P(\set{\omega \in \Omega}{X(\omega) = r}) 
    \end{align*}
\end{definition}

\begin{definition}[Support \cite{billingsley2011}]
    For a random variable with distribution $\mu$, the support is the smallest
    set $S$ for which $\mu(S) = 1$. Discrete random variables have a countable
    support $S = \{S_1, S_2, \dots\}$.
    \begin{align*}
        \text{supp}(\mu) = \set{x \in \N}{\mu(x) > 0}
    \end{align*}
\end{definition}

When considering a random variable, one is usually interested in the value to
expect in general. For example, most possible outcomes of a binomial
distribution are very improbable. This is described by the expected value.
\begin{definition}[Expected value \cite{billingsley2011,
    meyer2021tacas}]\label{def:expected_value}
    The expected value of a discrete random variable $X$ on a probability space
    $(\Omega, \F, P)$ is the weighted sum of all values X with respect to the
    measure $P$:

    \begin{align*}
        \E(X) &= \sum_{r \in \overline{N}} r \cdot P(X = r)
        = \sum_{r \in \overline{N}} P(X \geq r)
    \end{align*}

    If $\E(X)$ does not exist, then $\E(X) = \infty$.
\end{definition}

There are many common probability distributions. \gls{koat} uses five of them,
the Bernoulli, Uniform, Geometric, Hypergeometric and Binomial distributions.
They will be briefly introduced in this section, following the definitions from
\citetitle{kraaikamp2005modern} \cite{kraaikamp2005modern}. Examples of the
various probability distributions are displayed in \fref{fig:distributions}.

\subsection{Bernoulli Distribution \cite{kraaikamp2005modern}}\label{ssec:bernoulli}
The Bernoulli distribution is a distribution with two possible outcomes: $1$ and
$0$, for example decided by a coin toss. 

\begin{definition}[Bernoulli distribution]\label{def:bernoulli}
    A discrete random variable $X$ follows a \textit{Bernoulli distribution}
    parameterized by $p \in [0,1]$ if its probability is given by $\mu(1) =
    P(X=1) = p$ and $\mu(0) = P(X=0) = 1-p$.
\end{definition}

The support of a Bernoulli Distribution is as follows:
\begin{align*}
    \text{supp}(\textsc{Bern}(p)) = \begin{cases}
        \{0\} & p = 0 \\
        \{1\} & p = 1 \\
        \{0, 1\} & \text{otherwise}
    \end{cases}
\end{align*}

\subsection{Uniform Distribution \cite{kraaikamp2005modern}}\label{ssec:uniform}
The uniform distribution is a distribution where every value in a certain range
has the same probability. 

\begin{definition}[Uniform distribution]\label{def:uniform}
    A discrete random variable $X$ follows a \textit{discrete uniform
    distribution} on the interval $[a, b]$, if its probability is given by 
    \begin{align*}
        P(X=k) &= \begin{cases}
            \frac{1}{b-a} & \text{if} , a \leq k \leq b \\
            0 & \text{otherwise}
        \end{cases} & \text{for } k\in\N
    \end{align*}
\end{definition}

The support of a uniform distribution is as follows: 
\begin{align*}
    \text{supp}(\textsc{Unif}(a, b)) = \{a, a+1, \dots, b-1, b\}
\end{align*}

\subsection{Geometric Distribution \cite{kraaikamp2005modern}}\label{ssec:geom}
The geometric distribution is related to the Bernoulli distribution. A series of
independent Bernoulli events is repeated until a false result is encountered.
The number of repetitions follows a geometric distribution. 

\begin{definition}[Geometric distribution]\label{def:geom}
    A discrete random variable $X$ follows a \textit{geometric distribution}
    parameterised by $p$ (the probability of the repeated Bernoulli event) with
    $0 < p \leq 1$, if its probability is given by 
    \begin{align*}
        \mu(k) = P(X=k)&=(1-p)^{k-1}p & \text{for } k = 1, 2, \dots
    \end{align*}
\end{definition}

The support of a geometric distribution is as follows:
\begin{align*}
    \text{supp}(\textsc{Geom}(p)) = \N^+
\end{align*}

\subsection{Hypergeometric Distribution \cite{kraaikamp2005modern}}\label{def:hyper}
The hypergeometric distribution describes counting the number of successes when
sampling from a population with a certain number of successes without putting
back. 
\begin{definition}[Hypergeometric distribution]
    A discrete random variable $X$ follows a \textit{hypergeometric
    distribution} with a population of size $N$, $K < N$ successes therein, and
    $n < N$ samples, if its probability is given by
    \begin{align}
        \mu(k) = P(X=k) &= \frac{\binom{K}{k} \binom{N-K}{n-k}}{\binom{N}{n}} & \text{for
        } k = 0, \dots, n
    \end{align}
\end{definition}

The support of a hypergeometric distribution is as follows:
\begin{align}
    \text{supp}(\textsc{Hyper}(N, K, n)) = \{\max(0, n+K-N), \dots, \min (n, K)\}
\end{align}

\subsection{Binomial Distribution}
The binomial distribution is similar to the geometric distribution except that
the number of successes of repeated independent Bernoulli events is counted.

\begin{definition}[Binomial distribution \cite{kraaikamp2005modern}]\label{def:bino}
    A discrete random variable $X$ follows a \textit{binomial distribution} with
    $n$ Bernoulli events of a probability $p$, with $0 < p < 1$, if its
    probability is given by
    \begin{align}
        \mu(k) = P(X=k)&=\binom{n}{k} p^k (1-p)^{n-k} & \text{for } k = 0,\dots,n
    \end{align}
\end{definition}

The support of a binomial distribution is as follows:
\begin{align}
    \text{supp}(\textsc{Binom}(n, p))=\{0,\dots,n\}
\end{align}

\begin{figure}
    \centering
    \begin{subcaptionblock}[t]{0.3\textwidth}
        \centering
        \input{figures/ch2_distributions_bern}
        \caption{Bernoulli $p = 0.25$}
    \end{subcaptionblock}
    \begin{subcaptionblock}[t]{.3\textwidth}
        \centering
        \input{figures/ch2_distributions_unif}
        \caption{Uniform $a = 2, b=6$}
    \end{subcaptionblock}
    \begin{subcaptionblock}[t]{.3\textwidth}
        \centering
        \input{figures/ch2_distributions_geom}
        \caption{Geometric $p = 0.4$}
    \end{subcaptionblock}
    \begin{subcaptionblock}[t]{.35\textwidth}
        \input{figures/ch2_distributions_hyper}
        \caption{Hypergeometric $N=12,$ \\$K=8, n=5$}
    \end{subcaptionblock}
    \begin{subcaptionblock}[t]{.35\textwidth}
        \centering
        \input{figures/ch2_distributions_bino}
        \caption{Binomial $p=0.4, n=8$}
    \end{subcaptionblock}
    \caption{Examples of common probability distributions. \label{fig:distributions}}
\end{figure}

\subsection{Polynomials over Distributions}\label{ssec:prob_update}
Let $\D$ be the set of probability distributions. $\Z[\V\union \D]$ is
the set of integer polynomials containing variables $\V$ or probability
distributions $\D$. They behave exactly like normal polynomials except that the
values of the distributions are sampled probabilistically instead of being given
as an argument. As such we can define a function $[\cdot]: \Z[\V\union \D]
\rightarrow \Sigma \rightarrow \Z \rightarrow [0,1]$ that returns the
probability that the polynomial $f \in \Z[\V \union D]$ with a state $s \in
\Sigma$ evaluates to a value $k\in\Z$ with a probability $[\varf{}](s)(k)$.

The values of the variables are fixed by a syntactical substitution; where every
variable $x_i\in\V$ is replaced by the value fixed in the assignment $s(x_i)$,
we write $[x_i\backslash s(x_i)]$. The resulting polynomial
$\varf{}[x_i\backslash s(x_i)]$ describes a random variable $[\varf{}](s)$ that
takes a value at random depending only on the distributions $D_1, \dots, D_m$
within $f$. The random variable has the following probability distribution: 

\begin{align*}
    [\varf{}](s)(k) &= \sum_{d \in \Z^m} D_1(d_1)
    \cdot D_2(d_2) \cdots D_m(d_m) \cdot \begin{cases}
        1, & \text{if } \varf{}[x_i /s(x_i)]_{1\leq i\leq m}[D_i/d_i]_{1\leq i\leq m} = k \\
        0, & \text{otherwise}
    \end{cases}\\
    & = P([\varf{}](s) = k) 
\end{align*}

% Under misuse of notation we write $f(s)$ to denote the unique value $v$ with
% $f(s)(v) = 1$, when it exists. \todo{check if actually used}
% The support of polynomial $f$ with an assignment $s$ is the set of values that
% can be achieved with the polynomial by sampling the random distributions.

\begin{example}\label{ex:prob_polynomial}
    Consider the polynomial $f = x^2 + \textsc{Unif}(3,5)$ and the assignment
    $s(x) = 2$. $[\varf{}](s)$ is the random variable described by the
    polynomial $\varf[x/2]
    =2^2 + \textsc{Unif}(3,5)$.
    It can take three different values with the following probability
    \begin{equation*}
        [\varf{}](s)(k) = \begin{cases}
            \frac{1}{3}, & \text{if } k \in \{7,8,9\}\\
            0, & \text{otherwise}
        \end{cases}
    \end{equation*}
\end{example}

\begin{definition}[Probabilistic update]\label{def:prob_update}
    Similar to Definition \ref{def:update}, a probabilistic update is described
    by a function $\eta: A \rightarrow \Z[\V\union\D]$ that maps a probabilistic
    polynomial over $\V$ to every variable $x\in A$ and is undefined otherwise. 
    The value after the update is determined by evaluating the probabilistic
    polynomial.
\end{definition}

The support of the update $\eta : A \rightarrow \Z[\V\union \D]$ on an assignment
$s \in \Sigma$ is the set $\support_s(\eta)=\set{s' \in \Sigma}
{[\eta(x)](s)(s') > 0 \text{ for all } x\in A }$.

\subsection{Non-probabilistic over-approximation}
The value of a random variable can be approximated by a constraint for the
bounds of the support of the probability distribution. For any probability
distribution $D \in \D$ and a variable $d \in \V$, we define 
\begin{equation*}
    \lfloor D \rceil_d = \begin{cases}
        \min (\support(D)) \leq d \leq \max (\support(D)) & \begin{aligned}&\text{if}
            \min(\support(D))\text{, and } \\&\max (\support(D)) \text{ are
        known},\end{aligned} \\
        \min (\support(D)) \leq d & \text{if} \min(\support(D))
        \text{ is known}, \\
        d \leq \max(\support(D))  & \text{if} \max(\support(D)) \text{ is known},\\
        \texttt{true} & \text{otherwise}.
    \end{cases}
\end{equation*}

\begin{rem}
    \gls{koat} supports only the distributions presented in the previous
    section. Hence, those constraints are known and easy to compute for all
    distributions which we encounter in practice.
\end{rem}

A probabilistic polynomial is a random variable that evaluates to a random
value. The random value can be approximated by a non-probabilistic polynomial
with fresh variables and a constraint, constraining the value of those newly
introduced variables. 
\begin{definition}[Non-probabilistic over-approximation of probabilistic
    polynomials] 
    Let $A\subset\V$ and $f \in \Z[A\union\D]$ be a probabilistic polynomial
    with probability distributions $D_1,\dots,D_n$ used within $f$. Then the
    non-probabilistic over-approximation of a probabilistic polynomial is
    defined as the tuple
    $(\lfloor\varf\rceil,\tilde{\varf}) \in \C \times \Z[A \uplus \{d_1,\dots,d_n\}]$  with
    \begin{align*} 
        \lfloor \varf \rceil = \texttt{lin}(\LAnd_{i=1,\dots,m} \lfloor D_i
        \rceil_{d_i}) \text{
            and }
        \\ \tilde{f} = \varf{}[D_i/d_i] 
    \end{align*}
    with fresh variables $d_1,\dots,d_m \in (\V \backslash A)$. 
\end{definition}

\begin{example}
    Consider the same probabilistic polynomial as in Example
    \ref{ex:prob_polynomial}: $f = x^2 + \textsc{Unif}(3,5)$. The distribution
    support of the uniform distribution is bounded in both directions. 
    \begin{align*}
        \min(\support(\textsc{Unif}(3,5))) &= 3 \\
        \max(\support(\textsc{Unif}(3,5))) &= 5 \\
    \end{align*}

    Hence, a variable $d$ assigned from the distribution satisfies the constraint
    $\lfloor \varf\rceil = \lfloor \textsc{Unif}(3,5)\rceil_d = 3 \leq d \leq
    5$. The distribution in the probabilistic polynomial is replaced by the
    variable $d$. We obtain the new approximated polynomial $\tilde{\varf} = x^2 +
    d$.
\end{example}

A probabilistic update can be over-approximated by using the non-probabilistic
over-approximation of all the probabilistic polynomials used in the update.

\begin{definition}[Non-probabilistic over-approximation\label{def:nonprobapprox}]
    Let $A,B\subseteq\V$ be a set of variables and $\eta: A \rightarrow \Z[B\union
    D]$ be a probabilistic update function. A \textit{non-probabilistic
    over-approximation} of the update $\eta$ is a pair $(\lfloor\eta\rceil,
    \tilde{\eta}) \in \C \times (A \rightarrow \Z[\V])$ where 
    \begin{align*}
        \lfloor\eta\rceil = \LAnd_{x\in A} x' = \tilde{\eta}(x) \Land
        \lfloor\eta(x)\rceil \\
        \tilde{\eta}(x) = x' \hspace{1cm} \text{for all } x\in A
    \end{align*}
    Without loss of generality, the fresh variables introduced by the
    over-approximation of every probabilistic polynomials can be chosen to not
    appear in the other over-approximations.
\end{definition}

\begin{lemma}\label{lem:prob_update}
    For any assignment $s,s'\in\Sigma$, $A \subset \V$, probabilistic update
    $\eta: A \rightarrow \Z[\V \union D]$, and formula $\varphi \in \C$, if $s'
    \in \support_s(\eta)$ and $s \models \varphi$ then $s' \models
    \tilde{\eta}(\varphi \Land \lfloor\eta\rceil)$.

    \begin{proof}
        Assume $s' \in \support_s(\eta)$ and $s \models \varphi$. 
        We define a new assignment $\tilde{s} = s$ that will be expanded later
        on. For all $x \in
        A$ holds $[\eta(x)](s)(s'(x)) > 0$. Let $D_1,\dots,D_m \in \D$ be the
        distributions within $\eta(x)$. 

        \begin{align*}
            0 &< [\eta(x)](s)(s'(x)) \\
              &= \sum_{a \in \Z^m} D_1(a_1) \cdot D_2(a_2) \cdots D_m(a_m) \cdot 
            \begin{cases}
                1, & \text{if } \varf{}[x_i/s(x_i)][D_i/a_i] =
                s'(x) \\
                0, & \text{otherwise}
            \end{cases}
        \end{align*}

        There must exist $a_1,\dots, a_m\in \Z$ such that 
        \begin{equation*}
            0 < D_1(a_1) \cdot D_2(a_2) \cdots D_m(a_m)\text{ and }
            \varf[x_i \backslash s(x_i)][D_i\backslash a_i] = s'(x)
        \end{equation*}

        Without loss of generality, let $d_i$ for $i=1,\dots,m$ be new variables
        that are undefined by $\tilde{s}$. We expand the assignment $\tilde{s}$
        as follows:
        \begin{equation*}
            \tilde{s}(x) = a_i \text{ for } x = d_i, i=1,\dots,m \\
        \end{equation*}

        By construction $s = \tilde{s}|_A$. Since $d_1,\dots,d_m$ are fresh
        variables, and $s \models \varphi$, so does $\tilde{s} \models \varphi$.
        Besides the following holds:

        \begin{align*}
            &D_i(a_i) > 0 \text{ for all } i=1,\dots,m \\
            \Leftrightarrow& s \models \lfloor D_i \rceil \text{ for all } i
            =1,\dots,m \\
            \Leftrightarrow& s\models \LAnd_{i=1,\dots,m} \lfloor D_i \rceil \\
            \Leftrightarrow& s\models \lfloor \eta(x) \rceil
        \end{align*}

        Since the above holds for all $x \in A$, 

        \begin{equation*}
            \tilde{s} \models \LAnd_{x \in A} \lfloor \eta(x) \rceil = \lfloor
            \eta \rceil
        \end{equation*}

        By Lemma \ref{lem:nonprop_update} follows $s' \models
        \tilde{\eta}(\varphi \Land \lfloor\eta\rceil)$
    \end{proof}
\end{lemma}

\section{Probabilistic Integer Programs}\label{sec:pip}
In general, an \textit{integer program} is a program that only uses variables
with integer values. Many similar mathematical structures have been developed
over the years to formalise integer programs. The one used in this thesis is the
integer transition system. Before giving a formal definition of probabilistic
integer programs, the various concepts of integer programs are introduced
intuitively in a less formal way. 
An integer transition system represents a program as locations and transitions
between those locations. A run of a program is a sequence of configurations
where every configuration contains the location which represents the program
counter, and a state that holds the current values of the programs 
variables. A transition contains a condition $\tau$, called \textit{guard}, that
must be true for the current state to be taken. Whenever a transition is taken
it modifies the state with some \textit{update} $\eta$. An update assigns a new
value to each program variable depending on the previous state. Every integer
program starts at a unique location usually called $\ell_0$. Whenever no
transition guard is satisfied the program terminates. The arguments to the
program are given by some starting state $s_0$, assigning a value for every
program variable.

We use a graphical representation of programs, as shown in \fref{fig:ex_pip_nd}.
Updates assign a value to every program variable; however for better
readability, the updates that preserve the value of a program variable are
omitted.

Non-deterministic programs contain mutually non-exclusive transitions leaving the
same location. Hence it is unclear which transition is taken at runtime. 

\begin{example}\label{ex:nd_branching}
    \begin{figure}[h]
        \centering
        \input{figures/ch2_ex_pip_nd.tex}
        \caption{A non-deterministic integer program over the variables $\V =
        \{x\}$.\label{fig:ex_pip_nd}}
    \end{figure}

    Take the program displayed in \fref{fig:ex_pip_nd} for example. At location
    $\ell_1$ with the state $s(x) = 3$, both transitions $t_2$ and $t_3$ are
    possible, since both of their guards $x > 0$ and $x > 1$ are satisfied. At
    runtime the decision which transition is taken is arbitrary
    (non-deterministic). 
\end{example}

A similar concept to non-determinism (where the decision is fully arbitrary), is
to give a probability to each transition. Instead of picking the transition
fully arbitrarily, the transitions are picked by their respective probability.
Although similar, non-deterministic and probabilistic branching are treated
separately. In order to allow both concepts in a program, one introduces the 
concept of \textit{general transitions}; they are sets of (conventional)
transitions with the probabilities adding up to 1, acting as a single transition
for the resolution of non-determinism. Non-determinism can only occur between
general transitions and it is resolved first before probabilities are resolved,
and every transition in a general transition must have the same guard. 

\begin{example}\label{ex:prob_branching}
    \begin{figure}[h]
        \centering
        \input{figures/ch2_ex_pip_prob.tex}
        \caption{A probabilistic (and non-deterministic) integer program over
        the variables $\V= \{x\}$. \label{fig:ex_pip_prob}}
    \end{figure}
    Take the program displayed in \fref{fig:ex_pip_prob} as an example. The
    transitions $t_2$ and $t_3$ are part of a general transition (marked in
    red). For simplicity, all the other transitions are unique members of their
    own general transition and the trivial probability of 1 is omitted. At
    location $\ell_1$ with a variable assignment $s(x) = 3$, three transitions
    $t_2$, $t_3$ and $t_4$ are possible. The non-determinism is resolved first.
    Then if the general transition containing $t_2$ and $t_3$ is selected, a
    coin is thrown to decide whether to take $t_2$ or $t_3$.
\end{example}

Besides probabilistic branching, variable assignments can be non-deterministic
or probabilistic, which is called non-deterministic or probabilistic sampling.
Non-deterministic sampling occurs when on a transition a variable is updated to
an expression that contains unbound variables (variables that are not part of
the set of program variables). The unbound variables are sampled arbitrarily
from all the integers, but can be constrained by the guard, in which case the
transition is only taken if the non-deterministic sampling fulfils the guard. If
the sampled value fulfils no transition's guard, the program terminates.
Non-deterministic sampling is resolved before non-deterministic branching.

\begin{example}\label{ex:nd_sampling}
    \begin{figure}[h]
        \centering
        \input{figures/ch2_ex_pip_nds.tex}
        \caption{A integer program with non-deterministic sampling over
        the variables $\V= \{x, u\}$ with program variables $\PV=\{x\}$.
        \label{fig:ex_pip_nds}}
    \end{figure}

    Take the program displayed in \fref{fig:ex_pip_nds} as an example. The
    transition $t_2$ contains an update with a temporary $u$ variable.
    Furthermore, the temporary variable is constrained to $1 \leq u \leq 3$. The
    value of temporary variable $u$ in the state $s: \PV \rightarrow \Z$ is
    ignored. It can have an arbitrary value for the purpose of fulfilling the
    guard and when being used in the update. Whenever a temporary variable is
    sampled to a value not satisfying the guard of $t_2$ and x is larger than
    zero, the program would terminate.
\end{example}

Probabilistic sampling is done using probability distributions (recall
\Sref{sec:probability}). Updates are expanded to integer polynomials with
variables and distributions. The value of the indeterminate in the polynomial is
equal to the (possibly temporary) variable or is sampled from the distribution. 

\begin{example}\label{ex:prob_sampling}
    \begin{figure}[h]
        \centering
        \input{figures/ch2_ex_pip_probs.tex}
        \caption{A integer program with probabilistic sampling over
        the variables $\V= \{x\}$ with program variables $\PV=\{x\}$.
        \label{fig:ex_pip_probs}}
    \end{figure}
    Take the program depicted in \fref{fig:ex_pip_probs} as an example. In
    transition $t_2$, the variable x is assigned to the polynomial $x -
    \textsc{Bern}(0.5)$. The Bernoulli distribution can take the value of 1 or 2
    with a probability of 0.5 in both cases. In the case that the distribution is
    sampled to 1, the variable x is decremented, and stays the same otherwise. 
\end{example}

Deterministic non-probabilistic programs are a special case of probabilistic
programs, where every general transition consists of a single transition, all
general transitions are mutually exclusive, and no non-deterministic or
probabilistic sampling is done in the transitions. Similarly, non-deterministic
non-probabilistic programs are a special case of probabilistic programs where
every general transition contains only a single transition and no probabilistic
sampling is allowed. 

This thesis will closely follow the formal definitions of E. Meyer
\cite{meyer2021tacas}. First the formal definition for a \acrfull{pip} is given,
then after a small excursion to Markov decision processes, the semantics are 
defined.

Let $\PV \subset \V$ be a finite set of program variables. Let $\Loc$ be a set
of locations, and $\T = \Loc \times [0,1] \times (\PV \rightarrow
\Z[\V\union\D]) \times \R \times \Loc$ be the set of transitions over locations
$\Loc$. Let $\GT = 2^{\T}$ be a set of general transitions over $\Loc$. 

\begin{definition}[Probabilistic Integer Program \cite{meyer2021tacas}]\label{def:pip}
    $(\PV, \Loc_\Prog, \GT_\Prog, \ell_0)$ is a probabilistic integer transition
    system where:
    \begin{enumerate}
        \item $\PV$ is a finite set of program variables,
        \item $\Loc_\Prog \subseteq \Loc$ is a finite non-empty set of locations,
        \item $\GT_\Prog \subseteq \GT$ is a finite set of general transitions,
            where a general transition $g \in \GT_\Prog$ is a finite non-empty
            set of transitions $t = (\ell, p, \tau, \eta, \ell') \in \T_\Prog
            \subseteq \GT_\Prog$ with: 
            \begin{enumerate} 
                \item a source and target location $\ell,\ell' \in \Loc$,
                \item a probability $p \geq 0$ that the transition is taken
                    when the corresponding general transition $g$ is executed,
                \item a guard $\tau \in C$,
                \item an update function $\eta : \PV \rightarrow \Z[\V \cup \D]$,
                    mapping every program variable to an integer polynomial
                    containing variables or distribution functions. All $t \in
                    g$ must share the same start location and guard.
            \end{enumerate}
            Moreover, the probabilities of the transitions must add up to $1$.
            We call $τ_g$ the guard of the general transition $g$ and all
            transitions share a common start location, which we call $\ell_g$.
        \item $\ell_0 \in \Loc$ is the start location of the program.
    \end{enumerate}
\end{definition}

Before defining the semantics of a \gls{pip}, a new special location $\ell_\bot$
is introduced that will be used to indicate the termination of a program as well
as a virtual transition $t_\bot$ and general transition $g_\bot = \{t_\bot\}$
with the same purpose. Instead of just terminating, a program takes the
transition $g_\bot$ to the location $\ell_\bot$ if and only if no other general
transition's guard is satisfied. Since no normal transition leaves the location
$l_\bot$ the run loops indefinitely on the location $\ell_\bot$. In addition, a
new transition $t_\text{in}$ is added for the start of the program. 

A state $s$ is a full variable assignment for the variables of a program $s: \V
\rightarrow \Z$. $\Sigma$ is the set of all states. The set of configurations is
defined as $\confs_\Prog = (\Loc_\Prog \union \{\ell_\bot\}) \times (\T_\Prog
\union \{t_{\text{in}}, t_\bot\}) \times \Sigma$. Note that additionally to the
location and state, the last transition taken to the current location is
remembered as well. This will not be relevant for the partial evaluation but it
is important to define the probabilistic properties of a \gls{pip}.

A run of the program is an \textit{infinite} sequence of configurations, and the
set of all runs is defined as $\runs_\Prog = \confs_\Prog^\omega$. In contrast a
finite prefix is a finite sequence of configurations. The set of all finite
prefixes is defined as $\fpath_\Prog = \confs_\Prog^*$. 

\begin{rem} 
    The set of configurations does not exclude invalid combinations of
    locations and transitions, that is, a configuration with a location that is
    not the target of the incoming transition, or a state that is impossible to
    reach via the incoming transition. Conversely, the set of runs and finite
    prefixes does not exclude invalid paths, e.g., two consecutive configurations
    that cannot be reached via the given transition, etc. The validity of a path
    will be described by a probability measure where invalid paths have a zero
    probability. 
\end{rem}

\subsection{Probabilistic Integer Program as a Markov Decision Process}
We will first introduce the Markov decision process in general and then apply
the concept to a run of a \gls{pip}. In general, a Markov decision process
contains five parts \cite{puterman1994markov}:
\begin{enumerate}
    \item \textit{decision epochs}, which are points in time at which a decision is
        made;
    \item \textit{states} the decision process is in at a decision epoch;
    \item \textit{actions} that are selected by the decision process;
    \item \textit{transition probabilities} that describe the probabilities of
        states transitioned into after a certain action was taken; and finally
    \item \textit{rewards} that are accumulated over the whole decision process
        with every action.\end{enumerate}

One can easily see the similarities to runs in a \gls{pip}. The steps of a
program are the decision epochs of the Markov decision process. The states of
the program are the states of the decision process. Non-deterministic sampling
and non-deterministic branching are similar to choosing an action, and the
transition probabilities describe the probabilities of the probabilistic
branching and sampling. 

The highest reward is the optimisation goal when taking decisions in a Markov
decision process. One looks for a decision policy that returns the highest
accumulated rewards over a whole process. In the literature
\cite{puterman1994markov, puterman1990markov}, the properties of a given Markov
decision process with regard to the highest achievable rewards have been
extensively researched. 

The goal of termination analysis is to find upper bounds for the runtime
complexity. If one defines runtime as a reward, one can picture the policy for a
Markov decision process as an adversary. We call the policy the
\enquote{scheduler}.

An execution of a program corresponds to a Markov decision process in the
following way:
\begin{enumerate}
    \item The decision epochs correspond to the steps of the program. There are
        infinitely many. Hence this is an infinite horizon Markov decision
        process.
    \item The states are the configurations $\confs_\Prog$ of the program.
    \item The actions are the selection of a general transition and full
        assignments that satisfy a guard of the selected general transition or
        the terminating transition.
    \item The transition probability, which will be defined later.
    \item The reward is 1 for every non-terminating transition and 0 for the
        terminating transition $t_\bot$.
\end{enumerate}

\begin{rem}
    In general, at every decision epoch one can use a different decision rule,
    described by a policy. However, we will only consider so-called stationary
    policies, where the policy uses the same decision rule at every decision
    epoch. 
\end{rem}

We differentiate between two types of schedulers: the first is a scheduler that
at a given state always takes the same decision -- it has no memory of its past
decisions and is not influenced by them. It is called a \emph{Markovian
Scheduler}. 

\begin{definition}[Markovian scheduler\cite{meyer2021tacas}\label{def:mscheduler}]
    A function $\scheduler : \confs_\Prog \rightarrow (\GT_\Prog \uplus \{g_\bot\})$ is a
    Markovian (memoryless) scheduler if for every configuration $c = (\ell,t,s) \in
    \confs_\Prog$, $\scheduler(c) = (g, s')$ implies:
    \begin{enumerate}
        \item \label{itm:md1} $s'(x) = s(x)$ for all $x \in \PV$ 
        \item $\ell$ is the start location $\ell_g$ of the general transition $g$.
        \item \label{itm:md3} $s' \models \tau_g$ for the guard $\tau_g$ of the
            general transition $g$. 
        \item $g = g_\bot$ and $s' = s$, if $\ell=\ell_\bot$, $t=t_\bot$, or no $g'
            \in \GT$ and $s'\in \Sigma$ satisfy \cref{itm:md1} to \cref{itm:md3}
            \label{itm:md4}
    \end{enumerate}

    The set of all Markovian schedulers for a program $\Prog$ is denoted by
    $\MDS_\Prog$, where \enquote{MD} stands for \enquote{\underline{M}arkovian
    \underline{d}eterministic}.
\end{definition}

\begin{rem}
    Although the decision rule selecting the action (non-deterministic sampling
    and branching) is deterministic, the new state is not. After selecting the
    action, the new state is determined by the probability distribution
    described by the \textit{transition probabilities}.
\end{rem}

The second type is a scheduler that remembers its past decisions. It is called
a history dependent scheduler. 

\begin{definition}[History dependent scheduler]
    A function $\scheduler : \fpath_\Prog \rightarrow (\GT_\Prog \uplus
    \{g_\bot\})$ is a history dependent scheduler if every finite prefix
    $f\in\fpath_\Prog$ and configuration $c=(\ell,t,s) \in \confs_\Prog$,
    $\scheduler(\varf\!c) = (g, s')$ implies items \ref{itm:md1} to \ref{itm:md4} of
    Definition \ref{def:mscheduler} for the last configuration $c$.
    The set of all history dependent schedulers for a program $\Prog$ is denoted
    by $\HDS_\Prog$, where \enquote{HD} stands for \enquote{\underline{h}istory
    dependent \underline{d}eterministic}.
\end{definition}

\subsection{Probability Space of a Program\label{ssec:probabilityspace}}
For a \gls{pip} one can define a probability space $(\runs_\Prog, \F, \PrSs)$,
where the outcomes of the probabilistic experiment are the runs on the program.
Every distinct run is measurable hence the $\sigma$-algebra $\F$ is defined to
contain every set $\{r\} \in \F$ for $r \in \runs_\Prog$. The probability
measure $\PrSs$ describes the probability of the given run for a scheduler
$\scheduler$ and the input given to the program as the initial state $s_0$.
Formally the probability space is defined by a cylindrical construction
expanding the length of finite prefixes up to infinity. For the detailed
construction we refer the reader to \citeauthor{meyer2021tacas} \cite{meyer2021tacas}.

On any given configuration, a Markovian scheduler resolves the non-determinism
first. The probability measure will use the result of the scheduler instead of
the previous configuration. The scheduler keeps the values of the program
variables from the previous configuration. It selects a general transition $g$
and a variable assignment $\tilde{s}$ satisfying the guard of the general
transition. If no such transition is found or the program is already in the
terminating state $\ell_\bot$ it takes the special transition $g_\bot$,
indicating termination.

Let $\scheduler \in \MDS_\Prog$ be a scheduler and $s_0 \in \Sigma$ be an
initial state. First, the probability for a program to start at a configuration
$c$ is 1 only for the configuration that has the initial state $s_0$ at the
start location $\ell_0$ coming from the initial transition $t_\text{in}$. All
other configuration are invalid starting configurations and they are assigned a
zero probability by the probability measure $\prSs : \confs_\Prog \rightarrow
[0,1]$

\begin{equation*}
    \prSs(c) = \begin{cases}
        1 & \text{ if } c = (\ell_0, s_0, t_\text{in}) \\
        0 & \text{ otherwise}
    \end{cases}
\end{equation*}

Let us say that the program ran up to a configuration $c=(\ell,t,s)$. The
scheduler resolves the non-deterministic sampling and branching to
$\scheduler(c) = (g, \tilde{s})$. The probability for the event $c \rightarrow
c'$ that $c'$ follows the configuration $c$ with $c' = (\ell', t', s')$ and $t'
= (\ell, p, \tau, \eta, \_, \ell') \in g$ depends on three things: First the
transition $t \in g$ is taken with the probability $p$; second the probability
that each program variable $x\in\PV$ is updated to the value $s'(x)$ by the
probabilistic update -- recall Definition \ref{def:prob_update}; third that the
temporary variables are sampled by the scheduler equal to the target state. The
result is the probability measure $\prSs: \confs_\Prog^2 \rightarrow [0,1]$. 

\begin{align*}
    \prSs(c \rightarrow c') = p \cdot \prod_{x\in\PV} [\eta(x)](\tilde{s})(s'(x))
    \cdot \prod_{u\in\V\backslash\PV} \delta_{\tilde{s}(u),s(u)},
\end{align*}

with any $z_1, z_2 \in \Z$: 
\begin{equation*}
    \delta_{z_1,z_2} = \begin{cases}
        1, & \text{if } z_1 = z_2 \\
        0, & \text{if } z_1 \neq z_2.
    \end{cases}
\end{equation*}

Naturally for all other configurations $c'$, such as configurations that are not
reachable by the given transition, or transitions that were not selected by the
scheduler we define $\prSs(c \rightarrow c') = 0$. 

\begin{align*}
    \prSs(f) = \prSs(c_0) \cdot \prod_{i=1}^n \prSs(c_{i-1} \rightarrow c_i)
\end{align*}

By a cylindrical construction one arrives at the final probability measure for
the infinite runs $\PrSs : \runs_\Prog \rightarrow [0,1]$ and the probability
space $(\runs_\Prog, \F, \PrSs)$.

\begin{definition}[Admissible run]\label{def:admissible}
    A finite prefix $f \in \fpath_\Prog$ is \textit{admissible} if and only if
    $\prSs(\varf{}) > 0$. A run $c_0c_1\dots \in \runs_\Prog$ is \textit{admissible}
    if and only if every finite prefix $c_0\dots c_n \in\fpath_\Prog$ is
    admissible.
\end{definition}

\begin{rem}
    An admissible run can very well have a zero probability if the
    probability of its prefixes approach zero for $n \rightarrow \inf$.
\end{rem}

\begin{definition}[Terminating Run]\label{def:termination}
    A run is terminating if it is admissible and it has a finite prefix
    ending in the location $\ell_\bot$.
\end{definition}

By definition only the terminating transition $t_\bot$ is admissible when the
program is in the terminal location $\ell_\bot$ and the following configuration
must again be in the terminal location $\ell_\bot$. A terminating run must end with
infinitely many repetitions of a terminating configuration
$c_\bot=(\ell_\bot,t_\bot,s)$ with some assignment $s\in\Sigma$.

\subsection{Runtime Complexity}
The runtime complexity is the number of non-terminating transitions visited
during a finite prefix or run. For runs they can be seen as a random variable in
the probability space of a program.

\begin{definition}[Runtime complexity]\label{def:runtime_complexity}
    For a finite prefix $f = c_0\dots{}c_n \in \fpath_\Prog$ the runtime complexity $\Rt_\Prog
    : \fpath_\Prog \rightarrow \N$ is defined as 
    \begin{equation*}
        \Rt(\varf) = |\set{i}{ c_i = (\ell_i, t_i, s_i) \text{ and } t_i \neq
        t_\bot}| 
    \end{equation*}

    For a run $c_0c_1\dots{}\in \runs_\Prog$ the runtime complexity
    $\Rt_\Prog : \runs_\Prog \rightarrow \overline{\N}$ is is
    a random variable with:
    \begin{equation*}
        \Rt_\Prog(c_0c_1\dots) = |\set{i}{c_i = (\ell_i, t_i, s_i) \text{ and } t_i \neq t_\bot}|
    \end{equation*}
\end{definition}

\begin{definition}[Expected runtime complexity, \acrshort{past}]\label{def:past}
    For a scheduler $\scheduler \in \MDS_\Prog$ and a start location $s_0 \in
    \Sigma$ the expected runtime complexity $\Rt_{\scheduler,s_0}(\Prog)$ of a
    \gls{pip} $\Prog$ is defined as the expected value of the runtime
    complexity. 

    \begin{equation*}
        \Rt_{\scheduler,s_0}(\Prog) = \ESs(\Rt_\Prog)
    \end{equation*}

    If $\Prog$'s expected runtime complexity is finite for every scheduler
    $\scheduler \in \MDS_\Prog$ and every initial state $s_0 \in \Sigma$, then
    $\Prog$ is called \gls{past}.
\end{definition}

\begin{example}
    Consider the program from Example \ref{ex:prob_sampling} displayed in
    \fref{fig:ex_pip_probs}. It must take at least transitions $t_1$ and $t_3$
    hence any runtime less than $2$ is impossible. The variable $x$ is
    decremented by $1$ with a probability of $0.5$ on every transition $t_2$. So
    for a given starting state $s_0 \in \Sigma$ and $\scheduler \in \MDS_\Prog$,
    the expected runtime depends on the initial variable $x$. The probability
    for decrementing $x$ once follows a geometric distribution. The runtime for
    decrementing $x$ until zero ($x$ times) is 
    \begin{equation*}
        \ESs(\Rt_\Prog) = s_0(x) \cdot \frac{1}{p} = s_0(x) \cdot \frac{1}{0.5}
        = 2 \cdot s_0(x).
    \end{equation*}
    The expected runtime does not depend on the scheduler and is finite for all
    values of $x$, hence the program is \gls{past}.
\end{example}

% The runtime complexity is a special case of the cost of a run, where every
% transition has an associated cost of $\kappa_t=1$ for all $t \in \T$. Then 

% \begin{equation}
%     \Rt_{\scheduler,s_0}(\Prog) = \Cost_{\scheduler, s_0}(\Prog)
% \end{equation}

\begin{lemma}\label{lem:hdsvsmds}
    For an arbitrary \gls{pip} $\Prog$ and an arbitrary initial state
    $s_0\in\Sigma$ the following holds
    \begin{align*}
        \sup_{\scheduler\in\HDS}\ESs(\Rt_\Prog) =
        \sup_{\scheduler\in\MDS}\ESs(\Rt_\Prog)
    \end{align*}

    That means, that it doesn't matter whether we use Markovian or history
    dependent deterministic schedulers when considering the worst-case expected
    runtime complexity.

    \begin{proof}
        We define $\nu^\scheduler$ to be the expected runtime for any
        deterministic scheduler $\scheduler$ -- Markovian or history dependent
        -- on a program $\Prog$ with an initial state $s_0\in\Sigma$.
        \begin{equation*}
            \nu^\scheduler = \ESs(\Rt_\Prog) = \lim_{n \rightarrow \infty}
            \sum_{\substack{|\varf| = n \\ \varf \in
            \fpath}}\Rt_\Prog(\varf)\cdot\prSs(\varf)
        \end{equation*}
        Let $\scheduler \in \HDS$ be a \emph{history dependent} scheduler with
        an expected reward $\nu^\scheduler$. We argue that $\sup_{\scheduler'
        \in \MDS} \nu^{\scheduler'} \geq \nu^\scheduler$.

        Let $N \in \N$ and $\fpath_N = \set{f \in \fpath_\Prog}{ |\varf| = N }$
        be the set of finite prefixes of length $N$. This set can contain
        infinitely many prefixes, since there can be infinitely many assignments
        caused by probabilistic sampling from finite distributions. We approach
        the set $\fpath_N$ with a series of finite sets $F_1, F_2, \dots
        \subseteq\fpath$ such that 

        \begin{equation*}
            \lim_{i \rightarrow \infty} \prSs(F_i) = \lim_{i \rightarrow \infty}
        \sum_{\varf \in F_i} \prSs(\varf) = 1.
        \end{equation*}

        Next, we define a set of Markovian schedulers that are restricted to
        select general transitions and assignments (actions) only for prefixes
        in $F_i$ which we call $\MDS_{F_i}$. As $F_i$ is finite, a Markovian
        scheduler $\scheduler' \in \MDS_{F_i}$ has only finitely many possible
        actions to select from.

        By Lemma 7.1.9 in \cite{puterman1994markov} an \emph{optimal} scheduler
        $\scheduler_\text{opt} \in \MDS_{F_i} \subset \MDS$ exists that
        maximises the expected reward when limited to the finite set of
        configurations. For all remaining configurations $\scheduler_\text{opt}$
        may behave arbitrarily. Let $\nu_i$ be the reward of this optimal
        scheduler.

        In the limit when $i$ approaches infinity, the optimal scheduler must
        yield an expected reward no less than the history dependent scheduler
        $\scheduler$. We get the following inequality.
        \begin{equation*}
            \lim_{i\rightarrow\infty}\nu_i \geq \sum_{\varf\in\fpath_N}
            \Rt(\varf) \cdot \prSs(\varf)
        \end{equation*}

        Finally, we let $N$ grow to infinity where $\nu_{i,N}$ is now the
        expected reward of the optimal Markovian scheduler as described above.

        \begin{equation*}
            \sup_{\scheduler' \in \MDS_\Prog} \nu^{\scheduler'}\geq
            \lim_{N\rightarrow\infty}\lim_{i\rightarrow\infty}\nu_{i,N} \geq
            \lim_{N\rightarrow\infty}\sum_{\varf \in
            \fpath_N}\Rt_\Prog(\varf)\cdot\prSs(\varf) = \nu^\scheduler
        \end{equation*}

        Since the above holds for any history dependent scheduler, it also holds
        for the supremum and we get 
        \begin{equation*}
            \sup_{\scheduler \in \MDS_\Prog} \ESs(\Rt_\Prog)\geq \sup_{\scheduler \in \HDS_\Prog}\ESs(\Rt_\Prog)
        \end{equation*}

        The inverse holds because the class of history dependent schedulers
        transcends its Markovian counterpart. One concludes
        \begin{equation*}
            \sup_{\scheduler \in \MDS_\Prog}\ESs(\Rt_\Prog) = \sup_{\scheduler \in
        \HDS_\Prog}\ESs(\Rt_\Prog);
        \end{equation*}
        completing the proof.
    \end{proof}
\end{lemma}

\subsection{Bounds}
The primary goal of \gls{koat}'s analysis is to find bounds for the runtime
complexity.

\begin{definition}[Bounds\cite{meyer2021tacas}]\label{def:bounds}
    The set of bounds $\B$ is the smallest set with $\PV \union \R_{\geq 0}
    \subseteq \B$, and where $b_1,b_2 \in \B$ and $v\in \R_{\geq 1}$ imply $b_1 +
    b_2, b_1 \cdot b_2 \in \B$ and $v^{b_1} \in \B$.
\end{definition}

During analysis of classical integer programs, one is especially interested in
runtime complexity bounds.

\begin{definition}[Expected runtime complexity bounds]
    For $s_0\in\Sigma$ and a \gls{pip} $\Prog$, $\RB \in \B$ is an expected
    runtime bound if
    \begin{equation*}
        \RB \geq \sup_{\scheduler\in\MDS}(\Rt_{\scheduler, s_0}) =
        \sup_{\scheduler\in\MDS} (\ESs(\Rt_\Prog)).
    \end{equation*}
\end{definition}

\subsection{Non-probabilistic Integer Programs}
A non-probabilistic program is a probabilistic program where every general
transition contains only a single transition with probability $p = 1$. In that
case for a scheduler $\scheduler$ and an initial state $s_0 \in \Sigma$ there is
only a single run $c_0c_1\dots \in \runs_\Prog$ with probability
$\PrSs(c_0c_1\dots) = 1$ and the expected runtime defines the runtime of the
program.
\begin{equation*}
    \ESs(\Rt_\Prog) = \Rt(c_0c_1\dots)
\end{equation*}

\section{Graph Operations}\label{sec:graph_operations}

A \gls{pip} $\Prog$ induces a graph which we call $G_\Prog$ with vertices
$V(G_\Prog) = \Loc_\Prog$ and edges $E(G_\Prog) = \T_\Prog$. We define some
graph operations that will become useful during the thesis. Let $\G$ be the set
of graphs over vertices $\Loc$ and transitions $\T$.

Adding a transition and possibly new vertices (locations) therein to a graph is
denoted by the plus operator $+: \G \times 2^\T \rightarrow \G$. Removing a
transition from a graph, while preserving the vertices therein, is denoted by
the minus operator $- : \G \times 2^\T \rightarrow \G$. Moreover, we write $G_1
\union G_2$ instead of $(V(G_1) \union V(G_2), E(G_1) \union E(G_2))$.

For a \gls{pip} $\Prog$ the incoming transitions of a location are described by
the operation $\TPin : \Loc \rightarrow 2^\T_{\Prog}$, whereas the outgoing
transitions at a location are described by a function $\TPout : \Loc \rightarrow
2^{\TP}$. We define similar functions outgoing general transitions $\GTPout:
\Loc \rightarrow 2^\GT$. 
\begin{align*} 
    \TPout(\ell) &= \set{t}{t = (\ell, \_, \_, \_, \_) \in \T}\\ 
    \TPin(\ell) &= \set{t}{t = (\_, \_, \_, \_, \ell) \in \T}\\
    \GTPout(\ell) &= \set{g}{g \in \GTP\text{ and } \ell_g = \ell}
\end{align*}

All locations in a subset of transitions are denoted by a function $\Loc :
2^{\TP} \rightarrow 2^{\Loc_\Prog}$ with 
\begin{equation*}
    \Loc(T) = \set{\ell, \ell'}{(\ell, \_, \_, \_, \ell') \in T}
\end{equation*}

For a subset of transitions $T \subset \T_\Prog$ the entry transitions are
described by a function $\TPin : 2^{\TP} \rightarrow 2^{\TP}$ and the exit
transitions are described by a function $\TPout : 2^{\TP} \rightarrow 2^{\TP}$
with 

\begin{align*}
    \TPout(T) &= \Union_{\ell \in \Loc(T)} \TPin(\ell) \backslash T \\
    \TPin(T) &= \Union_{\ell \in \Loc(T)} \TPout(\ell) \backslash T \\
\end{align*}

For a \gls{pip} $\Prog$ a loop (or cycle) is a finite sequence of locations $L =
\ell_0\dots\ell_n \in \Loc^n$ of length $n \in \N$ for which there exist
transitions $t_i = (\ell_i, \_, \_, \_, \ell_{i+1}) \in \T_\Prog$ for all $0
\leq i < n$ and a closing transition $t_n = (\ell_n, \_, \_, \_, \ell_0) \in
\T_\Prog$. Additionally we require the locations of a loop to be pairwise
different. The first location of the loop $\ell_0$ is called the \emph{loop
head}. A loop $L =\ell_0\dots\ell_n \in \Loc^n \in \Loc_\Prog^n$ induces a set
of subsets of transitions $T_L$ which we call the \emph{transition loops} of $L$
with
\begin{equation*}
    \begin{gathered}
    T_L = \set{\{t_0,\dots,t_n\}}{t_i = (\ell_i, \_, \_, \_, \ell_{i+1}) \in
        \T_\Prog, 0 \leq i < n, t_n = (\ell_i, \_, \_, \_, \ell_0) \in \T_\Prog}. 
    \end{gathered}
\end{equation*}

A subset of transitions $T \subseteq \T_\Prog$ is called a \emph{strongly
connected} component when for every two locations $\ell, \ell' \in V(T)$ there
exists a path between the two using only the transitions from $T$.