Chapter 4 The Forward and Backward Algorithm

The forward and backward probabilities are used for obtaining MLEs by the EM algorithm, state decoding, and state predictions. We cover several properties of the HMMs used to show that

  1. forward probabilities are the joint probability of \(\boldsymbol{X}^{(t)}\) and \(\boldsymbol{C}^{(t)}\)

  2. backward probabilities are the conditional probability of \(\boldsymbol{X}^{(t)}\) and \(\boldsymbol{C}^{(t)}\)

  3. their product is the likelihood \(L_T = \Pr(\boldsymbol{X}^{(T)}, \boldsymbol{C}^{(T)})\)

4.1 Forward and Backward Probabilities

For \(t = 1, 2, \dots, T\)

\[\begin{align} \boldsymbol{\alpha}_t &= \boldsymbol{\delta P} (x_1) \boldsymbol{\Gamma P} (x_2) \boldsymbol{\Gamma} \cdots \boldsymbol{\Gamma P} (x_T)\\ &= \boldsymbol{\delta P} (x_1) \prod_{s=2}^t \boldsymbol{\Gamma P} (x_s) \end{align}\]

and

\[\begin{align} \boldsymbol{\beta'}_t &= \boldsymbol{\Gamma P} (x_{t+1}) \boldsymbol{\Gamma P} (x_{t+2}) \cdots \boldsymbol{\Gamma P} (x_T) \boldsymbol{1'}\\ &= \left(\prod_{s=t+1}^T \boldsymbol{\Gamma P} (x_s) \right) \boldsymbol{1'} \end{align}\]

with the convention that an empty product is the identity matrix.

In recursive form,

for \(t = 1, 2, \dots, T-1\),

\[\begin{align} \boldsymbol{\alpha}_{t+1} = \boldsymbol{\alpha}_t \boldsymbol{\Gamma P} (x_{t+1}) \tag{4.1} \end{align}\]

and

\[\begin{align} \boldsymbol{\beta}_t' = \boldsymbol{\Gamma P} (x_{t+1}) \boldsymbol{\beta'}_{t+1} \tag{4.2} \end{align}\]

4.2 Properties of HMMs

4.2.1 Property

For positive integers t,

\[\begin{equation} \Pr(\boldsymbol{X}^{(t+1)}, C_t, C_{t+1}) = \Pr(\boldsymbol{X}^{(t)}, C_t) \Pr(C_{t+1}|C_t) \Pr(X_{t+1}|C_{t+1}) \tag{4.3} \end{equation}\]

Proof

By Equation (9.2), for \(\Pr(\boldsymbol{X}^{(t+1)}, \boldsymbol{C}^{(t+1)})\), it follows that

\[\Pr(\boldsymbol{X}^{(t+1)}, \boldsymbol{C}^{(t+1)}) = \Pr(\boldsymbol{X}^{(t)}, \boldsymbol{C}^{(t)}) \Pr(C_{t+1}|C_t) \Pr(X_{t+1}|C_{t+1})\]

Then summing over \(\boldsymbol{C}^{(t+1)}\), it follows that

\[\begin{align} \sum_{\boldsymbol{C}^{(t-1)}} \Pr(\boldsymbol{X}^{(t+1)}, \boldsymbol{C}^{(t+1)}) &= \sum_{\boldsymbol{C}^{(t-1)}} \Pr(\boldsymbol{X}^{(t)}, \boldsymbol{C}^{(t)}) \Pr(C_{t+1}|C_t) \Pr(X_{t+1}|C_{t+1})\\ \stackrel{\text{by LOTP}}{\Rightarrow} \Pr(\boldsymbol{X}^{(t+1)}, C_t, C_{t+1}) &= \Pr(\boldsymbol{X}^{(t)}, C_t) \Pr(C_{t+1}|C_t) \Pr(X_{t+1}|C_{t+1}) \end{align}\]

4.2.2 Property

A generalization of (4.3) is the following:

For any integer \(T \geq t+1\),

\[\begin{equation} \Pr(\boldsymbol{X}_1^{(T)}, C_t, C_{t+1}) = \Pr(\boldsymbol{X}_1^{(t)}, C_t) \Pr(C_{t+1}|C_t) \Pr(\boldsymbol{X}_{t+1}^T|C_{t+1}) \tag{4.4} \end{equation}\]

Proof

By (9.2), \(\Pr(\boldsymbol{X}_1^T, \boldsymbol{C}_1^T) = \Pr(C_1) \prod_{k=2}^T \Pr(C_k|C_{k-1}) \prod_{k=1}^T \Pr(X_k|C_k)\), which can be rewritten as

\[\Pr(\boldsymbol{X}_1^T, \boldsymbol{C}_1^T) = \Pr(C_1) \prod_{k=2}^t \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(C_k|C_{k-1}) \prod_{k=1}^t \Pr(X_k|C_k) \prod_{k=t+1}^T \Pr(X_k|C_k)\]

Then summing over \(\boldsymbol{C}_{t+2}^T\) and \(\boldsymbol{C}_1^{t-1}\), it follows that

\[\begin{align} \Pr(\boldsymbol{X}_1^T, C_t, C_{t+1}) &= \sum_{\boldsymbol{C}_{t+2}^T} \sum_{\boldsymbol{C}_{1}^{t-1}} \Pr(\boldsymbol{X}_1^T, \boldsymbol{C}_1^T) \qquad{\text{by LOTP}}\\ &= \sum_{\boldsymbol{C}_{t+2}^T} \sum_{\boldsymbol{C}_{1}^{t-1}} \Pr(C_1) \prod_{k=2}^t \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(C_k|C_{k-1}) \prod_{k=1}^t \Pr(X_k|C_k) \prod_{k=t+1}^T \Pr(X_k|C_k)\\ &= \sum_{\boldsymbol{C}_1^{t-1}} \Pr(C_1) \prod_{k=2}^t \Pr(C_k|C_{k-1}) \prod_{k=1}^t \Pr(X_k|C_k) \sum_{\boldsymbol{C}_{t+2}^T} \prod_{k=t+1}^T \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(X_k|C_k)\\ &= \sum_{\boldsymbol{C}_1^{t-1}} \Pr(\boldsymbol{X}_1^t, \boldsymbol{C}_1^t) \Pr(C_{t+1}|C_t) \sum_{\boldsymbol{C}_{t+2}^{T}} \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(X_k|C_k) \qquad{\text{by LOTP}}\\ &= \Pr(\boldsymbol{X}_1^t, C_t) \Pr(C_{t+1}|C_t) \Pr(\boldsymbol{X}_{t+1}^T|C_{t+1}) \end{align}\]

4.2.3 Property

For \(t=0, 1, \dots, T-1\),

\[\begin{equation} \Pr(\boldsymbol{X}_{t+1}^T|C_{t+1}) = \Pr(X_{t+1}|C_{t+1}) \Pr(\boldsymbol{X}_{t+2}^T|C_{t+1}) \tag{4.5} \end{equation}\]

Proof

From (9.2), it follows that

\[\begin{align} \Pr(\boldsymbol{X}_{t+1}^T|\boldsymbol{C}_{t+1}) &= \Pr(C_{t+1}) \Pr(X_{t+1}|C_{t+1}) \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+2}^T \Pr(X_k|C_k)\\ &= \Pr(X_{t+1}|C_{t+1}) \left( \Pr(C_{t+1}) \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+2}^T \Pr(X_k|C_k) \right)\\ &= \Pr(X_{t+1}|C_{t+1}) \Pr(\boldsymbol{X}_{t+2}^T, \boldsymbol{C}_{t+1}^T) \end{align}\]

Summing over \(\boldsymbol{C}_{t+2}^T\),

\[\begin{align} \sum_{\boldsymbol{C}_{t+2}^T} \Pr(\boldsymbol{X}_{t+1}^T|\boldsymbol{C}_{t+1}) &= \sum_{\boldsymbol{C}_{t+2}^T} \Pr(X_{t+1}|C_{t+1}) \Pr(\boldsymbol{X}_{t+2}^T, \boldsymbol{C}_{t+1}^T) \\ \stackrel{\text{by LOTP}}{\Rightarrow} \Pr(\boldsymbol{X}_{t+1}^T, C_{t+1}) &= \Pr(X_{t+1}|C_{t+1}) \Pr(\boldsymbol{X}_{t+2}^T, C_{t+1}) \end{align}\]

Dividing both sides by \(\Pr(C_{t+1})\),

\[\begin{align} \frac{1}{\Pr(C_{t+1})} \Pr(\boldsymbol{X}_{t+1}^T, C_{t+1}) &= \frac{1}{\Pr(C_{t+1})} \Pr(X_{t+1}|C_{t+1}) \Pr(\boldsymbol{X}_{t+2}^T, C_{t+1})\\ \stackrel{\text{by Bayes Rule}}{\Rightarrow} \Pr(\boldsymbol{X}_{t+1}^T|C_{t+1}) &= \Pr(X_{t+1}|C_{t+1}) \Pr(\boldsymbol{X}_{t+2}^T|C_{t+1}) \end{align}\]

4.2.4 Property

For \(t = 1, 2, \dots, T-1\),

\[\begin{equation} \Pr(\boldsymbol{X}_{t+1}^T|C_{t+1}) = \Pr(\boldsymbol{X}_{t+1}^T|C_t, C_{t+1}) \tag{4.6} \end{equation}\]

Proof

On the left-hand side:

\[\begin{align} \Pr(\boldsymbol{X}_{t+1}^T|C_{t+1}) &= \frac{1}{\Pr(C_{t+1})} \Pr(\boldsymbol{X}_{t+1}^T,C_{t+1}) \qquad{\text{by Bayes Rule}}\\ &= \frac{1}{\Pr(C_{t+1})} \sum_{\boldsymbol{C}_{t+2}^T} \Pr(\boldsymbol{X}_{t+1}^T,C_{t+1}) \qquad{\text{by LOTP}}\\ &= \frac{1}{\Pr(C_{t+1})} \sum_{\boldsymbol{C}_{t+2}^T} \Pr(C_{t+1}) \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(X_k|C_k) \qquad{\text{by Equation (10.2)}} \\ &= \sum_{\boldsymbol{C}_{t+2}^T} \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(X_k|C_k) \end{align}\]

On the right-hand side:

\[\begin{align} \Pr(\boldsymbol{X}_{t+1}^T|C_t, C_{t+1}) &= \frac{1}{\Pr(C_t, C_{t+1})} \Pr(\boldsymbol{X}_{t+1}^T|C_t, C_{t+1}) \qquad{\text{by Bayes Rule}}\\ &= \frac{1}{\Pr(C_t, C_{t+1})} \sum_{\boldsymbol{C}_{t+2}^T} \Pr(\boldsymbol{X}_{t+1}^T, \boldsymbol{C}_t^T) \qquad{\text{by LOTP}}\\ &= \frac{1}{\Pr(C_t, C_{t+1})} \sum_{\boldsymbol{C}_{t+2}^T} \Pr(C_t) \Pr(C_{t+1}|C_t) \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(X_k|C_k) \qquad{\text{by Equation (10.2)}}\\ &= \frac{1}{ \Pr(C_t) \Pr(C_{t+1}|C_t)} \sum_{\boldsymbol{C}_{t+2}^T} \Pr(C_t) \Pr(C_{t+1}|C_t) \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(X_k|C_k) \qquad{\text{by Bayes Rule}}\\ &= \sum_{\boldsymbol{C}_{t+2}^T} \prod_{k=t+2}^T \Pr(C_k|C_{k-1}) \prod_{k=t+1}^T \Pr(X_k|C_k) \end{align}\]

4.2.5 Property

The following shows the conditional independence of \(\boldsymbol{X}_1^t\) and \(\boldsymbol{X}_{t+1}^T\) given \(C_t\).

For \(t = 1, 2 \dots, T-1\),

\[\begin{equation} \Pr(\boldsymbol{X}_1^T|C_t) = \Pr(\boldsymbol{X}_1^t|C_t) \Pr(\boldsymbol{X}_{t+1}^T|C_t) \tag{4.7} \end{equation}\]

Proof

\[\begin{align} \Pr(\boldsymbol{X_1}^T, \boldsymbol{C_1}^T) &= \Pr(\boldsymbol{X_1}^t, \boldsymbol{C_1}^t) \frac{1}{\Pr(C_t)} \Pr(\boldsymbol{X_{t+1}}^T, \boldsymbol{C_t}^T) \end{align}\]

Then summing over \(\boldsymbol{C}_1^{t-1}\) and \(\boldsymbol{C}_{t+1}^{T}\),

\[\begin{align} \sum_{\boldsymbol{C}_1^{t-1}} \sum_{\boldsymbol{C}_{t+1}^{T}} \Pr(\boldsymbol{X_1}^T, \boldsymbol{C_1}^T) &= \sum_{\boldsymbol{C}_1^{t-1}} \sum_{\boldsymbol{C}_{t+1}^{T}} \Pr(\boldsymbol{X_1}^t, \boldsymbol{C_1}^t) \frac{1}{\Pr(C_t)} \Pr(\boldsymbol{X_{t+1}}^T, \boldsymbol{C_t}^T)\\ \stackrel{\text{by LOTP}}{\Rightarrow} \Pr(\boldsymbol{X}_1^T, C_t) &= \Pr(\boldsymbol{X}_1^t, C_t) \frac{1}{\Pr(C_t)} \Pr(\boldsymbol{X}_{t+1}^T, C_t)\\ &= \Pr(\boldsymbol{X}_1^t, C_t) \Pr(\boldsymbol{X}_{t+1}^T|C_t) \end{align}\]

4.3 Forward Probabilities as Joint Probabilities

Proposition 4.1 For t = 1, 2, …, T and j = 1, 2, …, m,

\[\begin{align} \alpha_t (j) = \Pr(\boldsymbol{X}^{(t)} = \boldsymbol{x}^{(t)}, C_t = j) \tag{4.8} \end{align}\]

Proof

We prove the above by induction on \(t\).

Recall from Equation (2.6) , \(\alpha_{t+1}(j) = \left(\sum_{i=1}^m \alpha_t (i) \gamma_{ij} \right) p_j (x_{t+1})\)

Base Case:

If \(t = 1\), it follows from Equation (4.1) that

\[\begin{align} \alpha_1 (j) &= \delta_j p(x_1)\\ &= \Pr(C_1 = j) \Pr(X_1 = x_1|C_1=j)\\ &= \Pr(X_1 = x_1, C_1 = j) \end{align}\]

Inductive Step:

Let \(t \in \{1, 2, \dots, T\}\). Suppose \(\alpha_t (j) = \Pr(\boldsymbol{X}^{(t)} = \boldsymbol{x}^{(t)}, C_t = j)\). Then

\[\begin{align} \alpha_{t+1} (j) &= \sum_{i=1}^m \alpha_t (i) \gamma_{ij} p (x_{t+1}) \qquad{\text{by the above}}\\ &= \sum_{i=1}^m \Pr(\boldsymbol{X}^{(t)} = \boldsymbol{x}^{(t)}, C_t = i) \Pr(C_{t+1}=j|C_t = i) \Pr(X_{t+1} = x_{t+1}|C_{t+1} = j)\\ &= \sum_{i = 1}^m \Pr(\boldsymbol{X}^{(t+1)} = \boldsymbol{x}^{(t+1)}, C_t = i, C_{t+1} = j) \qquad{\text{by Equation (5.3)}}\\ &= \Pr(\boldsymbol{X}^{(t+1)} = \boldsymbol{x}^{(t+1)}, C_{t+1} = j) \qquad{\text{by LOTP}} \end{align}\]

4.4 Backward Probabilities as Conditional Probabilities

Proposition 4.2 For t = 1, 2, …, T and i = 1, 2, …, m,

\[\begin{align} \beta_t (i) = \Pr(\boldsymbol{X}_{t+1}^T = \boldsymbol{x}_{t+1}^T|C_t = i) \tag{4.9} \end{align}\]

provided that \(\Pr(C_t = i) > 0\).

Proof

We prove the above by induction on \(t\).

Base Case:

If \(t = T-1\), it follows from Equation (4.2) that

\[\begin{align} \beta_{T-1} (i) &= \sum_{j} \Pr(C_T = j|C_{T-1} = i) \Pr(X_T=x_T|C_T=j)\\ &= \sum_{j} \Pr(C_T = j|C_{T-1} = i) \Pr(X_T = x_T|C_T = j, C_{T-1} = i) \qquad{\text{by Equation (5.7)}}\\ &= \sum_{j} \frac{1}{\Pr(C_{T-1} = i)} \Pr(X_T = x_T, C_T = j, C_{T-1} = i) \qquad{\text{by Chain Rule}}\\ &= \frac{1}{\Pr(C_{T-1} = i)} \sum_{j} \Pr(X_T = x_T, C_T = j, C_{T-1} = i)\\ &= \frac{1}{\Pr(C_{T-1} = i)} \Pr(X_T = x_T, C_{T-1} = i) \qquad{\text{by LOTP}}\\ &= \Pr(X_T = x_T|C_{T-1} = i) \qquad{\text{by Bayes Rule}} \end{align}\]

Inductive Step:

Let \(t \in \{1, 2, \dots, T-1\}\). Suppose \(\beta_t (i) = \Pr(\boldsymbol{X}_{t+1}^T = \boldsymbol{x}_{t+1}^T|C_t = i)\). Then

\[\begin{align} \beta_t (i) &= \sum_{j} \gamma_{ij} \Pr(X_{t+1} = x_{t+1}|C_{t+1} = j) \Pr(\boldsymbol{X}_{t+2}^T = \boldsymbol{x}_{t+2}^T|C_{t+1} = j) \qquad{\text{by the above}}\\ &= \sum_{j} Pr(C_{t+1} = j|C_t = i) \Pr(X_{t+1} = x_{t+1}|C_{t+1} = j) \Pr(\boldsymbol{X}_{t+2}^T = \boldsymbol{x}_{t+2}^T|C_{t+1} = j)\\ &= \sum_{j} \Pr(C_{t+1} = j|C_t = i) \Pr(\boldsymbol{X}_{t+1}^T= \boldsymbol{x}_{t+1}^T|C_{t+1} = j) \qquad{\text{by Equation (5.5)}}\\ &= \sum_{j} \Pr(C_{t+1} = j|C_t = i) \Pr(\boldsymbol{X}_{t+1}^T= \boldsymbol{x}_{t+1}^T|C_{t+1} = j, C_t = i) \qquad{\text{by Equation (5.6) }}\\ &= \frac{1}{\Pr(C_t = i)} \sum_{j} \Pr((\boldsymbol{X}_{t+1}^T= \boldsymbol{x}_{t+1}^T, C_t = i, C_{t+1} = j) \qquad{\text{by Chain Rule}}\\ &= \frac{1}{\Pr(C_t = i)} \sum_{j} \Pr((\boldsymbol{X}_{t+1}^T= \boldsymbol{x}_{t+1}^T, C_t = i) \qquad{\text{by LOTP}}\\ &= \Pr(\boldsymbol{X}_{t+1}^T= \boldsymbol{x}_{t+1}^T|C_t = i) \qquad{\text{by Bayes Rule}} \end{align}\]

Proposition 4.3 For t = 1, 2, …, T and i = 1, 2, …, m,

\[\begin{equation} \alpha_t (i) \beta_t (i) = \Pr(\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}, C_t = i) \tag{4.10} \end{equation}\]

and consequently \(\boldsymbol{\alpha}_t \boldsymbol{\beta'}_t = \Pr(\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}) = L_T\), for each such \(t\).

Proof

\[\begin{align} \alpha_t (i) \beta_t (i) &= \Pr(\boldsymbol{X}_1^t, C_t = i) \Pr(\boldsymbol{X}_{t+1}^T|C_t = i)\\ &= \Pr(C_t = i) \Pr(\boldsymbol{X}_t^t|C_t = i) \Pr(\boldsymbol{X}_{t+1}^T|C_t = i) \qquad{\text{by Bayes Rule}}\\ &= \Pr(C_t = i) \Pr(\boldsymbol{X}_1^t, \boldsymbol{X}_{t+1}^T|C_t = i) \qquad{\text{by Equation (5.7)} }\\ &= \Pr(\boldsymbol{X}^{(T)}, C_t = i) \qquad{\text{by Bayes Rule}} \end{align}\]

The second part follows by summing over \(i\),

\[\begin{align} \boldsymbol{\alpha}_t \boldsymbol{\beta}_t' &= \sum_{i} \alpha_t (i) \beta_t (i)\\ &= \sum_{i} \Pr(\boldsymbol{X}^T, C_T = i) \qquad{\text{by the above}}\\ &= \Pr(\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}) \qquad{\text{by LOTP}}\\ &= L_T \end{align}\]

Proposition 4.4 For t = 1, 2, …, T,

\[\begin{equation} \Pr(C_t = j|\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}) = \frac{1}{L_T} \alpha_t (j) \beta_t (j) \tag{4.11} \end{equation}\]

and for \(t = 2, 3, ..., T\),

\[\begin{equation} \Pr(C_{t-1} = j, C_t = k|\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}) = \frac{1}{L_T} \alpha_{t-1} (j) \gamma_{jk} p_k (x_t) \beta_t (k) \tag{4.12} \end{equation}\]

Proof

For (4.11), it follows from Equation (4.10) that

\[\begin{align} \alpha_t (j) \beta_t (j) \frac{1}{L_T} &= \Pr(\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}, C_t = j) \frac{1}{\Pr(\boldsymbol{X}^{(T)} = \Pr(\boldsymbol{x}^{(T)})}\\ &= \Pr(C_t = j|\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}) \qquad{\text{by Bayes Rule}}\\ \end{align}\]

For (4.12),

\[\begin{align} \Pr(C_{t-1} = j, C_t = k|\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}) &= \Pr(\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}, C_{t-1} = j, C_t = k) \frac{1}{\Pr(\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)})} \qquad{\text{by Bayes Rule}}\\ &= \Pr(\boldsymbol{X}^{(T)} = \boldsymbol{x}^{(T)}, C_{t-1} = j, C_t = k) \frac{1}{L_T} &= \frac{1}{L_T} \Pr(\boldsymbol{X}^{(t-1)} = \boldsymbol{x}^{(t-1)}, C_{t-1} = j) \Pr(C_t = k|C_{t-1} = j) \Pr(boldsymbol{X}_t^T|C_t = k)\\ &= \frac{1}{L_T} \alpha_{t-1} (j) \gamma_{ij} p_j (x_t) \beta_t (k) \end{align}\]