doc.tex

\documentclass[12pt,a4paper]{article}
\setcounter{secnumdepth}{0}
\usepackage{gensymb}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{sansmath}
\usepackage{pst-eucl}
\usepackage[UKenglish]{isodate}
\usepackage[UKenglish]{babel}
\usepackage{float}
\usepackage[numbered,framed]{matlab-prettifier}
\usepackage[T1]{fontenc}
\usepackage{setspace}
\usepackage{sectsty}
\usepackage[colorlinks=true,linkcolor=blue,urlcolor=black,bookmarksopen=true]{hyperref}
\newcommand{\E}{\mathbb{E}}
\newcommand{\eqn}[1]{Equation \ref{#1}}
\newcommand{\ovY}{\overline{Y}}
\newcommand{\wmu}{\widehat{\mu}}
\newcommand{\wst}[1]{\widehat{\sigma^2_{#1}}}
\newcommand{\wstq}[1]{\widehat{\sigma^4_{#1}}}
\newcommand{\B}{\mathbb{B}}
\newcommand{\RR}{\mathrm{RR}}
\newcommand{\var}{\mathrm{var}}
\newcommand{\MSE}{\mathrm{MSE}}
\newcommand{\SST}{\mathrm{SST}}
\newcommand{\MST}{\mathrm{MST}}
\newcommand{\SSE}{\mathrm{SSE}}
\newcommand{\SSS}{\mathrm{SS}}
\newcommand{\SSTotal}{\mathrm{Total\hspace{0.1cm}SS}}
\newcommand{\cov}{\mathrm{cov}}
\newcommand{\eff}{\mathrm{eff}}
\newcommand{\CM}{\mathrm{CM}}
\newcommand{\corr}{\mathrm{corr}}
\newcommand{\Poisson}{\mathrm{Poisson}}
\newcommand{\Binomial}{\mathrm{Binomial}}
\setlength{\parindent}{0pt}
\renewcommand{\baselinestretch}{2.0}
\usepackage[margin=0.1in]{geometry}
\title{Derivation of a test for equality of means with unequal variances}
\author{Brenton Horne}

\begin{document}
	\maketitle
	
	\tableofcontents
	
	\newpage
	
	\section{Hypotheses}
	Let $Y_{ij}$ denote the $j$th observation of the $i$th treatment group. Where $i=1, 2, 3, ..., m$ and $j=1, 2, 3, ..., n_i$. Under the null hypothesis: $Y_{ij} \sim \mathrm{N}(\mu, \sigma_i^2)$. Under the alternative hypothesis: $Y_{ij} \sim \mathrm{N}(\mu_i, \sigma_i^2)$, where $\mu_i \neq \mu_k$ for at least one pair of $i$ and $k$ values.
	
	\section{Definitions}
	\begin{align*}
	n &= \sum_{i=1}^m n_i \\
	\overline{Y} &= \dfrac{1}{n} \sum_{i=1}^m \sum_{j=1}^{n_i} Y_{ij} \\
	\overline{Y}_i &= \dfrac{1}{n_i} \sum_{j=1}^{n_i} Y_{ij} \\
	\end{align*}

	We will later use $\delta_{ik}$, which is the Kronecker delta symbol. It equals 0 if $i\neq k$ and 1 otherwise.
	
	\section{Derivation of the maximum likelihood under the null}
	Let $\Omega_0$ denote the parameter space under the null hypothesis. $\Omega_0 = \left\{(\mu, \sigma^2_i): \hspace{0.1cm}-\infty < \mu < \infty, \hspace{0.1cm}\sigma^2_i > 0\right\}$. \\
	$\Omega_a = \left\{(\mu_i, \sigma^2_i): \hspace{0.1cm}\mu_i \neq \mu_j\hspace{0.1cm}\mathrm{for\hspace{0.1cm}at\hspace{0.1cm}least\hspace{0.1cm}one\hspace{0.1cm}pair\hspace{0.1cm}of\hspace{0.1cm}}i\mathrm{\hspace{0.1cm}and\hspace{0.1cm}}j\mathrm{\hspace{0.1cm}values}, \hspace{0.1cm}-\infty < \mu_i < \infty, \hspace{0.1cm}\sigma^2_i > 0\right\}$. The unrestricted parameter space is thus: $\Omega = \Omega_0 \cup \Omega_a$. 
	\begin{align}
		L(\Omega_0) &= \prod_{i=1}^m \prod_{j=1}^{n_i} \dfrac{1}{\sqrt{2\pi} \sigma_i} \exp\left(-\dfrac{1}{2\sigma^2_i}(Y_{ij}-\mu)^2\right) \nonumber\\
		&= (2\pi)^{-n/2} \left(\prod_{i=1}^m \sigma_i^{-n_i}\right)\exp\left(-\dfrac{1}{2} \sum_{i=1}^m \dfrac{1}{\sigma_i^2}\sum_{j=1}^{n_i}(Y_{ij}-\mu)^2 \right). \label{LH0}
	\end{align}

	Taking the natural logarithm yields:
	\begin{align*}
		\ln{L(\Omega_0)} &= -\dfrac{n}{2}\ln{2\pi} - \dfrac{1}{2}\sum_{i=1}^m n_i \ln{\sigma^2_i} - \dfrac{1}{2} \sum_{i=1}^m \dfrac{1}{\sigma^2_i}\sum_{j=1}^{n_i} (Y_{ij}-\mu)^2.
	\end{align*}

	Differentiating the log-likelihood with respect to $\mu$ and setting to zero to maximize the likelihood:
	\begin{align}
		\dfrac{\partial \ln{L(\Omega_0)}}{\partial \mu} \Bigm\lvert_{\mu = \wmu, \hspace{0.1cm}\sigma^2_i = \wst{i}} &= -\dfrac{1}{2} \sum_{i=1}^m \dfrac{1}{\wst{i}}\sum_{j=1}^{n_i} 2(-1)(Y_{ij}-\wmu) \nonumber\\
		&= 0 \nonumber\\
		\sum_{i=1}^m \dfrac{1}{\wst{i}}\sum_{j=1}^{n_i} (Y_{ij}-\wmu) &= 0 \nonumber\\
		\sum_{i=1}^m \dfrac{1}{\wst{i}} (n_i\ovY_i - n_i \wmu) &= 0 \nonumber\\
		\left(\sum_{i=1}^m \dfrac{n_i \ovY_i}{\wst{i}}\right) - \left(\sum_{i=1}^m \dfrac{n_i}{\wst{i}}\right)\wmu &= 0 \nonumber \\
		\wmu &= \dfrac{\sum_{i=1}^m \dfrac{n_i \ovY_i}{\wst{i}}}{\sum_{i=1}^m \dfrac{n_i}{\wst{i}}}. \label{wmuH0}
	\end{align}

	Differentiating the log-likelihood with respect to $\sigma^2_k$ and setting to zero to maximize the likelihood:
	\begin{align*}
		\dfrac{\partial \ln{L(\Omega_0)}}{\partial \sigma^2_k} \Bigm\lvert_{\mu = \wmu, \hspace{0.1cm} \sigma^2_i = \wst{i}} &= -\dfrac{1}{2} \sum_{i=1}^m \dfrac{n_i}{\wst{i}}\delta_{ik} - \dfrac{1}{2} \sum_{i=1}^m -\dfrac{1}{\wstq{i}} \delta_{ik}\sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2 \\
		&= -\dfrac{1}{2} \dfrac{n_k}{\wst{k}} + \dfrac{1}{2\wstq{k}} \sum_{j=1}^{n_k} (Y_{kj}-\wmu)^2 \\
		&= 0.
	\end{align*}

	Multiplying by $2\wstq{i}$ yields:
	\begin{align}
		-n_k\wst{k} + \sum_{j=1}^{n_k} (Y_{kj}-\wmu)^2 &= 0 \nonumber\\
		\wst{k} &= \dfrac{1}{n_k} \sum_{j=1}^{n_k} (Y_{kj}-\wmu)^2. \label{wstH0}
	\end{align}

	Some simplification of our likelihood can be done now using our maximum likelihood estimators (MLEs), although unfortunately Equations \ref{wmuH0} and \ref{wstH0} cannot be analytically solved, they must be numerically solved to yield values for $\wmu$ and $\wst{k}$.  
	\begin{align*}
		L(\widehat{\Omega_0}) &= (2\pi)^{-n/2} \left(\prod_{i=1}^m \wst{i}^{-n_i/2}\right)\exp\left(-\dfrac{1}{2} \sum_{i=1}^m \dfrac{1}{\wst{i}}\sum_{j=1}^{n_i}(Y_{ij}-\wmu)^2 \right) \\
		&= (2\pi)^{-n/2} \left(\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2\right)^{-n_i/2}\right)\exp\left(-\dfrac{1}{2} \sum_{i=1}^m \dfrac{1}{\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2}\sum_{j=1}^{n_i}(Y_{ij}-\wmu)^2 \right) \\
		&= (2\pi)^{-n/2} \left(\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2\right)^{-n_i/2}\right)\exp\left(-\dfrac{1}{2} \sum_{i=1}^m n_i \right) \\
		&= (2\pi)^{-n/2}\left(\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2\right)^{-n_i/2}\right)\exp\left(-\dfrac{n}{2}\right) \\
		&= (2\pi e)^{-n/2}\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2\right)^{-n_i/2}.
	\end{align*}

	\subsection{Numerical approximation of maximum likelihood estimators}
	So here we must numerically approximate $\wmu$ and $\wst{i}$ using a technique like Newton's method and substitute this into our expression for $L(\Omega_0)$ to get the maximum likelihood under the null. If we use Newton's method, we must have functions we are finding the zeros of. Let:
	\begin{align*}
		f(\wmu, \wst{k}) &= \wmu - \dfrac{\sum_{k=1}^m \dfrac{n_k \ovY_k}{\wst{k}}}{\sum_{k=1}^m \dfrac{n_k}{\wst{k}}} \\
		g_i(\wmu, \wst{k}) &= \wst{i} - \dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2.
	\end{align*}

	Therefore:
	\begin{align*}
		\dfrac{\partial f}{\partial \wmu} &= 1\\
		\dfrac{\partial f}{\partial \wst{i}} &= -\dfrac{\sum_{k=1}^m -\dfrac{n_k \ovY_k}{\wstq{k}}\delta_{ik}}{\sum_{k=1}^m \dfrac{n_k}{\wst{k}}} - \dfrac{\sum_{k=1}^m \dfrac{n_k \ovY_k}{\wst{k}}}{-\left(\sum_{k=1}^m \dfrac{n_k}{\wst{k}}\right)^2} \sum_{k=1}^m -\dfrac{n_k}{\wstq{k}}\delta_{ik} \\
		&= \dfrac{\dfrac{n_i \ovY_i}{\wstq{i}}}{\sum_{k=1}^m \dfrac{n_k}{\wst{k}}} - \dfrac{\sum_{k=1}^m \dfrac{n_k \ovY_k}{\wst{k}}}{\left(\sum_{k=1}^m \dfrac{n_k}{\wst{k}}\right)^2} \dfrac{n_i}{\wstq{i}} \\
		&= \dfrac{n_i}{\wstq{i}\sum_{k=1}^m \dfrac{n_k}{\wst{k}}} \left(\ovY_i - \dfrac{\sum_{i=1}^m \dfrac{n_i \ovY_i}{\wst{i}}}{\sum_{i=1}^m \dfrac{n_i}{\wst{i}}}\right).
	\end{align*}

	As for the derivatives of $g_i(\wmu, \wst{k})$:
	\begin{align*}
		\dfrac{\partial g_i(\wmu, \wst{k})}{\partial \wmu} &= -\dfrac{1}{n_i} \sum_{j=1}^{n_i} 2(-1)(Y_{ij}-\wmu) \\
		&= \dfrac{2}{n_i} (n_i\ovY_i - n_i\wmu) \\
		&= 2(\ovY_i-\wmu) \\
		\dfrac{\partial g_i(\wmu, \wst{k})}{\partial \wst{k}} &= \delta_{ik}.
	\end{align*}

	This thus gives us a Jacobian:
	\begin{align*}
		\mathrm{J} = \left[\begin{matrix}
			\dfrac{\partial f}{\partial \wmu} & \dfrac{\partial f}{\partial \wst{1}} & \dfrac{\partial f}{\partial \wst{2}} & \dfrac{\partial f}{\partial \wst{3}} & \cdots & \dfrac{\partial f}{\partial \wst{m}} \\
			\dfrac{\partial g_1}{\partial \wmu} & \dfrac{\partial g_1}{\partial \wst{1}} & \dfrac{\partial g_1}{\partial \wst{2}} & \dfrac{\partial g_1}{\partial \wst{3}} & \cdots & \dfrac{\partial g_1}{\partial \wst{m}} \\
			\dfrac{\partial g_2}{\partial \wmu} & \dfrac{\partial g_2}{\partial \wst{1}} & \dfrac{\partial g_2}{\partial \wst{2}} & \dfrac{\partial g_2}{\partial \wst{3}} & \cdots & \dfrac{\partial g_2}{\partial \wst{m}} \\
			\dfrac{\partial g_3}{\partial \wmu} & \dfrac{\partial g_3}{\partial \wst{1}} & \dfrac{\partial g_3}{\partial \wst{2}} & \dfrac{\partial g_3}{\partial \wst{3}} & \cdots & \dfrac{\partial g_3}{\partial \wst{m}} \\
			\hdotsfor{6} \\
			\dfrac{\partial g_m}{\partial \wmu} & \dfrac{\partial g_m}{\partial \wst{1}} & \dfrac{\partial g_m}{\partial \wst{2}} & \dfrac{\partial g_m}{\partial \wst{3}} & \cdots & \dfrac{\partial g_m}{\partial \wst{m}}
		\end{matrix}\right].
	\end{align*}

	Interestingly, this Jacobian's main diagonal (left to right, top to bottom) consists entirely of 1s. The only off-diagonal elements that are non-zero are in the first row and first column. We also have this function vector:
	\begin{align*}
		\mathbf{F} = \left[
		\begin{matrix}
			f(\wmu, \wst{k}) \\
			g_1(\wmu, \wst{k}) \\
			g_2(\wmu, \wst{k}) \\
			g_3(\wmu, \wst{k}) \\
			\hdotsfor{1} \\
			g_m(\wmu, \wst{k})
		\end{matrix}
	\right].
	\end{align*}

	And we use the algorithm:
	\begin{align*}
		\left[\begin{matrix}
			\wmu \\
			\wst{1} \\
			\wst{2} \\
			\wst{3} \\
			\hdotsfor{1} \\
			\wst{m}
		\end{matrix}\right]_{\mathrm{New}} &= \left[\begin{matrix}
		\wmu \\
		\wst{1} \\
		\wst{2} \\
		\wst{3} \\
		\hdotsfor{1} \\
		\wst{m}
	\end{matrix}\right]_{\mathrm{Old}} - \mathrm{J}^{-1}\mathbf{F}
	\end{align*}

	to numerically approximate our MLEs under the null. What should we use as our initial guess for $\left[\begin{matrix}
		\wmu \\
		\wst{1} \\
		\wst{2} \\
		\wst{3} \\
		\hdotsfor{1} \\
		\wst{m}
	\end{matrix}\right]$? Well, how about the overall mean for $\wmu$ and the $k$th sample variance for $\wst{k}$?

	\section{Derivation of the unrestricted maximum likelihood}
	We will use $L(\Omega)$ to denote the unrestricted likelihood function.
	\begin{align*}
		L(\Omega) &= \prod_{i=1}^m \prod_{i=1}^{n_i} \dfrac{1}{\sqrt{2\pi} \sigma_i} \exp{\left(-\dfrac{1}{2\sigma^2_i} (Y_{ij}-\mu_i)^2\right)} \\
		&= (2\pi)^{-n/2} \left(\prod_{i=1}^m \sigma_i^{-n_i}\right)\exp{\left(-\dfrac{1}{2}\sum_{i=1}^m \dfrac{1}{\sigma^2_i}\sum_{j=1}^{n_i} (Y_{ij}-\mu_i)^2\right)}.
	\end{align*}

	Hence the unrestricted log-likelihood is:
	\begin{align*}
		\ln{L(\Omega)} &= -\dfrac{n}{2} \ln{2\pi} - \dfrac{1}{2}\sum_{i=1}^m n_i \ln{\sigma^2_i} - \dfrac{1}{2}\sum_{i=1}^m \dfrac{1}{\sigma^2_i}\sum_{j=1}^{n_i} (Y_{ij}-\mu_i)^2.
	\end{align*}

	Setting the derivative with respect to $\mu_l$ to zero to find the MLE for $\mu_l$:
	\begin{align*}
		\dfrac{\partial \ln{L(\Omega)}}{\partial \mu_l} \Bigm\lvert_{\mu_l=\wmu_l, \hspace{0.1cm}\sigma^2_k = \wst{k}} &= -\dfrac{1}{2}\sum_{i=1}^m \dfrac{1}{\wst{i}}\sum_{j=1}^{n_i} 2(-1)(Y_{ij}-\wmu_i)\delta_{il} \\
		&= \dfrac{1}{\wst{l}} \sum_{j=1}^{n_l} (Y_{lj}-\wmu_l) \\
		&= \dfrac{1}{\wst{l}} (n_l \ovY_l  -n_l \wmu_l) \\
		&= 0. \\
		\implies \wmu_l &= \ovY_l.
	\end{align*}

	Setting the derivative with respect to $\sigma^2_k$ to zero to find the MLE for $\sigma^2_k$:
	\begin{align*}
		\dfrac{\partial \ln{L(\Omega)}}{\partial \wst{k}}\Bigm\lvert_{\mu_l=\wmu_l, \hspace{0.1cm}\sigma^2_k = \wst{k}} &= -\dfrac{1}{2}\sum_{i=1}^m \dfrac{n_i}{\wst{i}} \delta_{ik} - \dfrac{1}{2}\sum_{i=1}^m -\dfrac{1}{\wstq{i}} \delta_{ik}\sum_{j=1}^{n_i} (Y_{ij}-\wmu_i)^2 \\
		&= -\dfrac{n_k}{2\wst{k}} + \dfrac{1}{2\wstq{k}}\sum_{j=1}^{n_k} (Y_{kj}-\wmu_k)^2 \\
		&= 0. \\
		\implies \wst{k} &= \dfrac{1}{n_k} \sum_{j=1}^{n_k} (Y_{kj}-\wmu_k)^2 \\
		&= \dfrac{1}{n_k} \sum_{j=1}^{n_k} (Y_{kj}-\ovY_k)^2.
	\end{align*}

	Substituting these MLEs into our expression for $L(\Omega)$ to get the unrestricted maximum likelihood:
	\begin{align*}
		L(\widehat{\Omega}) &= (2\pi)^{-n/2} \left(\prod_{i=1}^m (\wst{i})^{-n_i/2}\right)\exp{\left(-\dfrac{1}{2}\sum_{i=1}^m \dfrac{1}{\wst{i}}\sum_{j=1}^{n_i} (Y_{ij}-\wmu_i)^2\right)} \\
		&= (2\pi)^{-n/2}\left(\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2\right)^{-n_i/2}\right)\exp{\left(-\dfrac{1}{2}\sum_{i=1}^m \dfrac{1}{\dfrac{1}{n_i}\sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2}\sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2\right)} \\
		&= (2\pi)^{-n/2}\left(\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2\right)^{-n_i/2}\right)\exp{\left(-\dfrac{1}{2}\sum_{i=1}^m n_i\right)} \\
		&= (2\pi)^{-n/2}\left(\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2\right)^{-n_i/2}\right) \exp\left(-\dfrac{n}{2}\right) \\
		&= (2\pi e)^{-n/2}\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2\right)^{-n_i/2}.
	\end{align*}

	\section{Likelihood ratio}
	Hence our likelihood ratio is:
	\begin{align*}
		\lambda &= \dfrac{L(\widehat{\Omega_0})}{L(\widehat{\Omega})} \\
		&= \dfrac{(2\pi e)^{-n/2}\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2\right)^{-n_i/2}}{(2\pi e)^{-n/2}\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2\right)^{-n_i/2}} \\
		&= \dfrac{\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2\right)^{-n_i/2}}{\prod_{i=1}^m \left(\dfrac{1}{n_i} \sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2\right)^{-n_i/2}} \\
		&= \prod_{i=1}^m \left(\dfrac{\sum_{j=1}^{n_i} (Y_{ij}-\ovY_i)^2}{\sum_{j=1}^{n_i} (Y_{ij}-\wmu)^2}\right)^{n_i/2}.
	\end{align*}

	And our test statistic is:
	\begin{align*}
		-2\ln{\lambda} \sim \chi^2_{m-1}.
	\end{align*}

	As the unrestricted maximum likelihood has $m-1$ more parameters than the maximum likelihood under the null.
\end{document}