shallow-training/report/report.lyx

#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass article
\begin_preamble
\def\changemargin#1#2{\list{}{\rightmargin#2\leftmargin#1}\item[]}
\let\endchangemargin=\endlist
\pagenumbering{gobble}

\usepackage{pxfonts}
\usepackage{color}

\definecolor{commentgreen}{RGB}{0,94,11}
\definecolor{darkblue}{rgb}{0,0,0.75}
\definecolor{darkred}{rgb}{0.6,0,0}
\end_preamble
\use_default_options true
\begin_modules
customHeadersFooters
minimalistic
todonotes
\end_modules
\maintain_unincluded_children false
\language british
\language_package default
\inputencoding utf8
\fontencoding global
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\use_microtype true
\use_dash_ligatures true
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command biber
\index_command default
\paperfontsize default
\spacing other 1.2
\use_hyperref true
\pdf_title "Training Neural Networks With Backpropagation"
\pdf_author "Andy Pack"
\pdf_subject "EEEM005"
\pdf_keywords "EEEM005"
\pdf_bookmarks true
\pdf_bookmarksnumbered false
\pdf_bookmarksopen false
\pdf_bookmarksopenlevel 1
\pdf_breaklinks false
\pdf_pdfborder true
\pdf_colorlinks false
\pdf_backref false
\pdf_pdfusetitle true
\papersize default
\use_geometry true
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine biblatex
\cite_engine_type authoryear
\biblio_style plain
\biblio_options urldate=long
\biblatex_bibstyle ieee
\biblatex_citestyle ieee
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date true
\justification true
\use_refstyle 1
\use_minted 0
\index Index
\shortcut idx
\color #008000
\end_index
\leftmargin 1.8cm
\topmargin 2cm
\rightmargin 1.8cm
\bottommargin 2cm
\secnumdepth 3
\tocdepth 3
\paragraph_separation skip
\defskip medskip
\is_math_indent 0
\math_numbering_side default
\quotes_style british
\dynamic_quotes 0
\papercolumns 1
\papersides 1
\paperpagestyle fancy
\listings_params "language=Python,breaklines=true,frame=tb,otherkeywords={self},emph={State},emphstyle={\ttb\color{darkred}},basicstyle={\ttfamily},commentstyle={\bfseries\color{commentgreen}\itshape},keywordstyle={\color{darkblue}},emphstyle={\color{red}},stringstyle={\color{red}}"
\bullet 1 0 9 -1
\bullet 2 0 24 -1
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header

\begin_body

\begin_layout Title

\size giant
Training Neural Networks with Backpropagation
\end_layout

\begin_layout Author
Andy Pack
\end_layout

\begin_layout Standard
\begin_inset VSpace 15pheight%
\end_inset


\end_layout

\begin_layout Standard
\align center
\begin_inset Graphics
	filename surrey.png
	lyxscale 15
	width 40col%

\end_inset


\end_layout

\begin_layout Standard
\begin_inset VSpace vfill
\end_inset


\end_layout

\begin_layout Standard
\noindent
\align center
EEEM005
\begin_inset Newline newline
\end_inset

May 2021
\size large

\begin_inset Newline newline
\end_inset

Department of Electrical and Electronic Engineering
\begin_inset Newline newline
\end_inset

Faculty of Engineering and Physical Sciences
\begin_inset Newline newline
\end_inset

University of Surrey
\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Section*
Executive Summary
\end_layout

\begin_layout Standard
Investigations into the accuracy of a shallow multi-layer perceptron at
 classifying breast tumours as either benign or malignant are presented.

\noun on
Python
\noun default
 and the
\noun on
TensorFlow
\noun default
 platform were used to construct and evaluate networks of varied architectures
 and training periods.
\end_layout

\begin_layout Standard
For experiment 1, the effect of varying the number of hidden nodes was contraste
d with a varied number of training epochs.
 As the number of hidden nodes was increased from 1 to 64, the speed of
 convergence increased.
 The final performance once converged, however, was not significantly affected
 by the size of the hidden layer at around a 4% error rate.
 Different learning rates and momentum were selected in order to visualise
 the effect of these parameters on error rate.
 A larger learning rate was shown to increase the speed of convergence while
 also increasing the variance in the results.
\end_layout

\begin_layout Standard
Experiment 2 builds on the previous work by creating a horizontal ensemble
 classifier of models that use majority vote for consensus.
 In order to vary the models so as to make a more robust classification,
 variations were introduced to the parameters of the models.
 These variations could be made either linearly, with equally spaced parameter
 values within a range, or by randomly selecting values from the range.
 For uniform ensembles, the combined classification error rate was fairly
 consistent throughout the tested group sizes with a variance in results
 that decreased as the size of the meta-classifier was increased.
 For random ensembles, groups of less than 5 models showed both slightly
 higher error rates and significantly larger variance.
 By increasing the size of the ensemble, better accuracy with tighter variance
 was achieved.
 A random ensemble was recommended in order to better sample the allowed
 parameter combinations assuming a larger ensemble is employed.
\end_layout

\begin_layout Standard
Experiment 3 repeats the test apparatus of experiment 2 with variations
 in the employed optimisation algorithm.
 The RMSprop and Adam algorithms are described in order to present the differenc
es to the previously used stochastic gradient descent.
 Throughout the evaluated ensemble sizes and algorithms, the group error
 rate was comparable indicating that the use of a meta-classifier was able
 to overcome the drawbacks of each.
 Due to the similar accuracy, a recommendation for RMSprop was made as a
 result of its reduced variance for the reported results indicating a higher
 robustness for the parameters employed.
 For the healthcare domain of the
\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
pagenumbering{roman}
\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset CommandInset toc
LatexCommand tableofcontents

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Newpage pagebreak
\end_inset


\end_layout

\begin_layout Standard
\begin_inset FloatList figure

\end_inset


\end_layout

\begin_layout Standard
\begin_inset FloatList table

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Newpage pagebreak
\end_inset


\end_layout

\begin_layout Right Footer
Andy Pack / 6420013
\end_layout

\begin_layout Left Footer
May 2021
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
pagenumbering{arabic}
\end_layout

\begin_layout Plain Layout


\backslash
setcounter{page}{1}
\end_layout

\end_inset


\end_layout

\begin_layout Section
Introduction
\end_layout

\begin_layout Standard
Artificial neural networks have been the object of research and investigation
 since the 1940s with
\noun on
McCulloch
\noun default
 and
\noun on
Pitts
\noun default
' model of the artificial neuron
\begin_inset CommandInset citation
LatexCommand cite
key "McCulloch1943"
literal "false"

\end_inset

 or
\emph on
Threshold Logic Unit
\emph default
.
 Throughout the century, the development of the single and multi-layer perceptro
ns (SLP/MLP) alongside the backpropagation algorithm
\begin_inset CommandInset citation
LatexCommand cite
key "Rumelhart1986"
literal "false"

\end_inset

 advanced the study of artificial intelligence.
 Throughout the 2010s, convolutional neural networks have proved critical
 in the field of computer vision and image recognition
\begin_inset CommandInset citation
LatexCommand cite
key "alexnet"
literal "false"

\end_inset

.
\end_layout

\begin_layout Standard
This work investigates the ability of a shallow multi-layer perceptron to
 classify breast tumours as either benign or malignant.
 The architecture and parameters were varied before exploring how the combinatio
n of classifiers can affect performance.

\end_layout

\begin_layout Standard
Investigations were carried out in
\noun on
Python
\noun default
 using the
\noun on
TensorFlow
\noun default
 package to construct, train and evaluate neural networks.
 A
\noun on
Jupyter
\noun default
 notebook containing the experiments and the evaluated parameters can be
 seen formatted as a single script in appendix
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Source-Code"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 The networks were trained using a supervised learning curriculum of labelled
 data taken from a standard
\noun on
MatLab
\noun default
 dataset
\begin_inset CommandInset citation
LatexCommand cite
key "matlab-dataset"
literal "false"

\end_inset

 from the
\noun on
Deep Learning Toolbox
\noun default
.
 For this binary-classification problem there are two formats for the network,
 a single output node (threshold of 0.5 to differentiate classes) or two
 output nodes to create a one-hot vector.
 As the labels were formatted as one-hot vectors, two output nodes with
 a softmax activation function were used.
 The number of parameters associated with the employed architectures of
 varying hidden nodes can be seen in appendix
\begin_inset CommandInset ref
LatexCommand ref
reference "app:Network-Parameter-Counts"
plural "false"
caps "false"
noprefix "false"

\end_inset

 while a graph of the constructed network can be seen in appendix
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Network-Graph"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
\end_layout

\begin_layout Standard
Section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:exp1"
plural "false"
caps "false"
noprefix "false"

\end_inset

 investigates the effect of varying the number of hidden nodes on test accuracy
 along with the number of epochs that the MLPs are trained for.
 Section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:exp2"
plural "false"
caps "false"
noprefix "false"

\end_inset

 builds on the previous experiment by using reasonable parameter values
 to investigate performance when using an ensemble of models to classify
 in conjunction.
 The effect of varying the number of nodes and epochs throughout the ensemble
 was considered in order to determine whether combining multiple models
 could produce a better accuracy than any individual model.
 Section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:exp3"
plural "false"
caps "false"
noprefix "false"

\end_inset

 investigates the effect of altering how the networks learn by changing
 the optimisation algorithm.
 Two additional algorithms to the previously used are considered and compared
 using the same test apparatus of section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:exp2"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
\end_layout

\begin_layout Section
Hidden Nodes & Epochs
\begin_inset CommandInset label
LatexCommand label
name "sec:exp1"

\end_inset


\end_layout

\begin_layout Standard
This section investigates the effect of varying the number of hidden nodes,

\begin_inset Formula $n_{h}$
\end_inset

, in the single hidden layer of a shallow multi-layer perceptron.
 This is compared to the effect of training the model with different numbers
 of epochs.
 Throughout the experiment, stochastic gradient descent with momentum is
 used as the optimiser, variations in both momentum and learning rate are
 presented.
 The learning rate and momentum coefficient used during training are denoted

\begin_inset Formula $\eta$
\end_inset

 and
\begin_inset Formula $\beta$
\end_inset

 respectively.
\end_layout

\begin_layout Subsection
Results
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-14-error-rate-curves.png
	lyxscale 50
	width 33col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.05$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-14"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-12-error-rate-curves.png
	lyxscale 50
	width 33col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.1$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-12"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-13-error-rate-curves.png
	lyxscale 50
	width 33col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.5$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-13"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Varied hidden node performance results over varied training lengths for

\begin_inset Formula $\eta=0.05,0.1,0.5$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-12,14"

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
Figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-test2-12,14"
plural "false"
caps "false"
noprefix "false"

\end_inset

 visualises the test performance of hidden nodes up to
\begin_inset Formula $n_{h}=128$
\end_inset

 over training periods up to 100 epochs in length.
 In general, the error rate can be seen to decrease when the models are
 trained for longer.
 Increasing
\begin_inset Formula $n_{h}$
\end_inset

 decreases the error rate and increases the gradient with which it falls
 to a minimum limit of ~4%.
 As the learning rate increases, the speed with which the network converges
 increases.
 For
\begin_inset Formula $\eta=0.05$
\end_inset

, networks with large
\begin_inset Formula $n_{h}$
\end_inset

 begin converging after 30 epochs.
 This is after only 15 epochs for
\begin_inset Formula $\eta=0.1$
\end_inset

 and almost immediately for
\begin_inset Formula $\eta=0.5$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-14-error-rate-std.png
	lyxscale 50
	width 33col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.05$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-14-std"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-12-error-rate-std.png
	lyxscale 50
	width 33col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.1$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-12-std"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-13-error-rate-std.png
	lyxscale 50
	width 33col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.5$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-13-std"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Varied hidden node performance standard deviation results over varied training
 lengths for
\begin_inset Formula $\eta=0.05,0.1,0.5$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset

, note the larger
\begin_inset Formula $y$
\end_inset

 scale for
\begin_inset Formula $\eta=0.5$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-12,14-std"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
The standard deviations for the above discussed results of figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-test2-12,14"
plural "false"
caps "false"
noprefix "false"

\end_inset

 can be seen in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-test2-12,14-std"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 In general, prior to the networks beginning to converge the standard deviation
 is close to 0.
 As previously described, this takes place at lower epochs for higher learning
 rates.
 Once the networks start converging, the standard deviation of the test
 error rate increases.
 Increasing the learning rate also increases the variance in test error
 rates, the max value for
\begin_inset Formula $\eta=0.5$
\end_inset

 is double that of the lower
\begin_inset Formula $\eta$
\end_inset

 experiments within the first 20 epochs.
\end_layout

\begin_layout Standard
The results from figures
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-test2-13"
plural "false"
caps "false"
noprefix "false"

\end_inset

 and
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-test2-13-std"
plural "false"
caps "false"
noprefix "false"

\end_inset

 can be seen repeated in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-test2-13-individual"
plural "false"
caps "false"
noprefix "false"

\end_inset

 separately to include train error rates along with the previously reported
 test performance.
 In general the test and train accuracy can be seen to be very similar with
 little difference.
 While at 10 epochs the train and test accuracies are effectively equal,
 by 100 epochs the test error rate has slightly increased from the training
 value.
 Additionally, at low epochs and high node counts, the variance in test
 error rate is significantly higher than the training error rate.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-13-test-train-error-rate.png
	lyxscale 20
	width 50col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Error rates across nodes over epochs
\end_layout

\end_inset


\end_layout

\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-13-test-train-error-rate-std.png
	lyxscale 20
	width 50col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Standard deviations across nodes over epochs
\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Test and train results for
\begin_inset Formula $\eta=0.5$
\end_inset

,
\begin_inset Formula $\beta=0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-13-individual"

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Standard
The effect of varying momentum can be seen in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-momentums"
plural "false"
caps "false"
noprefix "false"

\end_inset

, a fixed learning rate of
\begin_inset Formula $\eta=0.01$
\end_inset

 was maintained throughout.
 The meaning of momentum and its effect on training is discussed in section

\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Stochastic-Gradient-Descent"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 Without momentum (
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-test2-11"
plural "false"
caps "false"
noprefix "false"

\end_inset

), it can be seen that the network does not begin to converge within 100
 epochs.
 This is also the case for
\begin_inset Formula $\beta=0.3$
\end_inset

, it is only by
\begin_inset Formula $\beta=0.5$
\end_inset

 that some of the evaluated architectures begin to converge.
 The test error rates for the 32 and 64 node series' begin to decrease after
 64 epochs with
\begin_inset Formula $n_{h}=64$
\end_inset

 nodes descending faster.
 With
\begin_inset Formula $\beta=0.7$
\end_inset

, the 32 and 64-node networks begin to converge earlier, after 32 epochs
 while the remaining architectures down to 2 nodes begin to converge after
 64 epochs.
 Finally, with
\begin_inset Formula $\beta=0.7$
\end_inset

, all of the evaluated architectures have converged by 64 epochs.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-7-error-rate-curves.png
	lyxscale 50
	width 45col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.01$
\end_inset

,
\begin_inset Formula $\beta=0.9$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-7"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-10-error-rate-curves.png
	lyxscale 50
	width 45col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.01$
\end_inset

,
\begin_inset Formula $\beta=0.7$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-10"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-8-error-rate-curves.png
	lyxscale 50
	width 45col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.01$
\end_inset

,
\begin_inset Formula $\beta=0.5$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-8-1"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-9-error-rate-curves.png
	lyxscale 50
	width 45col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.01$
\end_inset

,
\begin_inset Formula $\beta=0.3$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-9"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp1-test2-11-error-rate-curves.png
	lyxscale 50
	width 45col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
\begin_inset Formula $\eta=0.01$
\end_inset

,
\begin_inset Formula $\beta=0.0$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-test2-11"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Varied hidden node performance results over varied training length with
 different momentum coefficients
\begin_inset CommandInset label
LatexCommand label
name "fig:exp1-momentums"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Discussion
\end_layout

\begin_layout Standard
From the presented results, it can be seen that, generally, increasing either
 learning rate or momentum increases the speed of convergence.
 Increasing the learning rate makes convergence faster as the steps taken
 by the weights set across the error surface are larger.
\end_layout

\begin_layout Standard
Increasing the number of hidden nodes also increases the speed of convergence.
 However, it is worth noting that a large number of nodes is not required
 to achieve a highly performant accuracy.
 A single hidden node for a total of 14 parameters, with enough training,
 was able to achieve similar results to a 64-node network of 770 or 5 times
 as many parameters.
\end_layout

\begin_layout Standard
From the test/train comparisons, the slight divergence of test error rates
 from training error would suggest that the network is overfitting to the
 training data, reducing its ability to generalise.
\end_layout

\begin_layout Section
Ensemble Classification
\begin_inset CommandInset label
LatexCommand label
name "sec:exp2"

\end_inset


\end_layout

\begin_layout Standard
A horizontal ensemble of
\begin_inset Formula $m$
\end_inset

 models was constructed with majority vote in order to investigate whether
 this could improve performance over that of any single model.
 In order to introduce variation between models of the ensemble, a range
 of hidden nodes, learning rate and/or epochs could be defined.
 When selecting parameters throughout the ensemble, either
\begin_inset Formula $m$
\end_inset

 equally spaced values
\begin_inset Foot
status open

\begin_layout Plain Layout
For
\begin_inset Formula $m=1$
\end_inset

, the average of the range is taken
\end_layout

\end_inset

 or random values are selected within the range.
 The prior equally spaced ranges are referred to as
\emph on
uniform
\emph default
 ensembles in this work to distinguish them from random ensembles.
\end_layout

\begin_layout Standard
The statistic
\emph on
agreement
\emph default
,
\begin_inset Formula $a$
\end_inset

, is defined as the proportion of models under the meta-classifier that
 correctly predict a sample's class when the ensemble correctly classifies.
 It could also be considered the confidence of the meta-classifier, for
 one horizontal model
\begin_inset Formula $a_{m=1}\equiv1$
\end_inset

.
 As error rates are presented as opposed to accuracy, this is inverted by

\begin_inset Formula $d=1-a$
\end_inset

 to
\emph on
disagreement
\emph default
, the proportion of incorrect models when correctly group classifying.
 Alongside the disagreement and ensemble test accuracy, the average individual
 accuracy for both test and training data for all models within the meta-classif
iers are also presented.
\end_layout

\begin_layout Subsection
Results
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp2-test8-error-rate-curves.png
	lyxscale 50
	width 50col%

\end_inset


\begin_inset Graphics
	filename ../graphs/exp2-test8-error-rate-std.png
	lyxscale 50
	width 50col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Uniform ensemble classifier performance results for
\begin_inset Formula $\eta=0.03$
\end_inset

,
\begin_inset Formula $\beta=0.01$
\end_inset

, nodes =
\begin_inset Formula $1-400$
\end_inset

, epochs =
\begin_inset Formula $5-100$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp2-test8"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
An investigation into a uniform ensemble of variable nodes and epochs can
 be seen in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp2-test8"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 The ensemble error rate varies between 16% and 18%.
 The individual test and train error rates begin at ~17% and increase to
 20% as the models increase.
 As
\begin_inset Formula $m$
\end_inset

 increases, the variance in individual and ensemble error rates decreases,
 the individual rates decrease sharply while the ensemble error rate decreases
 more gradually.
 The disagreement increases from
\begin_inset Formula $m=1$
\end_inset

 to
\begin_inset Formula $m=9$
\end_inset

 and appears to be fairly constant beyond this.
\end_layout

\begin_layout Standard
An experiment with a fixed epoch value throughout the uniform ensemble is
 presented in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp2-test10"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 Nodes between 1 and 400 were selected for the classifiers with a learning
 rate,
\begin_inset Formula $\eta=0.15$
\end_inset

 and momentum,
\begin_inset Formula $p=0.01$
\end_inset

.
 The ensemble accuracy can be seen to be fairly constant at around 5% throughout
 the number of horizontal models, 3 models was the least accurate with a
 higher standard deviation.
 3 horizontal models also shows a significant spike in disagreement and
 individual error rates which gradually decreases as the number of models
 increases.
 The ensemble accuracy was higher than the individual models, significantly
 for 3 models (~10%) and by a single percent after
\begin_inset Formula $m=25$
\end_inset

.
 The variance in ensemble accuracy decreases as the number of models increases.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp2-test10-error-rate-curves.png
	lyxscale 50
	width 50col%

\end_inset


\begin_inset Graphics
	filename ../graphs/exp2-test10-error-rate-std.png
	lyxscale 50
	width 50col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Uniform ensemble classifier performance results for
\begin_inset Formula $\eta=0.15$
\end_inset

,
\begin_inset Formula $\beta=0.01$
\end_inset

, nodes =
\begin_inset Formula $1-400$
\end_inset

, epochs = 20
\begin_inset CommandInset label
LatexCommand label
name "fig:exp2-test10"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
Results for a random ensemble can be seen in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp2-test19"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 The ensemble error rate can be seen to be lower than the individual values
 with roughly a 2% improvement.
 The ensemble error rate is also fairly constant for
\begin_inset Formula $m>3$
\end_inset

, for
\begin_inset Formula $m=3$
\end_inset

 the ensemble error rate is higher in line with the higher individual error
 rates.
 For
\begin_inset Formula $m>1$
\end_inset

, the disagreement can be seen to fairly constant at around 3%.
 Looking to the variance, for all series the variance reduces as
\begin_inset Formula $m$
\end_inset

 increases.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp2-test19-error-rate-curves.png
	lyxscale 50
	width 50col%

\end_inset


\begin_inset Graphics
	filename ../graphs/exp2-test19-error-rate-std.png
	lyxscale 50
	width 50col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Random ensemble classifier performance results for
\begin_inset Formula $\eta=0.01-0.1$
\end_inset

,
\begin_inset Formula $\beta=0.0$
\end_inset

, nodes =
\begin_inset Formula $1-100$
\end_inset

, epochs =
\begin_inset Formula $10-70$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp2-test19"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Discussion
\end_layout

\begin_layout Standard
In general, the use of an ensemble classifier slightly increased the accuracy
 over the indivudal models used within.
 When using a random ensemble, the accuracy for
\begin_inset Formula $m=1$
\end_inset

 was worse than for a uniform ensemble.
 This is because for a uniform ensemble, the parameter values were taken
 as the mean of the provided ranges which could be expected to provide reasonabl
e values for each.
 With a random ensemble, an ineffective combination of parameters could
 be selected and there are no other models in the group to balance this
 out.
\end_layout

\begin_layout Standard
Throughout the presented experiments, 3 horizontal models was shown to be
 the worst-performing configuration with lower ensemble accuracy and higher
 disagreement.
 This is likely due to the larger proportion of the group that a single
 model constitutes.
 When correct, three models may only have a disagreement of 1/3 or 0 and
 thus the final value will lie somewhere between these two.
 As the number of horiztonal models increases, the number of acceptable
 disagreement values increases.
 As such a value for
\begin_inset Formula $m>5$
\end_inset

 allows more granularity in making ensemble decisions.
\end_layout

\begin_layout Standard
A random ensemble would be recommended in order to broadly sample parameter
 combinations for the provided ranges.
 For a uniform ensemble, each model will have parameters that are selected
 from the same point through the range which may lead to consistently ineffectiv
e combinations depending on the selected range
\begin_inset Foot
status open

\begin_layout Plain Layout
For a range of epochs from 1 to 100 and a range of hidden nodes from 1 to
 100, each model will have the same value for both using a uniform ensemble.
 It can be seen that the particular ranges specified for both is more important
 than if the values were selected randomly.
\end_layout

\end_inset

.
 When using a random ensemble it is important to use a large enough group
 of models in order to achieve robust group classification.
 A value for
\begin_inset Formula $m>10$
\end_inset

 is recommended to achieve this.
\end_layout

\begin_layout Standard
In general, increasing the size of the ensemble did not guarantee an increase
 in performance however it did tend to reduce the variance in accuracy.
 This would be a desired result where robustness is valued such as the domain
 of healthcare as is the case for this work.
\end_layout

\begin_layout Section
Optimiser Comparisons
\begin_inset CommandInset label
LatexCommand label
name "sec:exp3"

\end_inset


\end_layout

\begin_layout Standard
Throughout the previous experiments the stochastic gradient descent optimiser
 was used to change weights of the networks but there are many different
 optimisation algorithms.
 This section will present investigations into two other optimisation algorithms
 and discuss the differences between them using the horizontal ensemble
 classification of the previous section.
\end_layout

\begin_layout Standard
Prior to these investigations, however, stochastic gradient descent and
 the two other subject algorithms will be described.
\end_layout

\begin_layout Subsection
Optimisers
\end_layout

\begin_layout Subsubsection
Stochastic Gradient Descent
\begin_inset CommandInset label
LatexCommand label
name "subsec:Stochastic-Gradient-Descent"

\end_inset


\end_layout

\begin_layout Standard
Gradient descent and the closely related stochastic and mini-batch gradient
 descent are popular optimisation algorithms in the machine learning space.
\end_layout

\begin_layout Standard
The aim of the neural networks in question are to make correct classifications
 on sample data being fed-forward, ideally the networks classification would
 be equal to the provided label.
 A loss function,
\begin_inset Formula $J$
\end_inset

, is defined as the difference between the predicted output and the target
 labelled output
\begin_inset Foot
status open

\begin_layout Plain Layout
There are many different options for the loss function including mean squared
 error and categorical cross-entropy.
 Although they have significant differences, this coverage of optimisation
 algorithms does not rely on a specific loss function.
\end_layout

\end_inset

, it follows that we are aiming to minimise this as much as possible.
 In order to improve the network, the values of the parameters,
\begin_inset Formula $\theta$
\end_inset

, must be changed with the intention of reducing the loss value.
 From a set of starting weights,
\begin_inset Formula $\theta_{0}$
\end_inset

, this could be completed by finding the gradient of
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\xout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $J$
\end_inset

 w.r.t
\family default
\series default
\shape default
\size default
\emph default
\bar default
\strikeout default
\xout default
\uuline default
\uwave default
\noun default
\color inherit

\begin_inset Formula $\theta_{0}$
\end_inset


\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\xout off
\uuline off
\uwave off
\noun off
\color none
.
 Formally this would be
\begin_inset Formula $\nabla_{\theta_{0}}J\left(\theta_{0}\right)$
\end_inset

, the first derivative of the loss function with respect to the current
 weights.
 In order to reduce the loss, the gradient should be subtracted from the
 current weight, a scale factor,
\begin_inset Formula $\eta$
\end_inset

, or the learning rate is defined to apply a tuneable proportion of the
 gradient to the starting values.
\end_layout

\begin_layout Standard
In order to iteratively apply this algorithm, the form below is used for
 time steps,
\begin_inset Formula $t$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
\theta_{t+1}=\theta_{t}-\eta\cdot\nabla_{\theta_{t}}J\left(\theta_{t}\right)
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
The differences between standard or batch gradient descent and the previously
 mentioned variants is how many samples are fed-forward as part of the optimisat
ion algorithm.
 Standard gradient descent propagates and calculates weight changes for
 the entire training dataset in a single iteration of the algorithm.
 Stochastic gradient descent, instead, processes only one sample during
 an iteration.
 Mini-batch strikes a balance between the two, the speed of stochastic gradient
 descent is retained as more weight updates are made, however the path through
 the error surface can be noisier than vanilla gradient descent.
 Therefore, although the algorithm is colloquially referred to as gradient
 descent or SGD, more strictly as a batch size of 35 was used for this work,
 mini-batch gradient descent is being used.
\end_layout

\begin_layout Standard

\noun on
Tensorflow's
\noun default
 implementation of SGD also includes a momentum parameter.
 Momentum aims to help a network increase the speed of convergence and reduce
 oscillations by reinforcing dimensions (weights) that are changing in a
 consistent direction while slowing dimensions that are changing direction
 rapidly
\begin_inset CommandInset citation
LatexCommand cite
key "paperspace-mom-rmsprop-adam"
literal "false"

\end_inset

.
 Momentum introduces a memory element to the descent by including a portion,

\begin_inset Formula $\beta$
\end_inset

, of the previous step's weight delta or
\emph on
velocity
\emph default
 in subsequent iterations.
\end_layout

\begin_layout Standard
The introduction of momentum can be described as below
\begin_inset CommandInset citation
LatexCommand cite
key "tf.keras.optimizers.SGD"
literal "false"

\end_inset

,
\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
v_{t}=\beta\cdot v_{t-1}-\eta\cdot\nabla_{\theta_{t}}J\left(\theta_{t}\right)\label{eq:sgd-momentum}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
\theta_{t+1}=\theta_{t}+v_{t}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
As previously presented (figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp1-momentums"
plural "false"
caps "false"
noprefix "false"

\end_inset

), momentum can significantly increase convergence speed.
\end_layout

\begin_layout Subsubsection
RMSprop
\end_layout

\begin_layout Standard
Although gradient descent is a powerful optimisation algorithm, there are
 drawbacks.
 One limitation is that the learning rate,
\begin_inset Formula $\eta$
\end_inset

, is a scalar applied to all gradients.
 As a result, smaller gradients as would be found at saddle points move
 slowly.
 An alternative would be to expand the single scalar to a learning rate
 per parameter that could move dynamically throughout the training process,
 known as adaptive learning rate optimisation.

\end_layout

\begin_layout Standard
One such algorithm is RMSprop
\begin_inset CommandInset citation
LatexCommand cite
key "rmsprop-hinton"
literal "false"

\end_inset

 or
\emph on
root mean square propagation
\emph default
, an unpublished algorithm that builds on previous adaptive algorithms such
 as Rprop and Adagrad.
 These aimed to overcome the shortcomings of SGD by using just the sign
 of the calculated gradients and allowing the learning rate alone to define
 the size of the step.
 Instead of a constant or defined learning rate schedule, each learning
 rate
\emph on
floats
\emph default
 and is scaled up or down based on whether it is consistently changing in
 the same direction each iteration.
\end_layout

\begin_layout Standard
Equations for RMSprop can be seen below
\begin_inset CommandInset citation
LatexCommand cite
key "understanding-rmsprop"
literal "false"

\end_inset

.
 For conciseness, the previously defined derivative of the loss function
 w.r.t to the current parameters is shortened,
\begin_inset Formula $g_{t}=\nabla_{\theta_{t}}J\left(\theta_{t}\right)$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
E\left[g^{2}\right]_{t}=\alpha\cdot E\left[g^{2}\right]_{t-1}+\left(1-\alpha\right)\cdot g_{t}^{2}\label{eq:rmsprop-expected-value}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
\theta_{t+1}=\theta_{t}-\frac{\eta}{\sqrt{E\left[g^{2}\right]_{t}+\epsilon}}g_{t}\label{eq:rmsprop-update}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
As previously mentioned, only the sign of the gradient is used, this can
 be achieved by dividing
\begin_inset Formula $g$
\end_inset

 by the magnitude
\begin_inset Formula $|g|$
\end_inset

.
 RMSprop extends this by instead dividing by the exponential average of
 squared gradients, equation
\begin_inset CommandInset ref
LatexCommand ref
reference "eq:rmsprop-expected-value"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 In this equation,
\begin_inset Formula $\alpha$
\end_inset

 constitutes the gradient decay rate, a value of 0.9 is suggested
\begin_inset CommandInset citation
LatexCommand cite
key "understanding-rmsprop"
literal "false"

\end_inset

.

\begin_inset Formula $\epsilon$
\end_inset

 is a small constant on the order of
\begin_inset Formula $1\times10^{-7}$
\end_inset

 that stops the algorithm from dividing by 0.
\end_layout

\begin_layout Subsubsection
Adam
\end_layout

\begin_layout Standard
Adam or
\emph on
adaptive moment estimation
\emph default
 is an optimisation algorithm that combines the adaptive learning rates
 of RMSprop with the previously described momentum
\begin_inset CommandInset citation
LatexCommand cite
key "adam-paper"
literal "false"

\end_inset

.
 Like RMSprop, the exponential average of squared gradients is maintained,
 compare equations
\begin_inset CommandInset ref
LatexCommand ref
reference "eq:rmsprop-expected-value"
plural "false"
caps "false"
noprefix "false"

\end_inset

 and
\begin_inset CommandInset ref
LatexCommand ref
reference "eq:adam-squared-grad-accum"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 In addition to this, however, the exponential average of gradients is maintaine
d with a similar function to momentum, compare equations
\begin_inset CommandInset ref
LatexCommand ref
reference "eq:sgd-momentum"
plural "false"
caps "false"
noprefix "false"

\end_inset

 and
\begin_inset CommandInset ref
LatexCommand ref
reference "eq:adam-momentum"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
m_{t}=\beta_{1}\cdot m_{t-1}+\left(1-\beta_{1}\right)g_{t}\label{eq:adam-momentum}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
v_{t}=\beta_{2}\cdot v_{t-1}+\left(1-\beta_{2}\right)g_{t}^{2}\label{eq:adam-squared-grad-accum}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
These two equations constitute the eponymous moments,
\begin_inset Formula $m_{t}$
\end_inset

 is the first moment or mean while
\begin_inset Formula $v_{t}$
\end_inset

 is the second moment or the uncentered variance of the gradients.
 As these moments are initialised at zero, these estimations tend to bias
 towards 0.
 The original authors correct the bias using the below
\begin_inset CommandInset citation
LatexCommand cite
key "adam-paper"
literal "false"

\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
\hat{m}_{t}=\frac{m_{t}}{1-\beta_{1}^{t}}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
\hat{v}_{t}=\frac{v_{t}}{1-\beta_{2}^{t}}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
This leaves the update step itself, described below.
 Similarities can be seen between the previous RMSprop update step (equation

\begin_inset CommandInset ref
LatexCommand ref
reference "eq:rmsprop-update"
plural "false"
caps "false"
noprefix "false"

\end_inset

) and that of Adam.
 The RMSprop momentum term
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\xout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $E\left[g^{2}\right]_{t}$
\end_inset

 has been replaced by the equivalent
\begin_inset Formula $v_{t}$
\end_inset

 while the calculated gradient,
\begin_inset Formula $g_{t}$
\end_inset

 has been replaced by the exponentially decaying average gradient,
\begin_inset Formula $m_{t}$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Formula
\begin{equation}
\theta_{t+1}=\theta_{t}-\frac{\eta}{\sqrt{\hat{v}_{t}+\epsilon}}\hat{m}_{t}\label{eq:adam-update}
\end{equation}

\end_inset


\end_layout

\begin_layout Subsection
Results
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp3-test1-error-rate-curves.png
	lyxscale 30
	width 100col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Uniform ensemble classifier performance results for SGD, RMSprop and Adam
 optimisation with
\begin_inset Formula $\eta=0.1$
\end_inset

,
\begin_inset Formula $\beta=0.0$
\end_inset

, nodes = 16, epochs =
\begin_inset Formula $1-100$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp3-test1"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
A uniform ensemble classifier of fixed
\begin_inset Formula $\eta$
\end_inset

,
\begin_inset Formula $\beta$
\end_inset

 and
\begin_inset Formula $n_{h}$
\end_inset

 was evaluated, the results can be seen in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp3-test1"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 Across the values of
\begin_inset Formula $m$
\end_inset

 and the three evaluated optimisers, the ensemble error rates did not change
 significantly with ~1% difference throughout.
 The individual error rates and disagreement was more varied, Adam and RMSprop
 reported ~10% lower errors for
\begin_inset Formula $m>5$
\end_inset

.
 The previously described spike in individual error rates and disagreement
 for
\begin_inset Formula $m=3$
\end_inset

 can be seen.
 Figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp3-test1-std"
plural "false"
caps "false"
noprefix "false"

\end_inset

 presents the variance in these results.
 The Adam optimiser showed significantly higher variance for individual
 test error and disagreement while the SGD algorithm showed the lowest variance
 throughout.
 RMSprop demonstrated a higher initial variance for disagreement than SGD
 and then comparable results for
\begin_inset Formula $m>3$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp3-test1-errors-rate-std.png
	lyxscale 30
	width 100col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Uniform ensemble classifier test variance for SGD, RMSprop and Adam optimisation
 with
\begin_inset Formula $\eta=0.1$
\end_inset

,
\begin_inset Formula $\beta=0.0$
\end_inset

, nodes = 16, epochs =
\begin_inset Formula $1-100$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp3-test1-std"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Note Comment
status open

\begin_layout Plain Layout
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp3-test7-error-rate-curves.png
	lyxscale 30
	width 100col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Uniform ensemble classifier performance results for SGD, RMSprop and Adam
 optimisation with
\begin_inset Formula $\eta=0.1$
\end_inset

,
\begin_inset Formula $\beta=0.9$
\end_inset

, nodes =
\begin_inset Formula $1-400$
\end_inset

, epochs =
\begin_inset Formula $50-100$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp3-test7"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp3-test7-errors-rate-std.png
	lyxscale 30
	width 100col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Uniform ensemble classifier test variance for SGD, RMSprop and Adam optimisation
 with
\begin_inset Formula $\eta=0.1$
\end_inset

,
\begin_inset Formula $\beta=0.9$
\end_inset

, nodes =
\begin_inset Formula $1-400$
\end_inset

, epochs =
\begin_inset Formula $50-100$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp3-test7-std"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
The effect of including random parameter selection from a range of learning
 rates, hidden nodes and epochs was evaluated, the results can be seen in
 figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp3-test9"
plural "false"
caps "false"
noprefix "false"

\end_inset

.
 Similar to the previously presented random ensemble, the ensemble error
 rate is higher for
\begin_inset Formula $m=1$
\end_inset

.
 Past this, the ensemble accuracy is fairly constant throughout the range
 for
\begin_inset Formula $m$
\end_inset

 over the three optimisers at ~4%.
 Figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:exp3-test9-std"
plural "false"
caps "false"
noprefix "false"

\end_inset

 presents the variance in these results, RMSprop demonstrated the lowest
 standard deviation throughout the range of
\begin_inset Formula $m$
\end_inset

 for all series.
 SGD and Adam showed a higher variance for
\begin_inset Formula $m\leq5$
\end_inset

 before descending to similar values as RMSprop.
\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp3-test9-error-rate-curves.png
	lyxscale 30
	width 100col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Random ensemble classifier performance results for SGD, RMSprop and Adam
 optimisation with
\begin_inset Formula $\eta=0.01-0.1$
\end_inset

,
\begin_inset Formula $\beta=0.0$
\end_inset

, nodes =
\begin_inset Formula $1-100$
\end_inset

, epochs =
\begin_inset Formula $1-70$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp3-test9"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/exp3-test9-errors-rate-std.png
	lyxscale 30
	width 100col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Random ensemble classifier test variance for SGD, RMSprop and Adam optimisation
 with
\begin_inset Formula $\eta=0.01-0.1$
\end_inset

,
\begin_inset Formula $\beta=0.0$
\end_inset

, nodes =
\begin_inset Formula $1-100$
\end_inset

, epochs =
\begin_inset Formula $1-70$
\end_inset


\begin_inset CommandInset label
LatexCommand label
name "fig:exp3-test9-std"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Discussion
\end_layout

\begin_layout Standard
The similarity in group classification despite variations in individual
 error rates across the employed algorithms suggests that the use of a meta-clas
sifer was able to largely overcome the differences in training regimens.
 Similarly to the previous discussion, a larger ensemble reduced the variation
 in results, providing a more robust classification.
\end_layout

\begin_layout Standard
In suggesting a optimal algorithm it is worth considering the intended domains
 for RMSprop and Adam.
 As newer algorithms, there tends to a focus on deep convolutional networks
 which implies a somewhat different set of requirements such as being able
 to scale to large numbers of parameters.
 This is not to say that the algorithms are inappropriate for the presented
 applications, as demonstrated, these more complex algorithms were able
 to outperform the employed gradient descent with optional momentum.

\end_layout

\begin_layout Standard
As the ensemble accuracies throughout the ensemble sizes and algorithms
 were comparable, a recommendation for an algorithm can instead be made
 primarily on the robustness of the meta-classifier, an important factor
 in the subject field of healthcare.
 RMSprop showed the lowest disagreement in the group and the tightest variation
 in results, for this reason it is recommended based on this presented data.
 Although a random ensemble has previously been recommended with the caveat
 of using a larger ensemble, RMSprop showed lower variation throughout the
 smaller group sizes where Adam showed similar variance to SGD.
\end_layout

\begin_layout Section
Conclusions
\end_layout

\begin_layout Standard
Within this work, shallow multi-layer perceptrons were shown to be able
 to achieve an error rate of 4% when classifying breast tumours as either
 benign or malignant.
 Investigations were made into how the number of hidden nodes and the length
 of the training period affected the performance.
 In general, increasing the number of nodes increased the speed of convergence
 while training the network for longer also increased performance.
 A learning rate of 0.5 allowed all of the evaluated network architectures
 to train within 40 epochs.
\end_layout

\begin_layout Standard
The effect of creating an ensemble meta-classifier on performance was evaluated
 to investigate whether combining models could produce a lower error rate.
 While the ensemble accuracy was not significantly affected, the variance
 of the results was reduced indicating that the robustness was increased
 when classifying in conjunction.
\end_layout

\begin_layout Standard
Two additional optimisation algorithms were evaluated, RMSprop and Adam.
 The ensemble accuracy was not significantly affected, although RMSprop
 and Adam reported lower individual test and train error rates, the ensemble
 accuracy was not significantly affected.
 As such, a recommendation for RMSprop was made as it reported tighter variance.
 For the healthcare domain, a more robust network was prioritised.
\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Standard
\begin_inset CommandInset label
LatexCommand label
name "sec:bibliography"

\end_inset


\begin_inset CommandInset bibtex
LatexCommand bibtex
btprint "btPrintCited"
bibfiles "references"
options "bibtotoc"

\end_inset


\end_layout

\begin_layout Section
\start_of_appendix
Network Parameter Counts
\begin_inset CommandInset label
LatexCommand label
name "app:Network-Parameter-Counts"

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Float table
placement H
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Tabular
<lyxtabular version="3" rows="9" columns="2">
<features tabularvalignment="middle">
<column alignment="center" valignment="top">
<column alignment="center" valignment="top">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
Hidden Nodes
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
Trainable Parameters
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
1
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
14
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
2
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
26
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
4
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
50
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
8
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
98
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
16
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
194
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
32
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
386
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
64
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
770
\end_layout

\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
128
\end_layout

\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text

\begin_layout Plain Layout
1,538
\end_layout

\end_inset
</cell>
</row>
</lyxtabular>

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Number of trainable parameters for architectures of varying numbers of hidden
 nodes
\begin_inset CommandInset label
LatexCommand label
name "tab:trainable-params"

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Plain Layout

\end_layout

\end_inset


\end_layout

\begin_layout Section
Network Graph
\begin_inset CommandInset label
LatexCommand label
name "sec:Network-Graph"

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Float figure
placement H
wide false
sideways false
status open

\begin_layout Plain Layout
\noindent
\align center
\begin_inset Graphics
	filename ../graphs/tensorboard-graph.png
	lyxscale 50
	width 100col%

\end_inset


\end_layout

\begin_layout Plain Layout
\begin_inset Caption Standard

\begin_layout Plain Layout
Single hidden layer neural network as graphed by
\noun on
Tensorboard
\noun default

\begin_inset CommandInset label
LatexCommand label
name "fig:tensorboard"

\end_inset


\end_layout

\end_inset


\end_layout

\end_inset


\end_layout

\begin_layout Section
Source Code
\begin_inset CommandInset label
LatexCommand label
name "sec:Source-Code"

\end_inset


\end_layout

\begin_layout Standard
\begin_inset CommandInset include
LatexCommand lstinputlisting
filename "../nncw.py"
lstparams "caption={Formatted Jupyter notebook containing experiment code},label={notebook-code}"

\end_inset


\end_layout

\end_body
\end_document