3502 lines
64 KiB
Plaintext
3502 lines
64 KiB
Plaintext
#LyX 2.3 created this file. For more info see http://www.lyx.org/
|
|
\lyxformat 544
|
|
\begin_document
|
|
\begin_header
|
|
\save_transient_properties true
|
|
\origin unavailable
|
|
\textclass article
|
|
\begin_preamble
|
|
\def\changemargin#1#2{\list{}{\rightmargin#2\leftmargin#1}\item[]}
|
|
\let\endchangemargin=\endlist
|
|
\pagenumbering{gobble}
|
|
|
|
\usepackage{pxfonts}
|
|
\usepackage{color}
|
|
|
|
\definecolor{commentgreen}{RGB}{0,94,11}
|
|
\definecolor{darkblue}{rgb}{0,0,0.75}
|
|
\definecolor{darkred}{rgb}{0.6,0,0}
|
|
\end_preamble
|
|
\use_default_options true
|
|
\begin_modules
|
|
customHeadersFooters
|
|
minimalistic
|
|
todonotes
|
|
\end_modules
|
|
\maintain_unincluded_children false
|
|
\language british
|
|
\language_package default
|
|
\inputencoding utf8
|
|
\fontencoding global
|
|
\font_roman "default" "default"
|
|
\font_sans "default" "default"
|
|
\font_typewriter "default" "default"
|
|
\font_math "auto" "auto"
|
|
\font_default_family default
|
|
\use_non_tex_fonts false
|
|
\font_sc false
|
|
\font_osf false
|
|
\font_sf_scale 100 100
|
|
\font_tt_scale 100 100
|
|
\use_microtype true
|
|
\use_dash_ligatures true
|
|
\graphics default
|
|
\default_output_format default
|
|
\output_sync 0
|
|
\bibtex_command biber
|
|
\index_command default
|
|
\paperfontsize default
|
|
\spacing other 1.2
|
|
\use_hyperref true
|
|
\pdf_title "Training Neural Networks With Backpropagation"
|
|
\pdf_author "Andy Pack"
|
|
\pdf_subject "EEEM005"
|
|
\pdf_keywords "EEEM005"
|
|
\pdf_bookmarks true
|
|
\pdf_bookmarksnumbered false
|
|
\pdf_bookmarksopen false
|
|
\pdf_bookmarksopenlevel 1
|
|
\pdf_breaklinks false
|
|
\pdf_pdfborder true
|
|
\pdf_colorlinks false
|
|
\pdf_backref false
|
|
\pdf_pdfusetitle true
|
|
\papersize default
|
|
\use_geometry true
|
|
\use_package amsmath 1
|
|
\use_package amssymb 1
|
|
\use_package cancel 1
|
|
\use_package esint 1
|
|
\use_package mathdots 1
|
|
\use_package mathtools 1
|
|
\use_package mhchem 1
|
|
\use_package stackrel 1
|
|
\use_package stmaryrd 1
|
|
\use_package undertilde 1
|
|
\cite_engine biblatex
|
|
\cite_engine_type authoryear
|
|
\biblio_style plain
|
|
\biblio_options urldate=long
|
|
\biblatex_bibstyle ieee
|
|
\biblatex_citestyle ieee
|
|
\use_bibtopic false
|
|
\use_indices false
|
|
\paperorientation portrait
|
|
\suppress_date true
|
|
\justification true
|
|
\use_refstyle 1
|
|
\use_minted 0
|
|
\index Index
|
|
\shortcut idx
|
|
\color #008000
|
|
\end_index
|
|
\leftmargin 1.8cm
|
|
\topmargin 2cm
|
|
\rightmargin 1.8cm
|
|
\bottommargin 2cm
|
|
\secnumdepth 3
|
|
\tocdepth 3
|
|
\paragraph_separation skip
|
|
\defskip medskip
|
|
\is_math_indent 0
|
|
\math_numbering_side default
|
|
\quotes_style british
|
|
\dynamic_quotes 0
|
|
\papercolumns 1
|
|
\papersides 1
|
|
\paperpagestyle fancy
|
|
\listings_params "language=Python,breaklines=true,frame=tb,otherkeywords={self},emph={State},emphstyle={\ttb\color{darkred}},basicstyle={\ttfamily},commentstyle={\bfseries\color{commentgreen}\itshape},keywordstyle={\color{darkblue}},emphstyle={\color{red}},stringstyle={\color{red}}"
|
|
\bullet 1 0 9 -1
|
|
\bullet 2 0 24 -1
|
|
\tracking_changes false
|
|
\output_changes false
|
|
\html_math_output 0
|
|
\html_css_as_file 0
|
|
\html_be_strict false
|
|
\end_header
|
|
|
|
\begin_body
|
|
|
|
\begin_layout Title
|
|
|
|
\size giant
|
|
Training Neural Networks with Backpropagation
|
|
\end_layout
|
|
|
|
\begin_layout Author
|
|
Andy Pack
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset VSpace 15pheight%
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename surrey.png
|
|
lyxscale 15
|
|
width 40col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset VSpace vfill
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\align center
|
|
EEEM005
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
May 2021
|
|
\size large
|
|
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
Department of Electrical and Electronic Engineering
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
Faculty of Engineering and Physical Sciences
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
University of Surrey
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage newpage
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section*
|
|
Executive Summary
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Investigations into the accuracy of a shallow multi-layer perceptron at
|
|
classifying breast tumours as either benign or malignant are presented.
|
|
|
|
\noun on
|
|
Python
|
|
\noun default
|
|
and the
|
|
\noun on
|
|
TensorFlow
|
|
\noun default
|
|
platform were used to construct and evaluate networks of varied architectures
|
|
and training periods.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
For experiment 1, the effect of varying the number of hidden nodes was contraste
|
|
d with a varied number of training epochs.
|
|
As the number of hidden nodes was increased from 1 to 64, the speed of
|
|
convergence increased.
|
|
The final performance once converged, however, was not significantly affected
|
|
by the size of the hidden layer at around a 4% error rate.
|
|
Different learning rates and momentum were selected in order to visualise
|
|
the effect of these parameters on error rate.
|
|
A larger learning rate was shown to increase the speed of convergence while
|
|
also increasing the variance in the results.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Experiment 2 builds on the previous work by creating a horizontal ensemble
|
|
classifier of models that use majority vote for consensus.
|
|
In order to vary the models so as to make a more robust classification,
|
|
variations were introduced to the parameters of the models.
|
|
These variations could be made either linearly, with equally spaced parameter
|
|
values within a range, or by randomly selecting values from the range.
|
|
For uniform ensembles, the combined classification error rate was fairly
|
|
consistent throughout the tested group sizes with a variance in results
|
|
that decreased as the size of the meta-classifier was increased.
|
|
For random ensembles, groups of less than 5 models showed both slightly
|
|
higher error rates and significantly larger variance.
|
|
By increasing the size of the ensemble, better accuracy with tighter variance
|
|
was achieved.
|
|
A random ensemble was recommended in order to better sample the allowed
|
|
parameter combinations assuming a larger ensemble is employed.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Experiment 3 repeats the test apparatus of experiment 2 with variations
|
|
in the employed optimisation algorithm.
|
|
The RMSprop and Adam algorithms are described in order to present the differenc
|
|
es to the previously used stochastic gradient descent.
|
|
Throughout the evaluated ensemble sizes and algorithms, the group error
|
|
rate was comparable indicating that the use of a meta-classifier was able
|
|
to overcome the drawbacks of each.
|
|
Due to the similar accuracy, a recommendation for RMSprop was made as a
|
|
result of its reduced variance for the reported results indicating a higher
|
|
robustness for the parameters employed.
|
|
For the healthcare domain of the
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage newpage
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset ERT
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
pagenumbering{roman}
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset toc
|
|
LatexCommand tableofcontents
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage pagebreak
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset FloatList figure
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset FloatList table
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage pagebreak
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Right Footer
|
|
Andy Pack / 6420013
|
|
\end_layout
|
|
|
|
\begin_layout Left Footer
|
|
May 2021
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset ERT
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
pagenumbering{arabic}
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
setcounter{page}{1}
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Introduction
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Artificial neural networks have been the object of research and investigation
|
|
since the 1940s with
|
|
\noun on
|
|
McCulloch
|
|
\noun default
|
|
and
|
|
\noun on
|
|
Pitts
|
|
\noun default
|
|
' model of the artificial neuron
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "McCulloch1943"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
or
|
|
\emph on
|
|
Threshold Logic Unit
|
|
\emph default
|
|
.
|
|
Throughout the century, the development of the single and multi-layer perceptro
|
|
ns (SLP/MLP) alongside the backpropagation algorithm
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "Rumelhart1986"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
advanced the study of artificial intelligence.
|
|
Throughout the 2010s, convolutional neural networks have proved critical
|
|
in the field of computer vision and image recognition
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "alexnet"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
This work investigates the ability of a shallow multi-layer perceptron to
|
|
classify breast tumours as either benign or malignant.
|
|
The architecture and parameters were varied before exploring how the combinatio
|
|
n of classifiers can affect performance.
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Investigations were carried out in
|
|
\noun on
|
|
Python
|
|
\noun default
|
|
using the
|
|
\noun on
|
|
TensorFlow
|
|
\noun default
|
|
package to construct, train and evaluate neural networks.
|
|
A
|
|
\noun on
|
|
Jupyter
|
|
\noun default
|
|
notebook containing the experiments and the evaluated parameters can be
|
|
seen formatted as a single script in appendix
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:Source-Code"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
The networks were trained using a supervised learning curriculum of labelled
|
|
data taken from a standard
|
|
\noun on
|
|
MatLab
|
|
\noun default
|
|
dataset
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "matlab-dataset"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
from the
|
|
\noun on
|
|
Deep Learning Toolbox
|
|
\noun default
|
|
.
|
|
For this binary-classification problem there are two formats for the network,
|
|
a single output node (threshold of 0.5 to differentiate classes) or two
|
|
output nodes to create a one-hot vector.
|
|
As the labels were formatted as one-hot vectors, two output nodes with
|
|
a softmax activation function were used.
|
|
The number of parameters associated with the employed architectures of
|
|
varying hidden nodes can be seen in appendix
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "app:Network-Parameter-Counts"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
while a graph of the constructed network can be seen in appendix
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:Network-Graph"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Section
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:exp1"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
investigates the effect of varying the number of hidden nodes on test accuracy
|
|
along with the number of epochs that the MLPs are trained for.
|
|
Section
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:exp2"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
builds on the previous experiment by using reasonable parameter values
|
|
to investigate performance when using an ensemble of models to classify
|
|
in conjunction.
|
|
The effect of varying the number of nodes and epochs throughout the ensemble
|
|
was considered in order to determine whether combining multiple models
|
|
could produce a better accuracy than any individual model.
|
|
Section
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:exp3"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
investigates the effect of altering how the networks learn by changing
|
|
the optimisation algorithm.
|
|
Two additional algorithms to the previously used are considered and compared
|
|
using the same test apparatus of section
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "sec:exp2"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Hidden Nodes & Epochs
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:exp1"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
This section investigates the effect of varying the number of hidden nodes,
|
|
|
|
\begin_inset Formula $n_{h}$
|
|
\end_inset
|
|
|
|
, in the single hidden layer of a shallow multi-layer perceptron.
|
|
This is compared to the effect of training the model with different numbers
|
|
of epochs.
|
|
Throughout the experiment, stochastic gradient descent with momentum is
|
|
used as the optimiser, variations in both momentum and learning rate are
|
|
presented.
|
|
The learning rate and momentum coefficient used during training are denoted
|
|
|
|
\begin_inset Formula $\eta$
|
|
\end_inset
|
|
|
|
and
|
|
\begin_inset Formula $\beta$
|
|
\end_inset
|
|
|
|
respectively.
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Results
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-14-error-rate-curves.png
|
|
lyxscale 50
|
|
width 33col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.05$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-14"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-12-error-rate-curves.png
|
|
lyxscale 50
|
|
width 33col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-12"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-13-error-rate-curves.png
|
|
lyxscale 50
|
|
width 33col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.5$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-13"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Varied hidden node performance results over varied training lengths for
|
|
|
|
\begin_inset Formula $\eta=0.05,0.1,0.5$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-12,14"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-test2-12,14"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
visualises the test performance of hidden nodes up to
|
|
\begin_inset Formula $n_{h}=128$
|
|
\end_inset
|
|
|
|
over training periods up to 100 epochs in length.
|
|
In general, the error rate can be seen to decrease when the models are
|
|
trained for longer.
|
|
Increasing
|
|
\begin_inset Formula $n_{h}$
|
|
\end_inset
|
|
|
|
decreases the error rate and increases the gradient with which it falls
|
|
to a minimum limit of ~4%.
|
|
As the learning rate increases, the speed with which the network converges
|
|
increases.
|
|
For
|
|
\begin_inset Formula $\eta=0.05$
|
|
\end_inset
|
|
|
|
, networks with large
|
|
\begin_inset Formula $n_{h}$
|
|
\end_inset
|
|
|
|
begin converging after 30 epochs.
|
|
This is after only 15 epochs for
|
|
\begin_inset Formula $\eta=0.1$
|
|
\end_inset
|
|
|
|
and almost immediately for
|
|
\begin_inset Formula $\eta=0.5$
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-14-error-rate-std.png
|
|
lyxscale 50
|
|
width 33col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.05$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-14-std"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-12-error-rate-std.png
|
|
lyxscale 50
|
|
width 33col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-12-std"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-13-error-rate-std.png
|
|
lyxscale 50
|
|
width 33col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.5$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-13-std"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Varied hidden node performance standard deviation results over varied training
|
|
lengths for
|
|
\begin_inset Formula $\eta=0.05,0.1,0.5$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
, note the larger
|
|
\begin_inset Formula $y$
|
|
\end_inset
|
|
|
|
scale for
|
|
\begin_inset Formula $\eta=0.5$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-12,14-std"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The standard deviations for the above discussed results of figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-test2-12,14"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
can be seen in figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-test2-12,14-std"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
In general, prior to the networks beginning to converge the standard deviation
|
|
is close to 0.
|
|
As previously described, this takes place at lower epochs for higher learning
|
|
rates.
|
|
Once the networks start converging, the standard deviation of the test
|
|
error rate increases.
|
|
Increasing the learning rate also increases the variance in test error
|
|
rates, the max value for
|
|
\begin_inset Formula $\eta=0.5$
|
|
\end_inset
|
|
|
|
is double that of the lower
|
|
\begin_inset Formula $\eta$
|
|
\end_inset
|
|
|
|
experiments within the first 20 epochs.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The results from figures
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-test2-13"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
and
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-test2-13-std"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
can be seen repeated in figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-test2-13-individual"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
separately to include train error rates along with the previously reported
|
|
test performance.
|
|
In general the test and train accuracy can be seen to be very similar with
|
|
little difference.
|
|
While at 10 epochs the train and test accuracies are effectively equal,
|
|
by 100 epochs the test error rate has slightly increased from the training
|
|
value.
|
|
Additionally, at low epochs and high node counts, the variance in test
|
|
error rate is significantly higher than the training error rate.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-13-test-train-error-rate.png
|
|
lyxscale 20
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Error rates across nodes over epochs
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-13-test-train-error-rate-std.png
|
|
lyxscale 20
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Standard deviations across nodes over epochs
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Test and train results for
|
|
\begin_inset Formula $\eta=0.5$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-13-individual"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The effect of varying momentum can be seen in figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-momentums"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
, a fixed learning rate of
|
|
\begin_inset Formula $\eta=0.01$
|
|
\end_inset
|
|
|
|
was maintained throughout.
|
|
The meaning of momentum and its effect on training is discussed in section
|
|
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "subsec:Stochastic-Gradient-Descent"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
Without momentum (
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-test2-11"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
), it can be seen that the network does not begin to converge within 100
|
|
epochs.
|
|
This is also the case for
|
|
\begin_inset Formula $\beta=0.3$
|
|
\end_inset
|
|
|
|
, it is only by
|
|
\begin_inset Formula $\beta=0.5$
|
|
\end_inset
|
|
|
|
that some of the evaluated architectures begin to converge.
|
|
The test error rates for the 32 and 64 node series' begin to decrease after
|
|
64 epochs with
|
|
\begin_inset Formula $n_{h}=64$
|
|
\end_inset
|
|
|
|
nodes descending faster.
|
|
With
|
|
\begin_inset Formula $\beta=0.7$
|
|
\end_inset
|
|
|
|
, the 32 and 64-node networks begin to converge earlier, after 32 epochs
|
|
while the remaining architectures down to 2 nodes begin to converge after
|
|
64 epochs.
|
|
Finally, with
|
|
\begin_inset Formula $\beta=0.7$
|
|
\end_inset
|
|
|
|
, all of the evaluated architectures have converged by 64 epochs.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-7-error-rate-curves.png
|
|
lyxscale 50
|
|
width 45col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.01$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.9$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-7"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-10-error-rate-curves.png
|
|
lyxscale 50
|
|
width 45col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.01$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.7$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-10"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-8-error-rate-curves.png
|
|
lyxscale 50
|
|
width 45col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.01$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.5$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-8-1"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-9-error-rate-curves.png
|
|
lyxscale 50
|
|
width 45col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.01$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.3$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-9"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp1-test2-11-error-rate-curves.png
|
|
lyxscale 50
|
|
width 45col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Formula $\eta=0.01$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.0$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-test2-11"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Varied hidden node performance results over varied training length with
|
|
different momentum coefficients
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp1-momentums"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Discussion
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
From the presented results, it can be seen that, generally, increasing either
|
|
learning rate or momentum increases the speed of convergence.
|
|
Increasing the learning rate makes convergence faster as the steps taken
|
|
by the weights set across the error surface are larger.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Increasing the number of hidden nodes also increases the speed of convergence.
|
|
However, it is worth noting that a large number of nodes is not required
|
|
to achieve a highly performant accuracy.
|
|
A single hidden node for a total of 14 parameters, with enough training,
|
|
was able to achieve similar results to a 64-node network of 770 or 5 times
|
|
as many parameters.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
From the test/train comparisons, the slight divergence of test error rates
|
|
from training error would suggest that the network is overfitting to the
|
|
training data, reducing its ability to generalise.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Ensemble Classification
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:exp2"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
A horizontal ensemble of
|
|
\begin_inset Formula $m$
|
|
\end_inset
|
|
|
|
models was constructed with majority vote in order to investigate whether
|
|
this could improve performance over that of any single model.
|
|
In order to introduce variation between models of the ensemble, a range
|
|
of hidden nodes, learning rate and/or epochs could be defined.
|
|
When selecting parameters throughout the ensemble, either
|
|
\begin_inset Formula $m$
|
|
\end_inset
|
|
|
|
equally spaced values
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
For
|
|
\begin_inset Formula $m=1$
|
|
\end_inset
|
|
|
|
, the average of the range is taken
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
or random values are selected within the range.
|
|
The prior equally spaced ranges are referred to as
|
|
\emph on
|
|
uniform
|
|
\emph default
|
|
ensembles in this work to distinguish them from random ensembles.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The statistic
|
|
\emph on
|
|
agreement
|
|
\emph default
|
|
,
|
|
\begin_inset Formula $a$
|
|
\end_inset
|
|
|
|
, is defined as the proportion of models under the meta-classifier that
|
|
correctly predict a sample's class when the ensemble correctly classifies.
|
|
It could also be considered the confidence of the meta-classifier, for
|
|
one horizontal model
|
|
\begin_inset Formula $a_{m=1}\equiv1$
|
|
\end_inset
|
|
|
|
.
|
|
As error rates are presented as opposed to accuracy, this is inverted by
|
|
|
|
\begin_inset Formula $d=1-a$
|
|
\end_inset
|
|
|
|
to
|
|
\emph on
|
|
disagreement
|
|
\emph default
|
|
, the proportion of incorrect models when correctly group classifying.
|
|
Alongside the disagreement and ensemble test accuracy, the average individual
|
|
accuracy for both test and training data for all models within the meta-classif
|
|
iers are also presented.
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Results
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp2-test8-error-rate-curves.png
|
|
lyxscale 50
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp2-test8-error-rate-std.png
|
|
lyxscale 50
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Uniform ensemble classifier performance results for
|
|
\begin_inset Formula $\eta=0.03$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.01$
|
|
\end_inset
|
|
|
|
, nodes =
|
|
\begin_inset Formula $1-400$
|
|
\end_inset
|
|
|
|
, epochs =
|
|
\begin_inset Formula $5-100$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp2-test8"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
An investigation into a uniform ensemble of variable nodes and epochs can
|
|
be seen in figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp2-test8"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
The ensemble error rate varies between 16% and 18%.
|
|
The individual test and train error rates begin at ~17% and increase to
|
|
20% as the models increase.
|
|
As
|
|
\begin_inset Formula $m$
|
|
\end_inset
|
|
|
|
increases, the variance in individual and ensemble error rates decreases,
|
|
the individual rates decrease sharply while the ensemble error rate decreases
|
|
more gradually.
|
|
The disagreement increases from
|
|
\begin_inset Formula $m=1$
|
|
\end_inset
|
|
|
|
to
|
|
\begin_inset Formula $m=9$
|
|
\end_inset
|
|
|
|
and appears to be fairly constant beyond this.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
An experiment with a fixed epoch value throughout the uniform ensemble is
|
|
presented in figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp2-test10"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
Nodes between 1 and 400 were selected for the classifiers with a learning
|
|
rate,
|
|
\begin_inset Formula $\eta=0.15$
|
|
\end_inset
|
|
|
|
and momentum,
|
|
\begin_inset Formula $p=0.01$
|
|
\end_inset
|
|
|
|
.
|
|
The ensemble accuracy can be seen to be fairly constant at around 5% throughout
|
|
the number of horizontal models, 3 models was the least accurate with a
|
|
higher standard deviation.
|
|
3 horizontal models also shows a significant spike in disagreement and
|
|
individual error rates which gradually decreases as the number of models
|
|
increases.
|
|
The ensemble accuracy was higher than the individual models, significantly
|
|
for 3 models (~10%) and by a single percent after
|
|
\begin_inset Formula $m=25$
|
|
\end_inset
|
|
|
|
.
|
|
The variance in ensemble accuracy decreases as the number of models increases.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp2-test10-error-rate-curves.png
|
|
lyxscale 50
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp2-test10-error-rate-std.png
|
|
lyxscale 50
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Uniform ensemble classifier performance results for
|
|
\begin_inset Formula $\eta=0.15$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.01$
|
|
\end_inset
|
|
|
|
, nodes =
|
|
\begin_inset Formula $1-400$
|
|
\end_inset
|
|
|
|
, epochs = 20
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp2-test10"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Results for a random ensemble can be seen in figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp2-test19"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
The ensemble error rate can be seen to be lower than the individual values
|
|
with roughly a 2% improvement.
|
|
The ensemble error rate is also fairly constant for
|
|
\begin_inset Formula $m>3$
|
|
\end_inset
|
|
|
|
, for
|
|
\begin_inset Formula $m=3$
|
|
\end_inset
|
|
|
|
the ensemble error rate is higher in line with the higher individual error
|
|
rates.
|
|
For
|
|
\begin_inset Formula $m>1$
|
|
\end_inset
|
|
|
|
, the disagreement can be seen to fairly constant at around 3%.
|
|
Looking to the variance, for all series the variance reduces as
|
|
\begin_inset Formula $m$
|
|
\end_inset
|
|
|
|
increases.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp2-test19-error-rate-curves.png
|
|
lyxscale 50
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp2-test19-error-rate-std.png
|
|
lyxscale 50
|
|
width 50col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Random ensemble classifier performance results for
|
|
\begin_inset Formula $\eta=0.01-0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.0$
|
|
\end_inset
|
|
|
|
, nodes =
|
|
\begin_inset Formula $1-100$
|
|
\end_inset
|
|
|
|
, epochs =
|
|
\begin_inset Formula $10-70$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp2-test19"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Discussion
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In general, the use of an ensemble classifier slightly increased the accuracy
|
|
over the indivudal models used within.
|
|
When using a random ensemble, the accuracy for
|
|
\begin_inset Formula $m=1$
|
|
\end_inset
|
|
|
|
was worse than for a uniform ensemble.
|
|
This is because for a uniform ensemble, the parameter values were taken
|
|
as the mean of the provided ranges which could be expected to provide reasonabl
|
|
e values for each.
|
|
With a random ensemble, an ineffective combination of parameters could
|
|
be selected and there are no other models in the group to balance this
|
|
out.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Throughout the presented experiments, 3 horizontal models was shown to be
|
|
the worst-performing configuration with lower ensemble accuracy and higher
|
|
disagreement.
|
|
This is likely due to the larger proportion of the group that a single
|
|
model constitutes.
|
|
When correct, three models may only have a disagreement of 1/3 or 0 and
|
|
thus the final value will lie somewhere between these two.
|
|
As the number of horiztonal models increases, the number of acceptable
|
|
disagreement values increases.
|
|
As such a value for
|
|
\begin_inset Formula $m>5$
|
|
\end_inset
|
|
|
|
allows more granularity in making ensemble decisions.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
A random ensemble would be recommended in order to broadly sample parameter
|
|
combinations for the provided ranges.
|
|
For a uniform ensemble, each model will have parameters that are selected
|
|
from the same point through the range which may lead to consistently ineffectiv
|
|
e combinations depending on the selected range
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
For a range of epochs from 1 to 100 and a range of hidden nodes from 1 to
|
|
100, each model will have the same value for both using a uniform ensemble.
|
|
It can be seen that the particular ranges specified for both is more important
|
|
than if the values were selected randomly.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
.
|
|
When using a random ensemble it is important to use a large enough group
|
|
of models in order to achieve robust group classification.
|
|
A value for
|
|
\begin_inset Formula $m>10$
|
|
\end_inset
|
|
|
|
is recommended to achieve this.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In general, increasing the size of the ensemble did not guarantee an increase
|
|
in performance however it did tend to reduce the variance in accuracy.
|
|
This would be a desired result where robustness is valued such as the domain
|
|
of healthcare as is the case for this work.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Optimiser Comparisons
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:exp3"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Throughout the previous experiments the stochastic gradient descent optimiser
|
|
was used to change weights of the networks but there are many different
|
|
optimisation algorithms.
|
|
This section will present investigations into two other optimisation algorithms
|
|
and discuss the differences between them using the horizontal ensemble
|
|
classification of the previous section.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Prior to these investigations, however, stochastic gradient descent and
|
|
the two other subject algorithms will be described.
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Optimisers
|
|
\end_layout
|
|
|
|
\begin_layout Subsubsection
|
|
Stochastic Gradient Descent
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "subsec:Stochastic-Gradient-Descent"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Gradient descent and the closely related stochastic and mini-batch gradient
|
|
descent are popular optimisation algorithms in the machine learning space.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The aim of the neural networks in question are to make correct classifications
|
|
on sample data being fed-forward, ideally the networks classification would
|
|
be equal to the provided label.
|
|
A loss function,
|
|
\begin_inset Formula $J$
|
|
\end_inset
|
|
|
|
, is defined as the difference between the predicted output and the target
|
|
labelled output
|
|
\begin_inset Foot
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
There are many different options for the loss function including mean squared
|
|
error and categorical cross-entropy.
|
|
Although they have significant differences, this coverage of optimisation
|
|
algorithms does not rely on a specific loss function.
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
, it follows that we are aiming to minimise this as much as possible.
|
|
In order to improve the network, the values of the parameters,
|
|
\begin_inset Formula $\theta$
|
|
\end_inset
|
|
|
|
, must be changed with the intention of reducing the loss value.
|
|
From a set of starting weights,
|
|
\begin_inset Formula $\theta_{0}$
|
|
\end_inset
|
|
|
|
, this could be completed by finding the gradient of
|
|
\family roman
|
|
\series medium
|
|
\shape up
|
|
\size normal
|
|
\emph off
|
|
\bar no
|
|
\strikeout off
|
|
\xout off
|
|
\uuline off
|
|
\uwave off
|
|
\noun off
|
|
\color none
|
|
|
|
\begin_inset Formula $J$
|
|
\end_inset
|
|
|
|
w.r.t
|
|
\family default
|
|
\series default
|
|
\shape default
|
|
\size default
|
|
\emph default
|
|
\bar default
|
|
\strikeout default
|
|
\xout default
|
|
\uuline default
|
|
\uwave default
|
|
\noun default
|
|
\color inherit
|
|
|
|
\begin_inset Formula $\theta_{0}$
|
|
\end_inset
|
|
|
|
|
|
\family roman
|
|
\series medium
|
|
\shape up
|
|
\size normal
|
|
\emph off
|
|
\bar no
|
|
\strikeout off
|
|
\xout off
|
|
\uuline off
|
|
\uwave off
|
|
\noun off
|
|
\color none
|
|
.
|
|
Formally this would be
|
|
\begin_inset Formula $\nabla_{\theta_{0}}J\left(\theta_{0}\right)$
|
|
\end_inset
|
|
|
|
, the first derivative of the loss function with respect to the current
|
|
weights.
|
|
In order to reduce the loss, the gradient should be subtracted from the
|
|
current weight, a scale factor,
|
|
\begin_inset Formula $\eta$
|
|
\end_inset
|
|
|
|
, or the learning rate is defined to apply a tuneable proportion of the
|
|
gradient to the starting values.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In order to iteratively apply this algorithm, the form below is used for
|
|
time steps,
|
|
\begin_inset Formula $t$
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
\theta_{t+1}=\theta_{t}-\eta\cdot\nabla_{\theta_{t}}J\left(\theta_{t}\right)
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The differences between standard or batch gradient descent and the previously
|
|
mentioned variants is how many samples are fed-forward as part of the optimisat
|
|
ion algorithm.
|
|
Standard gradient descent propagates and calculates weight changes for
|
|
the entire training dataset in a single iteration of the algorithm.
|
|
Stochastic gradient descent, instead, processes only one sample during
|
|
an iteration.
|
|
Mini-batch strikes a balance between the two, the speed of stochastic gradient
|
|
descent is retained as more weight updates are made, however the path through
|
|
the error surface can be noisier than vanilla gradient descent.
|
|
Therefore, although the algorithm is colloquially referred to as gradient
|
|
descent or SGD, more strictly as a batch size of 35 was used for this work,
|
|
mini-batch gradient descent is being used.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
|
|
\noun on
|
|
Tensorflow's
|
|
\noun default
|
|
implementation of SGD also includes a momentum parameter.
|
|
Momentum aims to help a network increase the speed of convergence and reduce
|
|
oscillations by reinforcing dimensions (weights) that are changing in a
|
|
consistent direction while slowing dimensions that are changing direction
|
|
rapidly
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "paperspace-mom-rmsprop-adam"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
Momentum introduces a memory element to the descent by including a portion,
|
|
|
|
\begin_inset Formula $\beta$
|
|
\end_inset
|
|
|
|
, of the previous step's weight delta or
|
|
\emph on
|
|
velocity
|
|
\emph default
|
|
in subsequent iterations.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The introduction of momentum can be described as below
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "tf.keras.optimizers.SGD"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
,
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
v_{t}=\beta\cdot v_{t-1}-\eta\cdot\nabla_{\theta_{t}}J\left(\theta_{t}\right)\label{eq:sgd-momentum}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
\theta_{t+1}=\theta_{t}+v_{t}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
As previously presented (figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp1-momentums"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
), momentum can significantly increase convergence speed.
|
|
\end_layout
|
|
|
|
\begin_layout Subsubsection
|
|
RMSprop
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Although gradient descent is a powerful optimisation algorithm, there are
|
|
drawbacks.
|
|
One limitation is that the learning rate,
|
|
\begin_inset Formula $\eta$
|
|
\end_inset
|
|
|
|
, is a scalar applied to all gradients.
|
|
As a result, smaller gradients as would be found at saddle points move
|
|
slowly.
|
|
An alternative would be to expand the single scalar to a learning rate
|
|
per parameter that could move dynamically throughout the training process,
|
|
known as adaptive learning rate optimisation.
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
One such algorithm is RMSprop
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "rmsprop-hinton"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
or
|
|
\emph on
|
|
root mean square propagation
|
|
\emph default
|
|
, an unpublished algorithm that builds on previous adaptive algorithms such
|
|
as Rprop and Adagrad.
|
|
These aimed to overcome the shortcomings of SGD by using just the sign
|
|
of the calculated gradients and allowing the learning rate alone to define
|
|
the size of the step.
|
|
Instead of a constant or defined learning rate schedule, each learning
|
|
rate
|
|
\emph on
|
|
floats
|
|
\emph default
|
|
and is scaled up or down based on whether it is consistently changing in
|
|
the same direction each iteration.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Equations for RMSprop can be seen below
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "understanding-rmsprop"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
For conciseness, the previously defined derivative of the loss function
|
|
w.r.t to the current parameters is shortened,
|
|
\begin_inset Formula $g_{t}=\nabla_{\theta_{t}}J\left(\theta_{t}\right)$
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
E\left[g^{2}\right]_{t}=\alpha\cdot E\left[g^{2}\right]_{t-1}+\left(1-\alpha\right)\cdot g_{t}^{2}\label{eq:rmsprop-expected-value}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
\theta_{t+1}=\theta_{t}-\frac{\eta}{\sqrt{E\left[g^{2}\right]_{t}+\epsilon}}g_{t}\label{eq:rmsprop-update}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
As previously mentioned, only the sign of the gradient is used, this can
|
|
be achieved by dividing
|
|
\begin_inset Formula $g$
|
|
\end_inset
|
|
|
|
by the magnitude
|
|
\begin_inset Formula $|g|$
|
|
\end_inset
|
|
|
|
.
|
|
RMSprop extends this by instead dividing by the exponential average of
|
|
squared gradients, equation
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "eq:rmsprop-expected-value"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
In this equation,
|
|
\begin_inset Formula $\alpha$
|
|
\end_inset
|
|
|
|
constitutes the gradient decay rate, a value of 0.9 is suggested
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "understanding-rmsprop"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
|
|
\begin_inset Formula $\epsilon$
|
|
\end_inset
|
|
|
|
is a small constant on the order of
|
|
\begin_inset Formula $1\times10^{-7}$
|
|
\end_inset
|
|
|
|
that stops the algorithm from dividing by 0.
|
|
\end_layout
|
|
|
|
\begin_layout Subsubsection
|
|
Adam
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Adam or
|
|
\emph on
|
|
adaptive moment estimation
|
|
\emph default
|
|
is an optimisation algorithm that combines the adaptive learning rates
|
|
of RMSprop with the previously described momentum
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "adam-paper"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
Like RMSprop, the exponential average of squared gradients is maintained,
|
|
compare equations
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "eq:rmsprop-expected-value"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
and
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "eq:adam-squared-grad-accum"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
In addition to this, however, the exponential average of gradients is maintaine
|
|
d with a similar function to momentum, compare equations
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "eq:sgd-momentum"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
and
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "eq:adam-momentum"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
m_{t}=\beta_{1}\cdot m_{t-1}+\left(1-\beta_{1}\right)g_{t}\label{eq:adam-momentum}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
v_{t}=\beta_{2}\cdot v_{t-1}+\left(1-\beta_{2}\right)g_{t}^{2}\label{eq:adam-squared-grad-accum}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
These two equations constitute the eponymous moments,
|
|
\begin_inset Formula $m_{t}$
|
|
\end_inset
|
|
|
|
is the first moment or mean while
|
|
\begin_inset Formula $v_{t}$
|
|
\end_inset
|
|
|
|
is the second moment or the uncentered variance of the gradients.
|
|
As these moments are initialised at zero, these estimations tend to bias
|
|
towards 0.
|
|
The original authors correct the bias using the below
|
|
\begin_inset CommandInset citation
|
|
LatexCommand cite
|
|
key "adam-paper"
|
|
literal "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
\hat{m}_{t}=\frac{m_{t}}{1-\beta_{1}^{t}}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
\hat{v}_{t}=\frac{v_{t}}{1-\beta_{2}^{t}}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
This leaves the update step itself, described below.
|
|
Similarities can be seen between the previous RMSprop update step (equation
|
|
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "eq:rmsprop-update"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
) and that of Adam.
|
|
The RMSprop momentum term
|
|
\family roman
|
|
\series medium
|
|
\shape up
|
|
\size normal
|
|
\emph off
|
|
\bar no
|
|
\strikeout off
|
|
\xout off
|
|
\uuline off
|
|
\uwave off
|
|
\noun off
|
|
\color none
|
|
|
|
\begin_inset Formula $E\left[g^{2}\right]_{t}$
|
|
\end_inset
|
|
|
|
has been replaced by the equivalent
|
|
\begin_inset Formula $v_{t}$
|
|
\end_inset
|
|
|
|
while the calculated gradient,
|
|
\begin_inset Formula $g_{t}$
|
|
\end_inset
|
|
|
|
has been replaced by the exponentially decaying average gradient,
|
|
\begin_inset Formula $m_{t}$
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Formula
|
|
\begin{equation}
|
|
\theta_{t+1}=\theta_{t}-\frac{\eta}{\sqrt{\hat{v}_{t}+\epsilon}}\hat{m}_{t}\label{eq:adam-update}
|
|
\end{equation}
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Results
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp3-test1-error-rate-curves.png
|
|
lyxscale 30
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Uniform ensemble classifier performance results for SGD, RMSprop and Adam
|
|
optimisation with
|
|
\begin_inset Formula $\eta=0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.0$
|
|
\end_inset
|
|
|
|
, nodes = 16, epochs =
|
|
\begin_inset Formula $1-100$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp3-test1"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
A uniform ensemble classifier of fixed
|
|
\begin_inset Formula $\eta$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta$
|
|
\end_inset
|
|
|
|
and
|
|
\begin_inset Formula $n_{h}$
|
|
\end_inset
|
|
|
|
was evaluated, the results can be seen in figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp3-test1"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
Across the values of
|
|
\begin_inset Formula $m$
|
|
\end_inset
|
|
|
|
and the three evaluated optimisers, the ensemble error rates did not change
|
|
significantly with ~1% difference throughout.
|
|
The individual error rates and disagreement was more varied, Adam and RMSprop
|
|
reported ~10% lower errors for
|
|
\begin_inset Formula $m>5$
|
|
\end_inset
|
|
|
|
.
|
|
The previously described spike in individual error rates and disagreement
|
|
for
|
|
\begin_inset Formula $m=3$
|
|
\end_inset
|
|
|
|
can be seen.
|
|
Figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp3-test1-std"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
presents the variance in these results.
|
|
The Adam optimiser showed significantly higher variance for individual
|
|
test error and disagreement while the SGD algorithm showed the lowest variance
|
|
throughout.
|
|
RMSprop demonstrated a higher initial variance for disagreement than SGD
|
|
and then comparable results for
|
|
\begin_inset Formula $m>3$
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp3-test1-errors-rate-std.png
|
|
lyxscale 30
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Uniform ensemble classifier test variance for SGD, RMSprop and Adam optimisation
|
|
with
|
|
\begin_inset Formula $\eta=0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.0$
|
|
\end_inset
|
|
|
|
, nodes = 16, epochs =
|
|
\begin_inset Formula $1-100$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp3-test1-std"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Note Comment
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp3-test7-error-rate-curves.png
|
|
lyxscale 30
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Uniform ensemble classifier performance results for SGD, RMSprop and Adam
|
|
optimisation with
|
|
\begin_inset Formula $\eta=0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.9$
|
|
\end_inset
|
|
|
|
, nodes =
|
|
\begin_inset Formula $1-400$
|
|
\end_inset
|
|
|
|
, epochs =
|
|
\begin_inset Formula $50-100$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp3-test7"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp3-test7-errors-rate-std.png
|
|
lyxscale 30
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Uniform ensemble classifier test variance for SGD, RMSprop and Adam optimisation
|
|
with
|
|
\begin_inset Formula $\eta=0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.9$
|
|
\end_inset
|
|
|
|
, nodes =
|
|
\begin_inset Formula $1-400$
|
|
\end_inset
|
|
|
|
, epochs =
|
|
\begin_inset Formula $50-100$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp3-test7-std"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The effect of including random parameter selection from a range of learning
|
|
rates, hidden nodes and epochs was evaluated, the results can be seen in
|
|
figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp3-test9"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
.
|
|
Similar to the previously presented random ensemble, the ensemble error
|
|
rate is higher for
|
|
\begin_inset Formula $m=1$
|
|
\end_inset
|
|
|
|
.
|
|
Past this, the ensemble accuracy is fairly constant throughout the range
|
|
for
|
|
\begin_inset Formula $m$
|
|
\end_inset
|
|
|
|
over the three optimisers at ~4%.
|
|
Figure
|
|
\begin_inset CommandInset ref
|
|
LatexCommand ref
|
|
reference "fig:exp3-test9-std"
|
|
plural "false"
|
|
caps "false"
|
|
noprefix "false"
|
|
|
|
\end_inset
|
|
|
|
presents the variance in these results, RMSprop demonstrated the lowest
|
|
standard deviation throughout the range of
|
|
\begin_inset Formula $m$
|
|
\end_inset
|
|
|
|
for all series.
|
|
SGD and Adam showed a higher variance for
|
|
\begin_inset Formula $m\leq5$
|
|
\end_inset
|
|
|
|
before descending to similar values as RMSprop.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp3-test9-error-rate-curves.png
|
|
lyxscale 30
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Random ensemble classifier performance results for SGD, RMSprop and Adam
|
|
optimisation with
|
|
\begin_inset Formula $\eta=0.01-0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.0$
|
|
\end_inset
|
|
|
|
, nodes =
|
|
\begin_inset Formula $1-100$
|
|
\end_inset
|
|
|
|
, epochs =
|
|
\begin_inset Formula $1-70$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp3-test9"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/exp3-test9-errors-rate-std.png
|
|
lyxscale 30
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Random ensemble classifier test variance for SGD, RMSprop and Adam optimisation
|
|
with
|
|
\begin_inset Formula $\eta=0.01-0.1$
|
|
\end_inset
|
|
|
|
,
|
|
\begin_inset Formula $\beta=0.0$
|
|
\end_inset
|
|
|
|
, nodes =
|
|
\begin_inset Formula $1-100$
|
|
\end_inset
|
|
|
|
, epochs =
|
|
\begin_inset Formula $1-70$
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:exp3-test9-std"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Discussion
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The similarity in group classification despite variations in individual
|
|
error rates across the employed algorithms suggests that the use of a meta-clas
|
|
sifer was able to largely overcome the differences in training regimens.
|
|
Similarly to the previous discussion, a larger ensemble reduced the variation
|
|
in results, providing a more robust classification.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In suggesting a optimal algorithm it is worth considering the intended domains
|
|
for RMSprop and Adam.
|
|
As newer algorithms, there tends to a focus on deep convolutional networks
|
|
which implies a somewhat different set of requirements such as being able
|
|
to scale to large numbers of parameters.
|
|
This is not to say that the algorithms are inappropriate for the presented
|
|
applications, as demonstrated, these more complex algorithms were able
|
|
to outperform the employed gradient descent with optional momentum.
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
As the ensemble accuracies throughout the ensemble sizes and algorithms
|
|
were comparable, a recommendation for an algorithm can instead be made
|
|
primarily on the robustness of the meta-classifier, an important factor
|
|
in the subject field of healthcare.
|
|
RMSprop showed the lowest disagreement in the group and the tightest variation
|
|
in results, for this reason it is recommended based on this presented data.
|
|
Although a random ensemble has previously been recommended with the caveat
|
|
of using a larger ensemble, RMSprop showed lower variation throughout the
|
|
smaller group sizes where Adam showed similar variance to SGD.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Conclusions
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Within this work, shallow multi-layer perceptrons were shown to be able
|
|
to achieve an error rate of 4% when classifying breast tumours as either
|
|
benign or malignant.
|
|
Investigations were made into how the number of hidden nodes and the length
|
|
of the training period affected the performance.
|
|
In general, increasing the number of nodes increased the speed of convergence
|
|
while training the network for longer also increased performance.
|
|
A learning rate of 0.5 allowed all of the evaluated network architectures
|
|
to train within 40 epochs.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The effect of creating an ensemble meta-classifier on performance was evaluated
|
|
to investigate whether combining models could produce a lower error rate.
|
|
While the ensemble accuracy was not significantly affected, the variance
|
|
of the results was reduced indicating that the robustness was increased
|
|
when classifying in conjunction.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
Two additional optimisation algorithms were evaluated, RMSprop and Adam.
|
|
The ensemble accuracy was not significantly affected, although RMSprop
|
|
and Adam reported lower individual test and train error rates, the ensemble
|
|
accuracy was not significantly affected.
|
|
As such, a recommendation for RMSprop was made as it reported tighter variance.
|
|
For the healthcare domain, a more robust network was prioritised.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage newpage
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:bibliography"
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset bibtex
|
|
LatexCommand bibtex
|
|
btprint "btPrintCited"
|
|
bibfiles "references"
|
|
options "bibtotoc"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
\start_of_appendix
|
|
Network Parameter Counts
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "app:Network-Parameter-Counts"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float table
|
|
placement H
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Tabular
|
|
<lyxtabular version="3" rows="9" columns="2">
|
|
<features tabularvalignment="middle">
|
|
<column alignment="center" valignment="top">
|
|
<column alignment="center" valignment="top">
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
Hidden Nodes
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
Trainable Parameters
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
1
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
14
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
2
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
26
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
4
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
50
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
8
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
98
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
16
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
194
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
32
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
386
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
64
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
770
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
<row>
|
|
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
128
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
|
|
\begin_inset Text
|
|
|
|
\begin_layout Plain Layout
|
|
1,538
|
|
\end_layout
|
|
|
|
\end_inset
|
|
</cell>
|
|
</row>
|
|
</lyxtabular>
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Number of trainable parameters for architectures of varying numbers of hidden
|
|
nodes
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "tab:trainable-params"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Network Graph
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:Network-Graph"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Float figure
|
|
placement H
|
|
wide false
|
|
sideways false
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
\noindent
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename ../graphs/tensorboard-graph.png
|
|
lyxscale 50
|
|
width 100col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
\begin_inset Caption Standard
|
|
|
|
\begin_layout Plain Layout
|
|
Single hidden layer neural network as graphed by
|
|
\noun on
|
|
Tensorboard
|
|
\noun default
|
|
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "fig:tensorboard"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Source Code
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:Source-Code"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../nncw.py"
|
|
lstparams "caption={Formatted Jupyter notebook containing experiment code},label={notebook-code}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_body
|
|
\end_document
|