#LyX 2.3 created this file. For more info see http://www.lyx.org/ \lyxformat 544 \begin_document \begin_header \save_transient_properties true \origin unavailable \textclass article \begin_preamble \def\changemargin#1#2{\list{}{\rightmargin#2\leftmargin#1}\item[]} \let\endchangemargin=\endlist \pagenumbering{gobble} \usepackage{pxfonts} \usepackage{color} \definecolor{commentgreen}{RGB}{0,94,11} \definecolor{darkblue}{rgb}{0,0,0.75} \definecolor{darkred}{rgb}{0.6,0,0} \end_preamble \use_default_options true \begin_modules customHeadersFooters minimalistic todonotes \end_modules \maintain_unincluded_children false \language british \language_package default \inputencoding utf8 \fontencoding global \font_roman "default" "default" \font_sans "default" "default" \font_typewriter "default" "default" \font_math "auto" "auto" \font_default_family default \use_non_tex_fonts false \font_sc false \font_osf false \font_sf_scale 100 100 \font_tt_scale 100 100 \use_microtype true \use_dash_ligatures true \graphics default \default_output_format default \output_sync 0 \bibtex_command biber \index_command default \paperfontsize default \spacing onehalf \use_hyperref true \pdf_title "Training Neural Networks With Backpropagation" \pdf_author "Andy Pack" \pdf_subject "EEEM005" \pdf_keywords "EEEM005" \pdf_bookmarks true \pdf_bookmarksnumbered false \pdf_bookmarksopen false \pdf_bookmarksopenlevel 1 \pdf_breaklinks false \pdf_pdfborder true \pdf_colorlinks false \pdf_backref false \pdf_pdfusetitle true \papersize default \use_geometry true \use_package amsmath 1 \use_package amssymb 1 \use_package cancel 1 \use_package esint 1 \use_package mathdots 1 \use_package mathtools 1 \use_package mhchem 1 \use_package stackrel 1 \use_package stmaryrd 1 \use_package undertilde 1 \cite_engine biblatex \cite_engine_type authoryear \biblio_style plain \biblio_options urldate=long \biblatex_bibstyle ieee \biblatex_citestyle ieee \use_bibtopic false \use_indices false \paperorientation portrait \suppress_date true \justification true \use_refstyle 1 \use_minted 0 \index Index \shortcut idx \color #008000 \end_index \leftmargin 1.8cm \topmargin 2cm \rightmargin 1.8cm \bottommargin 2cm \secnumdepth 3 \tocdepth 3 \paragraph_separation skip \defskip medskip \is_math_indent 0 \math_numbering_side default \quotes_style british \dynamic_quotes 0 \papercolumns 1 \papersides 1 \paperpagestyle fancy \listings_params "language=Python,breaklines=true,frame=tb,otherkeywords={self},emph={State},emphstyle={\ttb\color{darkred}},basicstyle={\ttfamily},commentstyle={\bfseries\color{commentgreen}\itshape},keywordstyle={\color{darkblue}},emphstyle={\color{red}},stringstyle={\color{red}}" \bullet 1 0 9 -1 \bullet 2 0 24 -1 \tracking_changes false \output_changes false \html_math_output 0 \html_css_as_file 0 \html_be_strict false \end_header \begin_body \begin_layout Title \size giant Training Neural Networks with Backpropagation \end_layout \begin_layout Author Andy Pack \end_layout \begin_layout Standard \begin_inset VSpace 15pheight% \end_inset \end_layout \begin_layout Standard \align center \begin_inset Graphics filename surrey.png lyxscale 15 width 40col% \end_inset \end_layout \begin_layout Standard \begin_inset VSpace vfill \end_inset \end_layout \begin_layout Standard \noindent \align center EEEM005 \begin_inset Newline newline \end_inset May 2021 \size large \begin_inset Newline newline \end_inset Department of Electrical and Electronic Engineering \begin_inset Newline newline \end_inset Faculty of Engineering and Physical Sciences \begin_inset Newline newline \end_inset University of Surrey \end_layout \begin_layout Standard \begin_inset Newpage newpage \end_inset \end_layout \begin_layout Section* Executive Summary \end_layout \begin_layout Standard Summary here \end_layout \begin_layout Standard \begin_inset Newpage newpage \end_inset \end_layout \begin_layout Standard \begin_inset ERT status open \begin_layout Plain Layout \backslash pagenumbering{roman} \end_layout \end_inset \end_layout \begin_layout Abstract abstract \end_layout \begin_layout Standard \begin_inset CommandInset toc LatexCommand tableofcontents \end_inset \end_layout \begin_layout Standard \begin_inset Newpage pagebreak \end_inset \end_layout \begin_layout Standard \begin_inset FloatList figure \end_inset \end_layout \begin_layout Standard \begin_inset FloatList table \end_inset \end_layout \begin_layout Standard \begin_inset Newpage pagebreak \end_inset \end_layout \begin_layout Right Footer Andy Pack / 6420013 \end_layout \begin_layout Left Footer May 2021 \end_layout \begin_layout Standard \begin_inset ERT status open \begin_layout Plain Layout \backslash pagenumbering{arabic} \end_layout \begin_layout Plain Layout \backslash setcounter{page}{1} \end_layout \end_inset \end_layout \begin_layout Section Introduction \end_layout \begin_layout Standard Artificial neural networks have been the object of research and investigation since the 1940s with \noun on McCulloch \noun default and \noun on Pitts \noun default ' model of the artificial neuron \begin_inset CommandInset citation LatexCommand cite key "McCulloch1943" literal "false" \end_inset or \emph on Threshold Logic Unit \emph default . Throughout the century, the development of the single and multi-layer perceptro ns (SLP/MLP) alongside the backpropagation algorithm \begin_inset CommandInset citation LatexCommand cite key "Rumelhart1986" literal "false" \end_inset advanced the study of artificial intelligence. Throughout the 2010s, convolutional neural networks have proved critical in the field of computer vision and image recognition \begin_inset CommandInset citation LatexCommand cite key "alexnet" literal "false" \end_inset . \end_layout \begin_layout Standard This work investigates the ability of a shallow multi-layer perceptron to classify breast tumours as either benign or malignant. The architecture and parameters were varied before exploring how the combinatio n of classifiers can affect performance. \end_layout \begin_layout Standard Investigations were carried out in \noun on Python \noun default using the \noun on TensorFlow \noun default package to construct, train and evaluate neural networks. A \noun on Jupyter \noun default notebook containing the experiments and the evaluated parameters can be seen formatted as a single script in appendix \begin_inset CommandInset ref LatexCommand ref reference "sec:Source-Code" plural "false" caps "false" noprefix "false" \end_inset . The networks were trained using a supervised learning curriculum of labelled data taken from a standard \noun on MatLab \noun default dataset \begin_inset CommandInset citation LatexCommand cite key "matlab-dataset" literal "false" \end_inset from the \noun on Deep Learning Toolbox \noun default . For this binary-classification problem there are two formats for the network, a single output node (threshold of 0.5 to differentiate classes) or two output nodes to create a one-hot vector. As the labels were formatted as one-hot vectors, two output nodes with a softmax activation function were used. The number of parameters associated with the employed architectures of varying hdiden nodes can be seen in appendix \begin_inset CommandInset ref LatexCommand ref reference "app:Network-Parameter-Counts" plural "false" caps "false" noprefix "false" \end_inset while a graph of the constructed network can be seen in appendix \begin_inset CommandInset ref LatexCommand ref reference "sec:Network-Graph" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard Section \begin_inset CommandInset ref LatexCommand ref reference "sec:exp1" plural "false" caps "false" noprefix "false" \end_inset investigates the effect of varying the number of hidden nodes on test accuracy along with the number of epochs that the MLPs are trained for. Section \begin_inset CommandInset ref LatexCommand ref reference "sec:exp2" plural "false" caps "false" noprefix "false" \end_inset builds on the previous experiment by using reasonable parameter values to investigate performance when using an ensemble of models to classify in conjunction. The effect of varying the number of nodes and epochs throughout the ensemble was considered in order to determine whether combining multiple models could produce a better accuracy than any individual model. Section \begin_inset CommandInset ref LatexCommand ref reference "sec:exp3" plural "false" caps "false" noprefix "false" \end_inset investigates the effect of altering how the networks learn by changing the optimisation algorithm. Two additional algorithms to the previously used are considered and compared using the same test apparatus of section \begin_inset CommandInset ref LatexCommand ref reference "sec:exp2" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Section Hidden Nodes & Epochs \begin_inset CommandInset label LatexCommand label name "sec:exp1" \end_inset \end_layout \begin_layout Standard This section investigates the effect of varying the number of hidden nodes, \begin_inset Formula $n_{h}$ \end_inset , in the single hidden layer of a shallow multi-layer perceptron. This is compared to the effect of training the model with different numbers of epochs. Throughout the experiment, stochastic gradient descent with momentum is used as the optimiser, variations in both momentum and learning rate are presented. The learning rate and momentum coefficient used during training are denoted \begin_inset Formula $\eta$ \end_inset and \begin_inset Formula $\beta$ \end_inset respectively. \end_layout \begin_layout Subsection Results \end_layout \begin_layout Standard \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-14-error-rate-curves.png lyxscale 50 width 33col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.05$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-14" \end_inset \end_layout \end_inset \end_layout \end_inset \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-12-error-rate-curves.png lyxscale 50 width 33col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.1$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-12" \end_inset \end_layout \end_inset \end_layout \end_inset \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-13-error-rate-curves.png lyxscale 50 width 33col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.5$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-13" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Varied hidden node performance results over varied training lengths for \begin_inset Formula $\eta=0.05,0.1,0.5$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-12,14" \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \end_layout \end_inset \end_layout \begin_layout Standard Figure \begin_inset CommandInset ref LatexCommand ref reference "fig:exp1-test2-12,14" plural "false" caps "false" noprefix "false" \end_inset visualises the test performance of hidden nodes up to \begin_inset Formula $n_{h}=128$ \end_inset over training periods up to 100 epochs in length. In general, the error rate can be seen to decrease when the models are trained for longer. Increasing \begin_inset Formula $n_{h}$ \end_inset decreases the error rate and increases the gradient with which it falls to a minimum limit. As the learning rate increases, the speed with which the network converges increases. For \begin_inset Formula $\eta=0.05$ \end_inset , networks with large \begin_inset Formula $n_{h}$ \end_inset begin converging after 30 epochs. This is after only 15 epochs for \begin_inset Formula $\eta=0.1$ \end_inset and almost immediately for \begin_inset Formula $\eta=0.5$ \end_inset . \end_layout \begin_layout Standard \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-14-error-rate-std.png lyxscale 50 width 33col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.05$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-14-std" \end_inset \end_layout \end_inset \end_layout \end_inset \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-12-error-rate-std.png lyxscale 50 width 33col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.1$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-12-std" \end_inset \end_layout \end_inset \end_layout \end_inset \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-13-error-rate-std.png lyxscale 50 width 33col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.5$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-13-std" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Varied hidden node performance standard deviation results over varied training lengths for \begin_inset Formula $\eta=0.05,0.1,0.5$ \end_inset , \begin_inset Formula $\beta=0$ \end_inset , note the larger \begin_inset Formula $y$ \end_inset scale for \begin_inset Formula $\eta=0.5$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-12,14-std" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Standard The standard deviations for the above discussed results of figure \begin_inset CommandInset ref LatexCommand ref reference "fig:exp1-test2-12,14" plural "false" caps "false" noprefix "false" \end_inset can be seen in figure \begin_inset CommandInset ref LatexCommand ref reference "fig:exp1-test2-12,14-std" plural "false" caps "false" noprefix "false" \end_inset . In general, prior to the networks beginning to converge the standard deviation is close to 0. As previously described, this takes place at lower epochs for higher learning rates. Once the networks start converging, the standard deviation of the test error rate increases. Increasing the learning rate also increases the variance in test error rates, the max value for \begin_inset Formula $\eta=0.5$ \end_inset is double that of the lower \begin_inset Formula $\eta$ \end_inset experiments within the first 20 epochs. \end_layout \begin_layout Standard \begin_inset Flex TODO Note (inline) status open \begin_layout Plain Layout more std stuff and test/train splits \end_layout \end_inset \end_layout \begin_layout Standard The effect of varying momentum can be seen in figure \begin_inset CommandInset ref LatexCommand ref reference "fig:exp1-momentums" plural "false" caps "false" noprefix "false" \end_inset , a fixed learning rate of \begin_inset Formula $\eta=0.01$ \end_inset was maintained throughout. The meaning of momentum and its effect on training is discussed in section \begin_inset CommandInset ref LatexCommand ref reference "subsec:Stochastic-Gradient-Descent" plural "false" caps "false" noprefix "false" \end_inset . Without momentum ( \begin_inset CommandInset ref LatexCommand ref reference "fig:exp1-test2-11" plural "false" caps "false" noprefix "false" \end_inset ), it can be seen that the network does not begin to converge within 100 epochs. This is also the case for \begin_inset Formula $\beta=0.3$ \end_inset , it is only by \begin_inset Formula $\beta=0.5$ \end_inset that some of the evaluated architectures begin to converge. The test error rates for the 32 and 64 node series' begin to decrease after 64 epochs with \begin_inset Formula $n_{h}=64$ \end_inset nodes descending faster. With \begin_inset Formula $\beta=0.7$ \end_inset , the 32 and 64-node networks begin to converge earlier, after 32 epochs while the remaining architectures down to 2 nodes begin to converge after 64 epochs. Finally, with \begin_inset Formula $\beta=0.7$ \end_inset , all of the evaluated architectures have convered by 64 epochs. \end_layout \begin_layout Standard \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-7-error-rate-curves.png lyxscale 50 width 45col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.01$ \end_inset , \begin_inset Formula $\beta=0.9$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-7" \end_inset \end_layout \end_inset \end_layout \end_inset \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-10-error-rate-curves.png lyxscale 50 width 45col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.01$ \end_inset , \begin_inset Formula $\beta=0.7$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-10" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \noindent \align center \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-8-error-rate-curves.png lyxscale 50 width 45col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.01$ \end_inset , \begin_inset Formula $\beta=0.5$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-8-1" \end_inset \end_layout \end_inset \end_layout \end_inset \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-9-error-rate-curves.png lyxscale 50 width 45col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.01$ \end_inset , \begin_inset Formula $\beta=0.3$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-9" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \noindent \align center \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp1-test2-11-error-rate-curves.png lyxscale 50 width 45col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout \begin_inset Formula $\eta=0.01$ \end_inset , \begin_inset Formula $\beta=0.0$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp1-test2-11" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Varied hidden node performance results over varied training length with different momentum coefficients \begin_inset CommandInset label LatexCommand label name "fig:exp1-momentums" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Subsection Discussion \end_layout \begin_layout Standard From the presented results, it can be seen that, generally, increasing either learning rate or momentum increases the speed of convergence. \end_layout \begin_layout Standard Increasing the number of hidden nodes also increases the speed of convergence. However, it is worth noting that a large number of nodes is not required to achieve a highly performant accuracy. A single hidden node for a total of 14 parameters, with enough training, was able to achieve similar results to a 64-node network of 770 or 5 times as many parameters. \end_layout \begin_layout Section Ensemble Classification \begin_inset CommandInset label LatexCommand label name "sec:exp2" \end_inset \end_layout \begin_layout Standard A horizontal ensemble of \begin_inset Formula $m$ \end_inset models was constructed with majority vote in order to investigate whether this could improve performance over that of any single model. In order to introduce variation between models of the ensemble, a range of hidden nodes and/or epochs could be defined. When selecting parameters throughout the ensemble, \begin_inset Formula $m$ \end_inset equally spaced values within the range are selected \begin_inset Foot status open \begin_layout Plain Layout For \begin_inset Formula $m=1$ \end_inset , the average of the range is taken \end_layout \end_inset . \end_layout \begin_layout Standard The statistic \emph on agreement \emph default , \begin_inset Formula $a$ \end_inset , is defined as the proportion of models under the meta-classifier that correctly predict a sample's class when the ensemble correctly classifies. It could also be considered the confidence of the meta-classifier, for one horizontal model \begin_inset Formula $a_{m=1}\equiv1$ \end_inset . As error rates are presented as opposed to accuracy, this is inverted by \begin_inset Formula $d=1-a$ \end_inset to \emph on disagreement \emph default , the proportion of incorrect models when correctly group classifying. Alongside the disagreement and ensemble test accuracy, the average individual accuracy for both test and training data are also presented. \end_layout \begin_layout Subsection Results \end_layout \begin_layout Standard \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp2-test8-error-rate-curves.png lyxscale 50 width 50col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Ensemble classifier performance results for \begin_inset Formula $\eta=0.03$ \end_inset , \begin_inset Formula $\beta=0.01$ \end_inset , nodes = \begin_inset Formula $1-400$ \end_inset , epochs = \begin_inset Formula $5-100$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp2-test8" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Standard An experiment with a fixed epoch value throughout the ensemble is presented in figure \begin_inset CommandInset ref LatexCommand ref reference "fig:exp2-test10" plural "false" caps "false" noprefix "false" \end_inset . Nodes between 1 and 400 were selected for the classifiers with a learning rate, \begin_inset Formula $\eta=0.15$ \end_inset and momentum, \begin_inset Formula $p=0.01$ \end_inset . The ensemble accuracy can be seen to be fairly constant throughout the number of horizontal models with 3 models being the least accurate with a higher standard deviation. 3 horizontal models also shows a significant spike in disagreement and individual error rates which gradually decreases as the number of models increases. \end_layout \begin_layout Standard \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/exp2-test10-error-rate-curves.png lyxscale 50 width 50col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Ensemble classifier performance results for \begin_inset Formula $\eta=0.15$ \end_inset , \begin_inset Formula $\beta=0.01$ \end_inset , nodes = \begin_inset Formula $1-400$ \end_inset , epochs = 20 \begin_inset CommandInset label LatexCommand label name "fig:exp2-test10" \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \end_layout \end_inset \end_layout \begin_layout Subsection Discussion \end_layout \begin_layout Standard From the data of figure \begin_inset CommandInset ref LatexCommand ref reference "fig:exp2-test10" plural "false" caps "false" noprefix "false" \end_inset , 3 horizontal models was shown to be the worst performing configuration with lower ensemble accuracy and higher disagreement. This is likely due to larger proportion that a single model constitutes. When correct, three models may only have a disagreement of 1/3 or 0 and thus the final value will lie somewhere between these two. As the number of horiztonal models increases, the number of acceptable disagreement values increases. \end_layout \begin_layout Section Optimiser Comparisons \begin_inset CommandInset label LatexCommand label name "sec:exp3" \end_inset \end_layout \begin_layout Standard Throughout the previous experiments the stochastic gradient descent optimiser was used to change the networks weights but there are many different optimisati on algorithms. This section will present investigations into two other optimisation algorithms and discuss the differences between them using the horizontal ensemble classification of the previous section. \end_layout \begin_layout Standard Prior to these investigations, however, stochastic gradient descent and the two other subject algorithms will be described. \end_layout \begin_layout Subsection Optimisers \end_layout \begin_layout Subsubsection Stochastic Gradient Descent \begin_inset CommandInset label LatexCommand label name "subsec:Stochastic-Gradient-Descent" \end_inset \end_layout \begin_layout Standard Gradient descent and the closely related stochastic and mini-batch gradient descent are popular optimisation algorithms in the machine learning space. \end_layout \begin_layout Standard The aim of the neural networks in question are to make correct classifications on sample data being fed-forward, ideally the networks classification would be equal to the provided label. A loss function, \begin_inset Formula $J$ \end_inset , is defined as the difference between the predicted ouput and the target labelled output \begin_inset Foot status open \begin_layout Plain Layout There are many different options for the loss function including mean squared error and categorical cross-entropy. Although they have significant differences, this coverage of optimisation algorithms does not rely on a specific loss function. \end_layout \end_inset , it follows that we are aiming to minimise this as much as possible. In order to improve the network, the values of the parameters, \begin_inset Formula $\theta$ \end_inset , must be changed with the intention of reducing the loss value. From a set of starting weights, \begin_inset Formula $\theta_{0}$ \end_inset , this could be completed by finding the gradient of \family roman \series medium \shape up \size normal \emph off \bar no \strikeout off \xout off \uuline off \uwave off \noun off \color none \begin_inset Formula $J$ \end_inset w.r.t \family default \series default \shape default \size default \emph default \bar default \strikeout default \xout default \uuline default \uwave default \noun default \color inherit \begin_inset Formula $\theta_{0}$ \end_inset \family roman \series medium \shape up \size normal \emph off \bar no \strikeout off \xout off \uuline off \uwave off \noun off \color none . Formally this would be \begin_inset Formula $\nabla_{\theta_{0}}J\left(\theta_{0}\right)$ \end_inset , the first derivative of the loss function with respect to the current weights. In order to reduce the loss, the gradient should be subtracted from the current weight, a scale factor, \begin_inset Formula $\eta$ \end_inset , or the learning rate is defined to apply a tuneable proportion of the gradient to the starting values. \end_layout \begin_layout Standard In order to iteratively apply this algorithm, the form below is used for time steps, \begin_inset Formula $t$ \end_inset . \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} \theta_{t+1}=\theta_{t}-\eta\cdot\nabla_{\theta_{t}}J\left(\theta_{t}\right) \end{equation} \end_inset \end_layout \begin_layout Standard The differences between standard or batch gradient descent and the previously mentioned variants is how many samples are fed-forward as part of the optimisat ion algorithm. Standard gradient descent propagates and calculates weight changes for the entire training dataset in a single iteration of the algorithm. Stochastic gradient descent, instead, processes only one sample during an iteration. Mini-batch strikes a balance between the two, the speed of stochastic gradient descent is retained as more weight updates are made, however the path through the error surface can be noisier than vanilla gradient descent. Therefore, although the algorithm is colloquially referred to as gradient descent or SGD, more strictly as a batch size of 35 was used for this work, mini-batch gradient descent is being used. \end_layout \begin_layout Standard \noun on Tensorflow's \noun default implementation of SGD also includses a momentum parameter. Momentum aims to help a network increase the speed of convergence and reduce oscillations by reinforcing dimensions (weights) that are changing in a consistent direction while slowing dimensions that are changing direction rapidly \begin_inset CommandInset citation LatexCommand cite key "paperspace-mom-rmsprop-adam" literal "false" \end_inset . Momentum introduces a memory element to the descent by including a portion, \begin_inset Formula $\beta$ \end_inset , of the previous step's weight delta or \emph on velocity \emph default in subsequent iterations. \end_layout \begin_layout Standard The introduction of momentum can be described as below \begin_inset CommandInset citation LatexCommand cite key "tf.keras.optimizers.SGD" literal "false" \end_inset , \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} v_{t}=\beta\cdot v_{t-1}-\eta\cdot\nabla_{\theta_{t}}J\left(\theta_{t}\right)\label{eq:sgd-momentum} \end{equation} \end_inset \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} \theta_{t+1}=\theta_{t}+v_{t} \end{equation} \end_inset \end_layout \begin_layout Standard As previously presented (figure \begin_inset CommandInset ref LatexCommand ref reference "fig:exp1-momentums" plural "false" caps "false" noprefix "false" \end_inset ), momentum can significantly increase convergence speed. \end_layout \begin_layout Subsubsection RMSprop \end_layout \begin_layout Standard Although gradient descent is a powerful optimisation algorithm, there are drawbacks. One limitation is that the learning rate, \begin_inset Formula $\eta$ \end_inset , is a scalar applied to all gradients. As a result, smaller gradients as would be found at saddle points move slowly. An alternative would be to expand the single scalar to a learning rate per parameter that could move dynamically throughout the training process, known as adaptive learning rate optimisation. \end_layout \begin_layout Standard One such algorithm is RMSprop \begin_inset CommandInset citation LatexCommand cite key "rmsprop-hinton" literal "false" \end_inset or \emph on root mean square propagation \emph default , an unpublished algorithm that builds on previous adaptive algorithms such as Rprop and Adagrad. These aimed to overcome the shortcomings of SGD by using just the sign of the calculated gradients and allowing the learning rate alone to define the size of the step. Instead of a constant or defined learning rate schedule, each learning rate \emph on floats \emph default and is scaled up or down based on whether it is consistently changing in the same direction each iteration. \end_layout \begin_layout Standard Equations for RMSprop can be seen below \begin_inset CommandInset citation LatexCommand cite key "understanding-rmsprop" literal "false" \end_inset . For conciseness, the previously defined derivative of the loss function w.r.t to the current parameters is shortened, \begin_inset Formula $g_{t}=\nabla_{\theta_{t}}J\left(\theta_{t}\right)$ \end_inset . \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} E\left[g^{2}\right]_{t}=\alpha\cdot E\left[g^{2}\right]_{t-1}+\left(1-\alpha\right)\cdot g_{t}^{2}\label{eq:rmsprop-expected-value} \end{equation} \end_inset \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} \theta_{t+1}=\theta_{t}-\frac{\eta}{\sqrt{E\left[g^{2}\right]_{t}+\epsilon}}g_{t}\label{eq:rmsprop-update} \end{equation} \end_inset \end_layout \begin_layout Standard As previously mentioned, only the sign of the gradient is used, this can be achieved by dividing \begin_inset Formula $g$ \end_inset by the magnitude \begin_inset Formula $|g|$ \end_inset . RMSprop extends this by instead dividing by the exponential average of squared gradients, equation \begin_inset CommandInset ref LatexCommand ref reference "eq:rmsprop-expected-value" plural "false" caps "false" noprefix "false" \end_inset . In this equation, \begin_inset Formula $\alpha$ \end_inset constitutes the gradient decay rate, a value of 0.9 is suggested \begin_inset CommandInset citation LatexCommand cite key "understanding-rmsprop" literal "false" \end_inset . \begin_inset Formula $\epsilon$ \end_inset is a small constant on the order of \begin_inset Formula $1\times10^{-7}$ \end_inset that stops the algorithm from dividing by 0. \end_layout \begin_layout Subsubsection Adam \end_layout \begin_layout Standard Adam or \emph on adaptive moment estimation \emph default is an optimisation algorithm that combines the adaptive learning rates of RMSprop with the previously described momentum \begin_inset CommandInset citation LatexCommand cite key "adam-paper" literal "false" \end_inset . Like RMSprop, the exponential average of squared gradients is maintained, compare equations \begin_inset CommandInset ref LatexCommand ref reference "eq:rmsprop-expected-value" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand ref reference "eq:adam-squared-grad-accum" plural "false" caps "false" noprefix "false" \end_inset . In addition to this, however, the exponential average of gradients is maintaine d with a similar function to momentum, compare equations \begin_inset CommandInset ref LatexCommand ref reference "eq:sgd-momentum" plural "false" caps "false" noprefix "false" \end_inset and \begin_inset CommandInset ref LatexCommand ref reference "eq:adam-momentum" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} m_{t}=\beta_{1}\cdot m_{t-1}+\left(1-\beta_{1}\right)g_{t}\label{eq:adam-momentum} \end{equation} \end_inset \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} v_{t}=\beta_{2}\cdot v_{t-1}+\left(1-\beta_{2}\right)g_{t}^{2}\label{eq:adam-squared-grad-accum} \end{equation} \end_inset \end_layout \begin_layout Standard These two equations constitute the eponymous moments, \begin_inset Formula $m_{t}$ \end_inset is the first moment or mean while \begin_inset Formula $v_{t}$ \end_inset is the second moment or the uncentered variance of the gradients. As these moments are initialised at zero, these estimations tend to bias towards 0. The original authors correct the bias using the below \begin_inset CommandInset citation LatexCommand cite key "adam-paper" literal "false" \end_inset . \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} \hat{m}_{t}=\frac{m_{t}}{1-\beta_{1}^{t}} \end{equation} \end_inset \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} \hat{v}_{t}=\frac{v_{t}}{1-\beta_{2}^{t}} \end{equation} \end_inset \end_layout \begin_layout Standard This leaves the update step itself, described below. Similarities can be seen between the previous RMSprop update step (equation \begin_inset CommandInset ref LatexCommand ref reference "eq:rmsprop-update" plural "false" caps "false" noprefix "false" \end_inset ) and that of Adam. The RMSprop momentum term \family roman \series medium \shape up \size normal \emph off \bar no \strikeout off \xout off \uuline off \uwave off \noun off \color none \begin_inset Formula $E\left[g^{2}\right]_{t}$ \end_inset has been replaced by the equivalent \begin_inset Formula $v_{t}$ \end_inset while the calculated gradient, \begin_inset Formula $g_{t}$ \end_inset has been replaced by the exponentially decaying average gradient, \begin_inset Formula $m_{t}$ \end_inset . \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} \theta_{t+1}=\theta_{t}-\frac{\eta}{\sqrt{\hat{v}_{t}+\epsilon}}\hat{m}_{t}\label{eq:adam-update} \end{equation} \end_inset \end_layout \begin_layout Subsection Results \end_layout \begin_layout Standard \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename /home/andy/dev/py/shallow-training/graphs/exp3-test1-error-rate-curves.png lyxscale 30 width 100col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Ensemble classifier performance results for SGD, RMSprop and Adam optimisation with \begin_inset Formula $\eta=0.1$ \end_inset , \begin_inset Formula $\beta=0.0$ \end_inset , nodes = 16, epochs = \begin_inset Formula $1-100$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp3-test1" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Standard \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename /home/andy/dev/py/shallow-training/graphs/exp3-test7-error-rate-curves.png lyxscale 30 width 100col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Ensemble classifier performance results for SGD, RMSprop and Adam optimisation with \begin_inset Formula $\eta=0.1$ \end_inset , \begin_inset Formula $\beta=0.9$ \end_inset , nodes = \begin_inset Formula $1-400$ \end_inset , epochs = \begin_inset Formula $50-100$ \end_inset \begin_inset CommandInset label LatexCommand label name "fig:exp3-test7" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \begin_layout Subsection Discussion \end_layout \begin_layout Standard In suggesting a optimal algorithm it is worth considering the intended domains for RMSprop and Adam. As newer algorithms, there tends to a focus on deep convolutional networks which implies a somewhat different set of requirements. This is not to say that the algorithms are inappropriate for the presented applications, as demonstrated, these more complex algorithms were able to outperform the employed gradient descent with optional momentum. \end_layout \begin_layout Section Conclusions \end_layout \begin_layout Standard \begin_inset Newpage newpage \end_inset \end_layout \begin_layout Standard \begin_inset CommandInset label LatexCommand label name "sec:bibliography" \end_inset \begin_inset CommandInset bibtex LatexCommand bibtex btprint "btPrintCited" bibfiles "references" options "bibtotoc" \end_inset \end_layout \begin_layout Section \start_of_appendix Network Parameter Counts \begin_inset CommandInset label LatexCommand label name "app:Network-Parameter-Counts" \end_inset \end_layout \begin_layout Standard \begin_inset Float table placement H wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout Hidden Nodes \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Trainable Parameters \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 1 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 14 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 2 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 26 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 4 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 50 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 8 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 98 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 16 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 194 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 32 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 386 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 64 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 770 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 128 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 1,538 \end_layout \end_inset \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Number of trainable parameters for architectures of varying numbers of hidden nodes \begin_inset CommandInset label LatexCommand label name "tab:trainable-params" \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \end_layout \end_inset \end_layout \begin_layout Section Source Code \begin_inset CommandInset label LatexCommand label name "sec:Source-Code" \end_inset \end_layout \begin_layout Standard \begin_inset CommandInset include LatexCommand lstinputlisting filename "../nncw.py" lstparams "caption={Formatted Jupyter notebook containing experiment code},label={notebook-code}" \end_inset \end_layout \begin_layout Section Network Graph \begin_inset CommandInset label LatexCommand label name "sec:Network-Graph" \end_inset \end_layout \begin_layout Standard \begin_inset Float figure placement H wide false sideways false status open \begin_layout Plain Layout \noindent \align center \begin_inset Graphics filename ../graphs/tensorboard-graph.png lyxscale 50 width 100col% \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout Single hidden layer neural network as graphed by \noun on Tensorboard \noun default \begin_inset CommandInset label LatexCommand label name "fig:tensorboard" \end_inset \end_layout \end_inset \end_layout \end_inset \end_layout \end_body \end_document