#LyX 2.3 created this file. For more info see http://www.lyx.org/ \lyxformat 544 \begin_document \begin_header \save_transient_properties true \origin unavailable \textclass article \begin_preamble \rfoot{6420013} \end_preamble \use_default_options true \maintain_unincluded_children false \language english \language_package default \inputencoding auto \fontencoding global \font_roman "default" "default" \font_sans "default" "default" \font_typewriter "default" "default" \font_math "auto" "auto" \font_default_family default \use_non_tex_fonts false \font_sc false \font_osf false \font_sf_scale 100 100 \font_tt_scale 100 100 \use_microtype false \use_dash_ligatures true \graphics default \default_output_format default \output_sync 0 \bibtex_command default \index_command default \paperfontsize default \spacing single \use_hyperref false \papersize default \use_geometry true \use_package amsmath 1 \use_package amssymb 1 \use_package cancel 1 \use_package esint 1 \use_package mathdots 1 \use_package mathtools 1 \use_package mhchem 1 \use_package stackrel 1 \use_package stmaryrd 1 \use_package undertilde 1 \cite_engine biblatex \cite_engine_type authoryear \biblio_style plain \biblatex_bibstyle ieee \biblatex_citestyle ieee \use_bibtopic false \use_indices false \paperorientation portrait \suppress_date true \justification true \use_refstyle 1 \use_minted 0 \index Index \shortcut idx \color #008000 \end_index \leftmargin 2cm \topmargin 2cm \rightmargin 2cm \bottommargin 2cm \secnumdepth 3 \tocdepth 3 \paragraph_separation indent \paragraph_indentation default \is_math_indent 0 \math_numbering_side default \quotes_style english \dynamic_quotes 0 \papercolumns 1 \papersides 1 \paperpagestyle fancy \tracking_changes false \output_changes false \html_math_output 0 \html_css_as_file 0 \html_be_strict false \end_header \begin_body \begin_layout Title Visual Search Coursework \end_layout \begin_layout Author Andy Pack (6420013) \end_layout \begin_layout LyX-Code \begin_inset Newpage pagebreak \end_inset \end_layout \begin_layout Section* Abstract \end_layout \begin_layout Standard abstract \end_layout \begin_layout LyX-Code \begin_inset CommandInset toc LatexCommand tableofcontents \end_inset \end_layout \begin_layout Quotation \begin_inset Newpage pagebreak \end_inset \end_layout \begin_layout Section Introduction \end_layout \begin_layout Standard An application of computer vision and visual media processing is that of viusal search, the ability to quantitatively identify features of an image such that other images can be compared and ranked based on similarity. \end_layout \begin_layout Standard These measured features can be arranged as a data structure or descriptor and a visual search system can be composed of the extraction and comparison of these descriptors. It is an example of content based image retrieval or CBIR. \end_layout \begin_layout Standard Visual search is used in consumer products to generate powerful results such as Google Lens and Google reverse image search. It also has applicability as smaller features of products such as 'related products' results. \end_layout \begin_layout Subsection Extraction \end_layout \begin_layout Standard When arranged as three 2D arrays of intensity for each colour channel, an image can be manipulated and measured to identify features using colour and shape information. The methods for doing so have varying applicability and efficacy to a visual search system, many also have variables which can be tuned to improve performan ce. \end_layout \begin_layout Subsection Comparison \end_layout \begin_layout Standard Typically a descriptor is a single column vector of numbers calculated about an image. This vector allows an image descriptor to plotted as a point in a feature space of the same dimensionality as the vector. Images that are close together in this feature space will indicate that they have similar descriptors. Methods for calculating the distance will determine how images are ranked. \end_layout \begin_layout Section Descriptors \end_layout \begin_layout Subsection Average Colour \end_layout \begin_layout Standard Average colour represents one of the most basic descriptors capable of being calculated about an image, an array of three numbers for the average red green and blue intensity values found in the image. \end_layout \begin_layout Standard These three numbers hold no information about the distribution of colour throughout the image and no information based on edge and shape information. The lack of either hinders it's applicability to any real world problems. The only advantage would be the speed of calculation. \end_layout \begin_layout Subsection Global Colour Histogram \end_layout \begin_layout Standard A global colour histogram extracts colour distribution information from an image which can be used as a descriptor. \end_layout \begin_layout Standard Each pixel in an image can be plotted as a point in it's 3D colour space with the axes being red, green and blue intensity values for each pixel. Visually inspecting this colour space will provide information about colour scattering found throughout the image. As different resolutions of images will produce datasets of different sizes in the feature space, a descriptor must be devised that transforms this data into a resolution agnostic form which can be compared. \end_layout \begin_layout Standard Each axes is partitioned into \begin_inset Formula $q$ \end_inset divisions so that a histogram can be calculated for each colour channel. Each channel's intensity value, \begin_inset Formula $val$ \end_inset , can be converted into an integer bin value using equation \begin_inset CommandInset ref LatexCommand ref reference "eq:integer-bin-calc" plural "false" caps "false" noprefix "false" \end_inset , where floor strips a float value into an integer by truncating all values past the decimal point. \end_layout \begin_layout Standard \begin_inset Formula \begin{equation} bin\:val=floor\left(q\cdotp\frac{val}{256}\right)\label{eq:integer-bin-calc} \end{equation} \end_inset \end_layout \begin_layout Standard This allows each pixel to now be represented as a 3D point of three 'binned' values, a full RGB colour space has been reduced to three colour histrograms, one for each channel. In order to arrange this as a descriptor each point should be further reduced to a single number so that a global histogram can be formed of these values. This is done by taking decimal bin integers and concatenating them into a single number in base \begin_inset Formula $q$ \end_inset . For an RGB colour space, each pixel can be augmented as shown in equation \begin_inset CommandInset ref LatexCommand ref reference "eq:base-conversion" plural "false" caps "false" noprefix "false" \end_inset . \begin_inset Formula \begin{equation} pixel\:bin=red\:bin\cdotp q^{2}+green\:bin\cdotp q^{1}+blue\:bin\cdotp q^{0}\label{eq:base-conversion} \end{equation} \end_inset \end_layout \begin_layout Standard Calculating a histogram of each pixel's bin value will function as a descriptor for the image once normalised by count. This normalisation will remove the effect of changing resolutions of image. \end_layout \begin_layout Standard Each descriptor plots an image as a point in a \begin_inset Formula $q^{3}$ \end_inset -dimensional feature space where similiarity can be computed using a suitable distance measure (L1 norm for example). \end_layout \begin_layout Subsubsection Efficacy \end_layout \begin_layout Standard The advantage of global colour histogram over the average RGB descriptor is that amounts of colours are now represented in the descriptor. Clusters of similar colours representing objects or backgrounds will be captured and can be compared. \end_layout \begin_layout Standard A global histogram, however, holds no spatial colour information, this is lost by plotting the pixels in their colour space. \end_layout \begin_layout Standard This suggests that performing a pixel shuffling operation on the image will not affect the extracted descriptor which has implications on the adequacy of the methodology for a visual search system. \end_layout \begin_layout Subsection Spatial Colour \end_layout \begin_layout Standard Spatial techniques involve calculating descriptors tht are discriminative between colour and shape information in different regions of the image. This is done by dividing the image into a grid of cells and then calculating individual 'sub-descriptors' which are concatenated into the global image descriptor. \end_layout \begin_layout Standard These sub-descriptors can be calculated using any approprate method however a main consideration should be the dimensionality of the final descriptor. This can be calculted using the following equation, \end_layout \begin_layout Standard \begin_inset Formula \[ D_{total}=W\cdotp H\cdotp D_{sub-descriptor} \] \end_inset \end_layout \begin_layout Standard Where \begin_inset Formula $W$ \end_inset and \begin_inset Formula $H$ \end_inset refer to the number of columns and rows of the determined grid respectively. \end_layout \begin_layout Standard It would be feasible to calculate a colour histogram however this already generates a desciptor of \begin_inset Formula $q^{3}$ \end_inset dimensionality, where \begin_inset Formula $q$ \end_inset is the number of divisions. \end_layout \begin_layout Standard For example using a \begin_inset Formula $q$ \end_inset value of 4 and a spatial grid of 6 x 4 would produce a descriptor in 1536 dimenions, while a \begin_inset Formula $q$ \end_inset of 6 with a a grid of 10 x 6 is 12,960 dimensional. \end_layout \begin_layout Standard This is an extremely high value and will increase the time taken to calculate and compare descriptors. \end_layout \begin_layout Standard For a spatial colour descriptor the average RGB values for each cell can be used as these sub descriptors will be three dimensional reducing the total value. \end_layout \begin_layout Subsubsection Efficacy \end_layout \begin_layout Standard Computing a spatial descriptor can increase performance when highlighting the difference to a colour histogram. While a colour histogram will describe how many of each colour is present in an image, spatial colour techniques of the type described above will indicate the colours found in each area of the image. Considering an image of a cow in a field, the colour histogram will identify and count the brown pixels of the cow and the green pixels of the field, spatial colour techniques will identify an area of brown in the middle of an image surrounded by an area of green. \end_layout \begin_layout Subsection Spatial Texture \end_layout \begin_layout Standard Spatial texture replaces the colour sub-desciptor from before with a descriptor that reflects the texture found in the image as described by the edges that can be detected. \end_layout \begin_layout Subsubsection Edge Detection \end_layout \begin_layout Standard Edges can be detected in an image by finding areas where neighbouring pixels have significantly different intensities. \end_layout \begin_layout Standard Mathematically this can be seen as taking the first derivative of the image by convolving it with a Sobel filter. The Sobel filters are a pair of 3x3 kernels, one for each axes (see figure \begin_inset CommandInset ref LatexCommand ref reference "fig:3x3-Sobel-filter" plural "false" caps "false" noprefix "false" \end_inset ), which approximates the gradient of the greyscale intensity of an image. \begin_inset Float figure wide false sideways false status open \begin_layout Plain Layout \align center \begin_inset Formula $S_{x}=\begin{bmatrix}-1 & 0 & +1\\ -2 & 0 & +2\\ -1 & 0 & +1 \end{bmatrix}$ \end_inset \begin_inset space \qquad{} \end_inset \begin_inset Formula $S_{y}=\begin{bmatrix}+1 & +2 & +1\\ 0 & 0 & 0\\ -1 & -2 & -1 \end{bmatrix}$ \end_inset \end_layout \begin_layout Plain Layout \begin_inset Caption Standard \begin_layout Plain Layout 3x3 Sobel filter kernels for \begin_inset Formula $x$ \end_inset and \begin_inset Formula $y$ \end_inset axes \begin_inset CommandInset label LatexCommand label name "fig:3x3-Sobel-filter" \end_inset \end_layout \end_inset \end_layout \begin_layout Plain Layout \end_layout \end_inset \end_layout \begin_layout Standard The results of convolving each filter with the image are two images that express the intensity of edges in that axes. \end_layout \begin_layout Standard From here a composite edge magnitude image of the two can be calculated as shown, \end_layout \begin_layout Standard \begin_inset Formula \[ G_{composite}=\sqrt{G_{x}^{2}+G_{y}^{2}} \] \end_inset \end_layout \begin_layout Standard With the angles of the edges calculated as follows, \end_layout \begin_layout Standard \begin_inset Formula \[ \Theta=\arctan\left(\frac{G_{y}}{G_{x}}\right) \] \end_inset \end_layout \begin_layout Subsubsection Application \end_layout \begin_layout Standard To create a descriptor, both the angle and magnitude information will be used, the descriptor itself will reflect information about the angle of the edges found. \end_layout \begin_layout Standard First the image grid cells will be thresholded using the magnitude values. Magnitude values can be seen to represent the confidence with which edges can be found and so here a decision is effectively being made as to what are and are not edges, this value can be tuned to best match the applcation. \end_layout \begin_layout Standard Once a thresholded edge maginute image has been found, a normalised histogram will be calculated for the angles of these edges. This histograms of each grid cell will act as the descriptor when concatenated into a vector of dimensionality, \begin_inset Formula $D$ \end_inset , \end_layout \begin_layout Standard \begin_inset Formula \[ D_{total}=W\cdotp H\cdotp q \] \end_inset \end_layout \begin_layout Standard Where \begin_inset Formula $q$ \end_inset refers to the number of edge histogram bins. \end_layout \begin_layout Subsection Principal Component Analysis \end_layout \begin_layout Section Distance Measures \end_layout \begin_layout Standard Once image descriptors are plotted in a feature space a visual search system compares descriptors by measuring the distance between them. The method for doing so will affect the ranking of descriptors. \end_layout \begin_layout Subsection L1 Norm \end_layout \begin_layout Subsection L2 Norm \end_layout \begin_layout Standard The L2 norm, or Euclidean distance, is the shortest difference between two points in space, it is also referred to as the magnitude of a vector. In a three dimensional Euclidean space the magnitude of a vector, \begin_inset Formula $x=\left(i,j,k\right)$ \end_inset , is given by, \end_layout \begin_layout Standard \begin_inset Formula \[ \left\Vert x\right\Vert _{2}=\sqrt{i^{2}+j^{2}+k^{2}} \] \end_inset \end_layout \begin_layout Standard It's intuitive distance measurement makes it the most commonly used norm in Euclidean space. \end_layout \begin_layout Subsection Mahalanobis Distance \end_layout \begin_layout Section Test Methods \end_layout \begin_layout Subsection Dataset \end_layout \begin_layout Standard For the purposes of these experiments the Microsoft MSRC \begin_inset CommandInset citation LatexCommand cite key "microsoft_msrc" literal "false" \end_inset version 2 dataset was used. The set is made up of 591 images across 20 categories, the classifications for which can be seen in appendix \begin_inset CommandInset ref LatexCommand ref reference "sec:MSRC-Dataset-Classifications" plural "false" caps "false" noprefix "false" \end_inset . \end_layout \begin_layout Standard Worth noting about the dataset is that there are some similarities and overlap between categories which has implications on the results which can be calculate d when using it. \end_layout \begin_layout Standard For example category 1 is a collection of images of cows, sheep and horses on grass however cows and sheep each have their own distinct categories. Category 18 also has many similarities to category 20 with both being mainly shots of bodies of water and boats in water of varying sizes. \end_layout \begin_layout Standard During the evaulation of implemented visual search techniques the classification of each image is done by referencing the group index they are named with. As such, occurences of false negatives may increase as images that do in fact look similar as they are both, say, images of cows will be marked as not similar and measure negatively for the performance of the method. \end_layout \begin_layout Subsection Precision and Recall \end_layout \begin_layout Standard When comapring the effectiveness of different descriptors the main measurements are those of precision and recall. \end_layout \begin_layout Standard Once the visual search system has ranked a dataset on similarity to a query image, the precision and recall can be calculated up to \begin_inset Formula $n$ \end_inset images through the ranked list. \end_layout \begin_layout Standard At each \begin_inset Formula $n$ \end_inset the precision is defined as the number of images up to \begin_inset Formula $n$ \end_inset that are classed as relevant. Higher precision values indicate better system accuracy and an ideal system response as \begin_inset Formula $n$ \end_inset increases would be a precision of 1 until all relevant documents have been returned at which point it would reduce to a minimum value of the fraction of relevant documents in the dataset. This would indicate that the system is able to select a relevant image every time one is available. \end_layout \begin_layout Standard The recall is defined at \begin_inset Formula $n$ \end_inset as how many of the available relevant results have been returned up to \begin_inset Formula $n$ \end_inset . Higher recall values at \begin_inset Formula $n$ \end_inset indicate that the system can recall relevant documents faster with less false positives and begins at 0 before increasing to a maximum of 1 as \begin_inset Formula $n$ \end_inset increases when all have been returned. \end_layout \begin_layout Standard While both measurements appear to reflect similar concepts there is a difference. Precision is a measure of how accurately a system can decide whether a document is relevant while recall can be thought of as a measure of a systems repeated accuracy and measures how long it takes to retrieve all relevant documents. \end_layout \begin_layout Standard A system with high recall but low precision will indicate that the system is effectively able to retrieve all relevant documents eventually however there will be false positives within the results. Results of this quality would be advantageous when it is important to obtain all relevant results however not when the relevance of each and every one is valued. \end_layout \begin_layout Standard A system with high precision but low recall would indicate that the system is able to very confident in its selection of relevant documents but may indicate an increase in false negatives. \end_layout \begin_layout Subsection Precision Recall Curve \end_layout \begin_layout Standard A way to visualise the response of a visual search system is to calculate both precision and recall for all values of \begin_inset Formula $n$ \end_inset and plot each pair against each for what is known as a precision-recall curve or PR curve. \end_layout \begin_layout Subsection Methods \end_layout \begin_layout Standard In order to evaluate the performance of each descriptor two different tests were conducted. \end_layout \begin_layout Subsubsection Category Response \end_layout \begin_layout Standard The category response aims to control for a descriptor's varying performance at each of the dataset's categories by looping through each category and randomly selecting an image from each as the query image. Each category iteration has precision and recall values calculated for all \begin_inset Formula $n$ \end_inset to allow the mean average precision to be calculated. This mean value is calculated from 20 iterations for the MSRCv2 dataset. \end_layout \begin_layout Standard Completing one iteration for each category also allows a confusion matrix to be constructed. For each iteration the top 20 results were evaluated, this number was chosen as this is approximately the mean number of images in each category. \end_layout \begin_layout Standard The completed confusion matrix allows the main category confusions to be identified and discussions to be made. \end_layout \begin_layout Subsubsection Random Response \end_layout \begin_layout Standard The random response places emphasis on iteration over controlling for inter-cate gory response. Here query images are selected at random from the entire dataset and many iterations are run in order to identify a mean response. \end_layout \begin_layout Section Results \end_layout \begin_layout Subsection Average RGB \end_layout \begin_layout Subsection Global Colour Histogram \end_layout \begin_layout Subsection Spatial Colour \end_layout \begin_layout Subsection Spatial Colour and Texture \end_layout \begin_layout Section Discussion \end_layout \begin_layout Section Conclusions \end_layout \begin_layout Standard \begin_inset Newpage pagebreak \end_inset \end_layout \begin_layout Standard \start_of_appendix \begin_inset CommandInset bibtex LatexCommand bibtex btprint "btPrintCited" bibfiles "references" options "plain" \end_inset \end_layout \begin_layout Section MSRCv2 Dataset Classifications \begin_inset CommandInset label LatexCommand label name "sec:MSRC-Dataset-Classifications" \end_inset \end_layout \begin_layout Standard \align center \begin_inset Tabular \begin_inset Text \begin_layout Plain Layout Category Index \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Category Classification \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 1 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Farm Animal \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 2 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Tree \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 3 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Building \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 4 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Plane \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 5 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Cow \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 6 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Face \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 7 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Car \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 8 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Bike \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 9 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Sheep \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 10 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Flower \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 11 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Sign \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 12 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Bird \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 13 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Books \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 14 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Bench \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 15 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Cat \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 16 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Dog \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 17 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Road \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 18 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Water Features \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 19 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Human Figures \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout 20 \end_layout \end_inset \begin_inset Text \begin_layout Plain Layout Coast \end_layout \end_inset \end_inset \end_layout \end_body \end_document