564 lines
13 KiB
Plaintext
564 lines
13 KiB
Plaintext
#LyX 2.3 created this file. For more info see http://www.lyx.org/
|
|
\lyxformat 544
|
|
\begin_document
|
|
\begin_header
|
|
\save_transient_properties true
|
|
\origin unavailable
|
|
\textclass article
|
|
\begin_preamble
|
|
\def\changemargin#1#2{\list{}{\rightmargin#2\leftmargin#1}\item[]}
|
|
\let\endchangemargin=\endlist
|
|
\pagenumbering{roman}
|
|
|
|
\usepackage{color}
|
|
|
|
\definecolor{commentgreen}{RGB}{0,94,11}
|
|
\end_preamble
|
|
\use_default_options true
|
|
\begin_modules
|
|
customHeadersFooters
|
|
minimalistic
|
|
todonotes
|
|
figs-within-sections
|
|
\end_modules
|
|
\maintain_unincluded_children false
|
|
\language english
|
|
\language_package default
|
|
\inputencoding auto
|
|
\fontencoding global
|
|
\font_roman "default" "default"
|
|
\font_sans "default" "default"
|
|
\font_typewriter "default" "default"
|
|
\font_math "auto" "auto"
|
|
\font_default_family default
|
|
\use_non_tex_fonts false
|
|
\font_sc false
|
|
\font_osf false
|
|
\font_sf_scale 100 100
|
|
\font_tt_scale 100 100
|
|
\use_microtype true
|
|
\use_dash_ligatures true
|
|
\graphics default
|
|
\default_output_format default
|
|
\output_sync 0
|
|
\bibtex_command biber
|
|
\index_command default
|
|
\paperfontsize 11
|
|
\spacing onehalf
|
|
\use_hyperref true
|
|
\pdf_title "Linear Predictive Speech Synthesizer"
|
|
\pdf_author "Andy Pack"
|
|
\pdf_subject "EEEM030 Speech & Audio Processing & Recognition"
|
|
\pdf_keywords "EEEM030"
|
|
\pdf_bookmarks true
|
|
\pdf_bookmarksnumbered false
|
|
\pdf_bookmarksopen false
|
|
\pdf_bookmarksopenlevel 1
|
|
\pdf_breaklinks false
|
|
\pdf_pdfborder true
|
|
\pdf_colorlinks false
|
|
\pdf_backref false
|
|
\pdf_pdfusetitle true
|
|
\papersize default
|
|
\use_geometry true
|
|
\use_package amsmath 1
|
|
\use_package amssymb 1
|
|
\use_package cancel 1
|
|
\use_package esint 1
|
|
\use_package mathdots 1
|
|
\use_package mathtools 1
|
|
\use_package mhchem 1
|
|
\use_package stackrel 1
|
|
\use_package stmaryrd 1
|
|
\use_package undertilde 1
|
|
\cite_engine biblatex
|
|
\cite_engine_type authoryear
|
|
\biblio_style plain
|
|
\biblio_options urldate=long
|
|
\biblatex_bibstyle ieee
|
|
\biblatex_citestyle ieee
|
|
\use_bibtopic false
|
|
\use_indices false
|
|
\paperorientation portrait
|
|
\suppress_date true
|
|
\justification true
|
|
\use_refstyle 1
|
|
\use_minted 0
|
|
\index Index
|
|
\shortcut idx
|
|
\color #008000
|
|
\end_index
|
|
\leftmargin 1.8cm
|
|
\topmargin 2cm
|
|
\rightmargin 1.8cm
|
|
\bottommargin 2cm
|
|
\secnumdepth 3
|
|
\tocdepth 3
|
|
\paragraph_separation skip
|
|
\defskip medskip
|
|
\is_math_indent 0
|
|
\math_numbering_side default
|
|
\quotes_style english
|
|
\dynamic_quotes 0
|
|
\papercolumns 1
|
|
\papersides 1
|
|
\paperpagestyle fancy
|
|
\bullet 1 0 9 -1
|
|
\bullet 2 0 24 -1
|
|
\tracking_changes false
|
|
\output_changes false
|
|
\html_math_output 0
|
|
\html_css_as_file 0
|
|
\html_be_strict false
|
|
\end_header
|
|
|
|
\begin_body
|
|
|
|
\begin_layout Title
|
|
|
|
\size giant
|
|
Linear Predictive Speech Synthesizer
|
|
\end_layout
|
|
|
|
\begin_layout Author
|
|
Andy Pack
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset VSpace 15pheight%
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\align center
|
|
\begin_inset Graphics
|
|
filename surrey.png
|
|
lyxscale 15
|
|
width 40col%
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset VSpace vfill
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\noindent
|
|
\align center
|
|
EEEM030
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
November 2020
|
|
\size large
|
|
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
Department of Electrical and Electronic Engineering
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
Faculty of Engineering and Physical Sciences
|
|
\begin_inset Newline newline
|
|
\end_inset
|
|
|
|
University of Surrey
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage newpage
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Abstract
|
|
Abstract
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset toc
|
|
LatexCommand tableofcontents
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage newpage
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset FloatList figure
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset toc
|
|
LatexCommand lstlistoflistings
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage newpage
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Right Footer
|
|
Andy Pack / 6420013
|
|
\end_layout
|
|
|
|
\begin_layout Left Footer
|
|
November 2020
|
|
\end_layout
|
|
|
|
\begin_layout Left Header
|
|
EEEM030 Coursework 1
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset ERT
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
pagenumbering{arabic}
|
|
\end_layout
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
|
|
\backslash
|
|
setcounter{page}{1}
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Introduction
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Brief
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The aim of this report is to demonstrate how digital signal processing technique
|
|
s can be used to analyse, model and synthesise speech.
|
|
The task will be considered as two areas of concern, that of modelling
|
|
and synthesis.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The modelling stage will utilise Linear Predictive Coding and the source-filter
|
|
model of speech to construct a filter that acts similarly to the vocal
|
|
tract's effect on sound produced by the vocal chords.
|
|
Comparisons of the frequency response for both the estimated filter and
|
|
the original sound will be presented, the effect of different filter orders
|
|
will also be demonstrated.
|
|
Relevant parameters of the original vowel speech segment will be presented
|
|
including the fundamental frequency and formant frequencies.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The synthesis stage will complete the source-filter model of speech by creating
|
|
a suitable periodic sound source to be modulated by the previous filter.
|
|
With a complete source-filter model, artificial vowel sounds will be synthesise
|
|
d and analysed.
|
|
Subjective assessments will be made as to the differences between the original
|
|
sound and the final product of the model when system parameters are varied.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Implementation
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
The implementation of this system was completed using
|
|
\noun on
|
|
Matlab
|
|
\noun default
|
|
with aid from functions in the digital signal processing toolbox among
|
|
others.
|
|
Following loading a vowel sample, a segment of changing length (100ms was
|
|
standard) was clipped for processing.
|
|
The clip optionally also underwent pre-emphasis using a high pass filter.
|
|
As speech spectra can tend to have higher energy at lower frequencies,
|
|
the use of pre-emphasis can balance the magnitude across the spectrum.
|
|
A first order filter was used and the coefficient varied, over-use could
|
|
prove excessive for higher frequencies including fricative sounds.
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Modelling
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In order to estimate the filter state of the vocal tract, the linear predictive
|
|
coding coefficients of varying orders were calculated using the
|
|
\begin_inset listings
|
|
lstparams "language=Matlab,basicstyle={\ttfamily},tabsize=4"
|
|
inline true
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
lpc(signal, order)
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
function.
|
|
In order to compare the frequency response of the LPC filter with the original
|
|
signal, the Fourier transform of the signal was calculated.
|
|
The frequency domain representation of the LPC filter was found using the
|
|
|
|
\begin_inset listings
|
|
lstparams "language=Matlab,basicstyle={\ttfamily},tabsize=4"
|
|
inline true
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
freqz(b, a, n, f)
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
function and co-plotted with the original signal.
|
|
This frequency plot of the LPC filter constitutes the spectral envelope
|
|
of the signal and the vowel formant frequencies can be found at the maxima
|
|
of the spectrum.
|
|
Due to the smooth profile of the LPC spectrum, formant frequencies were
|
|
estimated by identifying the local maxima of the function.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In order to find the fundamental frequency of the signal, the cepstrum was
|
|
used.
|
|
The use of a low pass filter was investigated in order to smooth the cepstrum
|
|
before programmatically finding pitch period candidates by applying
|
|
\begin_inset Formula $x$
|
|
\end_inset
|
|
|
|
and
|
|
\begin_inset Formula $y$
|
|
\end_inset
|
|
|
|
thresholds.
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Synthesis
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
In order to synthesise speech, a periodic impulse train at the identified
|
|
fundamental frequency of the original vowel was generated.
|
|
The impulse train was sampled at the same frequency as the original sound.
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Results
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
LPC Filter
|
|
\end_layout
|
|
|
|
\begin_layout Subsubsection
|
|
Order Variation
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Spectral Analysis
|
|
\end_layout
|
|
|
|
\begin_layout Subsubsection
|
|
Fundamental Frequency
|
|
\end_layout
|
|
|
|
\begin_layout Subsubsection
|
|
Formant Frequencies
|
|
\end_layout
|
|
|
|
\begin_layout Subsubsection
|
|
Cepstrum Smoothing
|
|
\end_layout
|
|
|
|
\begin_layout Subsection
|
|
Synthesis
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Discussion
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
Conclusion
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset Newpage newpage
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:bibliography"
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset CommandInset bibtex
|
|
LatexCommand bibtex
|
|
btprint "btPrintCited"
|
|
bibfiles "references"
|
|
options "bibtotoc"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Section
|
|
\start_of_appendix
|
|
Source Code
|
|
\begin_inset CommandInset label
|
|
LatexCommand label
|
|
name "sec:Code"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
While much of the code was developed in individual scripts in order to experimen
|
|
t with separate aspects of the system, for collecting results a script which
|
|
constitutes the entire system was written,
|
|
\begin_inset listings
|
|
lstparams "basicstyle={\ttfamily}"
|
|
inline true
|
|
status open
|
|
|
|
\begin_layout Plain Layout
|
|
|
|
lpss.m
|
|
\end_layout
|
|
|
|
\end_inset
|
|
|
|
.
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../lpss.m"
|
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, mfcc, spectro, fft_, autocorr, clip_segment, islocalmax, ms_to_samples, rceps, cceps, ones, audioplayer, play, get_impulse_train, lpc},caption={Main script including source-filter model and spectral analysis},label={main_script}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\begin_inset Newpage pagebreak
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../func/spectro.m"
|
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples},caption={Spectrogram plotting wrapper function},label={spectrogram_function}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../func/fft_.m"
|
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram},caption={Fast Fourier transform wrapper function},label={fft_function}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../func/autocorr.m"
|
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram},caption={Autocorrelation plotting wrapper function},label={autocorr_function}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../func/clip_segment.m"
|
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples},caption={Retrieve a segment of the original speech signal},label={clip_segment_function}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../func/ms_to_samples.m"
|
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram},caption={Transform time in milliseconds into the respective number of samples},label={ms_to_samples_function}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\begin_layout Standard
|
|
\begin_inset CommandInset include
|
|
LatexCommand lstinputlisting
|
|
filename "../func/get_impulse_train.m"
|
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples, repmat},caption={Generate an impulse rate of given fundamental frequency at a provided sampling frequency for a given length of time},label={get_impulse_train_function}"
|
|
|
|
\end_inset
|
|
|
|
|
|
\end_layout
|
|
|
|
\end_body
|
|
\end_document
|