first draft? added output sounds, referencing
This commit is contained in:
parent
4910c2c20d
commit
b2d3bccb29
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,4 @@
|
|||||||
*~
|
*~*
|
||||||
*#
|
*#
|
||||||
*.pdf
|
*.pdf
|
||||||
samples
|
samples
|
||||||
|
21
lpss.m
21
lpss.m
@ -4,10 +4,13 @@
|
|||||||
|
|
||||||
close all;clear all;clc;
|
close all;clear all;clc;
|
||||||
|
|
||||||
|
NAME = 'hood_m';
|
||||||
|
% NAME = 'head_f';
|
||||||
|
|
||||||
SEGMENT_LENGTH = 100; % ms
|
SEGMENT_LENGTH = 100; % ms
|
||||||
SEGMENT_OFFSET = 20; % ms from start
|
SEGMENT_OFFSET = 20; % ms from start
|
||||||
|
|
||||||
LPC_ORDER = 25;
|
LPC_ORDER = 30;
|
||||||
AC_DISP_SAMPLES = 1000; % autocorrelation display samples
|
AC_DISP_SAMPLES = 1000; % autocorrelation display samples
|
||||||
WINDOW_NUMBER = 10; % number of windows for spectrogram
|
WINDOW_NUMBER = 10; % number of windows for spectrogram
|
||||||
WINDOW_OVERLAP = 10; % ms
|
WINDOW_OVERLAP = 10; % ms
|
||||||
@ -36,15 +39,15 @@ ORIG_LPC_T_COMPARE = false;
|
|||||||
ORIG_SPECTROGRAM = true;
|
ORIG_SPECTROGRAM = true;
|
||||||
SYNTH_SPECTROGRAM = true;
|
SYNTH_SPECTROGRAM = true;
|
||||||
|
|
||||||
SYNTHESISED_SOUND_LENGTH = 1000; % ms
|
SYNTHESISED_SOUND_LENGTH = 100; % ms
|
||||||
|
|
||||||
WRITE = false;
|
WRITE = ~true;
|
||||||
PLAY = false;
|
PLAY = ~false;
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
%% READ SIGNAL
|
%% READ SIGNAL
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
[y, Fs] = audioread('samples/head_f.wav');
|
[y, Fs] = audioread(strcat('samples/', NAME, '.wav'));
|
||||||
% take segment of sample for processing
|
% take segment of sample for processing
|
||||||
y = clip_segment(y, Fs, SEGMENT_LENGTH, SEGMENT_OFFSET);
|
y = clip_segment(y, Fs, SEGMENT_LENGTH, SEGMENT_OFFSET);
|
||||||
y_orig = y;
|
y_orig = y;
|
||||||
@ -75,7 +78,7 @@ AC_DISP_SAMPLES = min([AC_DISP_SAMPLES L]);
|
|||||||
figure(1)
|
figure(1)
|
||||||
plot(x, y(end-AC_DISP_SAMPLES+1:end), x, est_y(end-AC_DISP_SAMPLES+1:end), '--')
|
plot(x, y(end-AC_DISP_SAMPLES+1:end), x, est_y(end-AC_DISP_SAMPLES+1:end), '--')
|
||||||
|
|
||||||
grid
|
gridh
|
||||||
xlabel('Sample Number')
|
xlabel('Sample Number')
|
||||||
ylabel('Amplitude')
|
ylabel('Amplitude')
|
||||||
legend('Original signal','LPC estimate')
|
legend('Original signal','LPC estimate')
|
||||||
@ -171,9 +174,9 @@ plot(ceps_t(1:round(L / 2)), c(1:round(L / 2)))
|
|||||||
%% MAXIMA
|
%% MAXIMA
|
||||||
% value threshold
|
% value threshold
|
||||||
c(c < CEPSTRUM_THRESHOLD) = 0;
|
c(c < CEPSTRUM_THRESHOLD) = 0;
|
||||||
cep_maxima_indexes = islocalmax(c);
|
|
||||||
|
|
||||||
cep_maxima_times = ceps_t(1:round(L / 2));
|
% local maxima
|
||||||
|
cep_maxima_indexes = islocalmax(c);
|
||||||
cep_maxima_times = ceps_t(cep_maxima_indexes);
|
cep_maxima_times = ceps_t(cep_maxima_indexes);
|
||||||
c = c(cep_maxima_indexes);
|
c = c(cep_maxima_indexes);
|
||||||
|
|
||||||
@ -218,7 +221,7 @@ if exist('fundamental_freq')
|
|||||||
synth_sound = filter(1, a, excitation);
|
synth_sound = filter(1, a, excitation);
|
||||||
|
|
||||||
if WRITE
|
if WRITE
|
||||||
audiowrite('out.wav', synth_sound, Fs);
|
audiowrite(strcat('synthed/', NAME, '_o', num2str(LPC_ORDER), '_', num2str(SEGMENT_LENGTH), '_', num2str(SEGMENT_OFFSET), 'ms.wav'), synth_sound, Fs);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -10,3 +10,64 @@
|
|||||||
year = {2015}
|
year = {2015}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@misc{etsi-gsm,
|
||||||
|
author = {ETSI},
|
||||||
|
month = may,
|
||||||
|
organization = {European Telecommunications Standards Institute},
|
||||||
|
title = {Digital cellular telecommunications system (Phase 2+); Full rate speech; Transcoding; ETS 300 961},
|
||||||
|
url = {https://www.etsi.org/deliver/etsi_i_ets/300900_300999/300961/02_60/ets_300961e02p.pdf},
|
||||||
|
year = {1998}
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{all-pole-resonance,
|
||||||
|
author = {Kim, Hyung-Suk},
|
||||||
|
organization = {Center for Computer Research in Music and Acoustics, Stanford University},
|
||||||
|
title = {Linear Predictive Coding is All-Pole Resonance Modeling},
|
||||||
|
url = {https://ccrma.stanford.edu/~hskim08/lpc},
|
||||||
|
year = {2014}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{quefrency,
|
||||||
|
author = {Oppenheim, A.V. and Schafer, Ronald},
|
||||||
|
doi = {10.1109/MSP.2004.1328092},
|
||||||
|
journal = {Signal Processing Magazine, IEEE},
|
||||||
|
month = {10},
|
||||||
|
pages = {95--106},
|
||||||
|
title = {From Frequency to Quefrency: A History of the Cepstrum},
|
||||||
|
url = {https://www.researchgate.net/publication/3321562_From_Frequency_to_Quefrency_A_History_of_the_Cepstrum},
|
||||||
|
volume = {21},
|
||||||
|
year = {2004}
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{source-filter-macquaire,
|
||||||
|
author = {Mannell, Robert},
|
||||||
|
month = mar,
|
||||||
|
organization = {Department of Linguistics, Macquarie University},
|
||||||
|
title = {Source-Filter Theory of Speech Production},
|
||||||
|
url = {https://www.mq.edu.au/about/about-the-university/faculties-and-departments/medicine-and-health-sciences/departments-and-centres/department-of-linguistics/our-research/phonetics-and-phonology/speech/acoustics/acoustic-theory-of-speech-production/source-filter-theory},
|
||||||
|
year = {2020}
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{max-min,
|
||||||
|
author = {{Whitman College}},
|
||||||
|
title = {Maxima and Minima},
|
||||||
|
url = {https://www.whitman.edu/mathematics/calculus_online/section05.01.html}
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{islocalmax,
|
||||||
|
author = {{MathWorks}},
|
||||||
|
organization = {MathWorks},
|
||||||
|
subtitle = {Find local maxima},
|
||||||
|
title = {islocalmax},
|
||||||
|
url = {https://www.mathworks.com/help/matlab/ref/islocalmax.html}
|
||||||
|
}
|
||||||
|
|
||||||
|
@online{aalto-fundamental-freq,
|
||||||
|
author = {B{\"a}ckstr{\"o}m, Tom},
|
||||||
|
month = aug,
|
||||||
|
organization = {Aalto University},
|
||||||
|
title = {Fundamental frequency (F0)},
|
||||||
|
url = {https://wiki.aalto.fi/pages/viewpage.action?pageId=149890776},
|
||||||
|
year = {2020}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -181,7 +181,15 @@ University of Surrey
|
|||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Abstract
|
\begin_layout Abstract
|
||||||
Abstract
|
A system implementing the source-filter model of speech is presented and
|
||||||
|
evaluated using vowel segments as subjects.
|
||||||
|
Linear predictive coding is used to estimate the formant frequencies of
|
||||||
|
the samples while the cepstrum is used to identify the fundamental frequency.
|
||||||
|
Comparisons of the LPC filter spectrum with the original audio spectrum
|
||||||
|
are provided.
|
||||||
|
A periodic impulse train of the same pitch period is used to synthesise
|
||||||
|
vowel samples, a subjective analysis of the segment quality is given.
|
||||||
|
Evaluations of various parameter variations are also presented.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
@ -272,12 +280,19 @@ Introduction
|
|||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
The ability to process and analyse speech signals has facilitated developments
|
Speech analysis and processing is an ever-expanding space with applications
|
||||||
throughout their use in the digital space with applications from data compressi
|
from data compression to speech recognition.
|
||||||
on to speech recognition.
|
The latter is a particularly relevant and popular area, presenting an important
|
||||||
|
domain for AI and machine learning applications.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Section
|
\begin_layout Standard
|
||||||
|
Prior to these, however, the ability to analyse, transform and identify
|
||||||
|
key parameters for a speech signal are important tools that will be explored
|
||||||
|
herein.
|
||||||
|
\end_layout
|
||||||
|
|
||||||
|
\begin_layout Subsection
|
||||||
Brief
|
Brief
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
@ -289,14 +304,37 @@ s can be used to analyse, model and synthesise speech.
|
|||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
The modelling stage will utilise Linear Predictive Coding and the source-filter
|
The modelling stage will utilise Linear Predictive Coding
|
||||||
model of speech to construct an all-pole filter that acts similarly to
|
\begin_inset CommandInset citation
|
||||||
the vocal tract's effect on sound produced by the vocal chords.
|
LatexCommand cite
|
||||||
|
key "all-pole-resonance"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
and the source-filter model of speech
|
||||||
|
\begin_inset CommandInset citation
|
||||||
|
LatexCommand cite
|
||||||
|
key "source-filter-macquaire"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
to construct an all-pole filter that acts similarly to the vocal tract's
|
||||||
|
effect on sound produced by the vocal chords.
|
||||||
Comparisons of the frequency response for both the estimated filter and
|
Comparisons of the frequency response for both the estimated filter and
|
||||||
the original sound will be presented, the effect of different filter orders
|
the original sound will be presented, the effect of different filter orders
|
||||||
will also be demonstrated.
|
will also be demonstrated.
|
||||||
Relevant parameters of the original vowel speech segment will be presented
|
Relevant parameters of the original vowel speech segment will be presented
|
||||||
including the fundamental frequency and formant frequencies.
|
including the fundamental frequency
|
||||||
|
\begin_inset CommandInset citation
|
||||||
|
LatexCommand cite
|
||||||
|
key "aalto-fundamental-freq"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
and formant frequencies.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
@ -321,12 +359,7 @@ Matlab
|
|||||||
others.
|
others.
|
||||||
Following loading a vowel sample, a segment of given length (100ms was
|
Following loading a vowel sample, a segment of given length (100ms was
|
||||||
typical) was clipped for processing.
|
typical) was clipped for processing.
|
||||||
The clip optionally also underwent pre-emphasis using a high pass filter.
|
The investigations were conducted on two samples,
|
||||||
As speech spectra can tend to have higher energy at lower frequencies,
|
|
||||||
the use of pre-emphasis can balance the magnitude across the spectrum.
|
|
||||||
A first order filter was used and the coefficient varied, over-use could
|
|
||||||
prove excessive for higher frequencies including fricative sounds.
|
|
||||||
The majority of the investigations were conducted on two samples,
|
|
||||||
\begin_inset listings
|
\begin_inset listings
|
||||||
lstparams "language=Matlab,basicstyle={\ttfamily},tabsize=4"
|
lstparams "language=Matlab,basicstyle={\ttfamily},tabsize=4"
|
||||||
inline true
|
inline true
|
||||||
@ -403,12 +436,27 @@ freqz(b, a, n, f)
|
|||||||
of the signal and the vowel formant frequencies can be found at the maxima
|
of the signal and the vowel formant frequencies can be found at the maxima
|
||||||
of the spectrum.
|
of the spectrum.
|
||||||
The smooth profile of the LPC spectrum allowed the formant frequencies
|
The smooth profile of the LPC spectrum allowed the formant frequencies
|
||||||
to be estimated by identifying the local maxima of the function.
|
to be estimated by identifying the local maxima
|
||||||
|
\begin_inset CommandInset citation
|
||||||
|
LatexCommand cite
|
||||||
|
key "max-min,islocalmax"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
of the function.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
In order to find the fundamental frequency of the signal, the cepstrum was
|
In order to find the fundamental frequency of the signal, the cepstrum
|
||||||
used.
|
\begin_inset CommandInset citation
|
||||||
|
LatexCommand cite
|
||||||
|
key "quefrency"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
was used.
|
||||||
Regular periodic frequencies in the time domain present as a peak in the
|
Regular periodic frequencies in the time domain present as a peak in the
|
||||||
quefrency domain, this can also be achieved with an auto-corelation function.
|
quefrency domain, this can also be achieved with an auto-corelation function.
|
||||||
The use of a low-pass filter was investigated in order to smooth the cepstrum
|
The use of a low-pass filter was investigated in order to smooth the cepstrum
|
||||||
@ -434,7 +482,15 @@ islocalmax(x)
|
|||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
function.
|
function
|
||||||
|
\begin_inset CommandInset citation
|
||||||
|
LatexCommand cite
|
||||||
|
key "islocalmax"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
.
|
||||||
A minimum quefrency threshold of 20 was applied to ignore the transient-like
|
A minimum quefrency threshold of 20 was applied to ignore the transient-like
|
||||||
oscillations at small
|
oscillations at small
|
||||||
\begin_inset Formula $x$
|
\begin_inset Formula $x$
|
||||||
@ -446,7 +502,7 @@ islocalmax(x)
|
|||||||
sampled at 24kHz, a frequency higher than that of the fundamental frequency
|
sampled at 24kHz, a frequency higher than that of the fundamental frequency
|
||||||
being investigated.
|
being investigated.
|
||||||
Additionally a minimum cepstrum threshold of 0.075 was used, from here the
|
Additionally a minimum cepstrum threshold of 0.075 was used, from here the
|
||||||
maximum value was used as the pitch period.
|
quefrency candidate with the highest value was used as the pitch period.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Subsection
|
\begin_layout Subsection
|
||||||
@ -472,7 +528,15 @@ noprefix "false"
|
|||||||
.
|
.
|
||||||
In order to produce the final synthesised speech, the generated impulse
|
In order to produce the final synthesised speech, the generated impulse
|
||||||
train must be convolved (in the time domain) with the transfer function
|
train must be convolved (in the time domain) with the transfer function
|
||||||
of the LPC filter representing the vocal tract.
|
of the LPC filter representing the vocal tract
|
||||||
|
\begin_inset CommandInset citation
|
||||||
|
LatexCommand cite
|
||||||
|
key "source-filter-macquaire"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
.
|
||||||
In
|
In
|
||||||
\noun on
|
\noun on
|
||||||
Matlab
|
Matlab
|
||||||
@ -1572,7 +1636,7 @@ noprefix "false"
|
|||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
, where the order of the
|
, the order of the
|
||||||
\begin_inset listings
|
\begin_inset listings
|
||||||
lstparams "basicstyle={\ttfamily}"
|
lstparams "basicstyle={\ttfamily}"
|
||||||
inline true
|
inline true
|
||||||
@ -1648,6 +1712,19 @@ name "fig:Spectrum-Tile"
|
|||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
|
|
||||||
|
\end_layout
|
||||||
|
|
||||||
|
\begin_layout Standard
|
||||||
|
\begin_inset Flex TODO Note (inline)
|
||||||
|
status open
|
||||||
|
|
||||||
|
\begin_layout Plain Layout
|
||||||
|
segment length variation?
|
||||||
|
\end_layout
|
||||||
|
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Subsection
|
\begin_layout Subsection
|
||||||
@ -1659,9 +1736,8 @@ Formant Frequencies
|
|||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
As described previously, the smooth profile of the LPC filter spectra makes
|
As described previously, the smooth profile of the LPC filter spectra allows
|
||||||
the use of the local maxima of this curve reasonable estimations as to
|
the local maxima to be used as reasonable estimations of the peaks.
|
||||||
the peaks.
|
|
||||||
The first three formants for the order 25 filters seen in figure
|
The first three formants for the order 25 filters seen in figure
|
||||||
\begin_inset CommandInset ref
|
\begin_inset CommandInset ref
|
||||||
LatexCommand ref
|
LatexCommand ref
|
||||||
@ -1892,7 +1968,7 @@ hood_m
|
|||||||
\begin_inset Text
|
\begin_inset Text
|
||||||
|
|
||||||
\begin_layout Plain Layout
|
\begin_layout Plain Layout
|
||||||
1,209
|
1,209.0
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
@ -2376,7 +2452,7 @@ noprefix "false"
|
|||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
.
|
, [1 -0.7] were used as coefficients.
|
||||||
When employing smoothing, the peak corresponding to the pitch period has
|
When employing smoothing, the peak corresponding to the pitch period has
|
||||||
been amplified compared to the unsmoothed curve where the pitch period
|
been amplified compared to the unsmoothed curve where the pitch period
|
||||||
does not reach far beyond the noise of the rest of the function.
|
does not reach far beyond the noise of the rest of the function.
|
||||||
@ -2428,7 +2504,8 @@ head_f
|
|||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
with and without low-pass filtering, thresholded local maxima crossed
|
with and without low-pass filtering, thresholded local maxima crossed,
|
||||||
|
smoothing coefficients: [1 -0.7]
|
||||||
\begin_inset CommandInset label
|
\begin_inset CommandInset label
|
||||||
LatexCommand label
|
LatexCommand label
|
||||||
name "fig:smoothed-cepstrum"
|
name "fig:smoothed-cepstrum"
|
||||||
@ -2711,70 +2788,6 @@ name "tab:fund-freq"
|
|||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
|
||||||
|
|
||||||
\begin_layout Subsubsection
|
|
||||||
Pre-emphasis
|
|
||||||
\end_layout
|
|
||||||
|
|
||||||
\begin_layout Standard
|
|
||||||
\begin_inset Float figure
|
|
||||||
wide false
|
|
||||||
sideways false
|
|
||||||
status open
|
|
||||||
|
|
||||||
\begin_layout Plain Layout
|
|
||||||
\noindent
|
|
||||||
\align center
|
|
||||||
\begin_inset Graphics
|
|
||||||
filename ../resources/hood_m_spect_25_premph_0.9.png
|
|
||||||
lyxscale 20
|
|
||||||
width 80col%
|
|
||||||
|
|
||||||
\end_inset
|
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
|
||||||
|
|
||||||
\begin_layout Plain Layout
|
|
||||||
\begin_inset Caption Standard
|
|
||||||
|
|
||||||
\begin_layout Plain Layout
|
|
||||||
LPC spectra for
|
|
||||||
\begin_inset listings
|
|
||||||
lstparams "basicstyle={\ttfamily}"
|
|
||||||
inline true
|
|
||||||
status open
|
|
||||||
|
|
||||||
\begin_layout Plain Layout
|
|
||||||
|
|
||||||
hood_m
|
|
||||||
\end_layout
|
|
||||||
|
|
||||||
\end_inset
|
|
||||||
|
|
||||||
following pre-emphasis using coefficients, [1 -0.9]
|
|
||||||
\begin_inset CommandInset label
|
|
||||||
LatexCommand label
|
|
||||||
name "fig:pre-emph-spectrum"
|
|
||||||
|
|
||||||
\end_inset
|
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
|
||||||
|
|
||||||
\end_inset
|
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
|
||||||
|
|
||||||
\begin_layout Plain Layout
|
|
||||||
|
|
||||||
\end_layout
|
|
||||||
|
|
||||||
\end_inset
|
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Subsection
|
\begin_layout Subsection
|
||||||
@ -2796,8 +2809,8 @@ noprefix "false"
|
|||||||
.
|
.
|
||||||
The circled areas highlight similar portions, the formant frequencies can
|
The circled areas highlight similar portions, the formant frequencies can
|
||||||
be seen in both.
|
be seen in both.
|
||||||
Despite being quasi-stationary, variation can be seen in time for the original
|
Despite being quasi-stationary, some variation in time can be seen for
|
||||||
signal.
|
the original signal.
|
||||||
The stationary synthesised signal, however, has a flat profile in time.
|
The stationary synthesised signal, however, has a flat profile in time.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
@ -2850,27 +2863,88 @@ name "fig:Spectrograms-synth"
|
|||||||
|
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
|
\begin_layout Standard
|
||||||
|
At lower filter orders (< 10), the synthesised speech has a
|
||||||
|
\emph on
|
||||||
|
buzzy
|
||||||
|
\emph default
|
||||||
|
quality resembling a sawtooth wave of the same pitch as the original voice
|
||||||
|
sample.
|
||||||
|
At these orders, the synthesised sound can not accurately be discerned
|
||||||
|
as being speech.
|
||||||
|
As the filter order increases, the tone of the sound becomes less harsh
|
||||||
|
and by around order 20 the sample could be identified as being of a voice.
|
||||||
|
By order 40, much of the harsh tone has been smoothed and the sample subjective
|
||||||
|
ly sounds as close to human speech as could be achieved.
|
||||||
|
Beyond this order, although the sound does change and smooth, it does not
|
||||||
|
appear to further approach the quality of the original sound.
|
||||||
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Section
|
\begin_layout Section
|
||||||
Discussion
|
Discussion
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
\begin_inset Flex TODO Note (inline)
|
As presented, the order of the LPC filter is a critical parameter for audio
|
||||||
status open
|
quality.
|
||||||
|
An order that is too low will not allow the filter to accurately map to
|
||||||
\begin_layout Plain Layout
|
the desired vowel spectrum leaving a sound that, although at the right
|
||||||
do numbers on compression
|
pitch, does not appreciably sound like the source segment.
|
||||||
|
At the other end, increasing the order beyond a certain complexity can
|
||||||
|
result in diminishing returns.
|
||||||
|
Although the sound sounded smoother, beyond around order 40 it did not
|
||||||
|
noticeably further approach the original sound.
|
||||||
|
Subjectively, an order of 30 provided a good approximation of the input
|
||||||
|
sound with acceptable quality for low bandwidth transmission.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
|
\begin_layout Standard
|
||||||
|
The use of low-pass filtering on the cepstrum when identifying the fundamental
|
||||||
|
frequency was effective in accentuating the peak corresponding to the pitch
|
||||||
|
period.
|
||||||
|
With this, a higher
|
||||||
|
\begin_inset Formula $y$
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
threshold could be used that would be further from the noise of the function
|
||||||
|
while still consistently identifying the correct peak.
|
||||||
|
\end_layout
|
||||||
|
|
||||||
|
\begin_layout Standard
|
||||||
|
A 100ms vowel segment sampled at 24kHz totals to 2,400 samples.
|
||||||
|
Assuming that each is represented by a float of 4 bytes, this uncompressed
|
||||||
|
vowel segment would fill 9600 bytes of storage.
|
||||||
|
Encoding the same 100ms of information via LPC using an order 30 filter
|
||||||
|
could reduce this to 120 bytes, just 1% of the previous space.
|
||||||
|
This is particularly important for audio transmission such as in mobile
|
||||||
|
telecoms, the GSM standard uses codecs based on LPC
|
||||||
|
\begin_inset CommandInset citation
|
||||||
|
LatexCommand cite
|
||||||
|
key "etsi-gsm"
|
||||||
|
literal "false"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
|
.
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Section
|
\begin_layout Section
|
||||||
Conclusion
|
Conclusion
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
|
\begin_layout Standard
|
||||||
|
Within this work, a complete source-filter model of speech has been presented,
|
||||||
|
analysing vowel samples and re-synthesising them while compressing the
|
||||||
|
data representation.
|
||||||
|
The effect of changing the complexity of this representation was investigated
|
||||||
|
by varying the order of the LPC filter and describing the effect on the
|
||||||
|
final audio sample.
|
||||||
|
Various statistics about the original samples were calculated including
|
||||||
|
the formant frequencies and the fundamental frequency.
|
||||||
|
With a sufficient filter order, sound samples comparable to the originals
|
||||||
|
were generated.
|
||||||
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
\begin_inset Newpage newpage
|
\begin_inset Newpage newpage
|
||||||
\end_inset
|
\end_inset
|
||||||
@ -2895,6 +2969,10 @@ options "bibtotoc"
|
|||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
|
|
||||||
|
\begin_inset Newpage pagebreak
|
||||||
|
\end_inset
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Section
|
\begin_layout Section
|
||||||
@ -2936,22 +3014,18 @@ Additional helper functions were written to plot and manipulate data.
|
|||||||
\begin_inset CommandInset include
|
\begin_inset CommandInset include
|
||||||
LatexCommand lstinputlisting
|
LatexCommand lstinputlisting
|
||||||
filename "../lpss.m"
|
filename "../lpss.m"
|
||||||
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, mfcc, spectro, fft_, autocorr, clip_segment, islocalmax, ms_to_samples, rceps, cceps, ones, audioplayer, play, get_impulse_train, lpc},caption={Main script including source-filter model and spectral analysis},label={main_script}"
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, mfcc, spectro, fft_, autocorr, clip_segment, islocalmax, ms_to_samples, rceps, cceps, ones, audioplayer, play, get_impulse_train, lpc, strcat, num2str, xlim},caption={Main script including source-filter model and spectral analysis},label={main_script}"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
|
|
||||||
\begin_inset Newpage pagebreak
|
|
||||||
\end_inset
|
|
||||||
|
|
||||||
|
|
||||||
\end_layout
|
\end_layout
|
||||||
|
|
||||||
\begin_layout Standard
|
\begin_layout Standard
|
||||||
\begin_inset CommandInset include
|
\begin_inset CommandInset include
|
||||||
LatexCommand lstinputlisting
|
LatexCommand lstinputlisting
|
||||||
filename "../func/spectro.m"
|
filename "../func/spectro.m"
|
||||||
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples},caption={Spectrogram plotting wrapper function},label={spectrogram_function}"
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples, xlim},caption={Spectrogram plotting wrapper function},label={spectrogram_function}"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
@ -2962,7 +3036,7 @@ lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},comm
|
|||||||
\begin_inset CommandInset include
|
\begin_inset CommandInset include
|
||||||
LatexCommand lstinputlisting
|
LatexCommand lstinputlisting
|
||||||
filename "../func/fft_.m"
|
filename "../func/fft_.m"
|
||||||
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram},caption={Fast Fourier transform wrapper function},label={fft_function}"
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, xlim},caption={Fast Fourier transform wrapper function},label={fft_function}"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
@ -2973,7 +3047,7 @@ lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},comm
|
|||||||
\begin_inset CommandInset include
|
\begin_inset CommandInset include
|
||||||
LatexCommand lstinputlisting
|
LatexCommand lstinputlisting
|
||||||
filename "../func/autocorr.m"
|
filename "../func/autocorr.m"
|
||||||
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram},caption={Autocorrelation plotting wrapper function},label={autocorr_function}"
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, xlim},caption={Autocorrelation plotting wrapper function},label={autocorr_function}"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
@ -2984,7 +3058,7 @@ lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},comm
|
|||||||
\begin_inset CommandInset include
|
\begin_inset CommandInset include
|
||||||
LatexCommand lstinputlisting
|
LatexCommand lstinputlisting
|
||||||
filename "../func/clip_segment.m"
|
filename "../func/clip_segment.m"
|
||||||
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples},caption={Retrieve a segment of the original speech signal},label={clip_segment_function}"
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples, xlim},caption={Retrieve a segment of the original speech signal},label={clip_segment_function}"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
@ -2995,7 +3069,7 @@ lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},comm
|
|||||||
\begin_inset CommandInset include
|
\begin_inset CommandInset include
|
||||||
LatexCommand lstinputlisting
|
LatexCommand lstinputlisting
|
||||||
filename "../func/ms_to_samples.m"
|
filename "../func/ms_to_samples.m"
|
||||||
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram},caption={Transform time in milliseconds into the respective number of samples},label={ms_to_samples_function}"
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, xlim},caption={Transform time in milliseconds into the respective number of samples},label={ms_to_samples_function}"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
@ -3006,7 +3080,7 @@ lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},comm
|
|||||||
\begin_inset CommandInset include
|
\begin_inset CommandInset include
|
||||||
LatexCommand lstinputlisting
|
LatexCommand lstinputlisting
|
||||||
filename "../func/get_impulse_train.m"
|
filename "../func/get_impulse_train.m"
|
||||||
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples, repmat},caption={Generate an impulse rate of given fundamental frequency at a provided sampling frequency for a given length of time},label={get_impulse_train_function}"
|
lstparams "breaklines=true,frame=tb,language=Matlab,basicstyle={\\ttfamily},commentstyle={\\color{commentgreen}\\itshape},keywordstyle={\\color{blue}},emphstyle={\\color{red}},stringstyle={\\color{red}},identifierstyle={\\color{cyan}},morekeywords={audioread, aryule, xcorr, freqz, spectrogram, ms_to_samples, repmat, xlim},caption={Generate an impulse rate of given fundamental frequency at a provided sampling frequency for a given length of time},label={get_impulse_train_function}"
|
||||||
|
|
||||||
\end_inset
|
\end_inset
|
||||||
|
|
||||||
|
BIN
synthed/head_f_o10_100_20ms.wav
Normal file
BIN
synthed/head_f_o10_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o15_100_20ms.wav
Normal file
BIN
synthed/head_f_o15_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o20_100_20ms.wav
Normal file
BIN
synthed/head_f_o20_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o25_100_20ms.wav
Normal file
BIN
synthed/head_f_o25_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o30_100_20ms.wav
Normal file
BIN
synthed/head_f_o30_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o40_100_20ms.wav
Normal file
BIN
synthed/head_f_o40_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o50_100_20ms.wav
Normal file
BIN
synthed/head_f_o50_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o5_100_20ms.wav
Normal file
BIN
synthed/head_f_o5_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o60_100_20ms.wav
Normal file
BIN
synthed/head_f_o60_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o70_100_20ms.wav
Normal file
BIN
synthed/head_f_o70_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/head_f_o80_100_20ms.wav
Normal file
BIN
synthed/head_f_o80_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o10_100_20ms.wav
Normal file
BIN
synthed/hood_m_o10_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o15_100_20ms.wav
Normal file
BIN
synthed/hood_m_o15_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o20_100_20ms.wav
Normal file
BIN
synthed/hood_m_o20_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o25_100_20ms.wav
Normal file
BIN
synthed/hood_m_o25_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o30_100_20ms.wav
Normal file
BIN
synthed/hood_m_o30_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o40_100_20ms.wav
Normal file
BIN
synthed/hood_m_o40_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o50_100_20ms.wav
Normal file
BIN
synthed/hood_m_o50_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o5_100_20ms.wav
Normal file
BIN
synthed/hood_m_o5_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o60_100_20ms.wav
Normal file
BIN
synthed/hood_m_o60_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o70_100_20ms.wav
Normal file
BIN
synthed/hood_m_o70_100_20ms.wav
Normal file
Binary file not shown.
BIN
synthed/hood_m_o80_100_20ms.wav
Normal file
BIN
synthed/hood_m_o80_100_20ms.wav
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user