I'm developing a Speech Recognition engine for recognizing few (10-14) isolated words. I'm using MFCC (Mel Frequency Cepstral Coefficient) method and doing it using MATLAB. I'm referring a research paper and a website and other sources. I have mentioned the links below. Following are the main problem I'm facing:
Knowing whether the sampling frequency of 44100Hz that I have chosen is correct or not?
Whether the lower frequency=300Hz and upper frequency=8000Hz that is chosen to calculate Mel Filter Bank Matrix is correct or not?
Whether the frame-size=256, and no.of filter banks(coefficients)=20 that I have chosen is suitable for my application?
Whether pre-emphasis filter used is good? Or should I use butterworth filter?
Is normalisation of samples required before processing/ after filtering? Which normalisation technique should I use in that case?
CODE STRUCTURE:
RUN the MainRunning.m file. (Used to run code for the TRAINING SET once, and then the test samples)
SevenStep.m contains the code for training
SevenStepTestSample.m contains the code for testing
I have put my code in this link
Research Paper I'm referring is in this link
Please suggest the error that I'm getting and the optimal upper frequency, lower frequency, frame-size, no. of filter banks for my application? How do I find the Hmatrix in my code correctly?(Mel Filter bank). Also do suggest any other errors, if any.
Thanks a lot in advance.
CODES:
MainRunning.m
wordArray= {'begin', 'continue', 'exit', 'move', 'save', 'undo'};
% //NOT USING THESE REMAINING WORDS NOW :, 'a', 'b', 'c', 'd', 'one', 'two', 'three', 'four', 'a1', 'a2', 'a3', 'a4'};
SevenStep();
for j=1:6 %for all words in database
ADDITION=0;
disp(strcat('CHECKING FOR THE WORD: ',upper(wordArray{j})));
for i=1:15 % each word has 15 samples for testing the accuracy of system
[Value1]=SevenStepTestSample(strcat('C:\Users\Ray\Documents\BE Project\Phase-III\FreshAudioSamples\test\',wordArray{j},num2str(i),'.wav'));
Result1=strcmpi(Value1,wordArray{j});
%Result2=strcmpi(Value2,wordArray{j});
ADDITION=ADDITION+((Result1));
end
fprintf('%s\t%d\n',upper(wordArray{j}),ADDITION); %Used to display no. of words correctly identified out of 15
end
SevenStep.m
function SevenStep()
tic
%-----------------------------------------------------------------INITIALISATION---------------------------------------------------------------%
path_start='C:\Users\Ray\Documents\BE Project\Phase-III\FreshAudioSamples\';
path_end='.wav';
alpha=0.97;
fs=44100; %frequency at which I have sampled my recorded samples
pad_length=120064;
numSamplesOfEachWord=30; %in database of samples
numOfWords=6;
frame_length=256; %How to choose an appropriate frame-size?
no_of_frames=pad_length/(int32(frame_length/2)); %since overlap is 50%
low_freq=300; %lower frequency for calculation of mel frequency filter bank (I'm unable to choose a correct one, and find the criteria for choosing it)
high_freq=8000; %upper frequency for calculation of mel frequency filter bank (I'm unable to choose a correct one, and find the criteria for choosing it)
no_of_coeffs=20; % This is no. of Mel-Filter banks to create. how to choose a approriate value for this?
%{
fnm = 'C:\Users\Ray\Documents\BE Project\Phase-III\trifbank.mat';
m1= matfile(fnm);
H=m1.H;
//Method 2 which I used to calculate when my Mel Frequency Filter Bank
(given below) was probably not working correctly
%}
%--------------------------------------------------PRE-PROCESSING FOR MEL FILTER BANK CREATION-----------------------------------------------%
low_linear=2595*log10(1+(low_freq/700));
high_linear=2595*log10(1+(high_freq/700));
band_length=(high_linear-low_linear)/(no_of_coeffs+1);
MelArray(no_of_coeffs+2,1)=zeros(); %to store mel frequencies to calculate mel frequency filter bank
LinearArray(no_of_coeffs+2,1)=zeros(); %to store linear frequencies to calculate mel frequency filter bank
FreqArray(no_of_coeffs+2,1)=zeros(); %to store frequency array to calculate mel frequency filter bank
%{
THIS ARRAY MAY HAVE WRONG VALUES DUE TO SELECTION of WRONG PARAMETERS LIKE low_freq, high_freq, frame_length (frame-size), no_of_coeffs (no. of filter banks). THIS IS MAJOR REASON BEHIND GENERATION OF NaN values in HMatrix
%}
HMatrix(no_of_coeffs,frame_length)=zeros(); %Hmk Matrix/ Filter Bank
%I'M VERY DOUBTFUL OF THE
%VALUES GENERATED BY THIS
%FILTER BANK
MelArray(1)=low_linear;
MelArray(no_of_coeffs+2)=high_linear;
LinearArray(1)=low_freq;
LinearArray(no_of_coeffs+2)=high_freq;
FreqArray(1)=floor((int32(frame_length)+1)*LinearArray(1)/fs);
FreqArray(no_of_coeffs+2)=floor((int32(frame_length)+1)*LinearArray(no_of_coeffs+2)/fs);
for m=1:no_of_coeffs
MelArray(m+1)=MelArray(m)+band_length;
LinearArray(m+1)=700*((power( 10,MelArray(m+1)/2595))-1);
FreqArray(m+1)=floor((int32(frame_length)+1)*LinearArray(m+1)/fs);
end
% THE MOST DOUBTFUL PART i.e. MEL FREQUENCY FILTER BANK MATRIX CREATION
%---------------------------------------------------------PROBABLE ERRONEOUS PART------------------------------------------------------------%
% I'M GETTING NaN values in this matrix probably due to choosing
% incorrect parameters for like upper freq, lower freq, frame-size, no.of filter banks, sampling frequency etc.
for k=1:frame_length
for m=1:no_of_coeffs
if(k<FreqArray(m))
HMatrix(m,k)=0;
elseif (FreqArray(m)<=k && k<=FreqArray(m+1))
HMatrix(m,k)=(k-FreqArray(m))/(FreqArray(m+1)-FreqArray(m));
elseif(FreqArray(m+1)<=k && k<=FreqArray(m+2))
HMatrix(m,k)=(FreqArray(m+2)-k)/(FreqArray(m+2)-FreqArray(m+1));
elseif (k>FreqArray(m+2))
HMatrix(m,k)=0;
end
end
end
%--------------------------------------------------------------------------------------------------------------------------------------------%
SM(no_of_frames,frame_length)=zeros();
FVector(no_of_frames,no_of_coeffs,numOfWords*numSamplesOfEachWord)=zeros();
LogFiltered(no_of_frames,no_of_coeffs)=zeros();
FinalResult(no_of_frames,no_of_coeffs)=zeros();
WindowValues=hamming(frame_length);
wordArray= {'begin\', 'continue\', 'exit\', 'move\', 'save\', 'undo\'};
% //NOT USING THESE REMAINING WORDS NOW : 'a\', 'b\', 'c\', 'd\', 'one\', 'two\', 'three\', 'four\', 'move\a1\', 'move\a2\', 'move\a3\', 'move\a4\'};
%{
//NOT USING THIS NOW
filenaming = 'C:\Users\Ray\Documents\BE Project\Phase-III\Normalising.mat';
m = matfile(filenaming);
Nrm=m.Nrm;
%}
%------------------------------------------------------------------TRAINING--------------------------------------------------------------------%
for i=1:numOfWords
for j=1:numSamplesOfEachWord
%----------------------------------------------------------READING WORD SAMPLE---------------------------------------------------%
SD=wavread(strcat(path_start,wordArray{i},num2str(j),path_end));
%SD=Nrm(j+(i-1)*numSamplesOfEachWord,:); //NOT USING THIS NOW
%SD=NoiseFilter(SD); % //NOT USING THIS NOW
%----------------------------------------------------PRE-EMPHASIS FILTER-------------------------------------------------------%
SD=filter([1 -alpha], 1, SD);
%------------------------------------------------------------PADDING-----------------------------------------------------------%
SD=pad(SD,pad_length); % function given at the end of this file
%-------------------------------------------------------FRAMING AND WINDOWING-------------------------------------------------------%
%----------------------------------------------------AND CALCULATING FFT OF OBATINED SAMPLES--------------------------------------------%
SD=reshape(SD,no_of_frames,frame_length/2);
for k=1:no_of_frames-1
temp=fft(([SD(k,:) SD(k+1,:)]'.*WindowValues)');
SM(k,:)=temp.*(conj(temp));
end
temp=fft(([SD(no_of_frames,:) zeros(1,frame_length/2)]'.*WindowValues)');
SM(no_of_frames,:)=temp.*(conj(temp));
%-------------------------------------------------APPLYING MEL - FILTER BANK AND LOG FILTERING--------------------------------------------%
for h=1:no_of_frames
for l=1:no_of_coeffs
LogFiltered(h,l) = log10(sum(SM(h,:).*HMatrix(l,:)));
end
end
%----------------------------------------------------------APPLYING DCT---------------------------------------------------------%
for indexing=1:no_of_frames
FinalResult(indexing,:)=dct(LogFiltered(indexing,:));
end
%---------------------------------------------------STORING TRAINING DATASET---------------------------------------------------%
FVector(1:no_of_frames,1:no_of_coeffs,j+(i-1)*numSamplesOfEachWord)=FinalResult;
end
end
save('SevenStep'); %Saving all variables to retrieve during running the program for SR
toc
end
function p=pad(x,full_length)
p = vertcat(x, zeros(full_length - length(x),1));
end
%{
NOT USING THIS NOW, BUT IF YOU CAN SUGGEST CHANGES IN THIS IT WOULD
APPRECIABLE
function Y=NoiseFilter(SD)
[B, A]= butter(2,0.01); % IS THIS CORRECT FOR MY SAMPLES?
X= filter(B, A, SD);
%figure; plot(SD);
X(abs(X)< 0.01) = 0;
%figure; plot(X);
%X=X(find(X,1,'first'):find(X,1,'last'));
Y=SD(find(X,1,'first'):find(X,1,'last')); % TAKING FROM ORIGINAL SAMPLE
%figure; plot(Y);
end
%}
SevenStepTestSample.m
function [StrVal1] =SevenStepTestSample(filename)
tic
alpha=0.97;
KVal=13; %The no. of values used for final comparison
pad_length=120064;
numSamplesOfEachWord=10;
numOfWords=6;
frame_length=256; %How to choose an appropriate frame-size?
no_of_frames=pad_length/(frame_length/2); %since overlap is 50%
no_of_coeffs=20; % This is no. of Mel-Filter banks to create. how to choose a approriate value for this?
wordArrayPrint= {'begin', 'continue', 'exit', 'move', 'save', 'undo'};
% NOT USING THE REMAINING WORDS FOR NOW :'a', 'b', 'c', 'd', 'one', 'two', 'three', 'four', 'a1', 'a2', 'a3', 'a4'};
filenm = 'C:\Users\Ray\Documents\BE Project\Phase-III\SevenStep.mat'; % This is the storage space of training set data
m = matfile(filenm);
FVector=m.FVector;
HMatrix=m.HMatrix;
WindowValues=m.WindowValues;
TSM(no_of_frames,frame_length)=zeros();
TLogFiltered(no_of_frames,no_of_coeffs)=zeros();
TFinalResult(no_of_frames,no_of_coeffs)=zeros();
[testSample,t1] = wavread(filename);
%{
WAS USING THIS FOR NORMALISATION OF VOICE SAMPLES, NOT USING IT NOW
tt=max(abs(testSample));
for j=1:length(testSample)
RD(j)=(testSample(j)/tt);
end
testSample=RD;
%}
% NOT USING THIS NOW : testSample=NoiseFilter(testSample);
testSample=filter([1 -alpha],1,testSample');
testSample = vertcat(testSample', zeros(pad_length - length(testSample),1));
testSample=reshape(testSample,no_of_frames,frame_length/2);
for k=1:no_of_frames-1
temp=fft(([testSample(k,:) testSample(k+1,:)]'.*WindowValues)');
TSM(k,:)=temp.*(conj(temp));
end
temp=fft(([testSample(no_of_frames,:) zeros(1,frame_length/2)]'.*WindowValues)');
TSM(no_of_frames,:)=temp.*(conj(temp));
for h=1:no_of_frames
for l=1:no_of_coeffs
TLogFiltered(h,l) = log10(sum(TSM(h,:).*HMatrix(l,:)));
end
end
for indexing=1:no_of_frames
TFinalResult(indexing,:)=dct(TLogFiltered(indexing,:));
end
errorArray = zeros(numOfWords*numSamplesOfEachWord,1);
for j=1:numOfWords*numSamplesOfEachWord
errorArray(j)= mean2(abs(TFinalResult(1:no_of_frames,2:KVal) - FVector(1:no_of_frames,2:KVal,j)));
end
%errorArray'
[err, index1] = min(errorArray);
%{
NOT USING THIS NOW: THIS CODE IS TO CALCULATE SECOND MINIMUM ASSUMING
THE FACT THAT NO TWO SAMPLES WILL GIVE SAME ERROR VALUE
[err,index2] = min(errorArray(errorArray>min(errorArray)));
if(index1<=index2)
index2=index2+1;
end
%}
for j=1:numOfWords
if (index1 >= (1+(j-1)*numSamplesOfEachWord) && index1 <= j*numSamplesOfEachWord)
StrVal1=upper(wordArrayPrint{j});
disp(StrVal1);
end
%{
if (index2 >= (1+(j-1)*numSamplesOfEachWord) && index2 <= j*numSamplesOfEachWord)
StrVal2=upper(wordArrayPrint{j});
end
%}
end
%fprintf('%s\t%s\n',StrVal1,StrVal2);
% save('SevenStepTestSample');
toc
end