<a target="_blank" href="https://www.huoban.com/news/tags-112.html"style="font-weight:bold;">基于</a>CNN和<a target="_blank" href="https://www.huoban.com/news/tags-4752.html"style="font-weight:bold;">MFCC</a>的语音情感识别-伙伴云

基于CNN和MFCC的语音情感识别

网友投稿 1299 2025-03-31

语音情感识别的主要任务是将蕴含在语音中的情感信息提取出来并识别出其类别。目前对于情感的描述主要有两种方法。第一种是基于离散的情感划分，将人类日常生活中广泛使用的基本情感分为愤怒、开心、兴奋、悲伤、厌恶等；另一种是基于连续维度情感划分，主要通过不同的效价度和激活程度来对不同情感进行区分的。

那么作为一个分类任务，特征选择是最关键的一步。本文中使用的语音特征是梅尔倒谱系数，有关梅尔倒谱系数是什么和怎样提取的知识，可参阅文章《Python语音信号处理》。

本文在一定程度上参考了MITESHPUTHRANNEU/Speech-Emotion-Analyzer这个项目，下面开始介绍如何通过卷积神经网络进行语音情感分析。

神经网络结构

使用到的架构其实还是很简单的，如下

数据集

我使用到是CASIA的语音情感数据库。CASIA汉语情感语料库由中国科学院自动化所（Institute of Automation, Chinese Academy of Sciences）录制，共包括四个专业发音人，六种情绪生气（angry）、高兴（happy）、害怕（fear）、悲伤（sad）、惊讶（surprise）和中性（neutral），共9600句不同发音。其中300句是相同文本的，也即是说对相同的文本赋以不同的情感来阅读，这些语料可以用来对比分析不同情感状态下的声学及韵律表现；另外100句是不同文本的，这些文本从字面意思就可以看出其情感归属，便于录音人更准确地表现出情感。

但是完整的CASIA数据集是收费的，因此我只找到了1200句残缺数据集。我把我找到的数据集放在我的网盘上：https://pan.baidu.com/s/1EsRoKaF17Q_3s2t7OMNibQ。

特征提取

我使用librosa模块进行MFCC的提取，提取代码如下。

%matplotlib inline

import librosa

import matplotlib.pyplot as plt

import numpy as np

path=r'D:\NLP\dataset\语音情感\test.wav'

y,sr = librosa.load(path,sr=None)

def normalizeVoiceLen(y,normalizedLen):

nframes=len(y)

y = np.reshape(y,[nframes,1]).T

#归一化音频长度为2s,32000数据点

if(nframes

res=normalizedLen-nframes

res_data=np.zeros([1,res],dtype=np.float32)

y = np.reshape(y,[nframes,1]).T

y=np.c_[y,res_data]

else:

y=y[:,0:normalizedLen]

return y[0]

def getNearestLen(framelength,sr):

framesize = framelength*sr

#找到与当前framesize最接近的2的正整数次方

nfftdict = {}

lists = [32,64,128,256,512,1024]

for i in lists:

nfftdict[i] = abs(framesize - i)

sortlist = sorted(nfftdict.items(), key=lambda x: x[1])#按与当前framesize差值升序排列

framesize = int(sortlist[0][0])#取最接近当前framesize的那个2的正整数次方值为新的framesize

return framesize

VOICE_LEN=32000

#获得N_FFT的长度

N_FFT=getNearestLen(0.25,sr)

#统一声音范围为前两秒

y=normalizeVoiceLen(y,VOICE_LEN)

print(y.shape)

#提取mfcc特征

mfcc_data=librosa.feature.mfcc(y=y, sr=sr,n_mfcc=13,n_fft=N_FFT,hop_length=int(N_FFT/4))

# 画出特征图，将MFCC可视化。转置矩阵，使得时域是水平的

plt.matshow(mfcc_data)

plt.title('MFCC')

上面代码的作用是加载声音，取声音的前两秒进行情感分析。getNearestLen()函数根据声音的采样率确定一个合适的语音帧长用于傅立叶变换。然后通过librosa.feature.mfcc()函数提取mfcc特征，并将其可视化。

下面的代码将数据集中的mfcc特征提取出来，并对每帧的mfcc取平均，将结果保存为文件。

#提取特征

import os

import pickle

counter=0

fileDirCASIA = r'D:\NLP\dataset\语音情感\CASIA database'

mfccs={}

mfccs['angry']=[]

mfccs['fear']=[]

mfccs['happy']=[]

mfccs['neutral']=[]

mfccs['sad']=[]

mfccs['surprise']=[]

mfccs['disgust']=[]

listdir=os.listdir(fileDirCASIA)

for persondir in listdir:

if(not r'.' in persondir):

emotionDirName=os.path.join(fileDirCASIA,persondir)

emotiondir=os.listdir(emotionDirName)

for ed in emotiondir:

if(not r'.' in ed):

filesDirName=os.path.join(emotionDirName,ed)

files=os.listdir(filesDirName)

for fileName in files:

if(fileName[-3:]=='wav'):

counter+=1

fn=os.path.join(filesDirName,fileName)

print(str(counter)+fn)

y,sr = librosa.load(fn,sr=None)

y=normalizeVoiceLen(y,VOICE_LEN)#归一化长度

mfcc_data=librosa.feature.mfcc(y=y, sr=sr,n_mfcc=13,n_fft=N_FFT,hop_length=int(N_FFT/4))

feature=np.mean(mfcc_data,axis=0)

mfccs[ed].append(feature.tolist())

with open('mfcc_feature_dict.pkl', 'wb') as f:

pickle.dump(mfccs, f)

数据预处理

代码如下：

%matplotlib inline

import pickle

import os

import librosa

import matplotlib.pyplot as plt

import numpy as np

from keras import layers

from keras import models

from keras import optimizers

from keras.utils import to_categorical

#读取特征

mfccs={}

with open('mfcc_feature_dict.pkl', 'rb') as f:

mfccs=pickle.load(f)

#设置标签

emotionDict={}

emotionDict['angry']=0

emotionDict['fear']=1

emotionDict['happy']=2

emotionDict['neutral']=3

emotionDict['sad']=4

emotionDict['surprise']=5

data=[]

labels=[]

data=data+mfccs['angry']

print(len(mfccs['angry']))

for i in range(len(mfccs['angry'])):

labels.append(0)

data=data+mfccs['fear']

print(len(mfccs['fear']))

for i in range(len(mfccs['fear'])):

labels.append(1)

print(len(mfccs['happy']))

data=data+mfccs['happy']

for i in range(len(mfccs['happy'])):

labels.append(2)

print(len(mfccs['neutral']))

data=data+mfccs['neutral']

for i in range(len(mfccs['neutral'])):

labels.append(3)

print(len(mfccs['sad']))

data=data+mfccs['sad']

for i in range(len(mfccs['sad'])):

labels.append(4)

print(len(mfccs['surprise']))

data=data+mfccs['surprise']

for i in range(len(mfccs['surprise'])):

labels.append(5)

print(len(data))

print(len(labels))

#设置数据维度

data=np.array(data)

data=data.reshape((data.shape[0],data.shape[1],1))

labels=np.array(labels)

labels=to_categorical(labels)

#数据标准化

DATA_MEAN=np.mean(data,axis=0)

DATA_STD=np.std(data,axis=0)

data-=DATA_MEAN

data/=DATA_STD

接下来保存好参数，模型预测的时候需要用到。

paraDict={}

paraDict['mean']=DATA_MEAN

paraDict['std']=DATA_STD

paraDict['emotion']=emotionDict

with open('mfcc_model_para_dict.pkl', 'wb') as f:

pickle.dump(paraDict, f)

最后是打乱数据集并划分训练数据和测试数据。

ratioTrain=0.8

numTrain=int(data.shape[0]*ratioTrain)

permutation = np.random.permutation(data.shape[0])

data = data[permutation,:]

labels = labels[permutation,:]

x_train=data[:numTrain]

x_val=data[numTrain:]

y_train=labels[:numTrain]

y_val=labels[numTrain:]

print(x_train.shape)

print(y_train.shape)

print(x_val.shape)

print(y_val.shape)

定义模型

使用keras定义模型，代码如下：

from keras.utils import plot_model

from keras import regularizers

model = models.Sequential()

model.add(layers.Conv1D(256,5,activation='relu',input_shape=(126,1)))

model.add(layers.Conv1D(128,5,padding='same',activation='relu',kernel_regularizer=regularizers.l2(0.001)))