# coding: utf-8import pandas as pd
#获取数据集
f =open('庆余年220.csv',encoding='utf8')
weibo_df = pd.read_csv(f)print(weibo_df.head())
1
2
3
4
5
6
7
输出结果如下图所示:
第二步,导入大连理工大学中文情感词典。
# coding: utf-8import pandas as pd
#-------------------------------------获取数据集---------------------------------
f =open('庆余年220.csv',encoding='utf8')
weibo_df = pd.read_csv(f)print(weibo_df.head())#-------------------------------------情感词典读取-------------------------------#注意:#1.词典中怒的标记(NA)识别不出被当作空值,情感分类列中的NA都给替换成NAU#2.大连理工词典中有情感分类的辅助标注(有NA),故把情感分类列改好再替换原词典中# 扩展前的词典
df = pd.read_excel('大连理工大学中文情感词汇本体NAU.xlsx')print(df.head(10))
df = df[['词语','词性种类','词义数','词义序号','情感分类','强度','极性']]
df.head()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
下图展示了我们导入的词典。
第三步,统计七种情绪分布情况。
# coding: utf-8import pandas as pd
#-------------------------------------获取数据集---------------------------------
f =open('庆余年220.csv',encoding='utf8')
weibo_df = pd.read_csv(f)print(weibo_df.head())#-------------------------------------情感词典读取-------------------------------#注意:#1.词典中怒的标记(NA)识别不出被当作空值,情感分类列中的NA都给替换成NAU#2.大连理工词典中有情感分类的辅助标注(有NA),故把情感分类列改好再替换原词典中# 扩展前的词典
df = pd.read_excel('大连理工大学中文情感词汇本体NAU.xlsx')print(df.head(10))
df = df[['词语','词性种类','词义数','词义序号',
'情感分类','强度','极性']]
df.head()#-------------------------------------七种情绪的运用-------------------------------
Happy =[]
Good =[]
Surprise =[]
Anger =[]
Sad =[]
Fear =[]
Disgust =[]#df.iterrows()功能是迭代遍历每一行for idx, row in df.iterrows():if row['情感分类']in['PA','PE']:
Happy.append(row['词语'])if row['情感分类']in['PD','PH','PG','PB','PK']:
Good.append(row['词语'])if row['情感分类']in['PC']:
Surprise.append(row['词语'])if row['情感分类']in['NB','NJ','NH','PF']:
Sad.append(row['词语'])if row['情感分类']in['NI','NC','NG']:
Fear.append(row['词语'])if row['情感分类']in['NE','ND','NN','NK','NL']:
Disgust.append(row['词语'])if row['情感分类']in['NAU']:#修改: 原NA算出来没结果
Anger.append(row['词语'])#正负计算不是很准 自己可以制定规则
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust
print('情绪词语列表整理完成')print(Anger)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
比如输出Anger生气的情绪词语,如下图所示。
用Spyder集成环境打开可以看到情感特征词的具体分布情况。
第四步,增加中文分词和自定义停用词典的代码。
#---------------------------------------中文分词---------------------------------import jieba
import time
#添加使用者词典和停用词
jieba.load_userdict("user_dict.txt")#自定义词典
stop_list = pd.read_csv('stop_words.txt',
engine='python',
encoding='utf-8',
delimiter="\n",
names=['t'])['t'].tolist()deftxt_cut(juzi):return[w for w in jieba.lcut(juzi)if w notin stop_list]#可增加len(w)>1
#---------------------------------------中文分词---------------------------------import jieba
import time
#添加自定义词典和停用词#jieba.load_userdict("user_dict.txt")
stop_list = pd.read_csv('stop_words.txt',
engine='python',
encoding='utf-8',
delimiter="\n",
names=['t'])#获取重命名t列的值
stop_list = stop_list['t'].tolist()deftxt_cut(juzi):return[w for w in jieba.lcut(juzi)if w notin stop_list]#可增加len(w)>1#---------------------------------------情感计算---------------------------------defemotion_caculate(text):
positive =0
negative =0
anger =0
disgust =0
fear =0
sad =0
surprise =0
good =0
happy =0
wordlist = txt_cut(text)#wordlist = jieba.lcut(text)
wordset =set(wordlist)
wordfreq =[]for word in wordset:
freq = wordlist.count(word)if word in Positive:
positive+=freq
if word in Negative:
negative+=freq
if word in Anger:
anger+=freq
if word
in Disgust:
disgust+=freq
if word in Fear:
fear+=freq
if word in Sad:
sad+=freq
if word in Surprise:
surprise+=freq
if word in Good:
good+=freq
if word in Happy:
happy+=freq
emotion_info ={'length':len(wordlist),'positive': positive,'negative': negative,'anger': anger,'disgust': disgust,'fear':fear,'good':good,'sadness':sad,'surprise':surprise,'happy':happy,}
indexs =['length','positive','negative','anger','disgust','fear','sadness','surprise','good','happy']return pd.Series(emotion_info, index=indexs)#测试
text="""
原著的确更吸引编剧读下去,所以跟《诛仙》系列明显感觉到编剧只看过故事大纲比,这个剧的编剧完整阅读过小说。
配乐活泼俏皮,除了强硬穿越的台词轻微尴尬,最应该尴尬的感情戏反而入戏,
故意模糊了陈萍萍的太监身份、太子跟长公主的暧昧关系,
整体观影感受极好,很期待第二季拍大东山之役。玩弄人心的阴谋阳谋都不狗血,架空的设定能摆脱历史背景,
服装道具能有更自由的发挥空间,特别喜欢庆帝的闺房。以后还是少看国产剧,太长了,
还是精短美剧更适合休闲,追这个太累。王启年真是太可爱了。
"""
res = emotion_caculate(text)print(res)
# coding: utf-8import pandas as pd
import jieba
import time
import csv
#-------------------------------------获取数据集---------------------------------
f =open('庆余年220.csv',encoding='utf8')
weibo_df = pd.read_csv(f)print(weibo_df.head())#-------------------------------------情感词典读取-------------------------------#注意:#1.词典中怒的标记(NA)识别不出被当作空值,情感分类列中的NA都给替换成NAU#2.大连理工词典中有情感分类的辅助标注(有NA),故把情感分类列改好再替换原词典中# 扩展前的词典
df = pd.read_excel('大连理工大学中文情感词汇本体NAU.xlsx')print(df.head(10))
df = df[['词语','词性种类','词义数','词义序号','情感分类','强度','极性']]
df.head()#-------------------------------------七种情绪的运用-------------------------------
Happy =[]
Good =[]
Surprise =[]
Anger =[]
Sad =[]
Fear =[]
Disgust =[]#df.iterrows()功能是迭代遍历每一行for idx, row in df.iterrows():if row['情感分类']in['PA','PE']:
Happy.append(row['词语'])if row['情感分类']in['PD','PH','PG','PB','PK']:
Good.append(row['词语'])if row['情感分类']in['PC']:
Surprise.append(row['词语'])if row['情感分类']in['NB','NJ','NH','PF']:
Sad.append(row['词语'])if row['情感分类']in['NI','NC','NG']:
Fear.append(row['词语'])if row['情感分类']in['NE','ND','NN','NK','NL']:
Disgust.append(row['词语'])if row['情感分类']in['NAU']:#修改: 原NA算出来没结果
Anger.append(row['词语'])#正负计算不是很准 自己可以制定规则
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust
print('情绪词语列表整理完成')print(Anger)#---------------------------------------中文分词---------------------------------#添加自定义词典和停用词#jieba.load_userdict("user_dict.txt")
stop_list = pd.read_csv('stop_words.txt',
engine='python',
encoding='utf-8',
delimiter="\n",
names=['t'])#获取重命名t列的值
stop_list =
stop_list['t'].tolist()deftxt_cut(juzi):return[w for w in jieba.lcut(juzi)if w notin stop_list]#可增加len(w)>1#---------------------------------------情感计算---------------------------------#文件写入
c =open("Emotion_features.csv","a+", newline='', encoding='gb18030')
writer = csv.writer(c)
writer.writerow(["Emotion","Word","Num"])#情感统计defemotion_caculate(text):
positive =0
negative =0
anger =0
disgust =0
fear =0
sad =0
surprise =0
good =0
happy =0
anger_list =[]
disgust_list =[]
fear_list =[]
sad_list =[]
surprise_list =[]
good_list =[]
happy_list =[]
wordlist = txt_cut(text)#wordlist = jieba.lcut(text)
wordset =set(wordlist)
wordfreq =[]for word in wordset:
freq = wordlist.count(word)
tlist =[]if word in Positive:
positive+=freq
if word in Negative:
negative+=freq
if word in Anger:
anger+=freq
anger_list.append(word)
tlist.append("anger")
tlist.append(word)
tlist.append(freq)
writer.writerow(tlist)if word in Disgust:
disgust+=freq
disgust_list.append(word)
tlist.append("disgust")
tlist.append(word)
tlist.append(freq)
writer.writerow(tlist)if word in Fear:
fear+=freq
fear_list.append(word)
tlist.append("fear")
tlist.append(word)
tlist.append(freq)
writer.writerow(tlist)if word in Sad:
sad+=freq
sad_list.append(word)
tlist.append("sad")
tlist.append(word)
tlist.append(freq)
writer.writerow(tlist)if word in Surprise:
surprise+=freq
surprise_list.append(word)
tlist.append("surprise")
tlist.append(word)
tlist.append(freq)
writer.writerow(tlist)if word in Good:
good+=freq
good_list.append(word)
tlist.append("good")
tlist.append(word)
tlist.append(freq)
writer.writerow(tlist)if word in Happy:
happy+=freq
happy_list.append(word)
tlist.append("happy")
tlist.append(word)
tlist.append(freq)
writer.writerow(tlist)
emotion_info ={'length':len(wordlist),'positive': positive,'negative': negative,'anger': anger,'disgust': disgust,'fear':fear,'good':good,'sadness':sad,'surprise':surprise,'happy':happy,}
indexs =['length','positive','negative','anger','disgust','fear','sadness','surprise','good','happy']#return pd.Series(emotion_info, index=indexs), anger_list, disgust_list, fear_list, sad_list, surprise_list, good_list, happy_listreturn pd.Series(emotion_info, index=indexs)#---------------------------------------情感计算---------------------------------
start = time.time()
emotion_df = weibo_df['review'].apply(emotion_caculate)
end = time.time()print(end-start)print(emotion_df.head())#输出结果
output_df = pd.concat([weibo_df, emotion_df], axis=1)
output_df.to_csv('庆余年220_emotion.csv',encoding='utf_8_sig', index=False)print(output_df.head())#结束统计
c.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
输出结果如下图所示:
3.词云分析
接着通过Pandas获取不同情绪的特征词及数量,代码如下所示。
# coding: utf-8import csv
import pandas as pd
#读取数据
f =open('Emotion_features.csv')
data = pd.read_csv(f)print(data.head())#统计结果
groupnum = data.groupby(['Emotion']).size()print(groupnum)print("")#分组统计for groupname,grouplist in data.groupby('Emotion'):print(groupname)print(grouplist)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
输出结果如下图所示:
Emotion Word Num
0 good 人心 11 good 极好 12 good 活泼 13 disgust 强硬 14 disgust 尴尬 2
Emotion
anger 2
disgust 208
fear 9
good 254
happy 39
sad 42
surprise 11
dtype: int64
anger
Emotion Word Num
133 anger 气愤 1382 anger 报仇 3
disgust
Emotion Word Num
3 disgust 强硬 14 disgust 尴尬 28 disgust 模糊 1...........558 disgust 紧张 1560 disgust 紧张 1561 disgust 刺激 1[208 rows x 3 columns]
fear
Emotion Word Num
93 fear 鸿门宴 1111 fear 吓人 1148 fear 可怕 1170 fear 没头苍蝇 1211 fear 厉害 1290 fear 刀光剑影 1292 fear 忌惮 1342 fear 无时无刻 1559 fear 紧张 1
good
Emotion Word Num
0 good 人心 11 good 极好 1...........
# coding: utf-8import csv
import pandas as pd
import operator
#------------------------------------统计结果------------------------------------#读取数据
f =open('Emotion_features.csv')
data = pd.read_csv(f)print(data.head())#统计结果
groupnum = data.groupby(['Emotion']).size()print(groupnum)print("")#分组统计for groupname,grouplist in data.groupby('Emotion'):print(groupname)print(grouplist)#生成数据 word = [('A',10), ('B',9), ('C',8)] 列表+Tuple
i =0
words =[]
counts =[]while i<len(data):if data['Emotion'][i]in"sad":#相等
k = data['Word'][i]
v = data['Num'][i]
n =0
flag =0while n<len(words):#如果两个单词相同则增加次数if words[n]==k:
counts[n]= counts[n]+ v
flag =1break
n = n +1#如果没有找到相同的特征词则添加if flag==0:
words.append(k)
counts.append(v)
i = i +1#添加最终数组结果
result =[]
k =0while k<len(words):
result.append((words[k],int(counts[k]*5)))#注意:因数据集较少,作者扩大5倍方便绘图
k = k +1print(result)#------------------------------------词云分析------------------------------------from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globalsimport SymbolType
# 渲染图defwordcloud_base()-> WordCloud:
c =(
WordCloud().add("", result, word_size_range=[5,200], shape=SymbolType.ROUND_RECT).set_global_opts(title_opts=opts.TitleOpts(title='情绪词云图')))return c
# 生成图
wordcloud_base().render('情绪词云图.html')
# -*- coding: utf-8 -*-from snownlp import SnowNLP
import codecs
import os
import pandas as pd
#获取情感分数
f =open('庆余年220.csv',encoding='utf8')
data = pd.read_csv(f)
sentimentslist =[]for i in data['review']:
s = SnowNLP(i)print(s.sentiments)
sentimentslist.append(s.sentiments)#区间转换为[-0.5, 0.5]
result =[]
i =0while i<len(sentimentslist):
result.append(sentimentslist[i]-0.5)
i = i +1#可视化画图import matplotlib.pyplot as plt
import numpy as np
plt.plot(np.arange(0,220,1), result,'k-')
plt.xlabel('Number')
plt.ylabel('Sentiment')
plt.title('Analysis of Sentiments')
plt.show()