# 处理评分列
df['rating'] = [re.sub(r'\[\'\"|\"\'\]'
, '', i) for i in df['rating']]
# 替换空列表
df['rating'].replace('[]', '还行', inplace=True)
# 定义字典
rating_dict = {
'很差': '1星',
'较差': '2星',
'还行': '3星',
'推荐': '4星',
'力荐': '5星'
}
df['rating'] = df['rating'].map(rating_dict)
# 评论信息分词处理
# 合并为一篇
txt = df['comment_info'].str.cat(sep='。')
# 添加关键词
jieba.add_word('黄轩')
jieba.add_word('佟丽娅')
jieba.add_word('男主')
jieba.add_word('女主')
jieba.add_word('跳戏')
jieba.add_word('颜值')
jieba.add_word('吐槽')
jieba.add_word('装逼')
jieba.add_word('国产剧')
# 读入停用词表
stop_words = []
with open('stop_words.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 添加停用词
stop_words.extend(['一部', '一拳', '一行', '10', '啊啊啊', '一句',
'get', '哈哈哈哈', '哈哈哈', '越来越', '一步',
'一种', '样子', '几个', '第一集'
, '一点',
'第一', '没见', '一集', '第一次', '两个',
'二代', '真的', '2020', '令人'])
# 评论字段分词处理
word_num = jieba.analyse.extract_tags(txt,
topK=100,
withWeight=True,
allowPOS=())
# 去停用词
word_num_selected = []
for i in word_num:
if i[0] not in stop_words:
word_num_selected.append(i)
key_words = pd.DataFrame(word_num_selected, columns=['words','num'])