Skip to content

基于朴素贝叶斯分析评论情感分析

基本步骤

text
1. 获取数据
2. 数据处理

   2.1 取出数据列,对数据进行分析
   2.2 判定评定标准
   2.3 选择停用词
   2.4 处理文本内容,转为标准格式
   2.5 统计词的个数
   2.6 准备训练集和测试集
3. 模型训练
4. 模型评估
5. 模型预测

示例代码

python
import jieba
import pandas as pd

def load_df_data_from_path(file_path: str):
    """
    读取文件加载df
    :param file_path:
    :return:
    """
    return pd.read_csv(file_path, encoding='gbk')


def get_stopwords_by_path(file_path: str):
    """
    从文件加载停用词
    :param file_path:
    :return:
    """
    stopwords = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for tmp in lines:
            line = tmp.strip()
            stopwords.append(line)
    return stopwords

def handle_content(content):
    comment_list = []
    for tmp in content:
        seg_list = jieba.cut(tmp, cut_all=False)
        seg_str = ",".join(seg_list)
        comment_list.append(seg_str)
    return comment_list
python
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from naive_bayes.bayes_util import load_df_data_from_path, get_stopwords_by_path, handle_content
# 加载数据
data = load_df_data_from_path('./data/书籍评价.csv')
# 2.1 分析数据
content = data["内容"]
print(content)
# 2.2 判定评价标准 1-好评; 0-差评
data.loc[data.loc[:, '评价'] == "好评", "评论标号"] = 1
data.loc[data.loc[:, "评价"] == "差评", "评论标号"] = 0
good_or_bad = data['评价'].values
print(good_or_bad)  # ['好评' '好评' '好评' '好评' '差评' '差评' '差评' '差评' '差评' '好评' '差评' '差评' '差评']
# 2.3 加载停用词
stopwords = get_stopwords_by_path('./data/stopwords.txt')
print(stopwords)
# 2.4 处理内容
comment_list = handle_content(content)
print(comment_list)
# 2.5 统计词个数: CountVectorizer可以文本的词语转为词频矩阵
con = CountVectorizer(stop_words=stopwords)
# 通过 fit_transform 函数计算各个词语出现的次数
X = con.fit_transform(comment_list)
name = con.get_feature_names_out()
print(X.toarray())
print(name)
# 2.6 准备训练和测试
x_train = X.toarray()[:10, :]
y_train = good_or_bad[:10]
x_test = X.toarray()[10:, :]
y_test = good_or_bad[10:]
# 模型训练
mb = MultinomialNB(alpha=1)
mb.fit(x_train, y_train)
y_predict = mb.predict(x_test)
# 预测值与真实值展示
print('预测值:', y_predict)
print('真实值:', y_test)
print(mb.score(x_test, y_test))

Released under the MIT License.