分类模型

使用某个模型进行训练的一般步骤

预处理数据集
划分数据集（训练集和测试集）
训练模型
测试模型
计算评估指标
保存模型

数据集预处理

1
2
3

# 检查缺失值
print("缺失值统计：")
print(df.isnull().sum())

# 数据预处理
# 数值型特征填充均值
num_imputer = SimpleImputer(strategy='mean')
df_num = df.select_dtypes(include=['float64', 'int64'])
df_num_imputed = num_imputer.fit_transform(df_num)

# 分类特征填充众数
cat_imputer = SimpleImputer(strategy='most_frequent')
df_cat = df.select_dtypes(include=['object'])
df_cat_imputed = cat_imputer.fit_transform(df_cat)

# 合并填充后的数据
df = pd.concat([pd.DataFrame(df_num_imputed, columns=df_num.columns),
                        pd.DataFrame(df_cat_imputed, columns=df_cat.columns)], axis=1)

# 定义数值型和分类特征
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# 定义预处理步骤
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),  # 数值型特征填充均值
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_features)  # 分类特征填充众数
    ]
)

# 对数据进行预处理
df_imputed = pd.DataFrame(preprocessor.fit_transform(df), columns=numerical_features.tolist() + categorical_features.tolist())

1 2	`# 删除包含缺失值的样本 df = df.dropna()`

1
2
3

# 将布尔型特征转换为数值型
df_bool = df.select_dtypes(include=['bool'])
df[df_bool.columns] = df_bool.astype(int)

数据集：脱敏的个人信息+就业信息
目标：预测一些个人信息的就业状态

随机森林

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import joblib
import seaborn as sns

# 设置字体支持中文
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 读取数据
df = pd.read_excel('train_set.xlsx')

# 检查数据
print("数据预览：")
print(df.head(10))

print("--------------------------")

# 删除包含缺失值的样本
df = df.dropna()

# 分离特征和目标变量
X = df.drop(columns=['label', '行业'])  # 特征
y = df['label']  # 目标变量

# 确定需要编码的非数值型特征
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
print(f"需要编码的分类特征: {categorical_features}")

# 构建预处理和模型的流水线
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # 对分类特征进行独热编码
    ],
    remainder='passthrough'  # 其他数值型特征保持原样
)

# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # 数据预处理
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))  # 随机森林模型
])

# 分割数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
pipeline.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = pipeline.predict(X_test)
print("=====================")
print("预测结果预览：")
print(y_pred[:10])
print("=====================")

# 计算评估指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# 输出结果
print(f"准确率 (Accuracy): {accuracy:.4f}")
print(f"查准率 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1 值: {f1:.4f}")



# 从 Pipeline 中提取训练好的随机森林模型
rf_model = pipeline.named_steps['classifier']

# 获取特征重要性
importances = rf_model.feature_importances_

# 从 OneHotEncoder 中提取编码后的特征名称
onehot_encoder = pipeline.named_steps['preprocessor'].transformers_[0][1]
encoded_feature_names = onehot_encoder.get_feature_names_out(categorical_features)


print("读热编码后的特征名称：", encoded_feature_names)

# 数值特征名称
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# 合并所有特征名称（独热编码后的特征 + 数值特征）
all_feature_names = list(encoded_feature_names) + numerical_features

# 初始化字典，用于聚合特征重要性到原始特征
feature_importance_dict = {feature: 0 for feature in X.columns}

# 聚合独热编码特征的重要性
for i, feature in enumerate(all_feature_names):
    # 提取原始特征名
    original_feature = feature.split('_')[0] if '_' in feature else feature
    if original_feature in feature_importance_dict:
        feature_importance_dict[original_feature] += importances[i]

# 提取聚合后的特征名称和重要性
aggregated_feature_names = list(feature_importance_dict.keys())
aggregated_importances = list(feature_importance_dict.values())

print("================")
print("聚合后的特征名称：", aggregated_feature_names)
print("聚合后的特征重要性：", aggregated_importances)
print("================")

# 绘制条形图
plt.figure(figsize=(10, 8))
sns.barplot(x=aggregated_importances, y=aggregated_feature_names)
plt.title('Feature Importance (Aggregated)')
plt.xlabel('重要性')
plt.ylabel('特征')
plt.tight_layout()
plt.savefig("figure/feature_importance_aggregated.jpg", dpi=500)

print("------------------")

# 保存训练好的模型
joblib.dump(rf_model, 'random_forest_model.pkl')
print(" 模型已保存为 'random_forest_model.pkl'")


predict_data = pd.read_excel('predict_set.xlsx')
# 对预测集进行预处理
predict_data_transformed = pipeline.named_steps['preprocessor'].transform(predict_data)

# 使用模型进行预测
y_pred = rf_model.predict(predict_data_transformed)

# 保存结果
df_result = predict_data.copy()
df_result['预测结果'] = y_pred
df_result.to_csv('./processed_data/predict_result_t2.csv', index=False, encoding='utf-8-sig')
print("预测完成，结果保存为 ./processed_data/predict_result_t2.csv")

优化：

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 定义随机森林模型
rf = RandomForestClassifier(random_state=42)

# 定义超参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# 网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 输出最佳参数
print("最佳参数：", grid_search.best_params_)
print("最佳得分：", grid_search.best_score_)

# 使用最佳参数训练模型
best_rf = grid_search.best_estimator_

优化方向：

调整超参数：通过网格搜索或随机搜索优化随机森林的超参数。
特征选择：根据特征重要性选择重要特征。
处理类别不平衡：通过调整 class_weight 参数平衡类别。
交叉验证：使用交叉验证评估模型性能，避免过拟合。
增加模型评估指标：输出混淆矩阵等更多评估指标。

分类模型

使用某个模型进行训练的一般步骤

数据集预处理

随机森林

推荐匹配