分类模型

使用某个模型进行训练的一般步骤

  1. 预处理数据集
  2. 划分数据集(训练集和测试集)
  3. 训练模型
  4. 测试模型
  5. 计算评估指标
  6. 保存模型

数据集预处理

1
2
3
# 检查缺失值
print("缺失值统计:")
print(df.isnull().sum())
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 数据预处理
# 数值型特征填充均值
num_imputer = SimpleImputer(strategy='mean')
df_num = df.select_dtypes(include=['float64', 'int64'])
df_num_imputed = num_imputer.fit_transform(df_num)

# 分类特征填充众数
cat_imputer = SimpleImputer(strategy='most_frequent')
df_cat = df.select_dtypes(include=['object'])
df_cat_imputed = cat_imputer.fit_transform(df_cat)

# 合并填充后的数据
df = pd.concat([pd.DataFrame(df_num_imputed, columns=df_num.columns),
pd.DataFrame(df_cat_imputed, columns=df_cat.columns)], axis=1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 定义数值型和分类特征
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# 定义预处理步骤
preprocessor = ColumnTransformer(
transformers=[
('num', SimpleImputer(strategy='mean'), numerical_features), # 数值型特征填充均值
('cat', SimpleImputer(strategy='most_frequent'), categorical_features) # 分类特征填充众数
]
)

# 对数据进行预处理
df_imputed = pd.DataFrame(preprocessor.fit_transform(df), columns=numerical_features.tolist() + categorical_features.tolist())
1
2
# 删除包含缺失值的样本
df = df.dropna()
1
2
3
# 将布尔型特征转换为数值型
df_bool = df.select_dtypes(include=['bool'])
df[df_bool.columns] = df_bool.astype(int)

数据集: 脱敏的个人信息+就业信息
目标: 预测一些个人信息的就业状态

随机森林

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import joblib
import seaborn as sns

# 设置字体支持中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题

# 读取数据
df = pd.read_excel('train_set.xlsx')

# 检查数据
print("数据预览:")
print(df.head(10))

print("--------------------------")

# 删除包含缺失值的样本
df = df.dropna()

# 分离特征和目标变量
X = df.drop(columns=['label', '行业']) # 特征
y = df['label'] # 目标变量

# 确定需要编码的非数值型特征
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
print(f"需要编码的分类特征: {categorical_features}")

# 构建预处理和模型的流水线
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # 对分类特征进行独热编码
],
remainder='passthrough' # 其他数值型特征保持原样
)

# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

pipeline = Pipeline([
('preprocessor', preprocessor), # 数据预处理
('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) # 随机森林模型
])

# 分割数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
pipeline.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = pipeline.predict(X_test)
print("=====================")
print("预测结果预览:")
print(y_pred[:10])
print("=====================")

# 计算评估指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# 输出结果
print(f"准确率 (Accuracy): {accuracy:.4f}")
print(f"查准率 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1 值: {f1:.4f}")



# 从 Pipeline 中提取训练好的随机森林模型
rf_model = pipeline.named_steps['classifier']

# 获取特征重要性
importances = rf_model.feature_importances_

# 从 OneHotEncoder 中提取编码后的特征名称
onehot_encoder = pipeline.named_steps['preprocessor'].transformers_[0][1]
encoded_feature_names = onehot_encoder.get_feature_names_out(categorical_features)


print("读热编码后的特征名称:", encoded_feature_names)

# 数值特征名称
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# 合并所有特征名称(独热编码后的特征 + 数值特征)
all_feature_names = list(encoded_feature_names) + numerical_features

# 初始化字典,用于聚合特征重要性到原始特征
feature_importance_dict = {feature: 0 for feature in X.columns}

# 聚合独热编码特征的重要性
for i, feature in enumerate(all_feature_names):
# 提取原始特征名
original_feature = feature.split('_')[0] if '_' in feature else feature
if original_feature in feature_importance_dict:
feature_importance_dict[original_feature] += importances[i]

# 提取聚合后的特征名称和重要性
aggregated_feature_names = list(feature_importance_dict.keys())
aggregated_importances = list(feature_importance_dict.values())

print("================")
print("聚合后的特征名称:", aggregated_feature_names)
print("聚合后的特征重要性:", aggregated_importances)
print("================")

# 绘制条形图
plt.figure(figsize=(10, 8))
sns.barplot(x=aggregated_importances, y=aggregated_feature_names)
plt.title('Feature Importance (Aggregated)')
plt.xlabel('重要性')
plt.ylabel('特征')
plt.tight_layout()
plt.savefig("figure/feature_importance_aggregated.jpg", dpi=500)

print("------------------")

# 保存训练好的模型
joblib.dump(rf_model, 'random_forest_model.pkl')
print(" 模型已保存为 'random_forest_model.pkl'")


predict_data = pd.read_excel('predict_set.xlsx')
# 对预测集进行预处理
predict_data_transformed = pipeline.named_steps['preprocessor'].transform(predict_data)

# 使用模型进行预测
y_pred = rf_model.predict(predict_data_transformed)

# 保存结果
df_result = predict_data.copy()
df_result['预测结果'] = y_pred
df_result.to_csv('./processed_data/predict_result_t2.csv', index=False, encoding='utf-8-sig')
print("预测完成,结果保存为 ./processed_data/predict_result_t2.csv")

优化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 定义随机森林模型
rf = RandomForestClassifier(random_state=42)

# 定义超参数网格
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}

# 网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 输出最佳参数
print("最佳参数:", grid_search.best_params_)
print("最佳得分:", grid_search.best_score_)

# 使用最佳参数训练模型
best_rf = grid_search.best_estimator_

优化方向:

  1. 调整超参数:通过网格搜索或随机搜索优化随机森林的超参数。
  2. 特征选择:根据特征重要性选择重要特征。
  3. 处理类别不平衡:通过调整 class_weight 参数平衡类别。
  4. 交叉验证:使用交叉验证评估模型性能,避免过拟合。
  5. 增加模型评估指标:输出混淆矩阵等更多评估指标。

推荐匹配

基于简历文本和岗位文本的相似度,推荐最匹配的岗位,并将推荐结果保存到CSV文件中。

1
2
3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

导入库
pandas:数据处理和操作
TfidVectorizer: 将文本数据转换为TF-IDF特征向量
cosine_similarity:计算简历和岗位文本之间的余弦相似度

1
2
3
4
5
6
7
8
9
10
11
12
df_resume = df[["教育程度", "专业", "毕业学校"]]

def map_education_leavel(code):
education_map = { ... } # 教育程度映射表
try:
return education_map.get(int(code), "未知")
except:
return "未知"

df_resume.loc[:, '教育程度_文字'] = df_resume['教育程度']
df_resume.loc[:, '简历文本'] = df_resume.fillna('').astype(str).agg(' '.join, axis=1)
df_resume = df_resume[["教育程度", "专业", "毕业学校", "简历文本"]]


分类模型
http://example.com/2025/04/19/模型训练/
作者
yvyvSunlight
发布于
2025年4月19日
许可协议