1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
| import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import matplotlib.pyplot as plt import joblib import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False
df = pd.read_excel('train_set.xlsx')
print("数据预览:") print(df.head(10))
print("--------------------------")
df = df.dropna()
X = df.drop(columns=['label', '行业']) y = df['label']
categorical_features = X.select_dtypes(include=['object']).columns.tolist() print(f"需要编码的分类特征: {categorical_features}")
preprocessor = ColumnTransformer( transformers=[ ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) ], remainder='passthrough' )
pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) ])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test) print("=====================") print("预测结果预览:") print(y_pred[:10]) print("=====================")
accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred)
print(f"准确率 (Accuracy): {accuracy:.4f}") print(f"查准率 (Precision): {precision:.4f}") print(f"召回率 (Recall): {recall:.4f}") print(f"F1 值: {f1:.4f}")
rf_model = pipeline.named_steps['classifier']
importances = rf_model.feature_importances_
onehot_encoder = pipeline.named_steps['preprocessor'].transformers_[0][1] encoded_feature_names = onehot_encoder.get_feature_names_out(categorical_features)
print("读热编码后的特征名称:", encoded_feature_names)
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
all_feature_names = list(encoded_feature_names) + numerical_features
feature_importance_dict = {feature: 0 for feature in X.columns}
for i, feature in enumerate(all_feature_names): original_feature = feature.split('_')[0] if '_' in feature else feature if original_feature in feature_importance_dict: feature_importance_dict[original_feature] += importances[i]
aggregated_feature_names = list(feature_importance_dict.keys()) aggregated_importances = list(feature_importance_dict.values())
print("================") print("聚合后的特征名称:", aggregated_feature_names) print("聚合后的特征重要性:", aggregated_importances) print("================")
plt.figure(figsize=(10, 8)) sns.barplot(x=aggregated_importances, y=aggregated_feature_names) plt.title('Feature Importance (Aggregated)') plt.xlabel('重要性') plt.ylabel('特征') plt.tight_layout() plt.savefig("figure/feature_importance_aggregated.jpg", dpi=500)
print("------------------")
joblib.dump(rf_model, 'random_forest_model.pkl') print(" 模型已保存为 'random_forest_model.pkl'")
predict_data = pd.read_excel('predict_set.xlsx')
predict_data_transformed = pipeline.named_steps['preprocessor'].transform(predict_data)
y_pred = rf_model.predict(predict_data_transformed)
df_result = predict_data.copy() df_result['预测结果'] = y_pred df_result.to_csv('./processed_data/predict_result_t2.csv', index=False, encoding='utf-8-sig') print("预测完成,结果保存为 ./processed_data/predict_result_t2.csv")
|