1. 安装和导入库
首先,确保安装了scikit-learn库:
pip install scikit-learn
导入必要的库:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris, load_boston
2. 数据处理
2.1 加载数据
sklearn提供了许多内置数据集,我们以鸢尾花数据集为例:
iris = load_iris()
X, y = iris.data, iris.target
2.2 数据分割
将数据分割为训练集和测试集:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
2.3 处理缺失值
使用SimpleImputer
处理缺失值:
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
2.4 分类编码
对分类特征进行编码:
encoder = OneHotEncoder()
# 对特定的分类特征进行编码(假设是第1列)
# X[:, [0]] = encoder.fit_transform(X[:, [0]]).toarray()
2.5 特征缩放
使用StandardScaler
标准化特征:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
3. 模型选择与评估
3.1 模型训练与评估
训练和评估多种模型:
models = {
'Logistic Regression': LogisticRegression(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'Support Vector Machine': SVC(),
'K-Nearest Neighbors': KNeighborsClassifier()
}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'{name} Accuracy: {accuracy}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
3.2 交叉验证
使用交叉验证评估模型性能:
for name, model in models.items():
scores = cross_val_score(model, X, y, cv=5)
print(f'{name} Cross-Validation Accuracy: {np.mean(scores)}')
4. 超参数调优
4.1 网格搜索
使用GridSearchCV
进行超参数调优:
param_grid = {
'C': [0.1, 1, 10],
'solver': ['lbfgs', 'liblinear']
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')
5. 管道与特征工程
5.1 创建管道
创建一个包含标准化和分类器的管道:
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Pipeline Accuracy: {accuracy}')
5.2 特征选择
使用SelectKBest
进行特征选择:
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(score_func=f_classif, k=2)
X_new = selector.fit_transform(X, y)
5.3 特征缩放
使用MinMaxScaler
进行特征缩放:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
6. 集成学习
6.1 随机森林
使用随机森林进行分类:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy}')
6.2 投票分类器
使用投票分类器结合多种模型:
voting_clf = VotingClassifier(estimators=[
('lr', LogisticRegression()),
('rf', RandomForestClassifier()),
('svc', SVC())
], voting='hard')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Voting Classifier Accuracy: {accuracy}')
6.3 梯度提升
使用梯度提升进行分类:
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_clf.fit(X_train, y_train)
y_pred = gb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Gradient Boosting Accuracy: {accuracy}')
6.4 AdaBoost
使用AdaBoost进行分类:
ada_clf = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'AdaBoost Accuracy: {accuracy}')
7. 模型评估与改进
7.1 ROC曲线与AUC
绘制ROC曲线并计算AUC:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
7.2 特征重要性
查看特征的重要性:
importances = model.feature_importances_
feature_names = iris.feature_names
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)