Python实战开发及案例分析（15）—— 支持向量机

支持向量机（Support Vector Machine，SVM）是一种监督学习模型，适用于分类和回归任务。SVM 尤其擅长处理小样本、高维度数据，以及复杂的分类任务。其基本思想是找到最佳的超平面将不同类别分开，并最大化两类之间的间隔（Margin）。

支持向量机的工作原理

超平面：在特征空间中将不同类别分开的决策边界。
支持向量：离超平面最近的训练样本，决定超平面的方向和位置。
核函数：
- 线性核：适用于线性可分问题。
- 多项式核：适用于非线性数据。
- 高斯核（RBF 核）：适用于复杂非线性数据。

Python 实现：SVM 分类

我们可以使用 scikit-learn 库来实现 SVM 分类器。

案例分析：鸢尾花分类

Python 实现：

# 导入所需库
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建 SVM 模型
svm = SVC(kernel='linear', C=1, random_state=42)
svm.fit(X_train, y_train)

# 预测测试集
y_pred = svm.predict(X_test)

# 输出分类报告和混淆矩阵
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 输出准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

案例分析：使用非线性核（RBF 核）进行分类

Python 实现：

# 导入所需库
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建 SVM 模型（使用 RBF 核）
svm_rbf = SVC(kernel='rbf', C=1, gamma=0.1, random_state=42)
svm_rbf.fit(X_train, y_train)

# 预测测试集
y_pred_rbf = svm_rbf.predict(X_test)

# 输出分类报告和混淆矩阵
print("\nClassification Report (RBF Kernel):")
print(classification_report(y_test, y_pred_rbf, target_names=iris.target_names))

print("Confusion Matrix (RBF Kernel):")
cm_rbf = confusion_matrix(y_test, y_pred_rbf)
sns.heatmap(cm_rbf, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 输出准确率
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)
print(f"Accuracy (RBF Kernel): {accuracy_rbf:.2f}")

Python 实现：SVM 回归

支持向量机还可以用于回归问题，称为支持向量回归（Support Vector Regression，SVR）。

案例分析：波士顿房价预测

Python 实现：

# 导入所需库
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# 加载波士顿房价数据集
boston = datasets.load_boston()
X = boston.data
y = boston.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建 SVR 模型（使用 RBF 核）
svr_rbf = SVR(kernel='rbf', C=1, gamma=0.1)
svr_rbf.fit(X_train, y_train)

# 预测测试集
y_pred_rbf = svr_rbf.predict(X_test)

# 输出性能指标
mse = mean_squared_error(y_test, y_pred_rbf)
mae = mean_absolute_error(y_test, y_pred_rbf)
r2 = r2_score(y_test, y_pred_rbf)
print(f"Mean Squared Error (RBF Kernel): {mse:.2f}")
print(f"Mean Absolute Error (RBF Kernel): {mae:.2f}")
print(f"R-squared Score (RBF Kernel): {r2:.2f}")

# 绘制预测值与实际值的散点图
plt.scatter(y_test, y_pred_rbf, alpha=0.5)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values (SVR - RBF Kernel)")
plt.show()

超参数调优

SVM 的性能依赖于超参数的设置，如 C、gamma 和核函数。我们可以使用 GridSearchCV 进行超参数调优。

Python 实现：SVM 超参数调优

鸢尾花分类案例：

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 定义参数网格
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# 使用 GridSearchCV 进行超参数调优
svm = SVC(random_state=42)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 输出最佳参数组合
print("Best Parameters:", grid_search.best_params_)

# 使用最佳参数组合进行分类
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# 输出准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

结论

SVM 分类和回归：
SVM 适用于分类和回归问题，能够处理线性和非线性数据。
核函数选择：
根据数据的分布情况选择适合的核函数（线性、RBF、Polynomial 等）。
超参数调优：
通过调整 C、gamma 和 kernel 参数可以显著提高模型性能。
可视化和解释：
使用混淆矩阵、特征重要性等可视化工具解释模型。

使用不同核函数

在 SVM 中，不同的核函数适用于不同类型的数据。

Python 实现：使用不同核函数进行鸢尾花分类

# 导入所需库
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 定义不同核函数的 SVM 模型
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
results = {}

for kernel in kernels:
    svm = SVC(kernel=kernel, C=1, random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[kernel] = accuracy
    print(f"\nClassification Report ({kernel} Kernel):")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))

    # 绘制混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix ({kernel} Kernel)")
    plt.show()

# 绘制不同核函数的准确率比较图
plt.bar(results.keys(), results.values())
plt.xlabel("Kernel")
plt.ylabel("Accuracy Score")
plt.title("Accuracy Comparison (Different SVM Kernels)")
plt.show()

数据预处理与标准化

支持向量机对数据的尺度敏感，通常需要对数据进行标准化处理，以获得更好的性能。

Python 实现：标准化处理的鸢尾花分类

# 导入所需库
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 数据标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 创建 SVM 模型
svm = SVC(kernel='rbf', C=1, gamma=0.1, random_state=42)
svm.fit(X_train, y_train)

# 预测测试集
y_pred = svm.predict(X_test)

# 输出分类报告和混淆矩阵
print("Classification Report (Standardized Data):")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

print("Confusion Matrix (Standardized Data):")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 输出准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (Standardized Data): {accuracy:.2f}")

与其他模型的比较

可以将 SVM 与其他模型进行比较，如随机森林、k 近邻等。

Python 实现：鸢尾花分类中 SVM 与其他模型的比较

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn import datasets
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 定义模型
models = {
    "SVM": SVC(kernel='rbf', C=1, gamma=0.1, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNeighbors": KNeighborsClassifier(n_neighbors=5)
}

# 训练和评估每个模型
accuracy_scores = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[name] = accuracy
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))

# 绘制各模型的准确率对比
plt.bar(accuracy_scores.keys(), accuracy_scores.values())
plt.xlabel("Model")
plt.ylabel("Accuracy Score")
plt.title("Model Comparison on Iris Dataset")
plt.show()

实际应用案例：文本分类

在文本分类中，SVM 通常与 TF-IDF 特征提取结合使用。

Python 实现：新闻文本分类案例

# 导入所需库
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载 20 Newsgroups 数据集
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'talk.politics.guns']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)

# TF-IDF 特征提取
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建 SVM 模型
svm = SVC(kernel='linear', C=1, random_state=42)
svm.fit(X_train, y_train)

# 预测测试集
y_pred = svm.predict(X_test)

# 输出分类报告和混淆矩阵
print("\nClassification Report (Text Classification):")
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))

print("\nConfusion Matrix (Text Classification):")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=newsgroups.target_names, yticklabels=newsgroups.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 输出准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (Text Classification): {accuracy:.2f}")

总结

核函数选择：
不同核函数适用于不同类型的数据，线性核适用于线性数据，而 RBF 核适用于非线性数据。
数据预处理与标准化：
SVM 对数据的尺度敏感，通常需要标准化。
模型比较：
与随机森林、k 近邻等模型比较，SVM 在小样本和高维数据中通常表现更佳。
实际应用案例：
SVM 在文本分类、图像识别和生物信息学等领域具有广泛的应用。

深入了解 SVM

接下来，我们将进一步探讨支持向量机（SVM）的应用和高级技术，包括：

多类分类问题：SVM 原本是二分类模型，如何应用于多类分类问题。
SVM 优化技巧：包括参数调优和核函数技巧。
实际应用案例：手写数字识别。

多类分类问题

SVM 原本是二分类模型，但可以通过以下两种策略扩展到多类问题：

一对多（One-vs-Rest，OvR）：为每个类别构建一个分类器，当前类别样本标记为正类，其他样本标记为负类。
一对一（One-vs-One，OvO）：为每两个类别之间构建一个分类器，共构建 𝑘⋅(𝑘−1)/2k⋅(k−1)/2 个分类器，适用于类别较少的情况。

Python 实现：多类分类问题

鸢尾花分类案例

# 导入所需库
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建 SVM 模型（OvO 策略）
svm_ovo = SVC(kernel='rbf', C=1, gamma=0.1, decision_function_shape='ovo', random_state=42)
svm_ovo.fit(X_train, y_train)
y_pred_ovo = svm_ovo.predict(X_test)

# 输出分类报告和混淆矩阵
print("\nClassification Report (One-vs-One):")
print(classification_report(y_test, y_pred_ovo, target_names=iris.target_names))

print("\nConfusion Matrix (One-vs-One):")
cm_ovo = confusion_matrix(y_test, y_pred_ovo)
sns.heatmap(cm_ovo, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (One-vs-One)")
plt.show()

# 创建 SVM 模型（OvR 策略）
svm_ovr = SVC(kernel='rbf', C=1, gamma=0.1, decision_function_shape='ovr', random_state=42)
svm_ovr.fit(X_train, y_train)
y_pred_ovr = svm_ovr.predict(X_test)

# 输出分类报告和混淆矩阵
print("\nClassification Report (One-vs-Rest):")
print(classification_report(y_test, y_pred_ovr, target_names=iris.target_names))

print("\nConfusion Matrix (One-vs-Rest):")
cm_ovr = confusion_matrix(y_test, y_pred_ovr)
sns.heatmap(cm_ovr, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (One-vs-Rest)")
plt.show()

SVM 优化技巧

超参数调优：利用 GridSearchCV 等工具对 C 和 gamma 进行调优。
核函数技巧：使用多项式核、自定义核函数等，或者调整现有核函数的参数。

Python 实现：SVM 超参数调优

鸢尾花分类案例：

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 定义参数网格
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# 使用 GridSearchCV 进行超参数调优
svm = SVC(random_state=42)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 输出最佳参数组合
print("Best Parameters:", grid_search.best_params_)

# 使用最佳参数组合进行分类
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# 输出准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

实际应用案例：手写数字识别

我们可以将 SVM 应用于手写数字识别问题。MNIST 数据集是一个常见的基准测试数据集，包含 0 到 9 的手写数字图像。

Python 实现：手写数字识别案例

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# 加载手写数字数据集
digits = datasets.load_digits()
X = digits.data
y = digits.target

# 数据标准化
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用 GridSearchCV 进行超参数调优
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
svm = SVC(random_state=42)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 输出最佳参数组合
print("Best Parameters:", grid_search.best_params_)

# 使用最佳参数组合进行分类
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[str(i) for i in range(10)]))

# 输出准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[str(i) for i in range(10)], yticklabels=[str(i) for i in range(10)])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (MNIST Handwritten Digits Recognition)")
plt.show()