数据挖掘与分析——深度学习算法应用

1. TensorFlow框架的基本使用（5-1）

获取训练数据

构建一个简单的线性模型：W，b为参数，W=2，b=1，运用tf.random.normal() 产生1000个随机数，产生x,y数据。

用matplotlib库，用蓝色绘制训练数据。

定义模型

通过对样本数据的离散图可以判断，呈线性规律变化，因此可以建立一个线性模型，即，把该线性模型定义为一个简单的类，里面封装了变量和计算，变量设置用tf.Variable()。

# 步骤2：定义模型
class LinearModel(tf.Module):
    def __init__(self):
        self.W = tf.Variable(tf.random.normal(shape=(), stddev=0.1))
        self.b = tf.Variable(tf.random.normal(shape=(), stddev=0.1))

    def __call__(self, x):
        return self.W * x + self.b

定义损失函数

损失函数是衡量给定输入的模型输出与期望输出的匹配程度，采用均方误差（L2范数损失函数）。

# 步骤3：定义损失函数
def loss(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

模型训练

运用数据和模型来训练得到模型的变量（W和b），观察W和b的变化（使用matplotlib绘制W和b的变化情况曲线）。


model = LinearModel()
learning_rate = 0.1
epochs = 50
history_W, history_b = [], []

for epoch in range(epochs):
    with tf.GradientTape() as tape:
        current_loss = loss(outputs, model(inputs))
    dW, db = tape.gradient(current_loss, [model.W, model.b])
    model.W.assign_sub(learning_rate * dW)
    model.b.assign_sub(learning_rate * db)
    history_W.append(model.W.numpy())
    history_b.append(model.b.numpy())

可视化

# 可视化W和b的变化
plt.plot(history_W, label='W')
plt.plot(history_b, label='b')
plt.xlabel('Epochs')
plt.ylabel('Values')
plt.legend()
plt.show()

完整代码：

import tensorflow as tf
import matplotlib.pyplot as plt

# 步骤1：生成训练数据
num_samples = 1000
true_W = 2
true_b = 1
inputs = tf.random.normal(shape=(num_samples,))
noise = tf.random.normal(shape=(num_samples,))
outputs = inputs * true_W + true_b + noise

# 绘制训练数据
plt.scatter(inputs, outputs, c='b', label='Training data')
plt.xlabel('Input')
plt.ylabel('Output')
plt.legend()
plt.show()

# 步骤2：定义模型
class LinearModel(tf.Module):
    def __init__(self):
        self.W = tf.Variable(tf.random.normal(shape=(), stddev=0.1))
        self.b = tf.Variable(tf.random.normal(shape=(), stddev=0.1))

    def __call__(self, x):
        return self.W * x + self.b

# 步骤3：定义损失函数
def loss(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

# 步骤4：模型训练
model = LinearModel()
learning_rate = 0.1
epochs = 50
history_W, history_b = [], []

for epoch in range(epochs):
    with tf.GradientTape() as tape:
        current_loss = loss(outputs, model(inputs))
    dW, db = tape.gradient(current_loss, [model.W, model.b])
    model.W.assign_sub(learning_rate * dW)
    model.b.assign_sub(learning_rate * db)
    history_W.append(model.W.numpy())
    history_b.append(model.b.numpy())

# 可视化W和b的变化
plt.plot(history_W, label='W')
plt.plot(history_b, label='b')
plt.xlabel('Epochs')
plt.ylabel('Values')
plt.legend()
plt.show()

2. 多层神经网络分类（5-2）

数据获取与预处理

MNIST 数据集来自美国国家标准与技术研究所, National Institute of Standards and Technology (NIST). 训练集 (training set) 由来自 250 个不同人手写的数字构成, 其中 50% 是高中学生, 50% 来自人口普查局 (the Census Bureau) 的工作人员. 测试集(test set) 也是同样比例的手写数字数据。

每张图像的大小都是28x28像素。MNIST数据集有60000张图像用于训练和10000张图像用于测试，其中每张图像都被标记了对应的数字（0-9）。

加载数据集
查看数据集
归一化处理

# 加载MNIST数据集
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# 输出第一张图片和对应的标签
# 查看训练集中的一张图像和对应的标签
plt.imshow(x_train[0], cmap='gray')
plt.title(f"Label: {x_train[0]}")
plt.axis('off')
plt.show()
# 查看测试集中的一张图像和对应的标签
plt.imshow(x_test[0], cmap='gray')
plt.title(f"Label: {x_test[0]}")
plt.axis('off')
plt.show()
# 对输入数据进行归一化处理
x_train = x_train / 255.0
x_test = x_test / 255.0

模型构建

模型定义

编译模型

输出模型参数

# 定义显示图片的函数
def plot_images(images):
    plt.imshow(images, cmap='binary')
    plt.show()
# 构建神经网络模型
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,28)),  # 将输入展平为一维数组
    tf.keras.layers.Dense(256, activation='relu'),  # 全连接层，使用ReLU激活函数
    tf.keras.layers.Dropout(0.2),  # Dropout层，可以防止过拟合
    tf.keras.layers.Dense(128, activation='relu'), # 全连接层，使用ReLU激活函数
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')  # 输出层，使用softmax激活函数输出分类概率
])
# 编译模型
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # 使用交叉熵损失函数
              metrics=['sparse_categorical_accuracy'])

# 输出模型结构
model.summary()

模型训练

训练

获取训练历史数据中的各指标值

绘制指标在训练过程中的变化图

# 训练模型
history = model.fit(x_train, y_train, epochs=50, validation_split=0.2, verbose=1)
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

模型评估

使用测试集对模型进行评估

# 生成图形
plt.figure(figsize=(12, 4))
# Loss 图
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
# Accuracy 图
plt.subplot(1, 2, 2)
plt.plot(train_accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

完整代码：

import tensorflow as tf

# 加载MNIST数据集
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

import matplotlib.pyplot as plt

# 查看训练集中的一张图像和对应的标签
plt.imshow(train_images[0], cmap='gray')
plt.title(f"Label: {train_labels[0]}")
plt.axis('off')
plt.show()

# 查看测试集中的一张图像和对应的标签
plt.imshow(test_images[0], cmap='gray')
plt.title(f"Label: {test_labels[0]}")
plt.axis('off')
plt.show()


# 将像素值归一化到0到1之间
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

from tensorflow.keras import models, layers

# 定义模型
model = models.Sequential([
    layers.Flatten(input_shape=(28, 28)),  # 将28x28的图像展平为784维向量
    layers.Dense(512, activation='relu'),  # 全连接层，使用ReLU激活函数
    layers.Dense(10, activation='softmax')  # 输出层，使用softmax激活函数，输出每个数字的概率分布
])


model.compile(optimizer='adam',  # 使用Adam优化器
              loss='sparse_categorical_crossentropy',  # 使用稀疏的交叉熵损失函数
              metrics=['accuracy'])  # 监控模型的准确率

model.summary()

history = model.fit(train_images, train_labels, epochs=5, batch_size=128, validation_split=0.2)

train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

plt.figure(figsize=(12, 4))

# Loss 图
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Accuracy 图
plt.subplot(1, 2, 2)
plt.plot(train_accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

test_loss, test_accuracy = model.evaluate(test_images, test_labels)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

3. 多层神经网络回归（5-3）

数据获取与预处理

Auto MPG 数据集，它记录了各种汽车效能指标MPG(Mile Per Gallon)与气缸数、重量、马力等因素的真实数据。除了产地的数字字段表示类别外，其他字段都是数值类型。对于产地地段，1 表示美国，2 表示欧洲，3 表示日本。

数据集例样：

18.0   8   307.0      130.0      3504.      12.0   70  1    "chevrolet chevelle malibu"
15.0   8   350.0      165.0      3693.      11.5   70  1    "buick skylark 320"
18.0   8   318.0      150.0      3436.      11.0   70  1    "plymouth satellite"
16.0   8   304.0      150.0      3433.      12.0   70  1    "amc rebel sst"
17.0   8   302.0      140.0      3449.      10.5   70  1    "ford torino"
15.0   8   429.0      198.0      4341.      10.0   70  1    "ford galaxie 500"
14.0   8   454.0      220.0      4354.       9.0   70  1    "chevrolet impala"
14.0   8   440.0      215.0      4312.       8.5   70  1    "plymouth fury iii"
14.0   8   455.0      225.0      4425.      10.0   70  1    "pontiac catalina"
15.0   8   390.0      190.0      3850.       8.5   70  1    "amc ambassador dpl"
15.0   8   383.0      170.0      3563.      10.0   70  1    "dodge challenger se"
14.0   8   340.0      160.0      3609.       8.0   70  1    "plymouth 'cuda 340"
15.0   8   400.0      150.0      3761.       9.5   70  1    "chevrolet monte carlo"
14.0   8   455.0      225.0      3086.      10.0   70  1    "buick estate wagon (sw)"
24.0   4   113.0      95.00      2372.      15.0   70  3    "toyota corona mark ii"
22.0   6   198.0      95.00      2833.      15.5   70  1    "plymouth duster"
18.0   6   199.0      97.00      2774.      15.5   70  1    "amc hornet"
21.0   6   200.0      85.00      2587.      16.0   70  1    "ford maverick"
27.0   4   97.00      88.00      2130.      14.5   70  3    "datsun pl510"
26.0   4   97.00      46.00      1835.      20.5   70  2    "volkswagen 1131 deluxe sedan"
25.0   4   110.0      87.00      2672.      17.5   70  2    "peugeot 504"
24.0   4   107.0      90.00      2430.      14.5   70  2    "audi 100 ls"
25.0   4   104.0      95.00      2375.      17.5   70  2    "saab 99e"
26.0   4   121.0      113.0      2234.      12.5   70  2    "bmw 2002"
21.0   6   199.0      90.00      2648.      15.0   70  1    "amc gremlin"
10.0   8   360.0      215.0      4615.      14.0   70  1    "ford f250"
10.0   8   307.0      200.0      4376.      15.0   70  1    "chevy c20"
11.0   8   318.0      210.0      4382.      13.5   70  1    "dodge d200"
9.0    8   304.0      193.0      4732.      18.5   70  1    "hi 1200d"
27.0   4   97.00      88.00      2130.      14.5   71  3    "datsun pl510"
28.0   4   140.0      90.00      2264.      15.5   71  1    "chevrolet vega 2300"
25.0   4   113.0      95.00      2228.      14.0   71  3    "toyota corona"
25.0   4   98.00      ?          2046.      19.0   71  1    "ford pinto"
19.0   6   232.0      100.0      2634.      13.0   71  1    "amc gremlin"
16.0   6   225.0      105.0      3439.      15.5   71  1    "plymouth satellite custom"
17.0   6   250.0      100.0      3329.      15.5   71  1    "chevrolet chevelle malibu"
19.0   6   250.0      88.00      3302.      15.5   71  1    "ford torino 500"
18.0   6   232.0      100.0      3288.      15.5   71  1    "amc matador"
14.0   8   350.0      165.0      4209.      12.0   71  1    "chevrolet impala"
14.0   8   400.0      175.0      4464.      11.5   71  1    "pontiac catalina brougham"
14.0   8   351.0      153.0      4154.      13.5   71  1    "ford galaxie 500"
14.0   8   318.0      150.0      4096.      13.0   71  1    "plymouth fury iii"
12.0   8   383.0      180.0      4955.      11.5   71  1    "dodge monaco (sw)"
13.0   8   400.0      170.0      4746.      12.0   71  1    "ford country squire (sw)"
13.0   8   400.0      175.0      5140.      12.0   71  1    "pontiac safari (sw)"
18.0   6   258.0      110.0      2962.      13.5   71  1    "amc hornet sportabout (sw)"
22.0   4   140.0      72.00      2408.      19.0   71  1    "chevrolet vega (sw)"
19.0   6   250.0      100.0      3282.      15.0   71  1    "pontiac firebird"
18.0   6   250.0      88.00      3139.      14.5   71  1    "ford mustang"
23.0   4   122.0      86.00      2220.      14.0   71  1    "mercury capri 2000"
28.0   4   116.0      90.00      2123.      14.0   71  2    "opel 1900"
30.0   4   79.00      70.00      2074.      19.5   71  2    "peugeot 304"
30.0   4   88.00      76.00      2065.      14.5   71  2    "fiat 124b"
31.0   4   71.00      65.00      1773.      19.0   71  3    "toyota corolla 1200"
35.0   4   72.00      69.00      1613.      18.0   71  3    "datsun 1200"
27.0   4   97.00      60.00      1834.      19.0   71  2    "volkswagen model 111"
26.0   4   91.00      70.00      1955.      20.5   71  1    "plymouth cricket"
24.0   4   113.0      95.00      2278.      15.5   72  3    "toyota corona hardtop"
25.0   4   97.50      80.00      2126.      17.0   72  1    "dodge colt hardtop"
23.0   4   97.00      54.00      2254.      23.5   72  2    "volkswagen type 3"
20.0   4   140.0      90.00      2408.      19.5   72  1    "chevrolet vega"
21.0   4   122.0      86.00      2226.      16.5   72  1    "ford pinto runabout"
13.0   8   350.0      165.0      4274.      12.0   72  1    "chevrolet impala"
14.0   8   400.0      175.0      4385.      12.0   72  1    "pontiac catalina"
15.0   8   318.0      150.0      4135.      13.5   72  1    "plymouth fury iii"
14.0   8   351.0      153.0      4129.      13.0   72  1    "ford galaxie 500"
17.0   8   304.0      150.0      3672.      11.5   72  1    "amc ambassador sst"
11.0   8   429.0      208.0      4633.      11.0   72  1    "mercury marquis"
13.0   8   350.0      155.0      4502.      13.5   72  1    "buick lesabre custom"
12.0   8   350.0      160.0      4456.      13.5   72  1    "oldsmobile delta 88 royale"
13.0   8   400.0      190.0      4422.      12.5   72  1    "chrysler newport royal"
19.0   3   70.00      97.00      2330.      13.5   72  3    "mazda rx2 coupe"
15.0   8   304.0      150.0      3892.      12.5   72  1    "amc matador (sw)"
13.0   8   307.0      130.0      4098.      14.0   72  1    "chevrolet chevelle concours (sw)"
13.0   8   302.0      140.0      4294.      16.0   72  1    "ford gran torino (sw)"
14.0   8   318.0      150.0      4077.      14.0   72  1    "plymouth satellite custom (sw)"
18.0   4   121.0      112.0      2933.      14.5   72  2    "volvo 145e (sw)"
22.0   4   121.0      76.00      2511.      18.0   72  2    "volkswagen 411 (sw)"
21.0   4   120.0      87.00      2979.      19.5   72  2    "peugeot 504 (sw)"
26.0   4   96.00      69.00      2189.      18.0   72  2    "renault 12 (sw)"
22.0   4   122.0      86.00      2395.      16.0   72  1    "ford pinto (sw)"
28.0   4   97.00      92.00      2288.      17.0   72  3    "datsun 510 (sw)"
23.0   4   120.0      97.00      2506.      14.5   72  3    "toyouta corona mark ii (sw)"
28.0   4   98.00      80.00      2164.      15.0   72  1    "dodge colt (sw)"
27.0   4   97.00      88.00      2100.      16.5   72  3    "toyota corolla 1600 (sw)"
13.0   8   350.0      175.0      4100.      13.0   73  1    "buick century 350"
14.0   8   304.0      150.0      3672.      11.5   73  1    "amc matador"
13.0   8   350.0      145.0      3988.      13.0   73  1    "chevrolet malibu"
14.0   8   302.0      137.0      4042.      14.5   73  1    "ford gran torino"
15.0   8   318.0      150.0      3777.      12.5   73  1    "dodge coronet custom"
12.0   8   429.0      198.0      4952.      11.5   73  1    "mercury marquis brougham"
13.0   8   400.0      150.0      4464.      12.0   73  1    "chevrolet caprice classic"
13.0   8   351.0      158.0      4363.      13.0   73  1    "ford ltd"
14.0   8   318.0      150.0      4237.      14.5   73  1    "plymouth fury gran sedan"
13.0   8   440.0      215.0      4735.      11.0   73  1    "chrysler new yorker brougham"
12.0   8   455.0      225.0      4951.      11.0   73  1    "buick electra 225 custom"
13.0   8   360.0      175.0      3821.      11.0   73  1    "amc ambassador brougham"
18.0   6   225.0      105.0      3121.      16.5   73  1    "plymouth valiant"
16.0   6   250.0      100.0      3278.      18.0   73  1    "chevrolet nova custom"
18.0   6   232.0      100.0      2945.      16.0   73  1    "amc hornet"
18.0   6   250.0      88.00      3021.      16.5   73  1    "ford maverick"
23.0   6   198.0      95.00      2904.      16.0   73  1    "plymouth duster"
26.0   4   97.00      46.00      1950.      21.0   73  2    "volkswagen super beetle"
11.0   8   400.0      150.0      4997.      14.0   73  1    "chevrolet impala"
12.0   8   400.0      167.0      4906.      12.5   73  1    "ford country"
13.0   8   360.0      170.0      4654.      13.0   73  1    "plymouth custom suburb"
12.0   8   350.0      180.0      4499.      12.5   73  1    "oldsmobile vista cruiser"
18.0   6   232.0      100.0      2789.      15.0   73  1    "amc gremlin"
20.0   4   97.00      88.00      2279.      19.0   73  3    "toyota carina"
21.0   4   140.0      72.00      2401.      19.5   73  1    "chevrolet vega"
22.0   4   108.0      94.00      2379.      16.5   73  3    "datsun 610"
18.0   3   70.00      90.00      2124.      13.5   73  3    "maxda rx3"
19.0   4   122.0      85.00      2310.      18.5   73  1    "ford pinto"
21.0   6   155.0      107.0      2472.      14.0   73  1    "mercury capri v6"
26.0   4   98.00      90.00      2265.      15.5   73  2    "fiat 124 sport coupe"
15.0   8   350.0      145.0      4082.      13.0   73  1    "chevrolet monte carlo s"
16.0   8   400.0      230.0      4278.      9.50   73  1    "pontiac grand prix"
29.0   4   68.00      49.00      1867.      19.5   73  2    "fiat 128"
24.0   4   116.0      75.00      2158.      15.5   73  2    "opel manta"
20.0   4   114.0      91.00      2582.      14.0   73  2    "audi 100ls"
19.0   4   121.0      112.0      2868.      15.5   73  2    "volvo 144ea"
15.0   8   318.0      150.0      3399.      11.0   73  1    "dodge dart custom"
24.0   4   121.0      110.0      2660.      14.0   73  2    "saab 99le"
20.0   6   156.0      122.0      2807.      13.5   73  3    "toyota mark ii"
11.0   8   350.0      180.0      3664.      11.0   73  1    "oldsmobile omega"
20.0   6   198.0      95.00      3102.      16.5   74  1    "plymouth duster"
21.0   6   200.0      ?          2875.      17.0   74  1    "ford maverick"
19.0   6   232.0      100.0      2901.      16.0   74  1    "amc hornet"
15.0   6   250.0      100.0      3336.      17.0   74  1    "chevrolet nova"
31.0   4   79.00      67.00      1950.      19.0   74  3    "datsun b210"
26.0   4   122.0      80.00      2451.      16.5   74  1    "ford pinto"
32.0   4   71.00      65.00      1836.      21.0   74  3    "toyota corolla 1200"
25.0   4   140.0      75.00      2542.      17.0   74  1    "chevrolet vega"
16.0   6   250.0      100.0      3781.      17.0   74  1    "chevrolet chevelle malibu classic"
16.0   6   258.0      110.0      3632.      18.0   74  1    "amc matador"
18.0   6   225.0      105.0      3613.      16.5   74  1    "plymouth satellite sebring"
16.0   8   302.0      140.0      4141.      14.0   74  1    "ford gran torino"
13.0   8   350.0      150.0      4699.      14.5   74  1    "buick century luxus (sw)"
14.0   8   318.0      150.0      4457.      13.5   74  1    "dodge coronet custom (sw)"
14.0   8   302.0      140.0      4638.      16.0   74  1    "ford gran torino (sw)"
14.0   8   304.0      150.0      4257.      15.5   74  1    "amc matador (sw)"
29.0   4   98.00      83.00      2219.      16.5   74  2    "audi fox"
26.0   4   79.00      67.00      1963.      15.5   74  2    "volkswagen dasher"
26.0   4   97.00      78.00      2300.      14.5   74  2    "opel manta"
31.0   4   76.00      52.00      1649.      16.5   74  3    "toyota corona"
32.0   4   83.00      61.00      2003.      19.0   74  3    "datsun 710"
28.0   4   90.00      75.00      2125.      14.5   74  1    "dodge colt"
24.0   4   90.00      75.00      2108.      15.5   74  2    "fiat 128"
26.0   4   116.0      75.00      2246.      14.0   74  2    "fiat 124 tc"
24.0   4   120.0      97.00      2489.      15.0   74  3    "honda civic"
26.0   4   108.0      93.00      2391.      15.5   74  3    "subaru"
31.0   4   79.00      67.00      2000.      16.0   74  2    "fiat x1.9"
19.0   6   225.0      95.00      3264.      16.0   75  1    "plymouth valiant custom"
18.0   6   250.0      105.0      3459.      16.0   75  1    "chevrolet nova"
15.0   6   250.0      72.00      3432.      21.0   75  1    "mercury monarch"
15.0   6   250.0      72.00      3158.      19.5   75  1    "ford maverick"
16.0   8   400.0      170.0      4668.      11.5   75  1    "pontiac catalina"
15.0   8   350.0      145.0      4440.      14.0   75  1    "chevrolet bel air"
16.0   8   318.0      150.0      4498.      14.5   75  1    "plymouth grand fury"
14.0   8   351.0      148.0      4657.      13.5   75  1    "ford ltd"
17.0   6   231.0      110.0      3907.      21.0   75  1    "buick century"
16.0   6   250.0      105.0      3897.      18.5   75  1    "chevroelt chevelle malibu"
15.0   6   258.0      110.0      3730.      19.0   75  1    "amc matador"
18.0   6   225.0      95.00      3785.      19.0   75  1    "plymouth fury"
21.0   6   231.0      110.0      3039.      15.0   75  1    "buick skyhawk"
20.0   8   262.0      110.0      3221.      13.5   75  1    "chevrolet monza 2+2"
13.0   8   302.0      129.0      3169.      12.0   75  1    "ford mustang ii"
29.0   4   97.00      75.00      2171.      16.0   75  3    "toyota corolla"
23.0   4   140.0      83.00      2639.      17.0   75  1    "ford pinto"
20.0   6   232.0      100.0      2914.      16.0   75  1    "amc gremlin"
23.0   4   140.0      78.00      2592.      18.5   75  1    "pontiac astro"
24.0   4   134.0      96.00      2702.      13.5   75  3    "toyota corona"
25.0   4   90.00      71.00      2223.      16.5   75  2    "volkswagen dasher"
24.0   4   119.0      97.00      2545.      17.0   75  3    "datsun 710"
18.0   6   171.0      97.00      2984.      14.5   75  1    "ford pinto"
29.0   4   90.00      70.00      1937.      14.0   75  2    "volkswagen rabbit"
19.0   6   232.0      90.00      3211.      17.0   75  1    "amc pacer"
23.0   4   115.0      95.00      2694.      15.0   75  2    "audi 100ls"
23.0   4   120.0      88.00      2957.      17.0   75  2    "peugeot 504"
22.0   4   121.0      98.00      2945.      14.5   75  2    "volvo 244dl"
25.0   4   121.0      115.0      2671.      13.5   75  2    "saab 99le"
33.0   4   91.00      53.00      1795.      17.5   75  3    "honda civic cvcc"
28.0   4   107.0      86.00      2464.      15.5   76  2    "fiat 131"
25.0   4   116.0      81.00      2220.      16.9   76  2    "opel 1900"
25.0   4   140.0      92.00      2572.      14.9   76  1    "capri ii"
26.0   4   98.00      79.00      2255.      17.7   76  1    "dodge colt"
27.0   4   101.0      83.00      2202.      15.3   76  2    "renault 12tl"
17.5   8   305.0      140.0      4215.      13.0   76  1    "chevrolet chevelle malibu classic"
16.0   8   318.0      150.0      4190.      13.0   76  1    "dodge coronet brougham"
15.5   8   304.0      120.0      3962.      13.9   76  1    "amc matador"
14.5   8   351.0      152.0      4215.      12.8   76  1    "ford gran torino"
22.0   6   225.0      100.0      3233.      15.4   76  1    "plymouth valiant"
22.0   6   250.0      105.0      3353.      14.5   76  1    "chevrolet nova"
24.0   6   200.0      81.00      3012.      17.6   76  1    "ford maverick"
22.5   6   232.0      90.00      3085.      17.6   76  1    "amc hornet"
29.0   4   85.00      52.00      2035.      22.2   76  1    "chevrolet chevette"
24.5   4   98.00      60.00      2164.      22.1   76  1    "chevrolet woody"
29.0   4   90.00      70.00      1937.      14.2   76  2    "vw rabbit"
33.0   4   91.00      53.00      1795.      17.4   76  3    "honda civic"
20.0   6   225.0      100.0      3651.      17.7   76  1    "dodge aspen se"
18.0   6   250.0      78.00      3574.      21.0   76  1    "ford granada ghia"
18.5   6   250.0      110.0      3645.      16.2   76  1    "pontiac ventura sj"
17.5   6   258.0      95.00      3193.      17.8   76  1    "amc pacer d/l"
29.5   4   97.00      71.00      1825.      12.2   76  2    "volkswagen rabbit"
32.0   4   85.00      70.00      1990.      17.0   76  3    "datsun b-210"
28.0   4   97.00      75.00      2155.      16.4   76  3    "toyota corolla"
26.5   4   140.0      72.00      2565.      13.6   76  1    "ford pinto"
20.0   4   130.0      102.0      3150.      15.7   76  2    "volvo 245"
13.0   8   318.0      150.0      3940.      13.2   76  1    "plymouth volare premier v8"
19.0   4   120.0      88.00      3270.      21.9   76  2    "peugeot 504"
19.0   6   156.0      108.0      2930.      15.5   76  3    "toyota mark ii"
16.5   6   168.0      120.0      3820.      16.7   76  2    "mercedes-benz 280s"
16.5   8   350.0      180.0      4380.      12.1   76  1    "cadillac seville"
13.0   8   350.0      145.0      4055.      12.0   76  1    "chevy c10"
13.0   8   302.0      130.0      3870.      15.0   76  1    "ford f108"
13.0   8   318.0      150.0      3755.      14.0   76  1    "dodge d100"
31.5   4   98.00      68.00      2045.      18.5   77  3    "honda accord cvcc"
30.0   4   111.0      80.00      2155.      14.8   77  1    "buick opel isuzu deluxe"
36.0   4   79.00      58.00      1825.      18.6   77  2    "renault 5 gtl"
25.5   4   122.0      96.00      2300.      15.5   77  1    "plymouth arrow gs"
33.5   4   85.00      70.00      1945.      16.8   77  3    "datsun f-10 hatchback"
17.5   8   305.0      145.0      3880.      12.5   77  1    "chevrolet caprice classic"
17.0   8   260.0      110.0      4060.      19.0   77  1    "oldsmobile cutlass supreme"
15.5   8   318.0      145.0      4140.      13.7   77  1    "dodge monaco brougham"
15.0   8   302.0      130.0      4295.      14.9   77  1    "mercury cougar brougham"
17.5   6   250.0      110.0      3520.      16.4   77  1    "chevrolet concours"
20.5   6   231.0      105.0      3425.      16.9   77  1    "buick skylark"
19.0   6   225.0      100.0      3630.      17.7   77  1    "plymouth volare custom"
18.5   6   250.0      98.00      3525.      19.0   77  1    "ford granada"
16.0   8   400.0      180.0      4220.      11.1   77  1    "pontiac grand prix lj"
15.5   8   350.0      170.0      4165.      11.4   77  1    "chevrolet monte carlo landau"
15.5   8   400.0      190.0      4325.      12.2   77  1    "chrysler cordoba"
16.0   8   351.0      149.0      4335.      14.5   77  1    "ford thunderbird"
29.0   4   97.00      78.00      1940.      14.5   77  2    "volkswage

加载数据集

# 加载数据集
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv('auto-mpg.data', names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)
print(raw_dataset)

数据清洗

统计数据集中各列中空值的个数，并删除包含空值的行。

将Origin列转换为one-hot（独热）编码。
数据探索

使用describe方法查看数据的统计指标
使用seaborn库中pairplot方法绘制"MPG", "Cylinders", "Displacement", "Weight"四列的联合分布图

# 数据清洗
dataset = raw_dataset.dropna()
# 将Origin列转换为one-hot编码
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
# 数据探索
print(dataset.describe())
sns.pairplot(dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')

数据标准化

# 数据标准化
labels = dataset.pop('MPG')
train_stats = dataset.describe().transpose()
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_dataset = norm(dataset)

划分训练集与测试集

#拆分训练数据集和测试数据集,将数据集拆分为一个训练数据集和一个测试数据集。

X_train, X_test, Y_train, Y_test = train_test_split(normed_dataset,labels,test_size=0.2,random_state=0)

模型构建

模型定义

# 模型构建
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=[X_train.shape[1]]),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

编译模型

loss='mse' #损失用mse

optimizer='adam'

metrics=['mae', 'mse'])

# 编译模型
model.compile(loss='mse', optimizer='adam', metrics=['mae', 'mse'])

输出模型参数

print(model.summary())

模型训练

训练

epochs=100,

validation_split = 0.2

verbose=1

# 模型训练
history = model.fit(X_train, Y_train, epochs=1000, validation_split=0.3, verbose=1)

获取训练历史数据中的各指标值

mae = history.history['mae']

val_mae = history.history['val_mae']

mse = history.history['mse']

val_mse = history.history['val_mse']

绘制指标在训练过程中的变化图

plt.figure(1)

plt.plot(mae, label='Training MAE')

plt.plot(val_mae, label='Validation MAE')

plt.title('Training and Validation MAE')

plt.legend()

plt.figure(2)

plt.plot(mse, label='Training MSE')

plt.plot(val_mse, label='Validation MSE')

plt.title('Training and Validation MSE')

plt.legend()

plt.show()

模型评估

使用测试集对模型进行评估

# 测试模型

model.evaluate(X_test, Y_test, verbose=1)

# 模型评估
h1=model.evaluate(X_test, Y_test, verbose=1)
print(h1)

完整代码

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 加载数据集
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv('auto-mpg.data', names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)
print(raw_dataset)
# 数据清洗
dataset = raw_dataset.dropna()

# 将Origin列转换为one-hot编码
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')

# 数据探索
print(dataset.describe())
sns.pairplot(dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')

# 数据标准化
labels = dataset.pop('MPG')
train_stats = dataset.describe().transpose()
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_dataset = norm(dataset)

# 划分训练集与测试集
X_train, X_test, Y_train, Y_test = train_test_split(normed_dataset, labels, test_size=0.2, random_state=0)

# 模型构建
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=[X_train.shape[1]]),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# 编译模型
model.compile(loss='mse', optimizer='adam', metrics=['mae', 'mse'])

# 输出模型参数
print(model.summary())

# 模型训练
history = model.fit(X_train, Y_train, epochs=100, validation_split=0.3, verbose=1)

# 获取训练历史数据中的各指标值
mae = history.history['mae']
val_mae = history.history['val_mae']
mse = history.history['mse']
val_mse = history.history['val_mse']

plt.figure()
plt.plot(mae, label='Training MAE')
plt.plot(val_mae, label='Validation MAE')
plt.title('Training and Validation MAE')
plt.legend()
plt.show()
plt.figure()
plt.plot(mse, label='Training MSE')
plt.plot(val_mse, label='Validation MSE')
plt.title('Training and Validation MSE')
plt.legend()
plt.show()

# 模型评估
h1=model.evaluate(X_test, Y_test, verbose=1)
print(h1)

4. 多层神经网络回归（5-4）

数据获取与预处理

IMDB数据集，有5万条来自网络电影数据库的评论，其中25000千条用来训练，25000用来测试，每个部分正负评论各占50%。和MNIST数据集类似，IMDB数据集也集成在Keras中，同时经过了预处理：电影评论转换成了一系列数字，每个数字代表字典中的一个单词（表示该单词出现频率的排名）。

读取数据

# 加载数据,评论文本已转换为整数，其中每个整数表示字典中的特定单词

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

预处理

# 循环神经网络输入长度固定

# 这里应该注意，循环神经网络的输入是固定长度的，否则运行后会出错。

# 由于电影评论的长度必须相同，pad_sequences 函数来标准化评论长度

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=100)

x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=100)

模型搭建

模型定义

model = Sequential([

#定义嵌入层

Embedding(10000, # 词汇表大小中收录单词数量，也就是嵌入层矩阵的行数

128, # 每个单词的维度，也就是嵌入层矩阵的列数

input_length=100),

# 定义LSTM隐藏层

LSTM(128, dropout=0.2, recurrent_dropout=0.2),

# 模型输出层

Dense(1, activation='sigmoid')

])

编译模型

model.compile(loss='binary_crossentropy',

optimizer='adam',

metrics=['accuracy'])

模型训练

# 模型训练
history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=1)

训练

epochs=5,

validation_split = 0.2

verbose=1

获取训练历史数据中的各指标值
绘制指标在训练过程中的变化图

模型评估

使用测试集对模型进行评估

  plt.figure(figsize=(12, 6))

    # 绘制准确率曲线
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'b', label='Training accuracy')
    plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # 绘制损失曲线
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.plot(epochs, val_loss, 'r', label='Validation Loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()

plot_history(history)

# 模型评估
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print(f'Test loss: {test_loss}')
print(f'Test accuracy: {test_acc}')

完整代码：

# -*- coding: utf-8 -*-

import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import matplotlib.pyplot as plt

# 加载数据
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

# 数据预处理
x_train = pad_sequences(x_train, maxlen=100)
x_test = pad_sequences(x_test, maxlen=100)

print(x_train.shape)
print(x_test.shape)

# 模型定义
model = Sequential([
    Embedding(10000, 128),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2, input_shape=(100,)),
    Dense(1, activation='sigmoid')
])
# 编译模型
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
# 模型训练
history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=1)

# 绘制训练过程中的变化图
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 6))

    # 绘制准确率曲线
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'b', label='Training accuracy')
    plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # 绘制损失曲线
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.plot(epochs, val_loss, 'r', label='Validation Loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()

plot_history(history)

# 模型评估
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print(f'Test loss: {test_loss}')
print(f'Test accuracy: {test_acc}')

数据挖掘与分析——深度学习算法应用

相关推荐

最近更新

热门阅读