DL练习:Kaggle房价预测

试一试捏。


#

使用多层感知机拟合一系列房价数据并做预测。

# 数据来源与预处理

此处下载训练集与测试集。

读取数据、删除ID列

## 读取训练数据和测试数据
train_data = pandas.read_csv('./house_price/train.csv', header=0)
test_data = pandas.read_csv('./house_price/test.csv', header=0)
## 删除ID列
train_data.drop(columns=['Id'], inplace=True)
test_data.drop(columns=['Id'], inplace=True)

数据预处理

考虑到训练特征集与测试集在进行独热编码时可能出现差异,应将训练用的特征集与测试集合并在一起做数据预处理。

## 数据预处理:0.连结训练特征集与测试集;1.将特征缩放至零均值和单位方差来标准化数据;2.将缺失值替换为平均值,即〇
# 将训练集分为features和labels
feature_col_names = [col_name for col_name in train_data.columns if col_name not in ['SalePrice']]
train_features = train_data[feature_col_names]
train_labels = train_data['SalePrice']
# 测试集没有标签,所以直接用
test_features = test_data
# 连结训练特征集与测试集
all_features = pandas.concat([train_features, test_features], axis=0)

处理连续型变量:

# 查询并处理连续型变量
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# 缩放这些连续型变量的非缺失值
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 将这些连续型变量的缺失值变为均值,而由于特征已缩放至0均值,此处即将缺失值变为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)

处理离散型变量:

# 处理离散型变量
all_features = pandas.get_dummies(all_features, dummy_na=True)  # dummy_na=True即将缺失值也归为一类
all_features = all_features * 1
值得注意的是 .get_dummies() 后新增一些独热编码列,内部元素均为True或False,需要使用 train_features *=1  将布尔转为数值。

将DF转张量,并重新分割得到训练特征集、标签集和测试集

# DataFrame转Numpy(多维数组),再由多维数组转Tensor(张量)
all_features = torch.tensor(all_features.values, dtype=torch.float32)  # 默认是float64,这里设定为32
train_labels = torch.tensor(train_labels.values.reshape(-1, 1), dtype=torch.float32)
# 分割all_features以重新获得处理后的训练集和测试集
train_features = all_features[: train_data.shape[0]]
test_features = all_features[train_data.shape[0]:]

# 训练

训练时涉及损失函数、数据迭代器构造、定义多层感知机、设置训练参数、K折交叉验证等内容:

使用合适的损失函数,并进行适当改造

loss = nn.MSELoss()  # 定义损失函数

# 由于房价差异巨大,价值高昂的房子权重会更大,因此使用相对误差来表示损失
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))  # 截断函数,用于将张量中的元素限制在指定的范围内,并返回新的张量。此处把net(features)限制在1到float('inf')之间
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    return rmse.item()

数据迭代器

def load_array(data_arrays, batch_size, is_train=True):
    """PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

data_iter = load_array((train_features, train_labels), 64)  # 数据迭代器,虽然使用Adam,但该小批量还是小批量

定义多层感知机

feature_num = train_features.shape[1]  # 获取特征数量
net = nn.Sequential(nn.Linear(feature_num, 1))  # 定义一个含单隐藏层的MLP

选择优化器并设置超参数

optimizer = torch.optim.Adam(net.parameters(), lr=5, weight_decay=0.001)  # Adam可以看作是平滑的SGD,对学习率不敏感,因此lr设置了5
num_epoch = 100

定义获取K折交叉验证所需数据的函数

def get_k_fold_data(k, i, X, y):
    """
    :param k: 指定K折
    :param i: 第几折
    :param X: 特征数据集
    :param y: 标签数据集
    """
    assert k > 1
    fold_size = X.shape[0] // k  # //指的是除以后取整数商
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == 1:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

开始训练,并进行K折交叉验证

train_l_sum, valid_l_sum = 0, 0
k = 5
for i in range(k):
    data = get_k_fold_data(k, i, train_features, train_labels)
    train_ls, valid_ls = [], []
    for epoch in range(num_epoch):
        for X, y in data_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if data[3] is not None:
            valid_ls.append(log_rmse(net, data[2], data[3]))
    train_l_sum += train_ls[-1]
    valid_l_sum += valid_ls[-1]
print(f'train loss: {train_l_sum / k}\nvalid loss: {valid_l_sum / k}')

# 输出结果

submission = pandas.DataFrame({
    'Id': test_Id,
    'SalePrice': preds.flatten()
})
submission.to_csv('./house_price/submission.csv', index=False)

# 完整代码

import pandas
import torch
from torch import nn
from torch.utils import data


# 由于房价差异巨大,价值高昂的房子权重会更大,因此使用相对误差来表示损失
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))  # 截断函数,用于将张量中的元素限制在指定的范围内,并返回新的张量。此处把net(features)限制在1到float('inf')之间
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    return rmse.item()


def load_array(data_arrays, batch_size, is_train=True):
    """PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)


def get_k_fold_data(k, i, X, y):
    """
    :param k: 指定K折
    :param i: 第几折
    :param X: 特征数据集
    :param y: 标签数据集
    """
    assert k > 1
    fold_size = X.shape[0] // k  # //指的是除以后取整数商
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == 1:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid


## 读取训练数据和测试数据
train_data = pandas.read_csv('./house_price/train.csv', header=0)
test_data = pandas.read_csv('./house_price/test.csv', header=0)
## 删除ID列
train_data.drop(columns=['Id'], inplace=True)
test_data.drop(columns=['Id'], inplace=True)
## 数据预处理:0.连结训练特征集与测试集;1.将特征缩放至零均值和单位方差来标准化数据;2.将缺失值替换为平均值,即〇
# 将训练集分为features和labels
feature_col_names = [col_name for col_name in train_data.columns if col_name not in ['SalePrice']]
train_features = train_data[feature_col_names]
train_labels = train_data['SalePrice']
# 测试集没有标签,所以直接用
test_features = test_data
# 连结训练特征集与测试集
all_features = pandas.concat([train_features, test_features], axis=0)
# 查询并处理连续型变量
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# 缩放这些连续型变量的非缺失值
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 将这些连续型变量的缺失值变为均值,而由于特征已缩放至0均值,此处即将缺失值变为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 处理离散型变量
all_features = pandas.get_dummies(all_features, dummy_na=True)  # dummy_na=True即将缺失值也归为一类
all_features = all_features * 1
# DataFrame转Numpy(多维数组),再由多维数组转Tensor(张量)
all_features = torch.tensor(all_features.values, dtype=torch.float32)  # 默认是float64,这里设定为32
train_labels = torch.tensor(train_labels.values.reshape(-1, 1), dtype=torch.float32)
# 分割all_features以重新获得处理后的训练集和测试集
train_features = all_features[: train_data.shape[0]]
test_features = all_features[train_data.shape[0]:]
### 训练
loss = nn.MSELoss()  # 定义损失函数
feature_num = train_features.shape[1]  # 获取特征数量
net = nn.Sequential(nn.Linear(feature_num, 1))  # 定义一个含单隐藏层的MLP
## 开训,并进行K折交叉验证
data_iter = load_array((train_features, train_labels), 64)  # 数据迭代器,虽然使用Adam,但该小批量还是小批量
optimizer = torch.optim.Adam(net.parameters(), lr=5, weight_decay=0.001)  # Adam可以看作是平滑的SGD,对学习率不敏感,因此lr设置了5
# K(5)折交叉验证
train_l_sum, valid_l_sum = 0, 0
k = 5
num_epoch = 100
for i in range(k):
    data = get_k_fold_data(k, i, train_features, train_labels)
    train_ls, valid_ls = [], []
    for epoch in range(num_epoch):
        for X, y in data_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if data[3] is not None:
            valid_ls.append(log_rmse(net, data[2], data[3]))
    train_l_sum += train_ls[-1]
    valid_l_sum += valid_ls[-1]
print(f'train loss: {train_l_sum / k}\nvalid loss: {valid_l_sum / k}')
## 预测并输出结果
preds = net(test_features).detach().numpy()
submission = pandas.DataFrame({
    'Id': test_Id,
    'SalePrice': preds.flatten()
})
submission.to_csv('./house_price/submission.csv', index=False)
Subscribe
Notify of
guest
0 Comments
Inline Feedbacks
View all comments