试一试捏。
# 序
使用多层感知机拟合一系列房价数据并做预测。
# 数据来源与预处理
在此处下载训练集与测试集。
读取数据、删除ID列
## 读取训练数据和测试数据
train_data = pandas.read_csv('./house_price/train.csv', header=0)
test_data = pandas.read_csv('./house_price/test.csv', header=0)
## 删除ID列
train_data.drop(columns=['Id'], inplace=True)
test_data.drop(columns=['Id'], inplace=True)
考虑到训练特征集与测试集在进行独热编码时可能出现差异,应将训练用的特征集与测试集合并在一起做数据预处理。数据预处理
## 数据预处理:0.连结训练特征集与测试集;1.将特征缩放至零均值和单位方差来标准化数据;2.将缺失值替换为平均值,即〇
# 将训练集分为features和labels
feature_col_names = [col_name for col_name in train_data.columns if col_name not in ['SalePrice']]
train_features = train_data[feature_col_names]
train_labels = train_data['SalePrice']
# 测试集没有标签,所以直接用
test_features = test_data
# 连结训练特征集与测试集
all_features = pandas.concat([train_features, test_features], axis=0)
处理连续型变量:
# 查询并处理连续型变量
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# 缩放这些连续型变量的非缺失值
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 将这些连续型变量的缺失值变为均值,而由于特征已缩放至0均值,此处即将缺失值变为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
处理离散型变量:
# 处理离散型变量
all_features = pandas.get_dummies(all_features, dummy_na=True) # dummy_na=True即将缺失值也归为一类
all_features = all_features * 1
值得注意的是, .get_dummies() 后新增一些独热编码列,内部元素均为True或False,需要使用 train_features *=1 将布尔转为数值。
将DF转张量,并重新分割得到训练特征集、标签集和测试集
# DataFrame转Numpy(多维数组),再由多维数组转Tensor(张量)
all_features = torch.tensor(all_features.values, dtype=torch.float32) # 默认是float64,这里设定为32
train_labels = torch.tensor(train_labels.values.reshape(-1, 1), dtype=torch.float32)
# 分割all_features以重新获得处理后的训练集和测试集
train_features = all_features[: train_data.shape[0]]
test_features = all_features[train_data.shape[0]:]
# 训练
训练时涉及损失函数、数据迭代器构造、定义多层感知机、设置训练参数、K折交叉验证等内容:
使用合适的损失函数,并进行适当改造
loss = nn.MSELoss() # 定义损失函数
# 由于房价差异巨大,价值高昂的房子权重会更大,因此使用相对误差来表示损失
def log_rmse(net, features, labels):
clipped_preds = torch.clamp(net(features), 1, float('inf')) # 截断函数,用于将张量中的元素限制在指定的范围内,并返回新的张量。此处把net(features)限制在1到float('inf')之间
rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
return rmse.item()
数据迭代器
def load_array(data_arrays, batch_size, is_train=True):
"""PyTorch数据迭代器"""
dataset = data.TensorDataset(*data_arrays)
return data.DataLoader(dataset, batch_size, shuffle=is_train)
data_iter = load_array((train_features, train_labels), 64) # 数据迭代器,虽然使用Adam,但该小批量还是小批量
定义多层感知机
feature_num = train_features.shape[1] # 获取特征数量
net = nn.Sequential(nn.Linear(feature_num, 1)) # 定义一个含单隐藏层的MLP
选择优化器并设置超参数
optimizer = torch.optim.Adam(net.parameters(), lr=5, weight_decay=0.001) # Adam可以看作是平滑的SGD,对学习率不敏感,因此lr设置了5
num_epoch = 100
定义获取K折交叉验证所需数据的函数
def get_k_fold_data(k, i, X, y):
"""
:param k: 指定K折
:param i: 第几折
:param X: 特征数据集
:param y: 标签数据集
"""
assert k > 1
fold_size = X.shape[0] // k # //指的是除以后取整数商
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == 1:
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train, X_part], 0)
y_train = torch.cat([y_train, y_part], 0)
return X_train, y_train, X_valid, y_valid
开始训练,并进行K折交叉验证
train_l_sum, valid_l_sum = 0, 0
k = 5
for i in range(k):
data = get_k_fold_data(k, i, train_features, train_labels)
train_ls, valid_ls = [], []
for epoch in range(num_epoch):
for X, y in data_iter:
optimizer.zero_grad()
l = loss(net(X), y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
if data[3] is not None:
valid_ls.append(log_rmse(net, data[2], data[3]))
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
print(f'train loss: {train_l_sum / k}\nvalid loss: {valid_l_sum / k}')
# 输出结果
submission = pandas.DataFrame({
'Id': test_Id,
'SalePrice': preds.flatten()
})
submission.to_csv('./house_price/submission.csv', index=False)
# 完整代码
import pandas
import torch
from torch import nn
from torch.utils import data
# 由于房价差异巨大,价值高昂的房子权重会更大,因此使用相对误差来表示损失
def log_rmse(net, features, labels):
clipped_preds = torch.clamp(net(features), 1, float('inf')) # 截断函数,用于将张量中的元素限制在指定的范围内,并返回新的张量。此处把net(features)限制在1到float('inf')之间
rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
return rmse.item()
def load_array(data_arrays, batch_size, is_train=True):
"""PyTorch数据迭代器"""
dataset = data.TensorDataset(*data_arrays)
return data.DataLoader(dataset, batch_size, shuffle=is_train)
def get_k_fold_data(k, i, X, y):
"""
:param k: 指定K折
:param i: 第几折
:param X: 特征数据集
:param y: 标签数据集
"""
assert k > 1
fold_size = X.shape[0] // k # //指的是除以后取整数商
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == 1:
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train, X_part], 0)
y_train = torch.cat([y_train, y_part], 0)
return X_train, y_train, X_valid, y_valid
## 读取训练数据和测试数据
train_data = pandas.read_csv('./house_price/train.csv', header=0)
test_data = pandas.read_csv('./house_price/test.csv', header=0)
## 删除ID列
train_data.drop(columns=['Id'], inplace=True)
test_data.drop(columns=['Id'], inplace=True)
## 数据预处理:0.连结训练特征集与测试集;1.将特征缩放至零均值和单位方差来标准化数据;2.将缺失值替换为平均值,即〇
# 将训练集分为features和labels
feature_col_names = [col_name for col_name in train_data.columns if col_name not in ['SalePrice']]
train_features = train_data[feature_col_names]
train_labels = train_data['SalePrice']
# 测试集没有标签,所以直接用
test_features = test_data
# 连结训练特征集与测试集
all_features = pandas.concat([train_features, test_features], axis=0)
# 查询并处理连续型变量
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# 缩放这些连续型变量的非缺失值
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 将这些连续型变量的缺失值变为均值,而由于特征已缩放至0均值,此处即将缺失值变为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 处理离散型变量
all_features = pandas.get_dummies(all_features, dummy_na=True) # dummy_na=True即将缺失值也归为一类
all_features = all_features * 1
# DataFrame转Numpy(多维数组),再由多维数组转Tensor(张量)
all_features = torch.tensor(all_features.values, dtype=torch.float32) # 默认是float64,这里设定为32
train_labels = torch.tensor(train_labels.values.reshape(-1, 1), dtype=torch.float32)
# 分割all_features以重新获得处理后的训练集和测试集
train_features = all_features[: train_data.shape[0]]
test_features = all_features[train_data.shape[0]:]
### 训练
loss = nn.MSELoss() # 定义损失函数
feature_num = train_features.shape[1] # 获取特征数量
net = nn.Sequential(nn.Linear(feature_num, 1)) # 定义一个含单隐藏层的MLP
## 开训,并进行K折交叉验证
data_iter = load_array((train_features, train_labels), 64) # 数据迭代器,虽然使用Adam,但该小批量还是小批量
optimizer = torch.optim.Adam(net.parameters(), lr=5, weight_decay=0.001) # Adam可以看作是平滑的SGD,对学习率不敏感,因此lr设置了5
# K(5)折交叉验证
train_l_sum, valid_l_sum = 0, 0
k = 5
num_epoch = 100
for i in range(k):
data = get_k_fold_data(k, i, train_features, train_labels)
train_ls, valid_ls = [], []
for epoch in range(num_epoch):
for X, y in data_iter:
optimizer.zero_grad()
l = loss(net(X), y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
if data[3] is not None:
valid_ls.append(log_rmse(net, data[2], data[3]))
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
print(f'train loss: {train_l_sum / k}\nvalid loss: {valid_l_sum / k}')
## 预测并输出结果
preds = net(test_features).detach().numpy()
submission = pandas.DataFrame({
'Id': test_Id,
'SalePrice': preds.flatten()
})
submission.to_csv('./house_price/submission.csv', index=False)