I have written a piece of code to train a Guassian Process Regression Model to predicting age. I've written the following code and it's running well:
我已经编写了一段代码来训练Guassian过程回归模型来预测年龄。我已经编写了以下代码,并且运行良好:
import numpy as np
import pandas as pd
import h5py
import torch
import gpytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import time
disease_mapping = {
'control': 0,
"Alzheimer's disease": 1,
"Graves' disease": 2,
"Huntington's disease": 3,
"Parkinson's disease": 4,
'rheumatoid arthritis': 5,
'schizophrenia': 6,
"Sjogren's syndrome": 7,
'stroke': 8,
'type 2 diabetes': 9
}
sample_type_mapping = {'control': 0, 'disease tissue': 1}
def load_idmap(idmap_dir):
idmap = pd.read_csv(idmap_dir, sep=',')
age = idmap.age.to_numpy()
age = age.astype(np.float32)
sample_type = idmap.sample_type.replace(sample_type_mapping)
return age, sample_type
def load_methylation_h5(prefix):
'''
Load methylation data from .h5 file.
Parameters:
------------
prefix: 'train' or 'test'
'''
methylation = h5py.File('encoded_'+prefix + 'data.h5', 'r')['data']
h5py.File('encoded_'+prefix + 'data.h5', 'r').close()
#return methylation[:, :10000] # 5000 just for test
return methylation[:, :] # If you want to use full data, you can use this line.
def evaluate_ml(y_true, y_pred, sample_type):
'''
This function is used to evaluate the performance of the model.
Parameters:
------------
y_true: true age
y_pred: predicted age
sample_type: sample type, 0 for control, 1 for case
Return:
------------
mae: mean absolute error.
mae_control: mean absolute error of control samples.
mae_case: mean absolute error of case samples.
We use MAE to evaluate the performance.
Please refer to evaluation section in the the official website for more details.
'''
mae_control = np.mean(
np.abs(y_true[sample_type == 0] - y_pred[sample_type == 0]))
case_true = y_true[sample_type == 1]
case_pred = y_pred[sample_type == 1]
above = np.where(case_pred >= case_true)
below = np.where(case_pred < case_true)
ae_above = np.sum(np.abs(case_true[above] - case_pred[above])) / 2
ae_below = np.sum(np.abs(case_true[below] - case_pred[below]))
mae_case = (ae_above + ae_below) / len(case_true)
mae = np.mean([mae_control, mae_case])
return mae, mae_control, mae_case
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
methylation = load_methylation_h5('train')
methylation_test = load_methylation_h5('test')
idmap_train_dir = 'trainmap.csv'
idmap_test_dir = 'testmap.csv'
age, sample_type = load_idmap(idmap_train_dir)
print('Load data done')
#测试集训练集划分及预处理
indices = np.arange(len(age))
[indices_train, indices_valid, age_train,
age_valid] = train_test_split(indices, age, test_size=0.2, shuffle=True)
methylation_train, methylation_valid = methylation[
indices_train], methylation[indices_valid]
sample_type_train, sample_type_valid = sample_type[
indices_train], sample_type[indices_valid]
feature_size = methylation_train.shape[1]
del methylation
# 将数据转换为torch张量
train_x = torch.tensor(methylation_train, dtype=torch.float32).to(device)
train_y = torch.tensor(age_train, dtype=torch.float32).to(device)
test_x = torch.tensor(methylation_valid, dtype=torch.float32).to(device)
test_y = torch.tensor(age_valid, dtype=torch.float32).to(device)
#dataset = torch.utils.data.TensorDataset(train_x, train_y)
#data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)
# 定义高斯过程模型
class GPRegressionModel(gpytorch.models.ExactGP):
def __init__(self, train_x, train_y, likelihood):
super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
self.mean_module = gpytorch.means.ConstantMean()
self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
likelihood = gpytorch.likelihoods.GaussianLikelihood().to(device)
model = GPRegressionModel(train_x, train_y, likelihood).to(device)#先验
# 准备训练
model.train()
likelihood.train()
# 使用Adam优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)#lr for learning rate
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(device)
#开始训练
##设置参数
num_epochs = 2
target_loss = 0.5
print('Start training...')
for epoch in range(num_epochs):
start = time.time()
optimizer.zero_grad()
output = model(train_x)
loss = -mll(output, train_y)
loss.backward()
optimizer.step()
print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}', f'Training time: {time.time() - start}s')
# 检查是否达到目标损失值
if loss.item() <= target_loss:
print(f"Terminating training at iteration {epoch} as target loss {target_loss} is achieved.")
break
# 切换到评估模式
model.eval()
likelihood.eval()
# 进行预测
with torch.no_grad(), gpytorch.settings.fast_pred_var():
observed_pred = likelihood(model(test_x))
age_valid_pred = observed_pred.mean
#print(age_valid_pred)
age_valid_pred = age_valid_pred.cpu().numpy()
#print(age_valid_pred)
mae = evaluate_ml(age_valid, age_valid_pred, sample_type_valid)
print(f'Validation MAE: {mae}')
#预测
pred_x = torch.tensor(methylation_test, dtype=torch.float32).to(device)
with torch.no_grad(), gpytorch.settings.fast_pred_var():
observed_pred = likelihood(model(pred_x))
age_pred = observed_pred.mean
age_pred = age_pred.cpu().numpy()
age_pred[age_pred < 0] = 0
# naive post-processing to ensure age >= 0
age_pred = np.around(age_pred, decimals=2)
age_pred = ['%.2f' % i for i in age_pred]
sample_id = pd.read_csv(idmap_test_dir, sep=',').sample_id
# Note: sample_id in submission should be the same as the order in testmap.csv.
# We do not provide the matching producdure for disordered sample_id in evaluation.
#submission = pd.DataFrame({'sample_id': sample_id, 'age': age_pred})
#submission_file = 'submit7.txt'
#submission.to_csv(submission_file, index=False)
but i have noticed that in each epoches, the same data was input, which i think may cause over fitting, so i want to use mini batch to train the model. I edit my code, as follow.
但我注意到,每个纪元都输入了相同的数据,我认为这可能会导致过度拟合,所以我想使用Mini Batch来训练模型。我编辑代码,如下所示。
import numpy as np
import pandas as pd
import h5py
import torch
import gpytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import time
disease_mapping = {
'control': 0,
"Alzheimer's disease": 1,
"Graves' disease": 2,
"Huntington's disease": 3,
"Parkinson's disease": 4,
'rheumatoid arthritis': 5,
'schizophrenia': 6,
"Sjogren's syndrome": 7,
'stroke': 8,
'type 2 diabetes': 9
}
sample_type_mapping = {'control': 0, 'disease tissue': 1}
def load_idmap(idmap_dir):
idmap = pd.read_csv(idmap_dir, sep=',')
age = idmap.age.to_numpy()
age = age.astype(np.float32)
sample_type = idmap.sample_type.replace(sample_type_mapping)
return age, sample_type
def load_methylation_h5(prefix):
'''
Load methylation data from .h5 file.
Parameters:
------------
prefix: 'train' or 'test'
'''
methylation = h5py.File('encoded_'+prefix + 'data.h5', 'r')['data']
h5py.File('encoded_'+prefix + 'data.h5', 'r').close()
#return methylation[:, :10000] # 5000 just for test
return methylation[:, :] # If you want to use full data, you can use this line.
def evaluate_ml(y_true, y_pred, sample_type):
'''
This function is used to evaluate the performance of the model.
Parameters:
------------
y_true: true age
y_pred: predicted age
sample_type: sample type, 0 for control, 1 for case
Return:
------------
mae: mean absolute error.
mae_control: mean absolute error of control samples.
mae_case: mean absolute error of case samples.
We use MAE to evaluate the performance.
Please refer to evaluation section in the the official website for more details.
'''
mae_control = np.mean(
np.abs(y_true[sample_type == 0] - y_pred[sample_type == 0]))
case_true = y_true[sample_type == 1]
case_pred = y_pred[sample_type == 1]
above = np.where(case_pred >= case_true)
below = np.where(case_pred < case_true)
ae_above = np.sum(np.abs(case_true[above] - case_pred[above])) / 2
ae_below = np.sum(np.abs(case_true[below] - case_pred[below]))
mae_case = (ae_above + ae_below) / len(case_true)
mae = np.mean([mae_control, mae_case])
return mae, mae_control, mae_case
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
methylation = load_methylation_h5('train')
methylation_test = load_methylation_h5('test')
idmap_train_dir = 'trainmap.csv'
idmap_test_dir = 'testmap.csv'
age, sample_type = load_idmap(idmap_train_dir)
print('Load data done')
#测试集训练集划分及预处理
indices = np.arange(len(age))
[indices_train, indices_valid, age_train,
age_valid] = train_test_split(indices, age, test_size=0.2, shuffle=True)
methylation_train, methylation_valid = methylation[
indices_train], methylation[indices_valid]
sample_type_train, sample_type_valid = sample_type[
indices_train], sample_type[indices_valid]
feature_size = methylation_train.shape[1]
del methylation
# 将数据转换为torch张量
train_x = torch.tensor(methylation_train, dtype=torch.float32).to(device)
train_y = torch.tensor(age_train, dtype=torch.float32).to(device)
test_x = torch.tensor(methylation_valid, dtype=torch.float32).to(device)
test_y = torch.tensor(age_valid, dtype=torch.float32).to(device)
#dataset = torch.utils.data.TensorDataset(train_x, train_y)
#data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)
# 定义高斯过程模型
class GPRegressionModel(gpytorch.models.ExactGP):
def __init__(self, train_x, train_y, likelihood):
super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
self.mean_module = gpytorch.means.ConstantMean()
self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
likelihood = gpytorch.likelihoods.GaussianLikelihood().to(device)
model = GPRegressionModel(train_x, train_y, likelihood).to(device)#先验
# 准备训练
model.train()
likelihood.train()
# 使用Adam优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)#lr for learning rate
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(device)
#开始训练
##设置参数
num_epochs = 2
target_loss = 0.5
print('Start training...')
for epoch in range(num_epochs):
start = time.time()
optimizer.zero_grad()
output = model(train_x)
loss = -mll(output, train_y)
loss.backward()
optimizer.step()
print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}', f'Training time: {time.time() - start}s')
# 检查是否达到目标损失值
if loss.item() <= target_loss:
print(f"Terminating training at iteration {epoch} as target loss {target_loss} is achieved.")
break
# 切换到评估模式
model.eval()
likelihood.eval()
# 进行预测
with torch.no_grad(), gpytorch.settings.fast_pred_var():
observed_pred = likelihood(model(test_x))
age_valid_pred = observed_pred.mean
#print(age_valid_pred)
age_valid_pred = age_valid_pred.cpu().numpy()
#print(age_valid_pred)
mae = evaluate_ml(age_valid, age_valid_pred, sample_type_valid)
print(f'Validation MAE: {mae}')
#预测
pred_x = torch.tensor(methylation_test, dtype=torch.float32).to(device)
with torch.no_grad(), gpytorch.settings.fast_pred_var():
observed_pred = likelihood(model(pred_x))
age_pred = observed_pred.mean
age_pred = age_pred.cpu().numpy()
age_pred[age_pred < 0] = 0
# naive post-processing to ensure age >= 0
age_pred = np.around(age_pred, decimals=2)
age_pred = ['%.2f' % i for i in age_pred]
sample_id = pd.read_csv(idmap_test_dir, sep=',').sample_id
# Note: sample_id in submission should be the same as the order in testmap.csv.
# We do not provide the matching producdure for disordered sample_id in evaluation.
#submission = pd.DataFrame({'sample_id': sample_id, 'age': age_pred})
#submission_file = 'submit7.txt'
#submission.to_csv(submission_file, index=False)
this time the programme return an error:"RuntimeError: You must train on the training inputs! ", traced back to line 147 "output = model(train_x)".
这一次,程序返回一个错误:“运行错误:您必须对训练输入进行训练!”,追溯到第147行“OUTPUT=MODEL(TRAIN_X)”。
Is this method unable to use mini batch, or just somewhere wrong in my code?
是这个方法不能使用迷你批处理,还是我的代码中有什么地方出错了?
更多回答
我是一名优秀的程序员,十分优秀!