阅读 192

第一次与你见面Kaggle:房价预测

深机器学习的过程无非就是获得数据,数据预处理,训练模型,交叉验证,模型模型选择和最终预测

读取数据集

导包

import d2lzh as d2l
from mxnet import autograd,gluon,init,nd
from mxnet.gluon import data as gdata ,loss as gloss,nn
import numpy as np
import pandas as pd
复制代码

我们的数据是从Kaggle上下载的,下载数据不用科学的上网,但是后来上传好像就要了

加压后读取

train_data= pd.read_csv('D:/dL/house_Data/train.csv')
test_data =pd.read_csv('D:/dL/house_Data/test.csv')
复制代码

其中的数据量及维度(80+1,80个特征和1个标签)

查看并分析其中的特征,如下,当然除了id其他都应该有用,同时最后的SalePrice标签在测试集中不存在,所以只要将所有数据的79个特征按样本连结就可以了

all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
复制代码

数据预处理

连续数值的特征标准化:某特征在整个数据集上的均值为μ,标准差为σ。
我们用每个值剪去μ再除以σ得到标准化后的每个特征值。
对于缺失值用特征值代替

numeric_features =all_features.dtypes[all_features.dtypes !='object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))
# 标准化后,每个特征的均值表位0,所以可以直接用0来替换缺失值
all_features[numeric_features] = all_features[numeric_features].fillna(0)
复制代码

离散数值转成只是特征。例如开大小,只有大或小,于是改为开大小_大 = 0 || 1,与,开大小_小 = 0 || 1两组特征值

all_features = pd.get_dummies(all_features,dummy_na=True)
all_features.shape
复制代码

原来79个特征,变成了331个

最后,通过values属性得到NumPy格式的数据

n_train=train_data.shape[0]
train_features = nd.array(all_features[:n_train].values)
test_features = nd.array(all_features[n_train:].values)
train_labels = nd.array(train_data.SalePrice.values).reshape((-1,1))
复制代码

训练模型

定义网络

loss = gloss.L2Loss()
def get_net():
    net =nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize()
    return net
复制代码

对数均方根误差的实现

def log_rmse(net,features,labels):
    clipped_preds=nd.clip(net(features),1,float('inf'))# 让里面的特征值小于1的改1,无上限
    rmse = nd.sqrt(2*loss(clipped_preds.log(),labels.log()).mean())
    return rmse.asscalar()
复制代码

numpy.clip(a, a_min, a_max, out=None)[source]

其中a是一个数组,后面两个参数分别表示最小和最大值

import numpy as np
x=np.array([1,2,3,5,6,7,8,9])
np.clip(x,3,8)
Out[88]:
array([3, 3, 3, 5, 6, 7, 8, 8])#1,2 变3,,,9变8
复制代码

用Adam优化算法,(对学习率相对不明感)

def train(net,train_features,train_labels,test_features,test_labels,num_epochs,learning_rate,weight_decay,batch_size):
          train_ls, test_ls=[] ,[]
          train_iter =gdata.DataLoader(gdata.ArrayDataset(
          train_features,train_labels),batch_size,shuffle=True)
          trainer =gluon.Trainer(net.collect_params(),'adam',{'learning_rate':learning_rate,'wd':weight_decay})
          for epoch in range(num_epochs):
            for X,y in train_iter:
                with autograd.record():
                    l = loss(net(X),y)
                l.backward()
                trainer.step(batch_size)
            train_ls.append(log_rmse(net,train_features,train_labels))
            if test_labels is not None:
                test_ls.append(log_rmse(net,test_features,test_labels))
          return train_ls,test_ls
          
复制代码

k折交叉验证

def get_k_fold_data(k,i,X,y):
    assert k>1
    fold_size =X.shape[0]//k
    X_train,y_train =None,None
    for j in range(k):
        idx =slice(j* fold_size,(j+1)*fold_size)
        X_part,y_part=X[idx,:],y[idx]
        if j==i:
            X_valid,y_valid =X_part,y_part
        elif X_train is None:
            X_train,y_train = X_part,y_part
        else:
            X_train = nd.concat(X_train,X_part,dim=0)
            y_train = nd.concat(y_train,y_part,dim=0)
    return X_train,y_train,X_valid,y_valid
复制代码

训练k次并返回训练和验证的平均误差

def k_fold(k,X_train,y_train,num_epochs,learning_rate,weight_decay,batch_size):
    train_l_sum,valid_l_sum =0,0
    for i in range(k):
        data = get_k_fold_data(k,i,X_train,y_train)
        net =get_net()
        train_ls,valid_ls = train(net,*data,num_epochs,learning_rate,weight_decay,batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i ==0:
            d2l.semilogy(range(1,num_epochs+1),train_ls,'epochs','rmse',
                        range(1,num_epochs+1),valid_ls,
                        ['train','valid'])
        print('fold % d ,train rmse %f,valid rmse %f'%(i,train_ls[-1],valid_ls[-1]))
        
    return train_l_sum / k,valid_l_sum/k
复制代码

模型选择

k,num_epochs,lr,weight_decay,batch_size = 5,100,5,0,64
train_l,valid_l = k_fold(k,train_features,train_labels,num_epochs,lr,weight_decay,batch_size)
print('%d-fold validation:avg train rmse%f,avg valid rmse %f' % (k,train_l,valid_l))
复制代码

预测

def train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size):
    net = get_net()
    train_ls,_= train(net,train_features,train_labels,None,None,num_epochs,lr,weight_decay,batch_size)
    d2l.semilogy(range(1,num_epochs+1),train_ls,'epochs','rmse')
    print('train rmse %f' % train_ls[-1])
    preds = net(test_features).asnumpy()
    test_data['SalePrice']=pd.Series(preds.reshape(1,-1)[0])
    submission = pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
    submission.to_csv('submission.csv',index=False)
    
复制代码

train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size)
复制代码