首页 算法

The detailed description of task and the dataset, including the report and source code please see in Github repository

Some Utils Functions Definition

# import useful packages
import time
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.io import loadmat
def generate_k_folders(dataset, k):
    """
    Generate K-folders
    
    Input: dataset and k
    Output: a list contains k dictionary, each dictionary contains training set, validation set and testing set
    """
    
    x = dataset["x_train"]
    y = dataset["y_train"]
    x_test = dataset["x_test"]
    y_test = dataset["y_test"]
    
    k_folders = []
    
    for i in range(k):
        if i < (k-1):     
            a = i*int(x.shape[0]/k)
            b = (i+1)*int(x.shape[0]/k)
            k_folders.append({
                "x_train": torch.cat((x[:a], x[b:]), dim=0),
                "y_train": torch.cat((y[:a], y[b:])),
                "x_val": x[a:b],
                "y_val": y[a:b],
                "x_test": x_test,
                "y_test": y_test
            })
        else:
            a = i*int(x.shape[0]/k)
            k_folders.append({
                "x_train": x[:a],
                "y_train": y[:a],
                "x_val": x[a:],
                "y_val": y[a:],
                "x_test": x_test,
                "y_test": y_test
            })
            
    return k_folders
def unzip_dataset(dataset):
    """
    upzip dataset
    """
    x_train = dataset["x_train"]
    y_train = dataset["y_train"]
    x_val = dataset["x_val"]
    y_val = dataset["y_val"]
    x_test = dataset["x_test"]
    y_test = dataset["y_test"]
    
    return x_train, y_train, x_val, y_val, x_test, y_test
def confusion_mat_evaluate(y_test, y_pred):
    """
    Evaluate the model performance by confusion matrix
    
    Input: y_predict and the truth label y_test
    Output: accuracy, precision, recall, f1
    """ 
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average="macro")
    recall = metrics.recall_score(y_test, y_pred, average="macro")
    f1 = metrics.f1_score(y_test, y_pred, average="macro")

    return accuracy, precision, recall, f1
def read_bi_data(dataset):
    """
    Read binary-class dataset
    
    Input: (numpy.array) dataset
    Output: a list consists of x_train, y_train, x_test and y_test
    """
    x_train = torch.from_numpy(dataset['train_X']).type(torch.FloatTensor).cuda()
    y_train = torch.from_numpy(dataset['train_Y']).type(torch.FloatTensor).cuda() 
    x_test = torch.from_numpy(dataset['test_X']).type(torch.FloatTensor).cuda()
    y_test = torch.from_numpy(dataset['test_Y']).type(torch.FloatTensor).cuda()
    
    dataset = {
        'x_train' : x_train,
        'y_train' : y_train,
        'x_test' : x_test,
        'y_test' : y_test
    }
    
    return dataset
def read_multi_data():
    """
    Read multi-class dataset
    
    Output: a list consists of x_train, y_train, x_test and y_test
    """
    x_train = torch.from_numpy(loadmat("datasets/multi-class/train_images.mat")["train_images"]).type(torch.FloatTensor).cuda()
    y_train = torch.from_numpy(loadmat("datasets/multi-class/train_labels.mat")["train_labels"]).type(torch.LongTensor).cuda()
    y_train = y_train.t().squeeze(dim=-1)
   

    x_test = torch.from_numpy(loadmat("datasets/multi-class/test_images.mat")["test_images"]).type(torch.FloatTensor).cuda()
    y_test = torch.from_numpy(loadmat("datasets/multi-class/test_labels.mat")["test_labels"]).type(torch.LongTensor).cuda()
    y_test = y_test.t().squeeze(dim=-1)
    
    dataset = {
        'x_train' : x_train,
        'y_train' : y_train,
        'x_test' : x_test,
        'y_test' : y_test
    }
    
    return dataset

Train Pytorch NN on Five Classifcation Data Sets

# Load data
breast_cancer_data = np.load("datasets/bi-class/breast-cancer.npz")
diabetes_data = np.load("datasets/bi-class/diabetes.npz")
digit_data = np.load("datasets/bi-class/digit.npz")
iris_data = np.load("datasets/bi-class/iris.npz")
wine_data = np.load("datasets/bi-class/wine.npz")
def train_bi_nn_model(dataset, H_list, device, learning_rate=1e-2, iteration=5000):
    
    x_train, y_train, x_val, y_val, x_test, y_test = unzip_dataset(dataset)

    N = x_train.shape[0]
    D_in = x_train.shape[1]
    D_out = 1
    
    # Binary Cross Entropy Loss
    loss_fn = torch.nn.BCELoss() 
    
    # The result table
    # Each row preserves the related result of corresponding H
    # 8 means we have 8 items to save => [H, best_accuracy_val, accuracy_test, auc, precision, recall, f1, training_time]
    res_table = np.zeros((len(H_list),8))
    res_table_ind = 0
    
    for H in H_list:
        
        model = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out),
            torch.nn.Sigmoid()
        ).to(device)
        
        loss_history = []
        accuracy_val = []
        correct = 0.0
        best_accuracy_val = 0.0
        best_iteration = 0.0
        best_model = None
        
        # SGD optimizer
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
        
        t0 = time.time()
        for t in range(iteration):
            y_train_pred = model(x_train).squeeze(dim=-1) # squeeze 2D of shape(x,1) to 1D of shape(x,)
            loss = loss_fn(y_train_pred, y_train)
            loss_history.append(loss.item())
            
            # Transfer the prediction result from probability to [0,1] label
            y_val_pred = model(x_val).squeeze(dim=-1)
            y_val_pred[y_val_pred >= 0.5] = 1.0
            y_val_pred[y_val_pred < 0.5] = 0.0
            
            # Calculate accuracy on the validation set
            correct = (y_val_pred == y_val).sum().item()
            accuracy = correct / y_val.shape[0]
            accuracy_val.append(accuracy)
            
            # Save the best model and best accuracy
            if accuracy > best_accuracy_val:
                best_accuracy_val = accuracy
                best_iteration = t
                best_model = model
            
            if t % 1000 == 0:   
                print("iteration: %s/%s" % (t, iteration))
    
            optimizer.zero_grad()
        
            # Backward pass
            loss.backward()
            
            # Update parameters
            optimizer.step()
        
        # Compute training time
        t1 = time.time()
        print("training time = %s(s)" % (t1 - t0))
        
        # Use the best model to predict on testing dataset
        y_test_pred = best_model(x_test).squeeze(dim=-1)

        # Detach data from the graph and transfer to numpy array
        yt = y_test.cpu().detach().numpy()
        yp = y_test_pred.cpu().detach().numpy()
        
        # Compute AUC (Area Under ROC Curve)
        auc = metrics.roc_auc_score(yt, yp)
        
        # Compute accuracy, precision, recall and f1
        yp[yp >= 0.5] = 1.0 # probalility >= 0.5 atached to positive label
        yp[yp < 0.5] = 0.0 # probability < 0.5 atached to negative label
        accuracy_test, precision, recall, f1 = confusion_mat_evaluate(yt, yp)
        
        # Save all the results in res_table
        res_table[res_table_ind] += np.array([H, best_accuracy_val, accuracy_test, auc, precision, recall, f1, t1-t0])
        res_table_ind += 1
        
        # Output the result
        print("The best model: iteration = %s" % (best_iteration))
        print("On validation dataset: accuracy = %s" % (best_accuracy_val))
        print("On testing dataset: accuracy = %s, auc = %s, precision = %s, recall = %s, f1 = %s" \
              % (accuracy_test, auc, precision, recall, f1))

        # Plot the loss curve and varlidation accuracy curve
        fig = plt.figure(figsize=(15,4))
        ax1 = plt.subplot(1,2,1)
        ax2 = plt.subplot(1,2,2)
        plt.sca(ax1)
        plt.title("H = %s" % H)
        plt.xlabel("iteration")
        plt.ylabel("loss")
        plt.plot(range(iteration), loss_history)
        plt.sca(ax2)
        plt.title("H = %s" % H)
        plt.xlabel("iteration")
        plt.ylabel("accuracy_val")
        plt.plot(range(iteration), accuracy_val)
        plt.show()

    # Return the result tables
    return res_table
def framework_run(data, device, k, H_list, lr, iteration):
    # Generate K-folders
    k_folders = generate_k_folders(read_bi_data(breast_cancer_data), k)

    # The result table
    # Each row preserves the related result of corresponding H
    # 8 means we have 8 items to save => [H, best_accuracy_val, accuracy_test, auc, precision, recall, f1, training_time]
    res_table = np.zeros((len(H_list), 8))
    
    # Traversal all folders dataset
    for index in range(k):
        print("K-Folder index = %s" % index)
        # Sum up the result table
        res_table += train_bi_nn_model(k_folders[index], H_list, device, lr, iteration)

    # Return the average table
    return np.round(res_table / k, decimals=3)

Testing part

If this part running successfully, it means that all functions works well.

device = torch.device('cuda')
framework_run(breast_cancer_data, device, k=5, H_list=[5,6], lr=5e-3, iteration=500)

Training on five binary classification data sets

# breast_data features = 10 => H* = 3, lr = 1e-2
framework_run(breast_cancer_data, device, k=5, H_list=[1,2,3,4,5,6,7], lr=1e-2, iteration=2500)
H_breastNNVal_AccuracyTest_AccuracyAUCPrecisionRecallF1Training Time
10.8920.9010.8920.8380.8660.8489.782(s)
20.9730.9630.9970.9620.9570.969.643(s)
30.9730.9690.9970.9670.9660.9669.4(s)
40.9650.9660.9980.9640.9620.96310.327(s)
50.9630.9680.9970.9650.9640.96511.125(s)
60.9650.9680.9970.9650.9640.96511.011(s)
70.9730.9680.9980.9650.9640.96510.401(s)
# digit_data features = 64 => H* = 10, lr = 2e-3
framework_run(digit_data, device, k=5, H_list=[5,6,7,8,9,10], lr=2e-3, iteration=2500)
H_digitNNVal_AccuracyTest_AccuracyAUCPrecisionRecallF1Training Time
50.9140.8590.9870.8180.80.7939.569(s)
60.9470.9310.9980.9510.9030.929.327(s)
70.9270.9350.9970.9560.9080.9249.11 (s)
80.9410.9290.9980.9510.90.91811.058(s)
90.9190.9220.9970.9480.890.90711.388(s)
100.9560.940.9970.9580.9150.9310.615(s)
# diabetes_data features = 8 => H* = 4, lr = 5e-2
framework_run(diabetes_data, device, k=5, H_list=[1,2,3,4,5,6,7,8], lr=5e-2, iteration=2500)
H_diabetesNNVal_AccuracyTest_AccuracyAUCPrecisionRecallF1Training Time
10.9260.90.8970.8330.8670.8469.412(s)
20.9730.9650.9970.9610.9610.9619.183(s)
30.9670.9680.9970.9640.9660.9659.584(s)
40.9710.9690.9970.9660.9670.9669.897(s)
50.9730.9680.9970.9650.9650.9659.909(s)
60.9760.9650.9970.9610.9620.96110.188(s)
70.9710.9660.9970.9630.9630.96310.127(s)
80.9710.9680.9970.9630.9660.9659.489(s)
# iris_data features = 4 => H* = 3, lr = 1e-2
framework_run(iris_data, device, k=5, H_list=[1,2,3,4], lr=1e-2, iteration=2500)
H_irisNNVal_AccuracyTest_AccuracyAUCPrecisionRecallF1Training Time
10.8860.9030.8910.8370.870.859.465(s)
20.9620.9650.9970.9630.9590.9619.192(s)
30.9630.9660.9970.9640.9620.9639.107(s)
40.9650.9630.9980.9620.9570.969.198(s)
# wine_data 13 features => H* = 6, lr = 1e-3 
framework_run(wine_data, device, k=5, H_list=list(range(1,11)), lr=1e-3, iteration=2500)
H_wineNNVal_AccuracyTest_AccuracyAUCPrecisionRecallF1Training Time
10.7150.7090.8270.6430.6740.69.564(s)
20.8220.7810.9790.6780.6910.669.17(s)
30.7580.7440.8690.6620.6380.6019.814(s)
40.8640.8410.9790.9010.7780.79110.202(s)
50.7340.7510.9290.6680.6480.6049.864(s)
60.8610.8340.9970.9020.7650.7739.523(s)
70.8240.8070.9640.7870.7280.7239.795(s)
80.870.8150.9960.8920.7370.7489.733(s)
90.830.8030.9970.8860.7210.739.45(s)
100.8130.7990.9970.8850.7150.71610.286(s)

Train Pytorch NN for Multi-class Data Sets

def train_multi_nn_model(dataset, L1_list, L2_list, device, learning_rate=1e-2, iteration=5000):
    
    x_train, y_train, x_val, y_val, x_test, y_test = unzip_dataset(dataset)
    
    # Cross Entropy Loss: combines nn.LogSoftmax() and nn.NLLLoss()
    loss_fn = torch.nn.CrossEntropyLoss() 
    
    # The result table
    # Each row preserves the related result of corresponding combination of L1 and L2
    # 5 means we have 5 items to save => [L1, L2, best_accuracy_val, accuracy_test, training_time]
    res_table = np.zeros((len(L1_list)*len(L2_list),5))
    res_table_ind = 0
    
    for L1 in L1_list:
        for L2 in L2_list:
            t0 = time.time()
            model = torch.nn.Sequential(
                torch.nn.Linear(784, L1), # input dimension = 784, hidden layer1 = L1
                torch.nn.ReLU(),
                torch.nn.Linear(L1, L2), # hidden layer2 = L2
                torch.nn.ReLU(),
                torch.nn.Linear(L2, 10), # output probability on 10 classes
            ).to(device)
        
            loss_history = []
            accuracy_val = []
            correct = 0.0
            best_accuracy_val = 0.0
            best_model = None
            
            # SGD optimizer
            optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
            
            for t in range(iteration):
                y_train_pred = model(x_train).squeeze(dim=-1) # squeeze 2D of shape(x,1) to 1D of shape(x,)
                loss = loss_fn(y_train_pred, y_train)
                loss_history.append(loss.item())
     
                # Choose the max possibility index as the prediction class
                y_val_pred = model(x_val)
                y_val_pred_label = torch.argmax(y_val_pred, dim=1)
                
                # Calculate accuracy on the validation set
                correct = (y_val_pred_label == y_val).sum().item()
                accuracy = correct / y_val.shape[0]
                accuracy_val.append(accuracy)
            
                # Save the best model and best accuracy
                if accuracy > best_accuracy_val:
                    best_accuracy_val = accuracy
                    best_model = model
            
                if t % 200 == 0:   
                    print("iteration: %s/%s" % (t, iteration))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
            # Compute training time
            t1 = time.time()
            print("training time = %s(s)" % (t1 - t0))
        
            # Use the best model to predict on testing dataset
            y_test_pred = best_model(x_test)
            y_test_pred_label = torch.argmax(y_test_pred, dim=1)
                
            # Calculate accuracy on the testing dataset
            correct = (y_test_pred_label == y_test).sum().item()
            accuracy_test = correct / y_val.shape[0]
            
            # Save in result table
            res_table[res_table_ind] += np.array([L1, L2, best_accuracy_val, accuracy_test, t1-t0])
            res_table_ind += 1
            
            print("best_accuracy_val = %s, accuracy_test = %s" % (best_accuracy_val, accuracy_test))

            
            # Plot the loss curve and varlidation accuracy curve
            fig = plt.figure(figsize=(15,4))
            ax1 = plt.subplot(1,2,1)
            ax2 = plt.subplot(1,2,2)
            plt.sca(ax1)
            plt.title("L1 = %s, L2 = %s" % (L1, L2))
            plt.xlabel("iteration")
            plt.ylabel("loss")
            plt.plot(range(iteration), loss_history)
            plt.sca(ax2)
            plt.title("L1 = %s, L2 = %s" % (L1, L2))
            plt.xlabel("iteration")
            plt.ylabel("accuracy_val")
            plt.plot(range(iteration), accuracy_val)
            fig.tight_layout(pad=0.4, w_pad=3.0, h_pad=3.0)            
            plt.show()
            
    # Return the result tables
    return res_table
# Generate 5-folders
k_folders = generate_k_folders(read_multi_data(), 5)

# The trying parameters of L1 and L2
L1_list = [50, 75, 100]
L2_list = [10, 15, 20]

# The result table
# Each row preserves the related result of corresponding combination of L1 and L2
# 5 means we have 5 items to save => [L1, L2, best_accuracy_val, accuracy_test, training_time]
res_table = np.zeros((len(L1_list)*len(L2_list),5))

for index in range(5):
    print("K-Folder index = %s" % index)
    # Sum up the result table
    res_table += train_multi_nn_model(k_folders[index], L1_list, L2_list, device, learning_rate=1e-4, iteration=1000)

# Compute the average table
np.round(res_table / 5, decimals=3)
L1L2Val AccuracyTest AccuracyTraining Time
50100.4540.22111.726(s)
50150.6010.29811.976(s)
50200.7040.35212.125(s)
75100.4130.20812.466(s)
75150.6150.30412.493(s)
75200.6910.34612.867(s)
100100.5290.25613.829(s)
100150.5850.28213.956(s)
100200.6510.3213.946(s)



文章评论

captcha