机器学习，兵王问题，支持向量机SVM，交叉验证求C和gamma

import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.utils.validation import column_or_1d
import numpy as np
from sklearn.model_selection import GridSearchCV

读取数据

original_data = pd.read_csv("krkopt.data")

增加表头格式化数据

original_data.columns = ["wx", "wy", "wwx", "wwy", "vx", "vy", "outcome"]
original_data.replace(to_replace={'^a$': 1, '^b$': 2, '^c$': 3, '^d$': 4, '^e$': 5, '^f$': 6, '^g$': 7, '^h$': 8, '^draw$': 1, "(?!draw)": 0}, regex=True, inplace=True)
original_data.head

<bound method NDFrame.head of        wx  wy  wwx  wwy  vx  vy  outcome
0       1   1    3    1   3   2        1
1       1   1    3    1   4   1        1
2       1   1    3    1   4   2        1
3       1   1    3    2   3   1        1
4       1   1    3    2   3   3        1
...    ..  ..  ...  ...  ..  ..      ...
28050   2   1    7    7   5   5        0
28051   2   1    7    7   5   6        0
28052   2   1    7    7   5   7        0
28053   2   1    7    7   6   5        0
28054   2   1    7    7   7   5        0

[28055 rows x 7 columns]>

数据归一化

original_data[['wx', 'wy', 'wwx', 'wwy', 'vx', 'vy']] = preprocessing.scale(original_data[['wx', 'wy', 'wwx', 'wwy', 'vx', 'vy']])
pd.DataFrame(data=original_data).to_csv("krkopt_fill.csv")
original_data.shape

(28055, 7)

切割输入数据和输出数据

new_original_data = pd.read_csv("krkopt_fill.csv")
original_data_x = new_original_data[['wx', 'wy', 'wwx', 'wwy', 'vx', 'vy']]
original_data_y = new_original_data[['outcome']]
original_data_x.head(5)
original_data_y.head(5)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	outcome
0	1
1	1
2	1
3	1
4	1

分割训练数据和测试数据

X_train, X_test, y_train, y_test = train_test_split( original_data_x, original_data_y, train_size=5000, random_state=0)
X_train.shape,X_test.shape
y_train.shape

(5000, 1)

讲y转化成一维数据

y_train = column_or_1d(y_train,warn=False)
y_train.shape

(5000,)

测试训练

clf = svm.SVC(C=10, tol=1e-3, gamma=0.8, kernel='rbf', decision_function_shape='ovr', probability=True)
clf.fit(X_train,y_train)

SVC(C=10, gamma=0.8, probability=True)

clf.score(X_test,y_test)

0.9888093689004555

⾸先需要对C和Gamma两个参数的取值进⾏初步搜索，c的取值范围是：2^-5--2^15,gamma的取值范围：2^-15--2^3,该范围是基于⼈⼯的经验；对数据进⾏交叉验证，初步找出识别率最⾼的c与gamma的组合

CScale = [-5,-3,-1,1,3,5,7,9,11,13,15];
gammaScale = [-15,-13,-11,-9,-7,-5,-3,-1,1,3]
C=[]
gamma=[]
for cs in CScale:
    C.append(2**cs)
for gs in gammaScale:
    gamma.append(2**gs)

C,gamma

([0.03125, 0.125, 0.5, 2, 8, 32, 128, 512, 2048, 8192, 32768],
 [3.0517578125e-05,  0.0001220703125,  0.00048828125,  0.001953125,  0.0078125,  0.03125,  0.125,  0.5,  2,  8])

clf = svm.SVC(tol=1e-3, kernel='rbf', decision_function_shape='ovr', probability=True)

tuned_parameters={"gamma": gamma, "C": C}
clf = GridSearchCV(svm.SVC(), tuned_parameters, n_jobs=5,cv=5)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print(clf.best_score_)
print()

Best parameters set found on development set:

{'C': 128, 'gamma': 0.125}
0.9942

根据初步找到的{'C': 128, 'gamma': 0.125},进一步精确查找


newC = np.linspace((32+128)/2,(128+512)/2,10)
newGamma = np.linspace((0.03125+0.125)/2,(0.125+0.5)/2,10)
newC,newGamma

(array([ 80.        , 106.66666667, 133.33333333, 160.        ,
        186.66666667, 213.33333333, 240.        , 266.66666667,
        293.33333333, 320.        ]),
 array([0.078125  , 0.10416667, 0.13020833, 0.15625   , 0.18229167,
        0.20833333, 0.234375  , 0.26041667, 0.28645833, 0.3125    ]))

clf = svm.SVC(tol=1e-3, kernel='rbf', decision_function_shape='ovr', probability=True)

tuned_parameters={"gamma": newGamma, "C": newC}
clf = GridSearchCV(svm.SVC(), tuned_parameters, n_jobs=5,cv=5)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print(clf.best_score_)
print()

Best parameters set found on development set:

{'C': 106.66666666666667, 'gamma': 0.18229166666666669}
0.9945999999999999

此时我们得到了一个相对精确的C和gamma，将5000份训练数据进行训练

clf = svm.SVC(C= 106.66666666666667,gamma=0.18229166666666669,tol=1e-3, kernel='rbf', decision_function_shape='ovr', probability=True)
clf.fit(X_train,y_train)
clf.score

<bound method ClassifierMixin.score of SVC(C=106.66666666666667, gamma=0.18229166666666669, probability=True)>

由此得到了一个相对较好的模型


clf.score(X_test,y_test)

0.9944480589893733