modified the file path in svm-python3.6.py
This commit is contained in:
commit
2cd68ce77b
|
@ -47,22 +47,6 @@
|
|||
|
||||
* 数据集下载地址:<https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data>
|
||||
|
||||
```python
|
||||
# 导入相关数据包
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
%matplotlib inline
|
||||
```
|
||||
|
||||
### 特征说明
|
||||
|
||||
## 一. 数据分析
|
||||
|
||||
### 数据下载和加载
|
||||
|
||||
|
||||
```python
|
||||
# 导入相关数据包
|
||||
import numpy as np
|
||||
|
@ -1308,7 +1292,6 @@ train_corr
|
|||
</div>
|
||||
|
||||
|
||||
|
||||
> 所有特征相关度分析
|
||||
|
||||
|
||||
|
@ -1346,9 +1329,6 @@ plt.show()
|
|||

|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
'\n1. GarageCars 和 GarageAre 相关性很高、就像双胞胎一样,所以我们只需要其中的一个变量,例如:GarageCars。\n2. TotalBsmtSF 和 1stFloor 与上述情况相同,我们选择 TotalBsmtS\n3. GarageAre 和 TotRmsAbvGrd 与上述情况相同,我们选择 GarageAre\n'
|
||||
|
||||
|
||||
|
@ -1367,7 +1347,6 @@ plt.show();
|
|||

|
||||
|
||||
|
||||
|
||||
```python
|
||||
train[['SalePrice', 'OverallQual', 'GrLivArea','GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']].info()
|
||||
```
|
||||
|
@ -1385,9 +1364,13 @@ train[['SalePrice', 'OverallQual', 'GrLivArea','GarageCars', 'TotalBsmtSF', 'Ful
|
|||
dtypes: int64(7)
|
||||
memory usage: 79.9 KB
|
||||
|
||||
|
||||
## 二. 特征工程
|
||||
|
||||
```
|
||||
test['SalePrice'] = None
|
||||
train_test = pd.concat((train, test)).reset_index(drop=True)
|
||||
```
|
||||
|
||||
### 1. 缺失值分析
|
||||
|
||||
2. 根据业务,常识,以及第二步的数据分析构造特征工程.
|
||||
|
@ -1395,10 +1378,12 @@ train[['SalePrice', 'OverallQual', 'GrLivArea','GarageCars', 'TotalBsmtSF', 'Ful
|
|||
|
||||
|
||||
```python
|
||||
total= train.isnull().sum().sort_values(ascending=False)
|
||||
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
|
||||
total= train_test.isnull().sum().sort_values(ascending=False)
|
||||
percent = (train_test.isnull().sum()/train_test.isnull().count()).sort_values(ascending=False)
|
||||
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Lost Percent'])
|
||||
missing_data.head(20)
|
||||
|
||||
print(missing_data[missing_data.isnull().values==False].sort_values('Total', axis=0, ascending=False).head(20))
|
||||
|
||||
|
||||
'''
|
||||
1. 对于缺失率过高的特征,例如 超过15% 我们应该删掉相关变量且假设该变量并不存在
|
||||
|
@ -1408,23 +1393,19 @@ missing_data.head(20)
|
|||
```
|
||||
|
||||
|
||||
|
||||
|
||||
'\n1. 对于缺失率过高的特征,例如 超过15% 我们应该删掉相关变量且假设该变量并不存在\n2. GarageX 变量群的缺失数据量和概率都相同,可以选择一个就行,例如:GarageCars\n3. 对于缺失数据在5%左右(缺失率低),可以直接删除/回归预测\n'
|
||||
|
||||
|
||||
|
||||
|
||||
```python
|
||||
train= train.drop((missing_data[missing_data['Total'] > 1]).index, axis=1)
|
||||
train= train.drop(train.loc[train['Electrical'].isnull()].index)
|
||||
train.isnull().sum().max() #justchecking that there's no missing data missing
|
||||
train_test = train_test.drop((missing_data[missing_data['Total'] > 1]).index.drop('SalePrice') , axis=1)
|
||||
# train_test = train_test.drop(train.loc[train['Electrical'].isnull()].index)
|
||||
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
print(tmp.isnull().sum().max()) # justchecking that there's no missing data missing
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
0
|
||||
1
|
||||
|
||||
|
||||
|
||||
|
@ -1461,13 +1442,9 @@ print("Kurtosis: %f" % train['SalePrice'].kurt())
|
|||
```
|
||||
|
||||
|
||||
|
||||
|
||||
'\n低范围的值都比较相似并且在 0 附近分布。\n高范围的值离 0 很远,并且七点几的值远在正常范围之外。\n'
|
||||
|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
@ -1491,8 +1468,6 @@ data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
|
|||
```
|
||||
|
||||
|
||||
|
||||
|
||||
'\n从图中可以看出:\n\n1. 有两个离群的 GrLivArea 值很高的数据,我们可以推测出现这种情况的原因。\n 或许他们代表了农业地区,也就解释了低价。 这两个点很明显不能代表典型样例,所以我们将它们定义为异常值并删除。\n2. 图中顶部的两个点是七点几的观测值,他们虽然看起来像特殊情况,但是他们依然符合整体趋势,所以我们将其保留下来。\n'
|
||||
|
||||
|
||||
|
@ -1504,9 +1479,11 @@ data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
|
|||
|
||||
```python
|
||||
# 删除点
|
||||
train.sort_values(by = 'GrLivArea',ascending = False)[:2]
|
||||
train = train.drop(train[train['Id'] == 1299].index)
|
||||
train = train.drop(train[train['Id'] == 524].index)
|
||||
print(train.sort_values(by='GrLivArea', ascending = False)[:2])
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
|
||||
train_test = train_test.drop(tmp[tmp['Id'] == 1299].index)
|
||||
train_test = train_test.drop(tmp[tmp['Id'] == 524].index)
|
||||
```
|
||||
|
||||
> 2.TotalBsmtSF 和 SalePrice 双变量分析
|
||||
|
@ -1515,7 +1492,7 @@ train = train.drop(train[train['Id'] == 524].index)
|
|||
```python
|
||||
var = 'TotalBsmtSF'
|
||||
data = pd.concat([train['SalePrice'],train[var]], axis=1)
|
||||
data.plot.scatter(x=var, y='SalePrice',ylim=(0,800000));
|
||||
data.plot.scatter(x=var, y='SalePrice',ylim=(0,800000))
|
||||
```
|
||||
|
||||
|
||||
|
@ -1548,7 +1525,7 @@ data.plot.scatter(x=var, y='SalePrice',ylim=(0,800000));
|
|||
|
||||
|
||||
```python
|
||||
sns.distplot(train['SalePrice'], fit=norm);
|
||||
sns.distplot(train['SalePrice'], fit=norm)
|
||||
fig = plt.figure()
|
||||
res = stats.probplot(train['SalePrice'], plot=plt)
|
||||
|
||||
|
@ -1559,40 +1536,36 @@ res = stats.probplot(train['SalePrice'], plot=plt)
|
|||
```
|
||||
|
||||
|
||||
|
||||
|
||||
'\n可以看出,房价分布不是正态的,显示了峰值,正偏度,但是并不跟随对角线。\n可以用对数变换来解决这个问题\n'
|
||||
|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
```python
|
||||
# 进行对数变换:
|
||||
train['SalePrice']= np.log(train['SalePrice'])
|
||||
# 进行对数变换:
|
||||
train_test['SalePrice'] = [i if i is None else np.log1p(i) for i in train_test['SalePrice']]
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# 绘制变换后的直方图和正态概率图:
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
|
||||
sns.distplot(train['SalePrice'], fit=norm);
|
||||
sns.distplot(tmp[tmp['SalePrice'] !=0]['SalePrice'], fit=norm);
|
||||
fig = plt.figure()
|
||||
res = stats.probplot(train['SalePrice'], plot=plt)
|
||||
res = stats.probplot(tmp['SalePrice'], plot=plt)
|
||||
```
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
@ -1610,26 +1583,25 @@ res = stats.probplot(train['GrLivArea'], plot=plt)
|
|||

|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
```python
|
||||
# 进行对数变换:
|
||||
train['GrLivArea']= np.log(train['GrLivArea'])
|
||||
train_test['GrLivArea'] = [i if i is None else np.log1p(i) for i in train_test['GrLivArea']]
|
||||
|
||||
# 绘制变换后的直方图和正态概率图:
|
||||
sns.distplot(train['GrLivArea'], fit=norm);
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
sns.distplot(tmp['GrLivArea'], fit=norm)
|
||||
fig = plt.figure()
|
||||
res = stats.probplot(train['GrLivArea'], plot=plt)
|
||||
res = stats.probplot(tmp['GrLivArea'], plot=plt)
|
||||
```
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
@ -1651,28 +1623,24 @@ res = stats.probplot(train['TotalBsmtSF'],plot=plt)
|
|||
'''
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
'\n从图中可以看出:\n* 显示出了偏度\n* 大量为 0(Y值) 的观察值(没有地下室的房屋)\n* 含 0(Y值) 的数据无法进行对数变换\n'
|
||||
|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
```python
|
||||
# 去掉为0的分布情况
|
||||
tmp = np.array(train.loc[train['TotalBsmtSF']>0, ['TotalBsmtSF']])[:, 0]
|
||||
sns.distplot(tmp,fit=norm);
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
|
||||
tmp = np.array(tmp.loc[tmp['TotalBsmtSF']>0, ['TotalBsmtSF']])[:, 0]
|
||||
sns.distplot(tmp, fit=norm)
|
||||
fig = plt.figure()
|
||||
res = stats.probplot(tmp,plot=plt)
|
||||
res = stats.probplot(tmp, plot=plt)
|
||||
```
|
||||
|
||||
|
||||
|
@ -1702,60 +1670,45 @@ print(train.loc[train['TotalBsmtSF']==1, ['TotalBsmtSF']].count())
|
|||
|
||||
```python
|
||||
# 进行对数变换:
|
||||
print(train['TotalBsmtSF'].head(20))
|
||||
train['TotalBsmtSF']= np.log(train['TotalBsmtSF'])
|
||||
print(train['TotalBsmtSF'].head(20))
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
|
||||
print(tmp['TotalBsmtSF'].head(10))
|
||||
train_test['TotalBsmtSF']= np.log1p(train_test['TotalBsmtSF'])
|
||||
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
print(tmp['TotalBsmtSF'].head(10))
|
||||
```
|
||||
|
||||
0 856
|
||||
1 1262
|
||||
2 920
|
||||
3 756
|
||||
4 1145
|
||||
5 796
|
||||
6 1686
|
||||
7 1107
|
||||
8 952
|
||||
9 991
|
||||
10 1040
|
||||
11 1175
|
||||
12 912
|
||||
13 1494
|
||||
14 1253
|
||||
15 832
|
||||
16 1004
|
||||
17 1
|
||||
18 1114
|
||||
19 1029
|
||||
Name: TotalBsmtSF, dtype: int64
|
||||
0 6.752270
|
||||
1 7.140453
|
||||
2 6.824374
|
||||
3 6.628041
|
||||
4 7.043160
|
||||
5 6.679599
|
||||
6 7.430114
|
||||
7 7.009409
|
||||
8 6.858565
|
||||
9 6.898715
|
||||
10 6.946976
|
||||
11 7.069023
|
||||
12 6.815640
|
||||
13 7.309212
|
||||
14 7.133296
|
||||
15 6.723832
|
||||
16 6.911747
|
||||
17 0.000000
|
||||
18 7.015712
|
||||
19 6.936343
|
||||
0 856.0
|
||||
1 1262.0
|
||||
2 920.0
|
||||
3 756.0
|
||||
4 1145.0
|
||||
5 796.0
|
||||
6 1686.0
|
||||
7 1107.0
|
||||
8 952.0
|
||||
9 991.0
|
||||
Name: TotalBsmtSF, dtype: float64
|
||||
0 6.753438
|
||||
1 7.141245
|
||||
2 6.825460
|
||||
3 6.629363
|
||||
4 7.044033
|
||||
5 6.680855
|
||||
6 7.430707
|
||||
7 7.010312
|
||||
8 6.859615
|
||||
9 6.899723
|
||||
Name: TotalBsmtSF, dtype: float64
|
||||
|
||||
|
||||
|
||||
```python
|
||||
# 绘制变换后的直方图和正态概率图:
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
|
||||
tmp = np.array(train.loc[train['TotalBsmtSF']>0, ['TotalBsmtSF']])[:, 0]
|
||||
tmp = np.array(tmp.loc[tmp['TotalBsmtSF']>0, ['TotalBsmtSF']])[:, 0]
|
||||
sns.distplot(tmp, fit=norm)
|
||||
fig = plt.figure()
|
||||
res = stats.probplot(tmp, plot=plt)
|
||||
|
@ -1780,17 +1733,15 @@ res = stats.probplot(tmp, plot=plt)
|
|||
|
||||
|
||||
```python
|
||||
plt.scatter(train['GrLivArea'], train['SalePrice'])
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
|
||||
plt.scatter(tmp['GrLivArea'], tmp['SalePrice'])
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
<matplotlib.collections.PathCollection at 0x11a366f60>
|
||||
|
||||
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
@ -1800,14 +1751,14 @@ plt.scatter(train['GrLivArea'], train['SalePrice'])
|
|||
|
||||
|
||||
```python
|
||||
plt.scatter(train[train['TotalBsmtSF']>0]['TotalBsmtSF'], train[train['TotalBsmtSF']>0]['SalePrice'])
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
|
||||
plt.scatter(tmp[tmp['TotalBsmtSF']>0]['TotalBsmtSF'], tmp[tmp['TotalBsmtSF']>0]['SalePrice'])
|
||||
|
||||
# 可以看出 SalePrice 在整个 TotalBsmtSF 变量范围内显示出了同等级别的变化。
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
<matplotlib.collections.PathCollection at 0x11d7d96d8>
|
||||
|
||||
|
||||
|
@ -1822,14 +1773,18 @@ plt.scatter(train[train['TotalBsmtSF']>0]['TotalBsmtSF'], train[train['TotalBsmt
|
|||
|
||||
|
||||
```python
|
||||
x_train = train[['OverallQual', 'GrLivArea','GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']]
|
||||
y_train = train[["SalePrice"]].values.ravel()
|
||||
x_test = test[['OverallQual', 'GrLivArea','GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']]
|
||||
tmp = train_test[train_test['SalePrice'].isnull().values==False]
|
||||
tmp_1 = train_test[train_test['SalePrice'].isnull().values==True]
|
||||
|
||||
# from sklearn.preprocessing import RobustScaler
|
||||
# N = RobustScaler()
|
||||
# rs_train = N.fit_transform(train)
|
||||
# rs_test = N.fit_transform(train)
|
||||
x_train = tmp[['OverallQual', 'GrLivArea','GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']]
|
||||
y_train = tmp[["SalePrice"]].values.ravel()
|
||||
x_test = tmp_1[['OverallQual', 'GrLivArea','GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']]
|
||||
|
||||
# 简单测试,用中位数来替代
|
||||
# print(x_test.GarageCars.mean(), x_test.GarageCars.median(), x_test.TotalBsmtSF.mean(), x_test.TotalBsmtSF.median())
|
||||
|
||||
x_test["GarageCars"].fillna(x_test.GarageCars.median(), inplace=True)
|
||||
x_test["TotalBsmtSF"].fillna(x_test.TotalBsmtSF.median(), inplace=True)
|
||||
```
|
||||
|
||||
### 2.开始建模
|
||||
|
@ -1851,10 +1806,11 @@ from sklearn.linear_model import Ridge
|
|||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
|
||||
|
||||
ridge = Ridge(alpha = 15)
|
||||
ridge = Ridge(alpha=0.1)
|
||||
|
||||
# bagging 把很多小的分类器放在一起,每个train随机的一部分数据,然后把它们的最终结果综合起来(多数投票)
|
||||
# bagging 算是一种算法框架
|
||||
params = [1,10,15,20,25,30,40]
|
||||
params = [1, 10, 20, 40, 60]
|
||||
test_scores = []
|
||||
for param in params:
|
||||
clf = BaggingRegressor(base_estimator=ridge, n_estimators=param)
|
||||
|
@ -1863,6 +1819,7 @@ for param in params:
|
|||
test_score = np.sqrt(-cross_val_score(clf, x_train, y_train, cv=10, scoring='neg_mean_squared_error'))
|
||||
test_scores.append(np.mean(test_score))
|
||||
|
||||
print(test_score.mean())
|
||||
plt.plot(params, test_scores)
|
||||
plt.title('n_estimators vs CV Error')
|
||||
plt.show()
|
||||
|
@ -1877,7 +1834,7 @@ plt.show()
|
|||
from sklearn.linear_model import Ridge
|
||||
from sklearn.model_selection import learning_curve
|
||||
|
||||
ridge = Ridge(alpha = 15)
|
||||
ridge = Ridge(alpha=0.1)
|
||||
|
||||
train_sizes, train_loss, test_loss = learning_curve(ridge, x_train, y_train, cv=10,
|
||||
scoring='neg_mean_squared_error',
|
||||
|
@ -1904,77 +1861,26 @@ plt.show()
|
|||
|
||||
|
||||
```python
|
||||
mode_br = BaggingRegressor(base_estimator=ridge, n_estimators=25)
|
||||
mode_br = BaggingRegressor(base_estimator=ridge, n_estimators=10)
|
||||
mode_br.fit(x_train, y_train)
|
||||
# y_test = np.expm1(mode_br.predict(x_test))
|
||||
y_test = mode_br.predict(x_test)
|
||||
y_test = np.expm1(mode_br.predict(x_test))
|
||||
```
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
ValueError Traceback (most recent call last)
|
||||
|
||||
<ipython-input-426-1c40a6d7beeb> in <module>()
|
||||
2 mode_br.fit(x_train, y_train)
|
||||
3 # y_test = np.expm1(mode_br.predict(x_test))
|
||||
----> 4 y_test = mode_br.predict(x_test)
|
||||
|
||||
|
||||
~/.virtualenvs/python3.6/lib/python3.6/site-packages/sklearn/ensemble/bagging.py in predict(self, X)
|
||||
946 check_is_fitted(self, "estimators_features_")
|
||||
947 # Check data
|
||||
--> 948 X = check_array(X, accept_sparse=['csr', 'csc'])
|
||||
949
|
||||
950 # Parallel loop
|
||||
|
||||
|
||||
~/.virtualenvs/python3.6/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
|
||||
451 % (array.ndim, estimator_name))
|
||||
452 if force_all_finite:
|
||||
--> 453 _assert_all_finite(array)
|
||||
454
|
||||
455 shape_repr = _shape_repr(array.shape)
|
||||
|
||||
|
||||
~/.virtualenvs/python3.6/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
|
||||
42 and not np.isfinite(X).all()):
|
||||
43 raise ValueError("Input contains NaN, infinity"
|
||||
---> 44 " or a value too large for %r." % X.dtype)
|
||||
45
|
||||
46
|
||||
|
||||
|
||||
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
|
||||
|
||||
|
||||
|
||||
```python
|
||||
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# 提交结果
|
||||
submission_df = pd.DataFrame(data = {'Id':x_test.index,'SalePrice':y_test})
|
||||
submission_df = pd.DataFrame(data = {'Id':test['Id'],'SalePrice': y_test})
|
||||
print(submission_df.head(10))
|
||||
submission_df.to_csv('/Users/jiangzl/Desktop/submission_br.csv',columns = ['Id','SalePrice'],index = False)
|
||||
```
|
||||
|
||||
Id SalePrice
|
||||
0 0 218022.623974
|
||||
1 1 164144.987442
|
||||
2 2 221398.628262
|
||||
3 3 191061.326748
|
||||
4 4 294855.598373
|
||||
5 5 155670.529343
|
||||
6 6 249098.039164
|
||||
7 7 221706.705606
|
||||
8 8 185981.384326
|
||||
9 9 114422.951956
|
||||
|
||||
|
||||
|
||||
```python
|
||||
|
||||
```
|
||||
Id SalePrice
|
||||
0 1461 110469.586157
|
||||
1 1462 148368.953437
|
||||
2 1463 172697.673678
|
||||
3 1464 189844.587562
|
||||
4 1465 207009.716532
|
||||
5 1466 188820.407208
|
||||
6 1467 163107.556014
|
||||
7 1468 180732.346459
|
||||
8 1469 194841.804925
|
||||
9 1470 110570.281362
|
||||
|
|
|
@ -1,294 +0,0 @@
|
|||
|
||||
# House Prices: Advanced Regression Techniques in Kaggle
|
||||
|
||||
*author: loveSnowBest*
|
||||
|
||||
## 1. A brief introduction to this competition
|
||||
This competition is a getting started one. As the title shows us, what we need to use for this competition is regression model. Here is the official description about this compeition:
|
||||
> Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
|
||||
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.
|
||||
|
||||
## 2. My solution
|
||||
|
||||
### import what we need
|
||||
|
||||
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
```
|
||||
|
||||
### load the data
|
||||
|
||||
|
||||
```python
|
||||
rawData=pd.read_csv('train.csv')
|
||||
testData=pd.read_csv('test.csv')
|
||||
```
|
||||
|
||||
And let's have a look at our data use head method:
|
||||
|
||||
|
||||
```Python
|
||||
rawData.head()
|
||||
```
|
||||
|
||||
|
||||
|
||||
<div>
|
||||
<table border="1" class="dataframe">
|
||||
<thead>
|
||||
<tr style="text-align: right;">
|
||||
<th></th>
|
||||
<th>Id</th>
|
||||
<th>MSSubClass</th>
|
||||
<th>MSZoning</th>
|
||||
<th>LotFrontage</th>
|
||||
<th>LotArea</th>
|
||||
<th>Street</th>
|
||||
<th>Alley</th>
|
||||
<th>LotShape</th>
|
||||
<th>LandContour</th>
|
||||
<th>Utilities</th>
|
||||
<th>...</th>
|
||||
<th>PoolArea</th>
|
||||
<th>PoolQC</th>
|
||||
<th>Fence</th>
|
||||
<th>MiscFeature</th>
|
||||
<th>MiscVal</th>
|
||||
<th>MoSold</th>
|
||||
<th>YrSold</th>
|
||||
<th>SaleType</th>
|
||||
<th>SaleCondition</th>
|
||||
<th>SalePrice</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>0</th>
|
||||
<td>1</td>
|
||||
<td>60</td>
|
||||
<td>RL</td>
|
||||
<td>65.0</td>
|
||||
<td>8450</td>
|
||||
<td>Pave</td>
|
||||
<td>NaN</td>
|
||||
<td>Reg</td>
|
||||
<td>Lvl</td>
|
||||
<td>AllPub</td>
|
||||
<td>...</td>
|
||||
<td>0</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>0</td>
|
||||
<td>2</td>
|
||||
<td>2008</td>
|
||||
<td>WD</td>
|
||||
<td>Normal</td>
|
||||
<td>208500</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>1</th>
|
||||
<td>2</td>
|
||||
<td>20</td>
|
||||
<td>RL</td>
|
||||
<td>80.0</td>
|
||||
<td>9600</td>
|
||||
<td>Pave</td>
|
||||
<td>NaN</td>
|
||||
<td>Reg</td>
|
||||
<td>Lvl</td>
|
||||
<td>AllPub</td>
|
||||
<td>...</td>
|
||||
<td>0</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>0</td>
|
||||
<td>5</td>
|
||||
<td>2007</td>
|
||||
<td>WD</td>
|
||||
<td>Normal</td>
|
||||
<td>181500</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>2</th>
|
||||
<td>3</td>
|
||||
<td>60</td>
|
||||
<td>RL</td>
|
||||
<td>68.0</td>
|
||||
<td>11250</td>
|
||||
<td>Pave</td>
|
||||
<td>NaN</td>
|
||||
<td>IR1</td>
|
||||
<td>Lvl</td>
|
||||
<td>AllPub</td>
|
||||
<td>...</td>
|
||||
<td>0</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>0</td>
|
||||
<td>9</td>
|
||||
<td>2008</td>
|
||||
<td>WD</td>
|
||||
<td>Normal</td>
|
||||
<td>223500</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>3</th>
|
||||
<td>4</td>
|
||||
<td>70</td>
|
||||
<td>RL</td>
|
||||
<td>60.0</td>
|
||||
<td>9550</td>
|
||||
<td>Pave</td>
|
||||
<td>NaN</td>
|
||||
<td>IR1</td>
|
||||
<td>Lvl</td>
|
||||
<td>AllPub</td>
|
||||
<td>...</td>
|
||||
<td>0</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>0</td>
|
||||
<td>2</td>
|
||||
<td>2006</td>
|
||||
<td>WD</td>
|
||||
<td>Abnorml</td>
|
||||
<td>140000</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>4</th>
|
||||
<td>5</td>
|
||||
<td>60</td>
|
||||
<td>RL</td>
|
||||
<td>84.0</td>
|
||||
<td>14260</td>
|
||||
<td>Pave</td>
|
||||
<td>NaN</td>
|
||||
<td>IR1</td>
|
||||
<td>Lvl</td>
|
||||
<td>AllPub</td>
|
||||
<td>...</td>
|
||||
<td>0</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>NaN</td>
|
||||
<td>0</td>
|
||||
<td>12</td>
|
||||
<td>2008</td>
|
||||
<td>WD</td>
|
||||
<td>Normal</td>
|
||||
<td>250000</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>5 rows × 81 columns</p>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
### split original data into X,Y
|
||||
First, we use drop method to split rawData into X and Y. Since we need to give Id for prediction, we should save testId before we drop them and finally we put it back.
|
||||
|
||||
|
||||
```python
|
||||
Y_train=rawData['SalePrice']
|
||||
X_train=rawData.drop(['SalePrice','Id'],axis=1)
|
||||
|
||||
testId=testData['Id']
|
||||
X_test=testData.drop(['Id'],axis=1)
|
||||
```
|
||||
|
||||
### deal with categorical data
|
||||
In scikit, we can use DictVectorizer and in pandas we can just use get_dummies. Here I choose the latter one. To use dummies we should put the X_train and X_test together.
|
||||
|
||||
|
||||
```python
|
||||
# add new keys train and test for the convienence of the future split
|
||||
X=pd.concat([X_train,X_test],axis=0,keys={'first','second'},
|
||||
ignore_index=False)
|
||||
X_d=pd.get_dummies(X)
|
||||
```
|
||||
|
||||
DO NOT forget to drop the original categorical data for pandas won't help you drop them automatically. You need to drop it manually:
|
||||
|
||||
|
||||
```python
|
||||
keep_cols=X_d.select_dtypes(include=['number']).columns
|
||||
X_d=X_d[keep_cols]
|
||||
```
|
||||
|
||||
Finally, we need to get our X_train and X_test back
|
||||
|
||||
|
||||
```python
|
||||
if len(X_d.loc['first'])==1460:
|
||||
X_train=X_d.loc['first']
|
||||
X_test=X_d.loc['second']
|
||||
else:
|
||||
X_train=X_d.loc['second']
|
||||
X_test=X_d.loc['first']
|
||||
```
|
||||
|
||||
### deal with missing data
|
||||
pandas provides us with a convienent way to fill missing data with average/median. Here we choose to fill the NA with average. Note to self: sometimes we use median() to avoid the influence by outlier.
|
||||
|
||||
|
||||
```python
|
||||
X_train=X_train.fillna(X_train.mean())
|
||||
X_test=X_test.fillna(X_test.mean())
|
||||
```
|
||||
|
||||
### Use StandardScaler to make data better for your model
|
||||
There are some methods to scale data in scikit, like standardScaler, RobustScaler. Here we choose StandardScaler.
|
||||
|
||||
|
||||
```python
|
||||
ss=StandardScaler()
|
||||
X_scale=ss.fit_transform(X_train)
|
||||
X_test_scale=ss.transform(X_test)
|
||||
```
|
||||
|
||||
### Choose your linear model
|
||||
In scikit, we have,emmmmm,let's see:
|
||||
- LinearRegression
|
||||
- SVM
|
||||
- RandomForestRegressor
|
||||
- LassoCV
|
||||
- RidgeCV
|
||||
- ElasticCV
|
||||
- GradientBoostingRegressor
|
||||
|
||||
Also, you can use XGBoost for this competition. After several attempts with these models, I find GradientBoostingRegressor has the best perfermance.
|
||||
|
||||
|
||||
```python
|
||||
gbr=GradientBoostingRegressor(n_estimators=3000,learning_rate=0.05,
|
||||
max_features='sqrt')
|
||||
gbr.fit(X_scale,Y_train)
|
||||
predict=np.array(gbr.predict(X_test_scale))
|
||||
```
|
||||
|
||||
### Save our prediction
|
||||
|
||||
Lack of knowledge about python, I don't know how to add feature names when I save them as csv. So I add 'Id' and 'SalePrice' manually afterwards.
|
||||
|
||||
|
||||
```python
|
||||
final=np.hstack((testId.reshape(-1,1),predict.reshape(-1,1)))
|
||||
np.savetxt('new.csv',final,delimiter=',',fmt='%d')
|
||||
```
|
||||
|
||||
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel_launcher.py:1: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
|
||||
"""Entry point for launching an IPython kernel.
|
||||
|
||||
|
||||
## 3.Summary
|
||||
This is just a simple sample for this competition. To get better score in this competition, we need to go deeper into the feature engineering and feature selection rather than simply selecting our model and training it. Furthermore, I think this is the most important part which deserves more focus since it will determine whether you can get to the top leaderboads in competitions.
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,196 @@
|
|||
# Dogs vs. Cats (kaggle 猫狗大战)
|
||||
|
||||
Create an algorithm to distinguish dogs from cats.
|
||||
|
||||
正如上面这句话所说,我们的目的就是创建一个算法,来对混合猫狗图片的数据集中,将猫和狗分别识别出来。
|
||||
|
||||
## 一、简介
|
||||
|
||||
猫狗大战这个项目其实与我们之前做的数字识别类似,只不过是图片复杂了一些。当然,我们这次使用深度学习,来完成我们想要做的事情。
|
||||
|
||||
## 二、安装包/第三方库要求
|
||||
|
||||
- Python 3.x
|
||||
- Pytorch
|
||||
|
||||
## 三、数据预处理
|
||||
|
||||
### 1、本次的数据来自 kaggle 的比赛项目 [dogs-vs-cats](https://www.kaggle.com/c/dogs-vs-cats)
|
||||
|
||||
### 2、查看数据格式
|
||||
|
||||
- 训练数据
|
||||
|
||||

|
||||
|
||||
- 训练数据集 - 说明:训练数据集中的数据,是经过人工标记的数据,类别和数字之间使用的 "." (点)做的分隔。
|
||||
|
||||
- 测试数据
|
||||
|
||||

|
||||
|
||||
- 测试数据集 - 说明:测试数据集中的数据,是没有经过人工标记的数据,没有对应的类别,只有一些相应的数字号码。
|
||||
|
||||
### 3、对数据的预处理
|
||||
|
||||
#### 3.1、提取训练 & 测试数据集的编号
|
||||
|
||||
训练数据集 & 测试数据集 给出的序号和 label 都是在文件名称中。
|
||||
|
||||
```python
|
||||
imgs = [os.path.join(root, img) for img in os.listdir(root)]
|
||||
|
||||
# test1:即测试数据集, D:/dataset/dogs-vs-cats/test1
|
||||
# train: 即训练数据集,D:/dataset/dogs-vs-cats/train
|
||||
if self.test:
|
||||
# 提取 测试数据集的序号,
|
||||
# 如 x = 'd:/path/123.jpg',
|
||||
# x.split('.') 得到 ['d:/path/123', 'jpg']
|
||||
# x.split('.')[-2] 得到 d:/path/123
|
||||
# x.split('.')[-2].split('/') 得到 ['d:', 'path', '123']
|
||||
# x.split('.')[-2].split('/')[-1] 得到 123
|
||||
imgs = sorted(imgs, key=lambda x: int(x.split('.')[-2].split('/')[-1]))
|
||||
else:
|
||||
# 如果不是测试集的话,就是训练集,我们只切分一下,仍然得到序号,123
|
||||
imgs = sorted(imgs, key=lambda x: int(x.split('.')[-2]))
|
||||
```
|
||||
|
||||
#### 3.2、划分训练集 & 验证集
|
||||
|
||||
首先我们知道我们手里的数据现在只有训练集和测试集,并没有验证集。那么为了我们训练得到的模型更好地拟合我们的测试数据,我们人为地将训练数据划分为 训练数据 + 验证数据(比例设置为 7:3)
|
||||
|
||||
```python
|
||||
# 获取图片的数量
|
||||
imgs_num = len(imgs)
|
||||
|
||||
# 划分训练、验证集,验证集:训练集 = 3:7
|
||||
# 判断是否为测试集
|
||||
if self.test:
|
||||
# 如果是 测试集,那么 就直接赋值
|
||||
self.imgs = imgs
|
||||
# 判断是否为 训练集
|
||||
elif train:
|
||||
# 如果是训练集,那么就把数据集的开始位置的数据 到 70% 部分的数据作为训练集
|
||||
self.imgs = imgs[:int(0.7 * imgs_num)]
|
||||
else:
|
||||
# 这种情况就是划分验证集啦,从 70% 部分的数据 到达数据的末尾,全部作为验证集
|
||||
self.imgs = imgs[int(0.7 * imgs_num):]
|
||||
```
|
||||
|
||||
#### 3.3、测试集,验证集和训练集的数据转换
|
||||
|
||||
```python
|
||||
# 数据的转换操作,测试集,验证集,和训练集的数据转换有所区别
|
||||
if transforms is None:
|
||||
# 如果转换操作没有设置,那我们设置一个转换
|
||||
normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
# 测试集 和 验证集 的转换
|
||||
# 判断如果是测试集或者不是训练集(也就是说是验证集),就应用我们下边的转换
|
||||
if self.test or not train:
|
||||
self.trainsforms = T.Compose([T.Resize(224), T.CenterCrop(224), T.ToTensor(), normalize])
|
||||
else:
|
||||
# 如果是测试集的话,使用另外的转换
|
||||
self.transforms = T.Compose([T.Resize(256), T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ToTensor(), normalize])
|
||||
```
|
||||
|
||||
#### 3.4、重写子类 / 函数
|
||||
|
||||
这里我们使用了 torch.utils.data 中的一些函数,比如 Dataset
|
||||
|
||||
class torch.utils.data.Dataset 表示 Dataset 的抽象类,所有其他的数据集都应继承该类。所有子类都应该重写 __len__ ,提供数据集大小的方法,和 __getitem__ ,支持从 0 到 len(self) 整数索引的方法。
|
||||
|
||||
```python
|
||||
def __len__(self):
|
||||
return len(self.imgs)
|
||||
|
||||
def __getitem__(self, index):
|
||||
img_path = self.imgs[index]
|
||||
# 判断,如果是测试集的数据的话,那就返回对应的序号,比如 d:path/123.jpg 返回 123
|
||||
if self.test:
|
||||
label = int(self.imgs[index].split('.')[-2].split('/')[-1])
|
||||
else:
|
||||
# 如果不是测试集的数据,那么会有相应的类别(label),也就是对应的dog 和 cat,dog 为 1,cat 为0
|
||||
label = 1 if 'dog' in img_path.split('/')[-1] else 0
|
||||
# 这里使用 Pillow 模块,使用 Image 打开一个图片
|
||||
data = Image.open(img_path)
|
||||
# 使用我们定义的 transforms ,将图片转换,详情参考:https://pytorch.org/docs/stable/torchvision/transforms.html#transforms-on-pil-image
|
||||
# 默认的 transforms 设置的是 none
|
||||
data = self.transforms(data)
|
||||
# 将转换完成的 data 以及对应的 label(如果有的话),返回
|
||||
return data,label
|
||||
```
|
||||
|
||||
#### 3.5、数据加载
|
||||
|
||||
```python
|
||||
# 训练数据集的路径
|
||||
train_path = 'D:/dataset/dogs-vs-cats/train'
|
||||
# 从训练数据集的存储路径中提取训练数据集
|
||||
train_dataset = GetData(train_path, train=True)
|
||||
# 将训练数据转换成 mini-batch 形式
|
||||
load_train = data.DataLoader(train_dataset, batch_size=20, shuffle=True, num_workers=1)
|
||||
|
||||
# 测试数据的获取
|
||||
# 首先设置测试数据的路径
|
||||
test_path = 'D:/dataset/dogs-vs-cats/test1'
|
||||
# 从测试数据集的存储路径中提取测试数据集
|
||||
test_path = GetData(test_path, test=True)
|
||||
# 将测试数据转换成 mini-batch 形式
|
||||
loader_test = data.DataLoader(test_dataset, batch_size=3, shuffle=True, num_workers=1)
|
||||
```
|
||||
|
||||
## 四、构建 CNN 模型
|
||||
|
||||
```python
|
||||
# 调用我们现成的 AlexNet() 模型
|
||||
cnn = AlexNet()
|
||||
# 将模型打印出来观察一下
|
||||
print(cnn)
|
||||
```
|
||||
|
||||
## 五、设置相应的优化器和损失函数
|
||||
|
||||
我们已经构造完成了 CNN 模型,并将我们所需要的数据进行了相应的预处理。那我们接下来的一步就是定义相应的损失函数和优化函数。
|
||||
|
||||
torch.optim 是一个实现各种优化算法的软件包。
|
||||
|
||||
```python
|
||||
# 设置优化器和损失函数
|
||||
# 这里我们使用 Adam 优化器,使用的损失函数是 交叉熵损失
|
||||
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.005, betas=(0.9, 0.99)) # 优化所有的 cnn 参数
|
||||
loss_func = nn.CrossEntropyLoss() # 目标 label 不是 one-hotted 类型的
|
||||
```
|
||||
|
||||
## 六、训练模型
|
||||
|
||||
数据以及相对应的损失函数和优化器,我们都已经设置好了,那接下来就是紧张刺激的训练模型环节了。
|
||||
|
||||
```python
|
||||
# 训练模型
|
||||
# 设置训练模型的次数,这里我们设置的是 10 次,也就是用我们的训练数据集对我们的模型训练 10 次,为了节省时间,我们可以只训练 1 次
|
||||
EPOCH = 10
|
||||
# 训练和测试
|
||||
for epoch in range(EPOCH):
|
||||
num = 0
|
||||
# 给出 batch 数据,在迭代 train_loader 的时候对 x 进行 normalize
|
||||
for step, (x, y) in enumerate(loader_train):
|
||||
b_x = Variable(x) # batch x
|
||||
b_y = Variable(y) # batch y
|
||||
|
||||
output = cnn(b_x) # cnn 的输出
|
||||
loss = loss_func(output, b_y) # 交叉熵损失
|
||||
optimizer.zero_grad() # 在这一步的训练步骤上,进行梯度清零
|
||||
loss.backward() # 反向传播,并进行计算梯度
|
||||
optimizer.step() # 应用梯度
|
||||
|
||||
# 可以打印一下
|
||||
# print('-'*30, step)
|
||||
if step % 20 == 0:
|
||||
num += 1
|
||||
for _, (x_t, y_test) in enumerate(loader_test):
|
||||
x_test = Variable(x_t) # batch x
|
||||
test_output = cnn(x_test)
|
||||
pred_y = torch.max(test_output, 1)[1].data.squeeze()
|
||||
accuracy = sum(pred_y == y_test) / float(y_test.size(0))
|
||||
print('Epoch: ', epoch, '| Num: ', num, '| Step: ', step, '| train loss: %.4f' % loss.data[0], '| test accuracy: %.4f' % accuracy)
|
||||
```
|
|
@ -2,53 +2,79 @@
|
|||
# coding: utf-8
|
||||
'''
|
||||
Created on 2017-10-26
|
||||
Update on 2017-10-26
|
||||
Author: 片刻
|
||||
Update on 2018-05-16
|
||||
Author: 片刻/ccyf00
|
||||
Github: https://github.com/apachecn/kaggle
|
||||
'''
|
||||
|
||||
import os.path
|
||||
import csv
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from numpy import shape, ravel
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
import sys
|
||||
|
||||
cpath = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
||||
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
||||
|
||||
# 加载数据
|
||||
def opencsv():
|
||||
# 使用 pandas 打开
|
||||
data = pd.read_csv(cpath + 'input/train.csv')
|
||||
data1 = pd.read_csv(cpath + 'input/test.csv')
|
||||
data = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
|
||||
data1 = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
|
||||
|
||||
train_data = data.values[0:, 1:] # 读入全部训练数据, [行,列]
|
||||
train_label = data.values[0:, 0] # 读取列表的第一列
|
||||
train_label = data.values[0:, 0] # 读取列表的第一列
|
||||
test_data = data1.values[0:, 0:] # 测试全部测试个数据
|
||||
return train_data, train_label, test_data
|
||||
|
||||
|
||||
def saveResult(result, csvName):
|
||||
with open(csvName, 'w',newline='') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
|
||||
#python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline=''
|
||||
myWriter = csv.writer(myFile) # 对文件执行写入
|
||||
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
|
||||
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
|
||||
# python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline=''
|
||||
myWriter = csv.writer(myFile) # 对文件执行写入
|
||||
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
|
||||
index = 0
|
||||
for i in result:
|
||||
tmp = []
|
||||
index = index + 1
|
||||
tmp.append(index)
|
||||
# tmp.append(i)
|
||||
tmp.append(int(i)) # 测试集的标签值
|
||||
tmp.append(int(i)) # 测试集的标签值
|
||||
myWriter.writerow(tmp)
|
||||
|
||||
|
||||
def knnClassify(trainData, trainLabel):
|
||||
knnClf = KNeighborsClassifier() # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
|
||||
knnClf.fit(trainData, ravel(trainLabel)) # ravel Return a contiguous flattened array.
|
||||
knnClf = KNeighborsClassifier() # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
|
||||
knnClf.fit(trainData, np.ravel(trainLabel)) # ravel Return a contiguous flattened array.
|
||||
return knnClf
|
||||
|
||||
|
||||
# 数据预处理-降维 PCA主成成分分析
|
||||
def dRPCA(x_train, x_test, COMPONENT_NUM):
|
||||
print('dimensionality reduction...')
|
||||
trainData = np.array(x_train)
|
||||
testData = np.array(x_test)
|
||||
'''
|
||||
使用说明:https://www.cnblogs.com/pinard/p/6243025.html
|
||||
n_components>=1
|
||||
n_components=NUM 设置占特征数量比
|
||||
0 < n_components < 1
|
||||
n_components=0.99 设置阈值总方差占比
|
||||
'''
|
||||
pca = PCA(n_components=COMPONENT_NUM, whiten=True)
|
||||
pca.fit(trainData) # Fit the model with X
|
||||
pcaTrainData = pca.transform(trainData) # Fit the model with X and 在X上完成降维.
|
||||
pcaTestData = pca.transform(testData) # Fit the model with X and 在X上完成降维.
|
||||
|
||||
# pca 方差大小、方差占比、特征数量
|
||||
print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n',
|
||||
pca.n_components_)
|
||||
print(sum(pca.explained_variance_ratio_))
|
||||
return pcaTrainData, pcaTestData
|
||||
|
||||
|
||||
def dRecognition_knn():
|
||||
start_time = time.time()
|
||||
|
||||
|
@ -61,17 +87,17 @@ def dRecognition_knn():
|
|||
stop_time_l = time.time()
|
||||
print('load data time used:%f' % (stop_time_l - start_time))
|
||||
|
||||
# 降维处理
|
||||
trainData, testData = dRPCA(trainData, testData, 35)
|
||||
|
||||
# 模型训练
|
||||
knnClf = knnClassify(trainData, trainLabel)
|
||||
|
||||
# 结果预测
|
||||
testLabel = knnClf.predict(testData)
|
||||
|
||||
|
||||
# 结果的输出
|
||||
saveResult(
|
||||
testLabel,
|
||||
cpath + 'output/Result_sklearn_knn.csv'
|
||||
)
|
||||
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv'))
|
||||
print("finish!")
|
||||
stop_time_r = time.time()
|
||||
print('classify time used:%f' % (stop_time_r - start_time))
|
||||
|
|
|
@ -9,6 +9,7 @@ Github: https://github.com/apachecn/kaggle
|
|||
PCA主成成分分析
|
||||
'''
|
||||
|
||||
import os.path
|
||||
import csv
|
||||
import time
|
||||
import numpy as np
|
||||
|
@ -18,13 +19,14 @@ from sklearn.svm import SVC
|
|||
from sklearn.metrics import classification_report
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
||||
|
||||
# 加载数据
|
||||
def opencsv():
|
||||
print('Load Data...')
|
||||
# 使用 pandas 打开
|
||||
dataTrain = pd.read_csv('datasets/getting-started/digit-recognizer/input/train.csv')
|
||||
dataPre = pd.read_csv('datasets/getting-started/digit-recognizer/input/test.csv')
|
||||
dataTrain = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
|
||||
dataPre = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
|
||||
trainData = dataTrain.values[:, 1:] # 读入全部训练数据
|
||||
trainLabel = dataTrain.values[:, 0]
|
||||
preData = dataPre.values[:, :] # 测试全部测试个数据
|
||||
|
@ -79,11 +81,11 @@ def saveResult(result, csvName):
|
|||
|
||||
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
|
||||
def analyse_data(dataMat):
|
||||
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
|
||||
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
|
||||
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
|
||||
#计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
|
||||
#cov(X,0) = cov(X) 除数是n-1(n为样本个数)
|
||||
#cov(X,1) 除数是n
|
||||
# 计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
|
||||
# cov(X,0) = cov(X) 除数是n-1(n为样本个数)
|
||||
# cov(X,1) 除数是n
|
||||
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
|
||||
# np.mat 是用来生成一个矩阵的
|
||||
# 保存特征值(eigvals)和对应的特征向量(eigVects)
|
||||
|
@ -188,8 +190,8 @@ def trainDRSVM():
|
|||
# 模型训练 (数据预处理-降维)
|
||||
optimalSVMClf, pcaPreData = getOptimalAccuracy(trainData, trainLabel, preData)
|
||||
|
||||
storeModel(optimalSVMClf, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.model')
|
||||
storeModel(pcaPreData, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.pcaPreData')
|
||||
storeModel(optimalSVMClf, os.path.join(data_dir, '/ouput/Result_sklearn_SVM.model'))
|
||||
storeModel(pcaPreData, os.path.join(data_dir, '/ouput/Result_sklearn_SVM.pcaPreData'))
|
||||
|
||||
print("finish!")
|
||||
stopTime = time.time()
|
||||
|
@ -199,14 +201,14 @@ def trainDRSVM():
|
|||
def preDRSVM():
|
||||
startTime = time.time()
|
||||
# 加载模型和数据
|
||||
optimalSVMClf = getModel('datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.model')
|
||||
pcaPreData = getModel('datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.pcaPreData')
|
||||
optimalSVMClf = getModel(os.path.join(data_dir, '/ouput/Result_sklearn_SVM.model'))
|
||||
pcaPreData = getModel(os.path.join(data_dir, '/ouput/Result_sklearn_SVM.pcaPreData'))
|
||||
|
||||
# 结果预测
|
||||
testLabel = optimalSVMClf.predict(pcaPreData)
|
||||
# print("testLabel = %f" % testscore)
|
||||
# 结果的输出
|
||||
saveResult(testLabel, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.csv')
|
||||
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_SVM.csv'))
|
||||
print("finish!")
|
||||
stopTime = time.time()
|
||||
print('PreModel load time used:%f s' % (stopTime - startTime))
|
||||
|
@ -220,4 +222,4 @@ if __name__ == '__main__':
|
|||
# 分析数据
|
||||
analyse_data(trainData)
|
||||
# 加载预测数据集
|
||||
# preDRSVM()
|
||||
preDRSVM()
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#!/usr/bin/python
|
||||
# coding: utf-8
|
||||
|
||||
'''
|
||||
Created on 2017-12-11
|
||||
Update on 2017-12-11
|
||||
|
@ -9,11 +8,12 @@ Github: https://github.com/apachecn/kaggle
|
|||
'''
|
||||
import time
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.linear_model import Ridge
|
||||
import os.path
|
||||
|
||||
data_dir = '../../../../datasets/getting-started/house-prices'
|
||||
data_dir = '/opt/data/kaggle/getting-started/house-prices'
|
||||
|
||||
|
||||
# 加载数据
|
||||
def opencsv():
|
||||
# 使用 pandas 打开
|
||||
|
@ -24,25 +24,29 @@ def opencsv():
|
|||
|
||||
|
||||
def saveResult(result):
|
||||
result.to_csv(os.path.join(data_dir,"submission.csv" ), sep=',', encoding='utf-8')
|
||||
result.to_csv(
|
||||
os.path.join(data_dir, "submission.csv"), sep=',', encoding='utf-8')
|
||||
|
||||
|
||||
def ridgeRegression(trainData, trainLabel, df_test):
|
||||
ridge = Ridge(alpha=10.0) # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
|
||||
ridge = Ridge(
|
||||
alpha=10.0
|
||||
) # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
|
||||
ridge.fit(trainData, trainLabel)
|
||||
predict = ridge.predict(df_test)
|
||||
pred_df = pd.DataFrame(predict, index=df_test["Id"], columns=["SalePrice"])
|
||||
return pred_df
|
||||
return pred_df
|
||||
|
||||
|
||||
def dataProcess(df_train, df_test):
|
||||
trainLabel = df_train['SalePrice']
|
||||
df = pd.concat((df_train,df_test), axis=0, ignore_index=True)
|
||||
df = pd.concat((df_train, df_test), axis=0, ignore_index=True)
|
||||
df.dropna(axis=1, inplace=True)
|
||||
df = pd.get_dummies(df)
|
||||
trainData = df[:df_train.shape[0]]
|
||||
test = df[df_train.shape[0]:]
|
||||
return trainData, trainLabel, test
|
||||
return trainData, trainLabel, test
|
||||
|
||||
|
||||
def Regression_ridge():
|
||||
start_time = time.time()
|
||||
|
@ -50,11 +54,11 @@ def Regression_ridge():
|
|||
# 加载数据
|
||||
df_train, df_test = opencsv()
|
||||
|
||||
print ("load data finish")
|
||||
print("load data finish")
|
||||
stop_time_l = time.time()
|
||||
print('load data time used:%f' % (stop_time_l - start_time))
|
||||
|
||||
#数据预处理
|
||||
|
||||
# 数据预处理
|
||||
train_data, trainLabel, df_test = dataProcess(df_train, df_test)
|
||||
|
||||
# 模型训练预测
|
||||
|
@ -62,7 +66,7 @@ def Regression_ridge():
|
|||
|
||||
# 结果的输出
|
||||
saveResult(result)
|
||||
print ("finish!")
|
||||
print("finish!")
|
||||
stop_time_r = time.time()
|
||||
print('classify time used:%f' % (stop_time_r - start_time))
|
||||
|
|
@ -0,0 +1,380 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
__author__ = 'liudong'
|
||||
__date__ = '2018/4/23 下午2:28'
|
||||
#import some necessary librairies
|
||||
|
||||
import numpy as np # linear algebra
|
||||
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
||||
# %matplotlib inline
|
||||
import matplotlib.pyplot as plt # Matlab-style plotting
|
||||
import seaborn as sns
|
||||
color = sns.color_palette()
|
||||
sns.set_style('darkgrid')
|
||||
import warnings
|
||||
|
||||
|
||||
def ignore_warn(*args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
# ignore annoying warning (from sklearn and seaborn)
|
||||
warnings.warn = ignore_warn
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from scipy import stats
|
||||
from scipy.stats import norm, skew #for some statistics
|
||||
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
|
||||
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
|
||||
from sklearn.model_selection import KFold, cross_val_score, train_test_split
|
||||
from sklearn.metrics import mean_squared_error
|
||||
import xgboost as xgb
|
||||
import lightgbm as lgb
|
||||
|
||||
# Limiting floats output to 3 decimal points
|
||||
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
|
||||
|
||||
from subprocess import check_output
|
||||
# check the files available in the directory
|
||||
# print(check_output(["ls", "/Users/liudong/Desktop/house_price/train.csv"]).decode("utf8"))
|
||||
# 加载数据
|
||||
train = pd.read_csv('/opt/data/kaggle/getting-started/house-prices/train.csv')
|
||||
test = pd.read_csv('/opt/data/kaggle/getting-started/house-prices/test.csv')
|
||||
# 查看训练数据的特征
|
||||
print(train.head(5))
|
||||
# 查看测试数据的特征
|
||||
print(test.head(5))
|
||||
|
||||
# 查看数据的数量和特征值的个数
|
||||
print("The train data size before dropping Id feature is : {} ".format(
|
||||
train.shape))
|
||||
print("The test data size before dropping Id feature is : {} ".format(
|
||||
test.shape))
|
||||
|
||||
# Save the 'Id' colum
|
||||
train_ID = train['Id']
|
||||
test_ID = test['Id']
|
||||
|
||||
# Now drop the 'Id' colum since it's unnecessary for the prediction process.
|
||||
train.drop("Id", axis=1, inplace=True)
|
||||
test.drop("Id", axis=1, inplace=True)
|
||||
|
||||
#check again the data size after dropping the 'Id' variable
|
||||
print("\nThe train data size after dropping Id feature is : {} ".format(
|
||||
train.shape))
|
||||
print(
|
||||
"The test data size after dropping Id feature is : {} ".format(test.shape))
|
||||
|
||||
# Deleting outliers 删除那些异常数据值
|
||||
train = train.drop(
|
||||
train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
|
||||
|
||||
# We use the numpy fuction log1p which applies log(1+x) to all elements of the column
|
||||
train["SalePrice"] = np.log1p(train["SalePrice"])
|
||||
|
||||
# 特征工程
|
||||
# let's first concatenate the train and test data in the same dataframe
|
||||
ntrain = train.shape[0]
|
||||
ntest = test.shape[0]
|
||||
y_train = train.SalePrice.values
|
||||
all_data = pd.concat((train, test)).reset_index(drop=True)
|
||||
all_data.drop(['SalePrice'], axis=1, inplace=True)
|
||||
print("all_data size is : {}".format(all_data.shape))
|
||||
|
||||
# 处理缺失数据
|
||||
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
|
||||
all_data_na = all_data_na.drop(
|
||||
all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
|
||||
missing_data = pd.DataFrame({'Missing Ratio': all_data_na})
|
||||
print(missing_data.head(20))
|
||||
|
||||
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
|
||||
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
|
||||
all_data["Alley"] = all_data["Alley"].fillna("None")
|
||||
all_data["Fence"] = all_data["Fence"].fillna("None")
|
||||
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
|
||||
# Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
|
||||
all_data["LotFrontage"] = all_data.groupby(
|
||||
"Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
|
||||
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
|
||||
all_data[col] = all_data[col].fillna('None')
|
||||
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
|
||||
all_data[col] = all_data[col].fillna(0)
|
||||
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
|
||||
'BsmtFullBath', 'BsmtHalfBath'):
|
||||
all_data[col] = all_data[col].fillna(0)
|
||||
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
|
||||
'BsmtFinType2'):
|
||||
all_data[col] = all_data[col].fillna('None')
|
||||
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
|
||||
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
|
||||
all_data['MSZoning'] = all_data['MSZoning'].fillna(
|
||||
all_data['MSZoning'].mode()[0])
|
||||
all_data = all_data.drop(['Utilities'], axis=1)
|
||||
all_data["Functional"] = all_data["Functional"].fillna("Typ")
|
||||
all_data['Electrical'] = all_data['Electrical'].fillna(
|
||||
all_data['Electrical'].mode()[0])
|
||||
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(
|
||||
all_data['KitchenQual'].mode()[0])
|
||||
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(
|
||||
all_data['Exterior1st'].mode()[0])
|
||||
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(
|
||||
all_data['Exterior2nd'].mode()[0])
|
||||
all_data['SaleType'] = all_data['SaleType'].fillna(
|
||||
all_data['SaleType'].mode()[0])
|
||||
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")
|
||||
#Check remaining missing values if any
|
||||
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
|
||||
all_data_na = all_data_na.drop(
|
||||
all_data_na[all_data_na == 0].index).sort_values(ascending=False)
|
||||
missing_data = pd.DataFrame({'Missing Ratio': all_data_na})
|
||||
print(missing_data.head())
|
||||
# 另外的特征工程
|
||||
# Transforming some numerical variables that are really categorical
|
||||
# MSSubClass=The building class
|
||||
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
|
||||
|
||||
# Changing OverallCond into a categorical variable
|
||||
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
|
||||
|
||||
# Year and month sold are transformed into categorical features.
|
||||
all_data['YrSold'] = all_data['YrSold'].astype(str)
|
||||
all_data['MoSold'] = all_data['MoSold'].astype(str)
|
||||
|
||||
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
|
||||
'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual',
|
||||
'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure',
|
||||
'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street',
|
||||
'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold')
|
||||
# process columns, apply LabelEncoder to categorical features
|
||||
for c in cols:
|
||||
lbl = LabelEncoder()
|
||||
lbl.fit(list(all_data[c].values))
|
||||
all_data[c] = lbl.transform(list(all_data[c].values))
|
||||
|
||||
# shape
|
||||
print('Shape all_data: {}'.format(all_data.shape))
|
||||
|
||||
# 增加更多重要的特征
|
||||
# Adding total sqfootage feature
|
||||
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data[
|
||||
'1stFlrSF'] + all_data['2ndFlrSF']
|
||||
# Skewed features
|
||||
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
|
||||
|
||||
# Check the skew of all numerical features
|
||||
skewed_feats = all_data[numeric_feats].apply(
|
||||
lambda x: skew(x.dropna())).sort_values(ascending=False)
|
||||
print("\nSkew in numerical features: \n")
|
||||
skewness = pd.DataFrame({'Skew': skewed_feats})
|
||||
print(skewness.head(10))
|
||||
|
||||
# Box Cox Transformation of (highly) skewed features
|
||||
# We use the scipy function boxcox1p which computes the Box-Cox transformation of 1+x .
|
||||
# Note that setting λ=0 is equivalent to log1p used above for the target variable.
|
||||
skewness = skewness[abs(skewness) > 0.75]
|
||||
print("There are {} skewed numerical features to Box Cox transform".format(
|
||||
skewness.shape[0]))
|
||||
|
||||
from scipy.special import boxcox1p
|
||||
|
||||
skewed_features = skewness.index
|
||||
lam = 0.15
|
||||
for feat in skewed_features:
|
||||
# all_data[feat] += 1
|
||||
all_data[feat] = boxcox1p(all_data[feat], lam)
|
||||
# Getting dummy categorical features
|
||||
all_data = pd.get_dummies(all_data)
|
||||
print(all_data.shape)
|
||||
# Getting the new train and test sets.
|
||||
train = all_data[:ntrain]
|
||||
test = all_data[ntrain:]
|
||||
|
||||
#Validation function
|
||||
n_folds = 5
|
||||
|
||||
|
||||
def rmsle_cv(model):
|
||||
kf = KFold(
|
||||
n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
|
||||
rmse = np.sqrt(-cross_val_score(
|
||||
model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
|
||||
print("rmse", rmse)
|
||||
return (rmse)
|
||||
|
||||
|
||||
# 模型
|
||||
# LASSO Regression :
|
||||
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
|
||||
# Elastic Net Regression
|
||||
ENet = make_pipeline(
|
||||
RobustScaler(), ElasticNet(
|
||||
alpha=0.0005, l1_ratio=.9, random_state=3))
|
||||
# Kernel Ridge Regression
|
||||
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
|
||||
# Gradient Boosting Regression
|
||||
GBoost = GradientBoostingRegressor(
|
||||
n_estimators=3000,
|
||||
learning_rate=0.05,
|
||||
max_depth=4,
|
||||
max_features='sqrt',
|
||||
min_samples_leaf=15,
|
||||
min_samples_split=10,
|
||||
loss='huber',
|
||||
random_state=5)
|
||||
# XGboost
|
||||
model_xgb = xgb.XGBRegressor(
|
||||
colsample_bytree=0.4603,
|
||||
gamma=0.0468,
|
||||
learning_rate=0.05,
|
||||
max_depth=3,
|
||||
min_child_weight=1.7817,
|
||||
n_estimators=2200,
|
||||
reg_alpha=0.4640,
|
||||
reg_lambda=0.8571,
|
||||
subsample=0.5213,
|
||||
silent=1,
|
||||
random_state=7,
|
||||
nthread=-1)
|
||||
# lightGBM
|
||||
model_lgb = lgb.LGBMRegressor(
|
||||
objective='regression',
|
||||
num_leaves=5,
|
||||
learning_rate=0.05,
|
||||
n_estimators=720,
|
||||
max_bin=55,
|
||||
bagging_fraction=0.8,
|
||||
bagging_freq=5,
|
||||
feature_fraction=0.2319,
|
||||
feature_fraction_seed=9,
|
||||
bagging_seed=9,
|
||||
min_data_in_leaf=6,
|
||||
min_sum_hessian_in_leaf=11)
|
||||
# Base models scores
|
||||
score = rmsle_cv(lasso)
|
||||
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(ENet)
|
||||
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(KRR)
|
||||
print(
|
||||
"Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(GBoost)
|
||||
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(),
|
||||
score.std()))
|
||||
score = rmsle_cv(model_xgb)
|
||||
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(model_lgb)
|
||||
print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
|
||||
|
||||
# 模型融合
|
||||
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
|
||||
def __init__(self, models):
|
||||
self.models = models
|
||||
|
||||
# we define clones of the original models to fit the data in
|
||||
def fit(self, X, y):
|
||||
self.models_ = [clone(x) for x in self.models]
|
||||
|
||||
# Train cloned base models
|
||||
for model in self.models_:
|
||||
model.fit(X, y)
|
||||
|
||||
return self
|
||||
|
||||
# Now we do the predictions for cloned models and average them
|
||||
def predict(self, X):
|
||||
predictions = np.column_stack(
|
||||
[model.predict(X) for model in self.models_])
|
||||
return np.mean(predictions, axis=1)
|
||||
|
||||
|
||||
# 评价这四个模型的好坏
|
||||
averaged_models = AveragingModels(models=(ENet, GBoost, KRR, lasso))
|
||||
score = rmsle_cv(averaged_models)
|
||||
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(),
|
||||
score.std()))
|
||||
|
||||
|
||||
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
|
||||
def __init__(self, base_models, meta_model, n_folds=5):
|
||||
self.base_models = base_models
|
||||
self.meta_model = meta_model
|
||||
self.n_folds = n_folds
|
||||
|
||||
# We again fit the data on clones of the original models
|
||||
def fit(self, X, y):
|
||||
self.base_models_ = [list() for x in self.base_models]
|
||||
self.meta_model_ = clone(self.meta_model)
|
||||
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
|
||||
|
||||
# Train cloned base models then create out-of-fold predictions
|
||||
# that are needed to train the cloned meta-model
|
||||
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
|
||||
for i, model in enumerate(self.base_models):
|
||||
for train_index, holdout_index in kfold.split(X, y):
|
||||
instance = clone(model)
|
||||
self.base_models_[i].append(instance)
|
||||
instance.fit(X[train_index], y[train_index])
|
||||
y_pred = instance.predict(X[holdout_index])
|
||||
out_of_fold_predictions[holdout_index, i] = y_pred
|
||||
|
||||
# Now train the cloned meta-model using the out-of-fold predictions as new feature
|
||||
self.meta_model_.fit(out_of_fold_predictions, y)
|
||||
return self
|
||||
|
||||
# Do the predictions of all base models on the test data and use the averaged predictions as
|
||||
# meta-features for the final prediction which is done by the meta-model
|
||||
def predict(self, X):
|
||||
meta_features = np.column_stack([
|
||||
np.column_stack([model.predict(X) for model in base_models]).mean(
|
||||
axis=1) for base_models in self.base_models_
|
||||
])
|
||||
return self.meta_model_.predict(meta_features)
|
||||
|
||||
|
||||
stacked_averaged_models = StackingAveragedModels(
|
||||
base_models=(ENet, GBoost, KRR), meta_model=lasso)
|
||||
score = rmsle_cv(stacked_averaged_models)
|
||||
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(),
|
||||
score.std()))
|
||||
|
||||
|
||||
# define a rmsle evaluation function
|
||||
def rmsle(y, y_pred):
|
||||
return np.sqrt(mean_squared_error(y, y_pred))
|
||||
|
||||
|
||||
# Final Training and Prediction
|
||||
# StackedRegressor
|
||||
stacked_averaged_models.fit(train.values, y_train)
|
||||
stacked_train_pred = stacked_averaged_models.predict(train.values)
|
||||
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
|
||||
print(rmsle(y_train, stacked_train_pred))
|
||||
|
||||
# XGBoost
|
||||
model_xgb.fit(train, y_train)
|
||||
xgb_train_pred = model_xgb.predict(train)
|
||||
xgb_pred = np.expm1(model_xgb.predict(test))
|
||||
print(rmsle(y_train, xgb_train_pred))
|
||||
# lightGBM
|
||||
model_lgb.fit(train, y_train)
|
||||
lgb_train_pred = model_lgb.predict(train)
|
||||
lgb_pred = np.expm1(model_lgb.predict(test.values))
|
||||
print(rmsle(y_train, lgb_train_pred))
|
||||
'''RMSE on the entire Train data when averaging'''
|
||||
|
||||
print('RMSLE score on train data:')
|
||||
print(rmsle(y_train, stacked_train_pred * 0.70 + xgb_train_pred * 0.15 +
|
||||
lgb_train_pred * 0.15))
|
||||
# 模型融合的预测效果
|
||||
ensemble = stacked_pred * 0.70 + xgb_pred * 0.15 + lgb_pred * 0.15
|
||||
# 保存结果
|
||||
result = pd.DataFrame()
|
||||
result['Id'] = test_ID
|
||||
result['SalePrice'] = ensemble
|
||||
# index=False 是用来除去行编号
|
||||
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
|
||||
print('##########结束训练##########')
|
|
@ -1,45 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
#-*- coding:utf-8 -*-
|
||||
'''
|
||||
Created on 2017-12-2
|
||||
Update on 2017-12-2
|
||||
Author: loveSnowBest
|
||||
Github: https://github.com/zehuichen123/kaggle
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
rawData=pd.read_csv('train.csv')
|
||||
testData=pd.read_csv('test.csv')
|
||||
testId=testData['Id']
|
||||
X_test=testData.drop(['Id'],axis=1)
|
||||
|
||||
Y_train=rawData['SalePrice']
|
||||
X_train=rawData.drop(['SalePrice','Id'],axis=1)
|
||||
|
||||
X=pd.concat([X_train,X_test],axis=0,keys={'train','test'},ignore_index=False)
|
||||
|
||||
X_d=pd.get_dummies(X)
|
||||
|
||||
keep_cols=X_d.select_dtypes(include=['number']).columns
|
||||
X_d=X_d[keep_cols]
|
||||
|
||||
X_train=X_d.loc['train']
|
||||
X_test=X_d.loc['test']
|
||||
|
||||
X_train=X_train.fillna(X_train.mean())
|
||||
X_test=X_test.fillna(X_test.mean())
|
||||
|
||||
ss=StandardScaler()
|
||||
X_scale=ss.fit_transform(X_train)
|
||||
X_test_scale=ss.transform(X_test)
|
||||
|
||||
rr=GradientBoostingRegressor(n_estimators=3000,learning_rate=0.05, max_features='sqrt')
|
||||
|
||||
rr.fit(X_scale,Y_train)
|
||||
predict=np.array(rr.predict(X_test_scale))
|
||||
final=np.hstack((testId.reshape(-1,1),predict.reshape(-1,1)))
|
||||
np.savetxt('new.csv',final,delimiter=',',fmt='%d')
|
|
@ -15,28 +15,64 @@ import os.path
|
|||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
data_dir = '../../../../datasets/getting-started/house-prices'
|
||||
data_dir = '/opt/data/kaggle/getting-started/house-prices'
|
||||
|
||||
#这里对数据做一些转换,原因要么是某些类别个数太少而且分布相近,要么是特征内的值之间有较为明显的优先级
|
||||
mapper = {'LandSlope': {'Gtl':'Gtl', 'Mod':'unGtl', 'Sev':'unGtl'},
|
||||
'LotShape': {'Reg':'Reg', 'IR1':'IR1', 'IR2':'other', 'IR3':'other'},
|
||||
'RoofMatl': {'ClyTile':'other', 'CompShg':'CompShg', 'Membran':'other', 'Metal':'other',
|
||||
'Roll':'other', 'Tar&Grv':'Tar&Grv', 'WdShake':'WdShake', 'WdShngl':'WdShngl'},
|
||||
'Heating':{'GasA':'GasA', 'GasW':'GasW', 'Grav':'Grav', 'Floor':'other',
|
||||
'OthW':'other', 'Wall':'Wall'},
|
||||
'HeatingQC':{'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
|
||||
'KitchenQual': {'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}
|
||||
}
|
||||
# 这里对数据做一些转换,原因要么是某些类别个数太少而且分布相近,要么是特征内的值之间有较为明显的优先级
|
||||
mapper = {
|
||||
'LandSlope': {
|
||||
'Gtl': 'Gtl',
|
||||
'Mod': 'unGtl',
|
||||
'Sev': 'unGtl'
|
||||
},
|
||||
'LotShape': {
|
||||
'Reg': 'Reg',
|
||||
'IR1': 'IR1',
|
||||
'IR2': 'other',
|
||||
'IR3': 'other'
|
||||
},
|
||||
'RoofMatl': {
|
||||
'ClyTile': 'other',
|
||||
'CompShg': 'CompShg',
|
||||
'Membran': 'other',
|
||||
'Metal': 'other',
|
||||
'Roll': 'other',
|
||||
'Tar&Grv': 'Tar&Grv',
|
||||
'WdShake': 'WdShake',
|
||||
'WdShngl': 'WdShngl'
|
||||
},
|
||||
'Heating': {
|
||||
'GasA': 'GasA',
|
||||
'GasW': 'GasW',
|
||||
'Grav': 'Grav',
|
||||
'Floor': 'other',
|
||||
'OthW': 'other',
|
||||
'Wall': 'Wall'
|
||||
},
|
||||
'HeatingQC': {
|
||||
'Po': 1,
|
||||
'Fa': 2,
|
||||
'TA': 3,
|
||||
'Gd': 4,
|
||||
'Ex': 5
|
||||
},
|
||||
'KitchenQual': {
|
||||
'Fa': 1,
|
||||
'TA': 2,
|
||||
'Gd': 3,
|
||||
'Ex': 4
|
||||
}
|
||||
}
|
||||
|
||||
#对结果影响很小,或者与其他特征相关性较高的特征将被丢弃
|
||||
to_drop = ['Id','Street','Utilities','Condition2','PoolArea','PoolQC','Fence',
|
||||
'YrSold','MoSold','BsmtHalfBath','BsmtFinSF2','GarageQual','MiscVal'
|
||||
,'EnclosedPorch','3SsnPorch','GarageArea','TotRmsAbvGrd','GarageYrBlt'
|
||||
,'BsmtFinType2','BsmtUnfSF','GarageCond'
|
||||
,'GarageFinish','FireplaceQu','BsmtCond','BsmtQual','Alley']
|
||||
# 对结果影响很小,或者与其他特征相关性较高的特征将被丢弃
|
||||
to_drop = [
|
||||
'Id', 'Street', 'Utilities', 'Condition2', 'PoolArea', 'PoolQC', 'Fence',
|
||||
'YrSold', 'MoSold', 'BsmtHalfBath', 'BsmtFinSF2', 'GarageQual', 'MiscVal',
|
||||
'EnclosedPorch', '3SsnPorch', 'GarageArea', 'TotRmsAbvGrd', 'GarageYrBlt',
|
||||
'BsmtFinType2', 'BsmtUnfSF', 'GarageCond', 'GarageFinish', 'FireplaceQu',
|
||||
'BsmtCond', 'BsmtQual', 'Alley'
|
||||
]
|
||||
|
||||
|
||||
#特渣工程之瞎搞特征,别问我思路是什么,纯属乱拍脑袋搞出来,而且对结果貌似也仅有一点点影响
|
||||
# 特渣工程之瞎搞特征,别问我思路是什么,纯属乱拍脑袋搞出来,而且对结果貌似也仅有一点点影响
|
||||
'''
|
||||
data['house_remod']: 重新装修的年份与房建年份的差值
|
||||
data['livingRate']: LotArea查了下是地块面积,这个特征是居住面积/地块面积*总体评价
|
||||
|
@ -45,98 +81,98 @@ data['room_area']: 房间数/居住面积
|
|||
data['fu_room']: 带有浴室的房间占总房间数的比例
|
||||
data['gr_room']: 卧室与房间数的占比
|
||||
'''
|
||||
|
||||
|
||||
def create_feature(data):
|
||||
#是否拥有地下室
|
||||
hBsmt_index = data.index[data['TotalBsmtSF']>0]
|
||||
data['HaveBsmt'] = 0;
|
||||
data.loc[hBsmt_index,'HaveBsmt'] = 1
|
||||
data['house_remod'] = data['YearRemodAdd']-data['YearBuilt'];
|
||||
data['livingRate'] = (data['GrLivArea']/data['LotArea'])*data['OverallCond'];
|
||||
data['lot_area'] = data['LotFrontage']/data['GrLivArea'];
|
||||
data['room_area'] = data['TotRmsAbvGrd']/data['GrLivArea'];
|
||||
data['fu_room'] = data['FullBath']/data['TotRmsAbvGrd'];
|
||||
data['gr_room'] = data['BedroomAbvGr']/data['TotRmsAbvGrd'];
|
||||
# 是否拥有地下室
|
||||
hBsmt_index = data.index[data['TotalBsmtSF'] > 0]
|
||||
data['HaveBsmt'] = 0
|
||||
data.loc[hBsmt_index, 'HaveBsmt'] = 1
|
||||
data['house_remod'] = data['YearRemodAdd'] - data['YearBuilt']
|
||||
data['livingRate'] = (data['GrLivArea'] /
|
||||
data['LotArea']) * data['OverallCond']
|
||||
data['lot_area'] = data['LotFrontage'] / data['GrLivArea']
|
||||
data['room_area'] = data['TotRmsAbvGrd'] / data['GrLivArea']
|
||||
data['fu_room'] = data['FullBath'] / data['TotRmsAbvGrd']
|
||||
data['gr_room'] = data['BedroomAbvGr'] / data['TotRmsAbvGrd']
|
||||
|
||||
|
||||
def processing(data):
|
||||
#构造新特征
|
||||
create_feature(data);
|
||||
#丢弃特征
|
||||
data.drop(to_drop,axis=1,inplace=True)
|
||||
|
||||
#填充None值,因为在特征说明中,None也是某些特征的一个值,所以对于这部分特征的缺失值以None填充
|
||||
fill_none = ['MasVnrType','BsmtExposure','GarageType','MiscFeature']
|
||||
# 构造新特征
|
||||
create_feature(data)
|
||||
# 丢弃特征
|
||||
data.drop(to_drop, axis=1, inplace=True)
|
||||
|
||||
# 填充None值,因为在特征说明中,None也是某些特征的一个值,所以对于这部分特征的缺失值以None填充
|
||||
fill_none = ['MasVnrType', 'BsmtExposure', 'GarageType', 'MiscFeature']
|
||||
for col in fill_none:
|
||||
data[col].fillna('None',inplace=True);
|
||||
|
||||
#对其他缺失值进行填充,离散型特征填充众数,数值型特征填充中位数
|
||||
na_col = data.dtypes[data.isnull().any()];
|
||||
data[col].fillna('None', inplace=True)
|
||||
|
||||
# 对其他缺失值进行填充,离散型特征填充众数,数值型特征填充中位数
|
||||
na_col = data.dtypes[data.isnull().any()]
|
||||
for col in na_col.index:
|
||||
if na_col[col] != 'object':
|
||||
med = data[col].median();
|
||||
data[col].fillna(med,inplace=True);
|
||||
med = data[col].median()
|
||||
data[col].fillna(med, inplace=True)
|
||||
else:
|
||||
mode = data[col].mode()[0];
|
||||
data[col].fillna(mode,inplace=True);
|
||||
|
||||
#对正态偏移的特征进行正态转换,numeric_col就是数值型特征,zero_col是含有零值的数值型特征
|
||||
#因为如果对含零特征进行转换的话会有各种各种的小问题,所以干脆单独只对非零数值进行转换
|
||||
numeric_col = data.skew().index;
|
||||
mode = data[col].mode()[0]
|
||||
data[col].fillna(mode, inplace=True)
|
||||
|
||||
# 对正态偏移的特征进行正态转换,numeric_col就是数值型特征,zero_col是含有零值的数值型特征
|
||||
# 因为如果对含零特征进行转换的话会有各种各种的小问题,所以干脆单独只对非零数值进行转换
|
||||
numeric_col = data.skew().index
|
||||
zero_col = data.columns[data.isin([0]).any()]
|
||||
for col in numeric_col:
|
||||
#对于那些condition特征,例如取值是0,1,2,3...那些我不作变换,因为意义不大
|
||||
if len(pd.value_counts(data[col])) <= 10 : continue;
|
||||
#如果是含有零值的特征,则只对非零值变换,至于用哪种形式变换,boxcox会自动根据数据来调整
|
||||
if col in zero_col:
|
||||
trans_data = data[data>0][col];
|
||||
before = abs(trans_data.skew());
|
||||
cox,_ = boxcox(trans_data)
|
||||
log_after = abs(Series(cox).skew());
|
||||
# 对于那些condition特征,例如取值是0,1,2,3...那些我不作变换,因为意义不大
|
||||
if len(pd.value_counts(data[col])) <= 10: continue
|
||||
# 如果是含有零值的特征,则只对非零值变换,至于用哪种形式变换,boxcox会自动根据数据来调整
|
||||
if col in zero_col:
|
||||
trans_data = data[data > 0][col]
|
||||
before = abs(trans_data.skew())
|
||||
cox, _ = boxcox(trans_data)
|
||||
log_after = abs(Series(cox).skew())
|
||||
if log_after < before:
|
||||
data.loc[trans_data.index,col] = cox;
|
||||
#如果是非零值的特征,则全部作转换
|
||||
data.loc[trans_data.index, col] = cox
|
||||
# 如果是非零值的特征,则全部作转换
|
||||
else:
|
||||
before = abs(data[col].skew());
|
||||
cox,_ = boxcox(data[col])
|
||||
log_after = abs(Series(cox).skew());
|
||||
before = abs(data[col].skew())
|
||||
cox, _ = boxcox(data[col])
|
||||
log_after = abs(Series(cox).skew())
|
||||
if log_after < before:
|
||||
data.loc[:,col] = cox;
|
||||
#mapper值的映射转换
|
||||
for col,mapp in mapper.items():
|
||||
data.loc[:,col] = data[col].map(mapp);
|
||||
|
||||
|
||||
df_train = pd.read_csv(os.path.join(data_dir, "train.csv"));
|
||||
df_test = pd.read_csv(os.path.join(data_dir, "test.csv"));
|
||||
test_ID = df_test['Id'];
|
||||
data.loc[:, col] = cox
|
||||
# mapper值的映射转换
|
||||
for col, mapp in mapper.items():
|
||||
data.loc[:, col] = data[col].map(mapp)
|
||||
|
||||
|
||||
df_train = pd.read_csv(os.path.join(data_dir, "train.csv"))
|
||||
df_test = pd.read_csv(os.path.join(data_dir, "test.csv"))
|
||||
test_ID = df_test['Id']
|
||||
|
||||
#去除离群点
|
||||
GrLivArea_outlier = set(df_train.index[(df_train['SalePrice']<200000)&(df_train['GrLivArea']>4000)]);
|
||||
LotFrontage_outlier = set(df_train.index[df_train['LotFrontage']>300]);
|
||||
df_train.drop(LotFrontage_outlier|GrLivArea_outlier,inplace=True)
|
||||
# 去除离群点
|
||||
GrLivArea_outlier = set(df_train.index[(df_train['SalePrice'] < 200000) & (
|
||||
df_train['GrLivArea'] > 4000)])
|
||||
LotFrontage_outlier = set(df_train.index[df_train['LotFrontage'] > 300])
|
||||
df_train.drop(LotFrontage_outlier | GrLivArea_outlier, inplace=True)
|
||||
|
||||
# 因为删除了几行数据,所以index的序列不再连续,需要重新reindex
|
||||
df_train.reset_index(drop=True, inplace=True)
|
||||
prices = np.log1p(df_train.loc[:, 'SalePrice'])
|
||||
df_train.drop(['SalePrice'], axis=1, inplace=True)
|
||||
# 这里对训练集和测试集进行合并,然后再进行特征工程
|
||||
all_data = pd.concat([df_train, df_test])
|
||||
all_data.reset_index(drop=True, inplace=True)
|
||||
|
||||
# 进行特征工程
|
||||
processing(all_data)
|
||||
|
||||
#因为删除了几行数据,所以index的序列不再连续,需要重新reindex
|
||||
df_train.reset_index(drop=True,inplace=True)
|
||||
prices = np.log1p(df_train.loc[:,'SalePrice'])
|
||||
df_train.drop(['SalePrice'],axis=1,inplace=True)
|
||||
#这里对训练集和测试集进行合并,然后再进行特征工程
|
||||
all_data = pd.concat([df_train,df_test])
|
||||
all_data.reset_index(drop=True,inplace=True)
|
||||
|
||||
#进行特征工程
|
||||
processing(all_data);
|
||||
|
||||
#dummy转换
|
||||
dummy = pd.get_dummies(all_data,drop_first=True);
|
||||
|
||||
#试了Ridge,Lasso,ElasticNet以及GBM,发现ridge的表现比其他的都好,参数alpha=6是调参结果
|
||||
ridge = Ridge(6);
|
||||
ridge.fit(dummy.iloc[:prices.shape[0],:],prices);
|
||||
result = np.expm1(ridge.predict(dummy.iloc[prices.shape[0]:,:]))
|
||||
pre = DataFrame(result,columns=['SalePrice'])
|
||||
prediction = pd.concat([test_ID,pre],axis=1)
|
||||
prediction.to_csv(os.path.join(data_dir, "submission.csv"),index=False)
|
||||
# dummy转换
|
||||
dummy = pd.get_dummies(all_data, drop_first=True)
|
||||
|
||||
# 试了Ridge,Lasso,ElasticNet以及GBM,发现ridge的表现比其他的都好,参数alpha=6是调参结果
|
||||
ridge = Ridge(6)
|
||||
ridge.fit(dummy.iloc[:prices.shape[0], :], prices)
|
||||
result = np.expm1(ridge.predict(dummy.iloc[prices.shape[0]:, :]))
|
||||
pre = DataFrame(result, columns=['SalePrice'])
|
||||
prediction = pd.concat([test_ID, pre], axis=1)
|
||||
prediction.to_csv(os.path.join(data_dir, "submission_1.csv"), index=False)
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
Loading…
Reference in New Issue