数据集说明:京东自2014年5月至2017年12月份公司股票交易数据共898条,通过这些历史数据建模,运用机器学习的方法,预测未来的股票价格。</br>
程序说明:采用线性回归、SVM线性回归、SVM多项式回归、SVM高斯回归,由Python语言实现垃圾邮件分类。</br>
算法理论请参照:线性回归算法、支持向量机SVM</br>
Ipynb演示文件:Ipynb文件</br>
Python代码:Python代码</br>
1 2 3 4 5 6 7 8 9
| """原始数据处理,并进行特征工程""" import pandas as pd
df = pd.read_csv('JDHistoricalQuotes.csv') df.replace('?', -99999, inplace=True) df.fillna(-99999,inplace=True)
df.head()
|
1 2 3 4 5 6 7
| df['volume'] = df['volume'] * 1.0 df['date'] = pd.to_datetime(df['date']) df.sort_values(by='date', inplace=True) df.set_index(df['date'], inplace=True)
df.head()
|
1 2 3 4 5 6
| df['hl_pct']=(df['high']-df['close'])/df['close'] * 100.0 df['pct_change'] = (df['close']-df['open'])/df['open']*100.0 df =df[['close','hl_pct','pct_change', 'volume']]
df.head()
|
1 2 3 4 5 6
| import math data_set_percent = 0.01 forecast_out = int(math.ceil(data_set_percent * len(df))) df['label'] = df['close'].shift(-forecast_out) df.tail(12)
|
1 2 3 4 5 6
| import numpy as np X = np.array(df.drop('label', 1))
from sklearn import preprocessing X = preprocessing.scale(X)
|
1 2 3 4 5 6 7 8 9
| X_validation = X[:-forecast_out] X_predict = X[-forecast_out:] df.dropna(inplace=True) y=np.array(df['label'])
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_validation, y, test_size=0.2)
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
| """训练模型并评估模型,做出预测"""
from sklearn.linear_model import LinearRegression from sklearn import svm import pickle
forecast_dict = {}
cls_dict = { 'LineRegression': LinearRegression(n_jobs=10), 'SvmLinearRegression': svm.SVR(kernel='linear', C=1e3), 'SvmPolyRegression': svm.SVR(kernel='poly', C=8, degree=3), 'SvmRbfRegression': svm.SVR(kernel='rbf', C=1e3, gamma=0.1) }
for name, cls in cls_dict.items(): try: with open('%s.pickle' % name, 'rb') as f: cls = pickle.load(f) except Exception, e: cls.fit(X_train, y_train) print e
with open('%s.pickle' % name, 'wb') as f: pickle.dump(cls, f)
print "%s Algorithm Accuracy: %s" % (name, cls.score(X_test, y_test))
forecast_dict.setdefault(name, (cls.predict(X_predict)))
|
1 2 3 4
| """通过matplotlib图形化展示"""
from matplotlib import style style.available
|
1 2 3 4 5
| last_date_object = df.iloc[-1].name last_date_unix = last_date_object.value // 10 ** 9 one_day_seconds = 24 * 60 * 60 next_date_unix = last_date_unix + one_day_seconds
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| for key, predict in forecast_dict.items(): df['forecast-%s' % key ] = np.nan for value in predict: from datetime import datetime next_date = datetime.fromtimestamp(next_date_unix) next_date_unix += one_day_seconds df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [value] next_date_unix -= one_day_seconds * len(predict) if key == 'LineRegression': color = 'blue' elif key == 'SvmPolyRegression': color = 'yellow' elif key == 'SvmLinearRegression': color = 'green' else: color = 'white' df['forecast-%s' % key].plot(label=key, color=color)
import matplotlib.pyplot as plt
plt.legend()
plt.plot(df['close'], color='red', label='Data') plt.xlabel('Date') plt.ylabel('Price') plt.title('Support Vector Regression') plt.show()
|