vnpy/examples/data_analysis/data_analysis.py

360 lines
10 KiB
Python
Raw Normal View History

2019-10-31 03:48:08 +00:00
from datetime import datetime
import warnings
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller as ADF
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import talib
from vnpy.trader.constant import Exchange, Interval
from vnpy.trader.database import database_manager
warnings.filterwarnings("ignore")
class DataAnalysis:
def __init__(self):
""""""
self.symbol = ""
self.exchange = None
self.interval = None
self.start = None
self.end = None
self.rate = 0.0
self.window_volatility = 20
self.window_index = 20
self.orignal = pd.DataFrame()
self.index_1to1 = []
2019-11-07 07:33:48 +00:00
self.index_2to2 = []
2019-10-31 03:48:08 +00:00
self.index_3to1 = []
self.index_2to1 = []
self.index_4to1 = []
self.intervals = []
self.results = {}
def load_history(
self,
2019-11-07 07:33:48 +00:00
symbol: str,
exchange: Exchange,
interval: Interval,
start: datetime,
2019-10-31 03:48:08 +00:00
end: datetime,
rate: float = 0.0,
index_1to1: list = None,
index_2to2: list = None,
index_3to1: list = None,
index_2to1: list = None,
index_4to1: list = None,
window_index: int = 20,
window_volatility: int = 20,
):
2019-11-07 07:33:48 +00:00
""""""
2019-10-31 03:48:08 +00:00
output("开始加载历史数据")
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
self.window_volatility = window_volatility
self.window_index = window_index
self.rate = rate
self.index_1to1 = index_1to1
self.index_2to2 = index_2to2
self.index_3to1 = index_3to1
self.index_2to1 = index_2to1
self.index_4to1 = index_4to1
2019-11-07 07:33:48 +00:00
# Load history data from database
bars = database_manager.load_bar_data(
symbol=symbol,
exchange=exchange,
interval=interval,
start=start,
2019-10-31 03:48:08 +00:00
end=end,
)
output(f"历史数据加载完成,数据量:{len(bars)}")
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
# Generate history data in DataFrame
t = []
o = []
h = []
2019-11-07 07:33:48 +00:00
l = [] # noqa
2019-10-31 03:48:08 +00:00
c = []
v = []
for bar in bars:
time = bar.datetime
open_price = bar.open_price
high_price = bar.high_price
low_price = bar.low_price
close_price = bar.close_price
volume = bar.volume
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
t.append(time)
o.append(open_price)
h.append(high_price)
l.append(low_price)
2019-11-07 07:33:48 +00:00
c.append(close_price)
2019-10-31 03:48:08 +00:00
v.append(volume)
self.orignal["open"] = o
self.orignal["high"] = h
self.orignal["low"] = l
self.orignal["close"] = c
self.orignal["volume"] = v
self.orignal.index = t
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
def base_analysis(self, df: DataFrame = None):
""""""
if df is None:
df = self.orignal
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
if df is None:
output("数据为空,请输入数据")
close_price = df["close"]
output("第一步:画出行情图,检查数据断点")
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
close_price.plot(figsize=(20, 8), title="close_price")
plt.show()
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
random_test(close_price)
stability_test(close_price)
autocorrelation_test(close_price)
self.relative_volatility_analysis(df)
self.growth_analysis(df)
self.calculate_index(df)
return df
def relative_volatility_analysis(self, df: DataFrame = None):
"""
相对波动率
"""
output("第五步:相对波动率分析")
df["volatility"] = talib.ATR(
np.array(df["high"]),
np.array(df["low"]),
np.array(df["close"]),
self.window_volatility
)
df["fixed_cost"] = df["close"] * self.rate
df["relative_vol"] = df["volatility"] - df["fixed_cost"]
df["relative_vol"].plot(figsize=(20, 6), title="relative volatility")
plt.show()
df["relative_vol"].hist(bins=200, figsize=(20, 6), grid=False)
plt.show()
2019-11-07 07:33:48 +00:00
statitstic_info(df["relative_vol"])
2019-10-31 03:48:08 +00:00
def growth_analysis(self, df: DataFrame = None):
"""
百分比K线变化率
"""
output("第六步:变化率分析")
df["pre_close"] = df["close"].shift(1).fillna(0)
df["g%"] = 100 * (df["close"] - df["pre_close"]) / df["close"]
df["g%"].plot(figsize=(20, 6), title="growth", ylim=(-5, 5))
plt.show()
df["g%"].hist(bins=200, figsize=(20, 6), grid=False)
plt.show()
2019-11-07 07:33:48 +00:00
statitstic_info(df["g%"])
2019-10-31 03:48:08 +00:00
def calculate_index(self, df: DataFrame = None):
""""""
output("第七步计算相关技术指标返回DataFrame\n")
if self.index_1to1:
for i in self.index_1to1:
func = getattr(talib, i)
df[i] = func(
2019-11-07 07:33:48 +00:00
np.array(df["close"]),
2019-10-31 03:48:08 +00:00
self.window_index
)
if self.index_3to1:
for i in self.index_3to1:
func = getattr(talib, i)
2019-11-07 07:33:48 +00:00
df[i] = func(
2019-10-31 03:48:08 +00:00
np.array(df["high"]),
np.array(df["low"]),
np.array(df["close"]),
self.window_index
)
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
if self.index_2to2:
for i in self.index_2to2:
func = getattr(talib, i)
result_down, result_up = func(
np.array(df["high"]),
np.array(df["low"]),
self.window_index
)
up = i + "_UP"
down = i + "_DOWN"
df[up] = result_up
df[down] = result_down
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
if self.index_2to1:
for i in self.index_2to1:
func = getattr(talib, i)
df[i] = func(
np.array(df["high"]),
np.array(df["low"]),
self.window_index
)
if self.index_4to1:
for i in self.index_4to1:
func = getattr(talib, i)
2019-11-07 07:33:48 +00:00
df[i] = func(
np.array(df["open"]),
2019-10-31 03:48:08 +00:00
np.array(df["high"]),
np.array(df["low"]),
np.array(df["close"]),
)
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
return df
def multi_time_frame_analysis(self, intervals: list = None, df: DataFrame = None):
""""""
if not intervals:
output("请输入K线合成周期")
return
if df is None:
df = self.orignal
if df is None:
output("请先加载数据")
return
2019-11-07 07:33:48 +00:00
for interval in intervals:
output("------------------------------------------------")
2019-10-31 03:48:08 +00:00
output(f"合成{interval}周期K先并开始数据分析")
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
data = pd.DataFrame()
data["open"] = df["open"].resample(interval, how="first")
data["high"] = df["high"].resample(interval, how="max")
data["low"] = df["low"].resample(interval, how="min")
data["close"] = df["close"].resample(interval, how="last")
data["volume"] = df["volume"].resample(interval, how="sum")
result = self.base_analysis(data)
self.results[interval] = result
def show_chart(self, data, boll_wide):
2019-11-07 07:33:48 +00:00
""""""
2019-10-31 03:48:08 +00:00
data["boll_up"] = data["SMA"] + data["STDDEV"] * boll_wide
data["boll_down"] = data["SMA"] - data["STDDEV"] * boll_wide
up_signal = []
down_signal = []
2019-11-07 07:33:48 +00:00
len_data = len(data["close"])
2019-10-31 03:48:08 +00:00
for i in range(1, len_data):
2019-11-07 07:33:48 +00:00
if data.iloc[i]["close"] > data.iloc[i]["boll_up"]and data.iloc[i - 1]["close"] < data.iloc[i - 1]["boll_up"]:
2019-10-31 03:48:08 +00:00
up_signal.append(i)
2019-11-07 07:33:48 +00:00
elif data.iloc[i]["close"] < data.iloc[i]["boll_down"] and data.iloc[i - 1]["close"] > data.iloc[i - 1]["boll_down"]:
2019-10-31 03:48:08 +00:00
down_signal.append(i)
2019-11-07 07:33:48 +00:00
plt.figure(figsize=(20, 8))
2019-10-31 03:48:08 +00:00
close = data["close"]
plt.plot(close, lw=1)
2019-11-07 07:33:48 +00:00
plt.plot(close, '^', markersize=5, color='r',
label='UP signal', markevery=up_signal)
plt.plot(close, 'v', markersize=5, color='g',
label='DOWN signal', markevery=down_signal)
2019-10-31 03:48:08 +00:00
plt.plot(data["boll_up"], lw=0.5, color="r")
plt.plot(data["boll_down"], lw=0.5, color="g")
plt.legend()
plt.show()
data["ATR"].plot(figsize=(20, 3), title="ATR")
plt.show()
def random_test(close_price):
""""""
acorr_result = acorr_ljungbox(close_price, lags=1)
p_value = acorr_result[1]
if p_value < 0.05:
output("第二步:随机性检验:非纯随机性")
else:
output("第二步:随机性检验:纯随机性")
output(f"白噪声检验结果:{acorr_result}\n")
def stability_test(close_price):
""""""
statitstic = ADF(close_price)
t_s = statitstic[1]
t_c = statitstic[4]["10%"]
if t_s > t_c:
output("第三步:平稳性检验:存在单位根,时间序列不平稳")
else:
output("第三步:平稳性检验:不存在单位根,时间序列平稳")
output(f"ADF检验结果{statitstic}\n")
def autocorrelation_test(close_price):
""""""
output("第四步:画出自相关性图,观察自相关特性")
plot_acf(close_price, lags=60)
plt.show()
plot_pacf(close_price, lags=60).show()
plt.show()
def statitstic_info(df):
""""""
mean = round(df.mean(), 4)
2019-11-07 07:33:48 +00:00
median = round(df.median(), 4)
2019-10-31 03:48:08 +00:00
output(f"样本平均数:{mean}, 中位数: {median}")
skew = round(df.skew(), 4)
kurt = round(df.kurt(), 4)
if skew == 0:
skew_attribute = "对称分布"
elif skew > 0:
skew_attribute = "分布偏左"
else:
skew_attribute = "分布偏右"
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
if kurt == 0:
kurt_attribute = "正态分布"
elif kurt > 0:
kurt_attribute = "分布陡峭"
else:
kurt_attribute = "分布平缓"
2019-11-07 07:33:48 +00:00
2019-10-31 03:48:08 +00:00
output(f"偏度为:{skew},属于{skew_attribute};峰度为:{kurt},属于{kurt_attribute}\n")
def output(msg):
"""
Output message of backtesting engine.
"""
print(f"{datetime.now()}\t{msg}")