Base on the available stock data sources we have, we should create our own data reader in the abstract data structure, which means we will hide the data getting process but only takes in command and give out standardized data table.
Here are all the dependencies we need:
from enum import Enum from datetime import datetime, timedelta import pandas as pd import time from IPython.display import clear_output import tqdm import requests as re import json
Things need to mentions is that Json is used to wrap/format web request data into dataframe. tqdm is a package for progress bar display.
Our goal is to build a class object to get data, let build the constructor first. Since getting data from sources involves many arguments, one of the best practice for me is to initialize those parameters in the constructor.
def __init__(self,tic_list, output="table", **kwargs): self.arg_list = {"freq": 'minutes',"start_date": datetime.now()-timedelta(days =256),"end_date":datetime.now(), "timeframe": 256, "file_name":""} self.tic_list = tic_list self.output = output self.arg_list["start_date"] for key , arg in kwargs.iteritems(): if key in ["freq","start_date","end_date"]: self.arg_list[key]=arg if key in ["timeframe"]: self.arg_list[key]=arg self.arg_list["start_date"] = datetime.now()-timedelta(days =arg)
In the constructor, I take in a list of tickers and a couple of data format setting variables. They include frequency, start date and end date. I also set a variable to control output type, which gives me the flexibility to choose data storage options. Later, I will talk about using mongo database to store stock data.
Next, it’s the most important part, data query function.
The first one is the historical data query function, get_ondemand_data()
def get_ondemand_data(self, interval = 1): self.result = pd.DataFrame() for i in tqdm.tqdm(range(len(self.tic_list))): trial = 0 i = self.tic_list[i] while trial <3: try: api_key = '95b5894daf3abced33fe48e7f265315e' start_date=self.arg_list["start_date"].strftime("%Y%m%d%H%M%S") end_date=self.arg_list["end_date"].strftime("%Y%m%d%H%M%S") # This is the required format for datetimes to access the API api_url = 'http://marketdata.websol.barchart.com/getHistory.csv?' + \ 'key={}&symbol={}&type={}&startDate={}&endDate={}&interval={}'\ .format(api_key, i, self.arg_list["freq"], start_date,end_date,interval) temp = pd.read_csv(api_url, parse_dates=['timestamp']) temp.set_index('timestamp', inplace=True) #index= pd.MultiIndex.from_product([[i],temp.index]) #temp=pd.DataFrame(data=temp.values,index=index,columns=temp.columns) self.result = self.result.append(temp) clear_output() print "Finished", i #time.sleep(5) trial=3 except Exception as e: print e print "error occorded in getting data for ", i trial +=1 time.sleep(10) if trial == 3: self.error.append([i,'get_ondemand']) return self.data_output()
I won’t go deep on this, but please be noticed that I add an out loop for retries of data requests in case connection error occurs.
The second one is the get stock quote function, get_quote()
def get_quote(self): self.result = pd.DataFrame() for i in tqdm.tqdm(range(len(self.tic_list))): i = self.tic_list[i] profile="https://financialmodelingprep.com/api/company/price/{}".format(i) temp = re.get(profile, verify=False).text temp=self.result.replace("\n","") temp = self.result.replace("<pre>","") temp= json.loads(result) temp = pd.DataFrame(result).transpose() self.result = self.result.append(temp) self.data_output()
Lastly, we need to create a function for standradize output.
def data_output(self): self.result = self.result.reset_index() self.result["Close"] = self.result["close"] self.result = self.result.rename(columns={'symbol':'Ticker','timestamp':"TimeStamp","high":"High","low":"Low","open":"Open","volume":"Volume"}) self.result["Return"]=( self.result.Close.diff(1)/self.result.Close) if self.output == "table": return self.result if self.output == "file": self.result.to_csv(self.arg_list["file_name"])
Put them all together. You can save them into a .py file so that next time when you use them, you can just import the .py file name.
Base on the available stock data sources we have, we should create our own data reader in abstract data structure, which means we will hide the data getting process but only takes in command and give out standardized data table.
from enum import Enum
from datetime import datetime, timedelta
import pandas as pd
import time
from IPython.display import clear_output
import tqdm
import requests as re
import json
class get_stock_data():
def __init__(self,tic_list, output="table", **kwargs):
self.arg_list = {"freq": 'minutes',"start_date": datetime.now()-timedelta(days =256),\
"end_date":datetime.now(), "day_range": 256, "file_name":""}
self.tic_list = tic_list
self.output = output
self.arg_list["start_date"]
for key , arg in kwargs.iteritems():
if key in ["freq","start_date","end_date"]:
self.arg_list[key]=arg
if key in ["timeframe"]:
self.arg_list[key]=arg
self.arg_list["start_date"] = datetime.now()-timedelta(days =arg)
self.error = []
def data_output(self):
self.result = self.result.reset_index()
self.result["Close"] = self.result["close"]
self.result = self.result.rename(columns={'symbol':'Ticker','timestamp':"TimeStamp","high":"High","low":"Low","open":"Open","volume":"Volume"})
self.result["Return"]=( self.result.Close.diff(1)/self.result.Close)
if self.output == "table":
return self.result
if self.output == "file":
self.result.to_csv(self.arg_list["file_name"])
def get_ondemand_data(self, interval = 1):
self.result = pd.DataFrame()
for i in tqdm.tqdm(range(len(self.tic_list))):
trial = 0
i = self.tic_list[i].upper()
while trial <3:
try:
api_key = '95b5894daf3abced33fe48e7f265315e'
start_date=self.arg_list["start_date"].strftime("%Y%m%d%H%M%S")
end_date=self.arg_list["end_date"].strftime("%Y%m%d%H%M%S")
# This is the required format for datetimes to access the API
api_url = 'http://marketdata.websol.barchart.com/getHistory.csv?' + \
'key={}&symbol={}&type={}&startDate={}&endDate={}&interval={}'\
.format(api_key, i, self.arg_list["freq"], start_date,end_date,interval)
temp = pd.read_csv(api_url, parse_dates=['timestamp'])
temp.set_index('timestamp', inplace=True)
#index= pd.MultiIndex.from_product([[i],temp.index])
#temp=pd.DataFrame(data=temp.values,index=index,columns=temp.columns)
self.result = self.result.append(temp)
clear_output()
print "Finished", i
#time.sleep(5)
trial=3
except Exception as e:
print e
print "error occorded in getting data for ", i
trial +=1
time.sleep(10)
if trial == 3:
self.error.append([i,'get_ondemand'])
return self.data_output()
def get_quote(self):
self.result = pd.DataFrame()
for i in tqdm.tqdm(range(len(self.tic_list))):
i = self.tic_list[i].upper()
profile="https://financialmodelingprep.com/api/company/price/{}".format(i)
temp = re.get(profile, verify=False).text
temp=self.result.replace("\n","")
temp = self.result.replace("<pre>","")
temp= json.loads(result)
temp = pd.DataFrame(result).transpose()
self.result = self.result.append(temp)
self.data_output()
my = get_stock_data(["AAPL"],day_range=2)
my.get_ondemand_data().head()