-
Notifications
You must be signed in to change notification settings - Fork 5
/
tick_data_combine_dates_hdf.py
142 lines (115 loc) · 5.18 KB
/
tick_data_combine_dates_hdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
import pandas as pd
import os
import time
import Netfonds_Ticker_List as NTL
import get_lists as getl
def tick_data_combine_dates_single(TCKR, listdir, directory=None):
"""
Input: single ticker in format 'TICKER.X', where X is netfonds exchange letter (N:NYSE,O:NASDAQ,A:AMEX)
Combines all tickdata files for the ticker in the directory, default = current.
"""
start_dir = os.getcwd() #save start dir so we can revert back at the end of program
if directory==None:
directory = start_dir
os.chdir(directory)
#get list of files for ticker = TCKR
files = getl.get_csv_file_list(TCKR, listdir, directory)
if files=='no tickers':
return 1
"""
run case for no H5 file.
if file doesn't exist, create it with the 1st csv data file, then close.
"""
if not(os.path.isfile(directory+'\\'+TCKR+'.combined.h5')):
store = pd.HDFStore(directory+'\\'+TCKR+'.combined.h5')
for fl in files: #find 1st file to create appendable h5 with, then break out of loop.
if 'combined' in fl:
continue
temp = pd.read_csv(directory+'\\'+fl, header=0, index_col=0)
if len(temp)==0:
continue
temp=temp[['bid', 'bid_depth', 'bid_depth_total', 'offer', 'offer_depth', 'offer_depth_total', 'price', 'quantity']]
temp.index=pd.to_datetime(temp.index)
temp = temp.sort_index()
files.remove(fl)
dates= list(pd.Series(temp.index).map(pd.Timestamp.date).unique())
break
store.append('dataframe', temp, format='table', complib='blosc', complevel=9,expectedrows=len(temp))
#store.append('dates', dates, format='table', complib='blosc', complevel=9,expectedrows=len(dates))
store.close()
"""
Now can run case where H5 file exists
"""
store = pd.HDFStore(directory+'\\'+TCKR+'.combined.h5')
#get list of existing dates in the HDF5 data store
if len(store.dataframe)==0:
olddates = []
else:
olddates= list(pd.Series(store.dataframe.index).map(pd.Timestamp.date).unique())
#olddates = store.dates
#get list of files to read in
for fl in files:
if 'combined' in fl:
continue
date=pd.datetime.strptime(fl.replace('.csv','').replace(TCKR+'.',''),'%Y%m%d').date()
if date in olddates:
files.remove(fl)
#read in the files to 'df'
df = pd.DataFrame()
for fl in files:
temp = pd.read_csv(directory+'\\'+fl, header=0, index_col=0)
if len(temp)==0:
continue
temp=temp[['bid', 'bid_depth', 'bid_depth_total', 'offer', 'offer_depth', 'offer_depth_total', 'price', 'quantity']]
df = df.append(temp)
if len(df)>0:
#convert index to timeindex
if type(df.index) != pd.tseries.index.DatetimeIndex:
df.index = pd.to_datetime(df.index)
#clean and append to the H5 data store.
df['index'] = df.index
df = df.drop_duplicates()
del df['index']
df = df.sort_index()
#dates= list(pd.Series(df.index).map(pd.Timestamp.date).unique())
#store.append('dates', dates, format='table', complib='blosc', complevel=9, expectedrows=len(df))
store.append('dataframe', df, format='table', complib='blosc', complevel=9, expectedrows=len(df))
store.close()
os.chdir(start_dir)
return 0
def tick_data_combine_dates_multi(TCKR=None, directories=None):
"""
combines files across dates into single file for tickers in TCKR
TCKR can represent an index (e.g. 'SPX', 'ETF', 'NYSE', 'NASDAQ', 'AMEX')
* multiple incies must be passed as a list
if TCKR is not passed, acts on all files in the directory
if directory is not passed, act within the current directory
"""
start_dir = os.getcwd() # save starting directory so we can revert back at end of function
os.chdir('D:\\Google Drive\\Python\\FinDataDownload')
start=time.time()
if directories==None:
directories=[start_dir]
if type(directories)!= list:
directories = [directories]
print 'converted directories input to a list'
for directory in directories:
if TCKR==None: #get list of tickers in the directory
TCKR, listdir = getl.get_list_tickers_in_dir(directory)
else:
TCKR = NTL.get_netfonds_tickers(TCKR)
TCKR = TCKR.ticker.values.tolist()
i=0
for tckr in TCKR:
i=i+1
tick_data_combine_dates_single(tckr, listdir, directory)
print '%-8s:dates combined, '%tckr +str(i)+' of '+str(len(TCKR)) + ', in time=%7.3f'%((time.time()-start)/60)+' mins'
os.chdir(start_dir)
TCKR=None
print 'Combine dates completed after '+str((time.time()-start)/60) + ' mins'
return
if __name__=='__main__':
os.chdir('D:\\Google Drive\\Python\\FinDataDownload')
directories = ['D:\\Financial Data\\Netfonds\\DailyTickDataPull\\Combined\\ETF']
tick_data_combine_dates_multi(directories=directories)