From db05f9ad572c4f65bec0e1ed2956bcb308bbb04b Mon Sep 17 00:00:00 2001 From: "Nicholaus E. Halecky" Date: Mon, 28 Jan 2013 16:30:04 -0800 Subject: [PATCH] EHN: Expand Yahoo finance features, idx components --- pandas/io/data.py | 182 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 173 insertions(+), 9 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index e4457d141e92c..964a58739b7e6 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -3,6 +3,7 @@ """ +import warnings import numpy as np import datetime as dt @@ -13,7 +14,7 @@ from zipfile import ZipFile from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str -from pandas import DataFrame, read_csv, concat +from pandas import Panel, DataFrame, Series, read_csv, concat from pandas.io.parsers import TextParser @@ -54,7 +55,8 @@ def DataReader(name, data_source=None, start=None, end=None, start, end = _sanitize_dates(start, end) if(data_source == "yahoo"): - return get_data_yahoo(name=name, start=start, end=end, + return get_data_yahoo(symbols=name, start=start, end=end, + adjust_price=False, chunk=25, retry_count=retry_count, pause=pause) elif(data_source == "fred"): return get_data_fred(name=name, start=start, end=end) @@ -73,14 +75,27 @@ def _sanitize_dates(start, end): return start, end +def _in_chunks(seq, size): + """ + Return sequence in 'chunks' of size defined by size + """ + return (seq[pos:pos + size] for pos in xrange(0, len(seq), size)) + + def get_quote_yahoo(symbols): """ Get current yahoo quote Returns a DataFrame """ - if not isinstance(symbols, list): - raise TypeError("symbols must be a list") + if isinstance(symbols, str): + sym_list = symbols + elif not isinstance(symbols, Series): + symbols = Series(symbols) + sym_list = str.join('+', symbols) + else: + sym_list = str.join('+', symbols) + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', 'time': 't1', 'short_ratio': 's7'} @@ -90,7 +105,7 @@ def get_quote_yahoo(symbols): data = dict(zip(codes.keys(), [[] for i in range(len(codes))])) urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % ( - str.join('+', symbols), request) + sym_list, request) try: lines = urllib2.urlopen(urlStr).readlines() @@ -117,19 +132,20 @@ def get_quote_yahoo(symbols): return DataFrame(data, index=idx) -def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0): +def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3, + pause=0): """ Get historical data for the given name from yahoo. Date format is datetime Returns a DataFrame. """ - start, end = _sanitize_dates(start, end) - if(name is None): - print "Need to provide a name" + warnings.warn("Need to provide a name.") return None + start, end = _sanitize_dates(start, end) + yahoo_URL = 'http://ichart.yahoo.com/table.csv?' url = yahoo_URL + 's=%s' % name + \ @@ -162,6 +178,154 @@ def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0): "return a 200 for url %s" % (pause, url)) +def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']): + """ + Return modifed DataFrame or Panel with adjusted prices based on + 'Adj Close' price. Adds 'Adj_Ratio' column. + """ + adj_ratio = hist_data['Adj Close'] / hist_data['Close'] + + data = hist_data.copy() + for item in price_list: + data[item] = hist_data[item] * adj_ratio + data['Adj_Ratio'] = adj_ratio + del data['Adj Close'] + return data + + +def _calc_return_index(price_df): + """ + Return a returns index from a input price df or series. + """ + + ret_index = price_df.pct_change().add(1).cumprod() + ret_index.ix[0] = 1 + return ret_index + + +def get_components_yahoo(idx_sym='^DJI'): + """ + Returns DataFrame containing list of component information for index + represented in idx_sym from yahoo. Includes component symbol + (ticker), exchange, and name. + + Parameters + ---------- + idx_sym : str + Index symbol, default '^DJI' (Dow Jones Industrial Average) + Examples: + '^NYA' (NYSE Composite) + '^IXIC' (NASDAQ Composite) + + See: http://finance.yahoo.com/indices for other index symbols + + Returns + ------- + idx_df : DataFrame + """ + stats = 'snx' + #URL of form: + #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv + url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \ + '&e=.csv&h={2}' + + idx_mod = idx_sym.replace('^', '@%5E') + urlStr = url.format(idx_mod, stats, 1) + + idx_df = DataFrame() + mask = [True] + comp_idx = 1 + + #LOOP across component index structure, + #break when no new components are found + while (True in mask): + urlStr = url.format(idx_mod, stats, comp_idx) + lines = (urllib.urlopen(urlStr).read().strip(). + strip('"').split('"\r\n"')) + + lines = [line.strip().split('","') for line in lines] + + temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) + temp_df = temp_df.drop_duplicates() + temp_df = temp_df.set_index('ticker') + mask = ~temp_df.index.isin(idx_df.index) + + comp_idx = comp_idx + 50 + idx_df = idx_df.append(temp_df[mask]) + + return idx_df + + +def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False, + ret_index=False, chunk=25, pause=0, **kwargs): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Yahoo! Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, list-like object (list, tupel, Series), DataFrame + Single stock symbol (ticker), list-like object of symbols or + DataFrame with index containing of stock symbols + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + adjust_price : bool, default False + Adjust all prices in hist_data ('Open', 'High', 'Low', 'Close') via + 'Adj Close' price. Adds 'Adj_Ratio' column and drops 'Adj Close'. + ret_index: bool, default False + Include a simple return index 'Ret_Index' in hist_data. + chunk : int, default 25 + Number of symbols to download consecutively before intiating pause. + pause : int, default 0 + Time, in seconds, to pause between consecutive chunks. + **kwargs: additional arguments to pass to _get_hist_yahoo + + Returns + ------- + hist_data : DataFrame (str) or Panel (list-like object, DataFrame) + """ + def dl_mult_symbols(symbols): + stocks = {} + for sym_group in _in_chunks(symbols, chunk): + for sym in sym_group: + try: + stocks[sym] = _get_hist_yahoo(name=sym, start=start, + end=end, **kwargs) + except: + warnings.warn('Error with sym: ' + sym + '... skipping.') + + time.sleep(pause) + + return Panel(stocks).swapaxes('items', 'minor') + + #If a scalar (single symbol, e.g. 'GOOG') + if isinstance(symbols, (str, int)): + sym = symbols + hist_data = _get_hist_yahoo(sym, start=start, end=end, **kwargs) + #Multiple symbols + elif isinstance(symbols, DataFrame): + try: + hist_data = dl_mult_symbols(Series(symbols.index)) + except ValueError: + raise + else: #Guess a Series + try: + hist_data = dl_mult_symbols(symbols) + except TypeError: + hist_data = dl_mult_symbols(Series(symbols)) + + if(ret_index): + hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) + if(adjust_price): + hist_data = _adjust_prices(hist_data) + + return hist_data + + def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): """