From 8196db95daa658737af929f68ec7cd45b826aa02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 17:41:24 -0400 Subject: [PATCH 1/6] Use google finance as datasource (test only, still pointing to yahoo finance) --- pandas/io/data.py | 178 +++++++++++++++++++++++++++++++++ pandas/io/tests/test_google.py | 95 ++++++++++++++++++ 2 files changed, 273 insertions(+) create mode 100644 pandas/io/tests/test_google.py diff --git a/pandas/io/data.py b/pandas/io/data.py index 43178fdcfddf1..f2b539fc795a7 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -58,6 +58,10 @@ def DataReader(name, data_source=None, start=None, end=None, return get_data_yahoo(symbols=name, start=start, end=end, adjust_price=False, chunk=25, retry_count=retry_count, pause=pause) + elif(data_source == "google"): + return get_data_google(symbols=name, start=start, end=end, + adjust_price=False, chunk=25, + retry_count=retry_count, pause=pause) elif(data_source == "fred"): return get_data_fred(name=name, start=start, end=end) elif(data_source == "famafrench"): @@ -132,6 +136,56 @@ def get_quote_yahoo(symbols): return DataFrame(data, index=idx) +def get_quote_google(symbols): + """ + Get current yahoo quote + + Returns a DataFrame + """ + if isinstance(symbols, str): + sym_list = symbols + elif not isinstance(symbols, Series): + symbols = Series(symbols) + sym_list = str.join('+', symbols) + else: + sym_list = str.join('+', symbols) + + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm + codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', + 'time': 't1', 'short_ratio': 's7'} + request = str.join('', codes.values()) # code request string + header = codes.keys() + + data = dict(zip(codes.keys(), [[] for i in range(len(codes))])) + + urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % ( + sym_list, request) + + try: + lines = urllib2.urlopen(urlStr).readlines() + except Exception, e: + s = "Failed to download:\n{0}".format(e) + print s + return None + + for line in lines: + fields = line.decode('utf-8').strip().split(',') + for i, field in enumerate(fields): + if field[-2:] == '%"': + data[header[i]].append(float(field.strip('"%'))) + elif field[0] == '"': + data[header[i]].append(field.strip('"')) + else: + try: + data[header[i]].append(float(field)) + except ValueError: + data[header[i]].append(np.nan) + + idx = data.pop('symbol') + + return DataFrame(data, index=idx) + + def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, pause=0, **kwargs): """ @@ -178,6 +232,52 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, "return a 200 for url %s" % (pause, url)) +def _get_hist_google(sym=None, start=None, end=None, retry_count=3, + pause=0, **kwargs): + """ + Get historical data for the given name from yahoo. + Date format is datetime + + Returns a DataFrame. + """ + if(sym is None): + warnings.warn("Need to provide a name.") + return None + + start, end = _sanitize_dates(start, end) + + yahoo_URL = 'http://ichart.yahoo.com/table.csv?' + + url = yahoo_URL + 's=%s' % sym + \ + '&a=%s' % (start.month - 1) + \ + '&b=%s' % start.day + \ + '&c=%s' % start.year + \ + '&d=%s' % (end.month - 1) + \ + '&e=%s' % end.day + \ + '&f=%s' % end.year + \ + '&g=d' + \ + '&ignore=.csv' + + for _ in range(retry_count): + resp = urllib2.urlopen(url) + if resp.code == 200: + lines = resp.read() + rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + parse_dates=True)[::-1] + + # Yahoo! Finance sometimes does this awesome thing where they + # return 2 rows for the most recent business day + if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover + rs = rs[:-1] + + return rs + + time.sleep(pause) + + raise Exception("after %d tries, Yahoo did not " + "return a 200 for url %s" % (pause, url)) + + def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']): """ Return modifed DataFrame or Panel with adjusted prices based on @@ -347,6 +447,84 @@ def dl_mult_symbols(symbols): return hist_data +def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0, + adjust_price=False, ret_index=False, chunksize=25, + **kwargs): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Yahoo! Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, array-like object (list, tuple, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + adjust_price : bool, default False + If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close') + based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops + 'Adj Close'. + ret_index : bool, default False + If True, includes a simple return index 'Ret_Index' in hist_data. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + + Returns + ------- + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) + """ + + def dl_mult_symbols(symbols): + stocks = {} + for sym_group in _in_chunks(symbols, chunksize): + for sym in sym_group: + try: + stocks[sym] = _get_hist_google(sym, start=start, + end=end, **kwargs) + except: + warnings.warn('Error with sym: ' + sym + '... skipping.') + + time.sleep(pause) + + return Panel(stocks).swapaxes('items', 'minor') + + if 'name' in kwargs: + warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.", + FutureWarning) + symbols = kwargs['name'] + + #If a single symbol, (e.g., 'GOOG') + if isinstance(symbols, (str, int)): + sym = symbols + hist_data = _get_hist_google(sym, start=start, end=end) + #Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) + elif isinstance(symbols, DataFrame): + try: + hist_data = dl_mult_symbols(Series(symbols.index)) + except ValueError: + raise + else: #Guess a Series + try: + hist_data = dl_mult_symbols(symbols) + except TypeError: + hist_data = dl_mult_symbols(Series(symbols)) + + if(ret_index): + hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) + if(adjust_price): + hist_data = _adjust_prices(hist_data) + + return hist_data def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py new file mode 100644 index 0000000000000..9c3e81485f34d --- /dev/null +++ b/pandas/io/tests/test_google.py @@ -0,0 +1,95 @@ +import unittest +import nose +from datetime import datetime + +import pandas as pd +import pandas.io.data as web +from pandas.util.testing import (network, assert_frame_equal, + assert_series_equal, + assert_almost_equal) +from numpy.testing.decorators import slow + +import urllib2 + + +class TestGoogle(unittest.TestCase): + + @slow + @network + def test_google(self): + # asserts that google is minimally working and that it throws + # an excecption when DataReader can't get a 200 response from + # google + start = datetime(2010, 1, 1) + end = datetime(2013, 01, 27) + + try: + self.assertEquals( + web.DataReader("F", 'google', start, end)['Close'][-1], + 13.68) + + self.assertRaises( + Exception, + lambda: web.DataReader("NON EXISTENT TICKER", 'google', + start, end)) + except urllib2.URLError: + try: + urllib2.urlopen('http://www.google.com') + except urllib2.URLError: + raise nose.SkipTest + else: + raise + + + @slow + @network + def test_get_quote(self): + df = web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG'])) + assert_series_equal(df.ix[0], df.ix[2]) + + + @slow + @network + def test_get_data(self): + import numpy as np + df = web.get_data_google('GOOG') + assert df.Volume.ix['OCT-08-2010'] == 2859200 + + sl = ['AAPL', 'AMZN', 'GOOG'] + pan = web.get_data_google(sl, '2012') + ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + assert ts[0].dayofyear == 96 + + pan = web.get_data_google(['GE', 'MSFT', 'INTC'], 'JAN-01-12', 'JAN-31-12') + expected = [19.02, 28.23, 25.39] + result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() + assert result == expected + + # sanity checking + t= np.array(result) + assert np.issubdtype(t.dtype, np.floating) + assert t.shape == (3,) + + expected = [[ 18.99, 28.4 , 25.18], + [ 18.58, 28.31, 25.13], + [ 19.03, 28.16, 25.52], + [ 18.81, 28.82, 25.87]] + result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values + assert (result == expected).all() + + #Check ret_index + pan = web.get_data_google(['GE', 'INTC', 'IBM'], '1977', '1987', + ret_index=True) + tstamp = pan.Ret_Index.INTC.first_valid_index() + result = pan.Ret_Index.ix[tstamp]['INTC'] + expected = 1.0 + assert result == expected + + # sanity checking + t= np.array(pan) + assert np.issubdtype(t.dtype, np.floating) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) From ad89365c3870f192dde01568b584bec8b7ee1086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 19:00:30 -0400 Subject: [PATCH 2/6] Remove unneeded import from test_google --- pandas/io/tests/test_google.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 9c3e81485f34d..5b5fdd59e4b55 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -4,9 +4,7 @@ import pandas as pd import pandas.io.data as web -from pandas.util.testing import (network, assert_frame_equal, - assert_series_equal, - assert_almost_equal) +from pandas.util.testing import (network, assert_series_equal) from numpy.testing.decorators import slow import urllib2 From ee10caaaa30a81fc0e72ff53ca85f0937099b837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 19:13:13 -0400 Subject: [PATCH 3/6] Implement _get_hist_google --- pandas/io/data.py | 39 +++++++--------------------------- pandas/io/tests/test_google.py | 11 ++-------- 2 files changed, 10 insertions(+), 40 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index f2b539fc795a7..d178d0089e6d6 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -246,18 +246,12 @@ def _get_hist_google(sym=None, start=None, end=None, retry_count=3, start, end = _sanitize_dates(start, end) - yahoo_URL = 'http://ichart.yahoo.com/table.csv?' - - url = yahoo_URL + 's=%s' % sym + \ - '&a=%s' % (start.month - 1) + \ - '&b=%s' % start.day + \ - '&c=%s' % start.year + \ - '&d=%s' % (end.month - 1) + \ - '&e=%s' % end.day + \ - '&f=%s' % end.year + \ - '&g=d' + \ - '&ignore=.csv' + google_URL = 'http://www.google.com/finance/historical?' + # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv + url = google_URL + urllib.urlencode({"q": sym, \ + "startdate": start.strftime('%b %d, %Y'), \ + "enddate": end.strftime('%b %d, %Y'), "output": "csv" }) for _ in range(retry_count): resp = urllib2.urlopen(url) if resp.code == 200: @@ -265,16 +259,11 @@ def _get_hist_google(sym=None, start=None, end=None, retry_count=3, rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True)[::-1] - # Yahoo! Finance sometimes does this awesome thing where they - # return 2 rows for the most recent business day - if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover - rs = rs[:-1] - return rs time.sleep(pause) - raise Exception("after %d tries, Yahoo did not " + raise Exception("after %d tries, Google did not " "return a 200 for url %s" % (pause, url)) @@ -448,11 +437,10 @@ def dl_mult_symbols(symbols): return hist_data def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0, - adjust_price=False, ret_index=False, chunksize=25, - **kwargs): + chunksize=25, **kwargs): """ Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Yahoo! Finance servers, + range, start to end. To avoid being penalized by Google Finance servers, pauses between downloading 'chunks' of symbols can be specified. Parameters @@ -470,12 +458,6 @@ def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0, pause : int, default 0 Time, in seconds, to pause between consecutive queries of chunks. If single value given for symbol, represents the pause between retries. - adjust_price : bool, default False - If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close') - based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops - 'Adj Close'. - ret_index : bool, default False - If True, includes a simple return index 'Ret_Index' in hist_data. chunksize : int, default 25 Number of symbols to download consecutively before intiating pause. @@ -519,11 +501,6 @@ def dl_mult_symbols(symbols): except TypeError: hist_data = dl_mult_symbols(Series(symbols)) - if(ret_index): - hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) - if(adjust_price): - hist_data = _adjust_prices(hist_data) - return hist_data def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 5b5fdd59e4b55..01868a70c3709 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -51,7 +51,8 @@ def test_get_quote(self): def test_get_data(self): import numpy as np df = web.get_data_google('GOOG') - assert df.Volume.ix['OCT-08-2010'] == 2859200 + print(df.Volume.ix['OCT-08-2010']) + assert df.Volume.ix['OCT-08-2010'] == 2863473 sl = ['AAPL', 'AMZN', 'GOOG'] pan = web.get_data_google(sl, '2012') @@ -75,14 +76,6 @@ def test_get_data(self): result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values assert (result == expected).all() - #Check ret_index - pan = web.get_data_google(['GE', 'INTC', 'IBM'], '1977', '1987', - ret_index=True) - tstamp = pan.Ret_Index.INTC.first_valid_index() - result = pan.Ret_Index.ix[tstamp]['INTC'] - expected = 1.0 - assert result == expected - # sanity checking t= np.array(pan) assert np.issubdtype(t.dtype, np.floating) From f43d24540a09cc2855569c6e8811669759cc065a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 19:20:23 -0400 Subject: [PATCH 4/6] No current finance data from Google --- pandas/io/data.py | 49 +--------------------------------- pandas/io/tests/test_google.py | 5 ++-- 2 files changed, 3 insertions(+), 51 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index d178d0089e6d6..13551272edae2 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -137,54 +137,7 @@ def get_quote_yahoo(symbols): def get_quote_google(symbols): - """ - Get current yahoo quote - - Returns a DataFrame - """ - if isinstance(symbols, str): - sym_list = symbols - elif not isinstance(symbols, Series): - symbols = Series(symbols) - sym_list = str.join('+', symbols) - else: - sym_list = str.join('+', symbols) - - # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm - codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', - 'time': 't1', 'short_ratio': 's7'} - request = str.join('', codes.values()) # code request string - header = codes.keys() - - data = dict(zip(codes.keys(), [[] for i in range(len(codes))])) - - urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % ( - sym_list, request) - - try: - lines = urllib2.urlopen(urlStr).readlines() - except Exception, e: - s = "Failed to download:\n{0}".format(e) - print s - return None - - for line in lines: - fields = line.decode('utf-8').strip().split(',') - for i, field in enumerate(fields): - if field[-2:] == '%"': - data[header[i]].append(float(field.strip('"%'))) - elif field[0] == '"': - data[header[i]].append(field.strip('"')) - else: - try: - data[header[i]].append(float(field)) - except ValueError: - data[header[i]].append(np.nan) - - idx = data.pop('symbol') - - return DataFrame(data, index=idx) - + raise NotImplementedError("Google Finance doesn't have this functionality") def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, pause=0, **kwargs): diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 01868a70c3709..9db7964c1acfe 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -42,9 +42,8 @@ def test_google(self): @slow @network def test_get_quote(self): - df = web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG'])) - assert_series_equal(df.ix[0], df.ix[2]) - + self.assertRaises(NotImplementedError, + lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG']))) @slow @network From c0529576e1bcd4369954539fbafc82a5e8c42502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 20:11:02 -0400 Subject: [PATCH 5/6] Corrected typo in data --- pandas/io/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index 13551272edae2..8bc3df561cadb 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -188,7 +188,7 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, def _get_hist_google(sym=None, start=None, end=None, retry_count=3, pause=0, **kwargs): """ - Get historical data for the given name from yahoo. + Get historical data for the given name from google. Date format is datetime Returns a DataFrame. From 0aadb1195219269b38e551e9044a52c33898e437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 20:16:28 -0400 Subject: [PATCH 6/6] Change google finance tests to @network only --- pandas/io/tests/test_google.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 9db7964c1acfe..7f4ca13c27e58 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -12,7 +12,6 @@ class TestGoogle(unittest.TestCase): - @slow @network def test_google(self): # asserts that google is minimally working and that it throws @@ -39,13 +38,11 @@ def test_google(self): raise - @slow @network def test_get_quote(self): self.assertRaises(NotImplementedError, lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG']))) - @slow @network def test_get_data(self): import numpy as np