diff --git a/pandas/io/data.py b/pandas/io/data.py index 43178fdcfddf1..8bc3df561cadb 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -58,6 +58,10 @@ def DataReader(name, data_source=None, start=None, end=None, return get_data_yahoo(symbols=name, start=start, end=end, adjust_price=False, chunk=25, retry_count=retry_count, pause=pause) + elif(data_source == "google"): + return get_data_google(symbols=name, start=start, end=end, + adjust_price=False, chunk=25, + retry_count=retry_count, pause=pause) elif(data_source == "fred"): return get_data_fred(name=name, start=start, end=end) elif(data_source == "famafrench"): @@ -132,6 +136,9 @@ def get_quote_yahoo(symbols): return DataFrame(data, index=idx) +def get_quote_google(symbols): + raise NotImplementedError("Google Finance doesn't have this functionality") + def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, pause=0, **kwargs): """ @@ -178,6 +185,41 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, "return a 200 for url %s" % (pause, url)) +def _get_hist_google(sym=None, start=None, end=None, retry_count=3, + pause=0, **kwargs): + """ + Get historical data for the given name from google. + Date format is datetime + + Returns a DataFrame. + """ + if(sym is None): + warnings.warn("Need to provide a name.") + return None + + start, end = _sanitize_dates(start, end) + + google_URL = 'http://www.google.com/finance/historical?' + + # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv + url = google_URL + urllib.urlencode({"q": sym, \ + "startdate": start.strftime('%b %d, %Y'), \ + "enddate": end.strftime('%b %d, %Y'), "output": "csv" }) + for _ in range(retry_count): + resp = urllib2.urlopen(url) + if resp.code == 200: + lines = resp.read() + rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + parse_dates=True)[::-1] + + return rs + + time.sleep(pause) + + raise Exception("after %d tries, Google did not " + "return a 200 for url %s" % (pause, url)) + + def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']): """ Return modifed DataFrame or Panel with adjusted prices based on @@ -347,6 +389,72 @@ def dl_mult_symbols(symbols): return hist_data +def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0, + chunksize=25, **kwargs): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Google Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, array-like object (list, tuple, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + + Returns + ------- + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) + """ + + def dl_mult_symbols(symbols): + stocks = {} + for sym_group in _in_chunks(symbols, chunksize): + for sym in sym_group: + try: + stocks[sym] = _get_hist_google(sym, start=start, + end=end, **kwargs) + except: + warnings.warn('Error with sym: ' + sym + '... skipping.') + + time.sleep(pause) + + return Panel(stocks).swapaxes('items', 'minor') + + if 'name' in kwargs: + warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.", + FutureWarning) + symbols = kwargs['name'] + + #If a single symbol, (e.g., 'GOOG') + if isinstance(symbols, (str, int)): + sym = symbols + hist_data = _get_hist_google(sym, start=start, end=end) + #Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) + elif isinstance(symbols, DataFrame): + try: + hist_data = dl_mult_symbols(Series(symbols.index)) + except ValueError: + raise + else: #Guess a Series + try: + hist_data = dl_mult_symbols(symbols) + except TypeError: + hist_data = dl_mult_symbols(Series(symbols)) + + return hist_data def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py new file mode 100644 index 0000000000000..7f4ca13c27e58 --- /dev/null +++ b/pandas/io/tests/test_google.py @@ -0,0 +1,82 @@ +import unittest +import nose +from datetime import datetime + +import pandas as pd +import pandas.io.data as web +from pandas.util.testing import (network, assert_series_equal) +from numpy.testing.decorators import slow + +import urllib2 + + +class TestGoogle(unittest.TestCase): + + @network + def test_google(self): + # asserts that google is minimally working and that it throws + # an excecption when DataReader can't get a 200 response from + # google + start = datetime(2010, 1, 1) + end = datetime(2013, 01, 27) + + try: + self.assertEquals( + web.DataReader("F", 'google', start, end)['Close'][-1], + 13.68) + + self.assertRaises( + Exception, + lambda: web.DataReader("NON EXISTENT TICKER", 'google', + start, end)) + except urllib2.URLError: + try: + urllib2.urlopen('http://www.google.com') + except urllib2.URLError: + raise nose.SkipTest + else: + raise + + + @network + def test_get_quote(self): + self.assertRaises(NotImplementedError, + lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG']))) + + @network + def test_get_data(self): + import numpy as np + df = web.get_data_google('GOOG') + print(df.Volume.ix['OCT-08-2010']) + assert df.Volume.ix['OCT-08-2010'] == 2863473 + + sl = ['AAPL', 'AMZN', 'GOOG'] + pan = web.get_data_google(sl, '2012') + ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + assert ts[0].dayofyear == 96 + + pan = web.get_data_google(['GE', 'MSFT', 'INTC'], 'JAN-01-12', 'JAN-31-12') + expected = [19.02, 28.23, 25.39] + result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() + assert result == expected + + # sanity checking + t= np.array(result) + assert np.issubdtype(t.dtype, np.floating) + assert t.shape == (3,) + + expected = [[ 18.99, 28.4 , 25.18], + [ 18.58, 28.31, 25.13], + [ 19.03, 28.16, 25.52], + [ 18.81, 28.82, 25.87]] + result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values + assert (result == expected).all() + + # sanity checking + t= np.array(pan) + assert np.issubdtype(t.dtype, np.floating) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False)