Skip to content

Commit

Permalink
Merge pull request #3814 from gliptak/googledata
Browse files Browse the repository at this point in the history
Implement historical finance data from Google Finance
  • Loading branch information
wesm committed Jun 10, 2013
2 parents 241db0d + 0aadb11 commit 5496613
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 0 deletions.
108 changes: 108 additions & 0 deletions pandas/io/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ def DataReader(name, data_source=None, start=None, end=None,
return get_data_yahoo(symbols=name, start=start, end=end,
adjust_price=False, chunk=25,
retry_count=retry_count, pause=pause)
elif(data_source == "google"):
return get_data_google(symbols=name, start=start, end=end,
adjust_price=False, chunk=25,
retry_count=retry_count, pause=pause)
elif(data_source == "fred"):
return get_data_fred(name=name, start=start, end=end)
elif(data_source == "famafrench"):
Expand Down Expand Up @@ -132,6 +136,9 @@ def get_quote_yahoo(symbols):
return DataFrame(data, index=idx)


def get_quote_google(symbols):
raise NotImplementedError("Google Finance doesn't have this functionality")

def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
pause=0, **kwargs):
"""
Expand Down Expand Up @@ -178,6 +185,41 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
"return a 200 for url %s" % (pause, url))


def _get_hist_google(sym=None, start=None, end=None, retry_count=3,
pause=0, **kwargs):
"""
Get historical data for the given name from google.
Date format is datetime
Returns a DataFrame.
"""
if(sym is None):
warnings.warn("Need to provide a name.")
return None

start, end = _sanitize_dates(start, end)

google_URL = 'http://www.google.com/finance/historical?'

# www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
url = google_URL + urllib.urlencode({"q": sym, \
"startdate": start.strftime('%b %d, %Y'), \
"enddate": end.strftime('%b %d, %Y'), "output": "csv" })
for _ in range(retry_count):
resp = urllib2.urlopen(url)
if resp.code == 200:
lines = resp.read()
rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
parse_dates=True)[::-1]

return rs

time.sleep(pause)

raise Exception("after %d tries, Google did not "
"return a 200 for url %s" % (pause, url))


def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
"""
Return modifed DataFrame or Panel with adjusted prices based on
Expand Down Expand Up @@ -347,6 +389,72 @@ def dl_mult_symbols(symbols):

return hist_data

def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0,
chunksize=25, **kwargs):
"""
Returns DataFrame/Panel of historical stock prices from symbols, over date
range, start to end. To avoid being penalized by Google Finance servers,
pauses between downloading 'chunks' of symbols can be specified.
Parameters
----------
symbols : string, array-like object (list, tuple, Series), or DataFrame
Single stock symbol (ticker), array-like object of symbols or
DataFrame with index containing stock symbols.
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, to pause between consecutive queries of chunks. If
single value given for symbol, represents the pause between retries.
chunksize : int, default 25
Number of symbols to download consecutively before intiating pause.
Returns
-------
hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
"""

def dl_mult_symbols(symbols):
stocks = {}
for sym_group in _in_chunks(symbols, chunksize):
for sym in sym_group:
try:
stocks[sym] = _get_hist_google(sym, start=start,
end=end, **kwargs)
except:
warnings.warn('Error with sym: ' + sym + '... skipping.')

time.sleep(pause)

return Panel(stocks).swapaxes('items', 'minor')

if 'name' in kwargs:
warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.",
FutureWarning)
symbols = kwargs['name']

#If a single symbol, (e.g., 'GOOG')
if isinstance(symbols, (str, int)):
sym = symbols
hist_data = _get_hist_google(sym, start=start, end=end)
#Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
elif isinstance(symbols, DataFrame):
try:
hist_data = dl_mult_symbols(Series(symbols.index))
except ValueError:
raise
else: #Guess a Series
try:
hist_data = dl_mult_symbols(symbols)
except TypeError:
hist_data = dl_mult_symbols(Series(symbols))

return hist_data

def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
end=dt.datetime.today()):
Expand Down
82 changes: 82 additions & 0 deletions pandas/io/tests/test_google.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import unittest
import nose
from datetime import datetime

import pandas as pd
import pandas.io.data as web
from pandas.util.testing import (network, assert_series_equal)
from numpy.testing.decorators import slow

import urllib2


class TestGoogle(unittest.TestCase):

@network
def test_google(self):
# asserts that google is minimally working and that it throws
# an excecption when DataReader can't get a 200 response from
# google
start = datetime(2010, 1, 1)
end = datetime(2013, 01, 27)

try:
self.assertEquals(
web.DataReader("F", 'google', start, end)['Close'][-1],
13.68)

self.assertRaises(
Exception,
lambda: web.DataReader("NON EXISTENT TICKER", 'google',
start, end))
except urllib2.URLError:
try:
urllib2.urlopen('http://www.google.com')
except urllib2.URLError:
raise nose.SkipTest
else:
raise


@network
def test_get_quote(self):
self.assertRaises(NotImplementedError,
lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG'])))

@network
def test_get_data(self):
import numpy as np
df = web.get_data_google('GOOG')
print(df.Volume.ix['OCT-08-2010'])
assert df.Volume.ix['OCT-08-2010'] == 2863473

sl = ['AAPL', 'AMZN', 'GOOG']
pan = web.get_data_google(sl, '2012')
ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
assert ts[0].dayofyear == 96

pan = web.get_data_google(['GE', 'MSFT', 'INTC'], 'JAN-01-12', 'JAN-31-12')
expected = [19.02, 28.23, 25.39]
result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
assert result == expected

# sanity checking
t= np.array(result)
assert np.issubdtype(t.dtype, np.floating)
assert t.shape == (3,)

expected = [[ 18.99, 28.4 , 25.18],
[ 18.58, 28.31, 25.13],
[ 19.03, 28.16, 25.52],
[ 18.81, 28.82, 25.87]]
result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values
assert (result == expected).all()

# sanity checking
t= np.array(pan)
assert np.issubdtype(t.dtype, np.floating)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)

0 comments on commit 5496613

Please sign in to comment.