Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement historical finance data from Google Finance #3814

Merged
merged 6 commits into from
Jun 10, 2013
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions pandas/io/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ def DataReader(name, data_source=None, start=None, end=None,
return get_data_yahoo(symbols=name, start=start, end=end,
adjust_price=False, chunk=25,
retry_count=retry_count, pause=pause)
elif(data_source == "google"):
return get_data_google(symbols=name, start=start, end=end,
adjust_price=False, chunk=25,
retry_count=retry_count, pause=pause)
elif(data_source == "fred"):
return get_data_fred(name=name, start=start, end=end)
elif(data_source == "famafrench"):
Expand Down Expand Up @@ -132,6 +136,9 @@ def get_quote_yahoo(symbols):
return DataFrame(data, index=idx)


def get_quote_google(symbols):
raise NotImplementedError("Google Finance doesn't have this functionality")

def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
pause=0, **kwargs):
"""
Expand Down Expand Up @@ -178,6 +185,41 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
"return a 200 for url %s" % (pause, url))


def _get_hist_google(sym=None, start=None, end=None, retry_count=3,
pause=0, **kwargs):
"""
Get historical data for the given name from yahoo.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

google? :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @hayd The evils of copy and paste. Corrected

Date format is datetime

Returns a DataFrame.
"""
if(sym is None):
warnings.warn("Need to provide a name.")
return None

start, end = _sanitize_dates(start, end)

google_URL = 'http://www.google.com/finance/historical?'

# www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
url = google_URL + urllib.urlencode({"q": sym, \
"startdate": start.strftime('%b %d, %Y'), \
"enddate": end.strftime('%b %d, %Y'), "output": "csv" })
for _ in range(retry_count):
resp = urllib2.urlopen(url)
if resp.code == 200:
lines = resp.read()
rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
parse_dates=True)[::-1]

return rs

time.sleep(pause)

raise Exception("after %d tries, Google did not "
"return a 200 for url %s" % (pause, url))


def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
"""
Return modifed DataFrame or Panel with adjusted prices based on
Expand Down Expand Up @@ -347,6 +389,72 @@ def dl_mult_symbols(symbols):

return hist_data

def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0,
chunksize=25, **kwargs):
"""
Returns DataFrame/Panel of historical stock prices from symbols, over date
range, start to end. To avoid being penalized by Google Finance servers,
pauses between downloading 'chunks' of symbols can be specified.

Parameters
----------
symbols : string, array-like object (list, tuple, Series), or DataFrame
Single stock symbol (ticker), array-like object of symbols or
DataFrame with index containing stock symbols.
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, to pause between consecutive queries of chunks. If
single value given for symbol, represents the pause between retries.
chunksize : int, default 25
Number of symbols to download consecutively before intiating pause.

Returns
-------
hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
"""

def dl_mult_symbols(symbols):
stocks = {}
for sym_group in _in_chunks(symbols, chunksize):
for sym in sym_group:
try:
stocks[sym] = _get_hist_google(sym, start=start,
end=end, **kwargs)
except:
warnings.warn('Error with sym: ' + sym + '... skipping.')

time.sleep(pause)

return Panel(stocks).swapaxes('items', 'minor')

if 'name' in kwargs:
warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.",
FutureWarning)
symbols = kwargs['name']

#If a single symbol, (e.g., 'GOOG')
if isinstance(symbols, (str, int)):
sym = symbols
hist_data = _get_hist_google(sym, start=start, end=end)
#Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
elif isinstance(symbols, DataFrame):
try:
hist_data = dl_mult_symbols(Series(symbols.index))
except ValueError:
raise
else: #Guess a Series
try:
hist_data = dl_mult_symbols(symbols)
except TypeError:
hist_data = dl_mult_symbols(Series(symbols))

return hist_data

def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
end=dt.datetime.today()):
Expand Down
85 changes: 85 additions & 0 deletions pandas/io/tests/test_google.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import unittest
import nose
from datetime import datetime

import pandas as pd
import pandas.io.data as web
from pandas.util.testing import (network, assert_series_equal)
from numpy.testing.decorators import slow

import urllib2


class TestGoogle(unittest.TestCase):

@slow
@network
def test_google(self):
# asserts that google is minimally working and that it throws
# an excecption when DataReader can't get a 200 response from
# google
start = datetime(2010, 1, 1)
end = datetime(2013, 01, 27)

try:
self.assertEquals(
web.DataReader("F", 'google', start, end)['Close'][-1],
13.68)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get 12.68 :s

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Erm... I take it back, it seems to now be 13.68. :)


self.assertRaises(
Exception,
lambda: web.DataReader("NON EXISTENT TICKER", 'google',
start, end))
except urllib2.URLError:
try:
urllib2.urlopen('http://www.google.com')
except urllib2.URLError:
raise nose.SkipTest
else:
raise


@slow
@network
def test_get_quote(self):
self.assertRaises(NotImplementedError,
lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG'])))

@slow
@network
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will these tests even run on travis on any python except 2.6? probably should just mark as network OR slow: if they use the network then network, otherwise slow if they are slow but not both since the tests exclude one or the other so if you both they won't be run

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure @cpcloud I changed it to @network only.

def test_get_data(self):
import numpy as np
df = web.get_data_google('GOOG')
print(df.Volume.ix['OCT-08-2010'])
assert df.Volume.ix['OCT-08-2010'] == 2863473

sl = ['AAPL', 'AMZN', 'GOOG']
pan = web.get_data_google(sl, '2012')
ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
assert ts[0].dayofyear == 96

pan = web.get_data_google(['GE', 'MSFT', 'INTC'], 'JAN-01-12', 'JAN-31-12')
expected = [19.02, 28.23, 25.39]
result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
assert result == expected

# sanity checking
t= np.array(result)
assert np.issubdtype(t.dtype, np.floating)
assert t.shape == (3,)

expected = [[ 18.99, 28.4 , 25.18],
[ 18.58, 28.31, 25.13],
[ 19.03, 28.16, 25.52],
[ 18.81, 28.82, 25.87]]
result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values
assert (result == expected).all()

# sanity checking
t= np.array(pan)
assert np.issubdtype(t.dtype, np.floating)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)