-
Notifications
You must be signed in to change notification settings - Fork 6
/
read_crunchbase.py
127 lines (96 loc) · 2.92 KB
/
read_crunchbase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Usage:
# mkdir -p ./crunchbasedata
# python read_crunchbase.py
import time
import os
import random
import sys
import json
import pandas as pd
import urllib2
# CrunchBase Key
# **** Replace the CB_KEY below with your API key ****
CB_KEY = '<insert key here>'
DEST_DIR = './crunchbasedata'
# input: a url as a string
# output: returns the page object form urlopen
def get_page_from_url(url):
request = urllib2.Request(url)
try:
page = urllib2.urlopen(request)
break
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print 'Failed to reach url'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
if e.code == 404: # page not found
print 'Error: ', e.code
cnt -= 1
if cnt == 0:
return None
time.sleep(2)
return page.read()
def read_crunchbase():
co_type = "company"
global companies_df
# change the page number(s) as necessary:
start_page = 1
end_page = 100
for page_no in range(start_page, end_page):
if page_no in last:
continue
companies_df = pd.DataFrame()
url = "https://api.crunchbase.com/v/3/organizations?" \
"organization_types=%s&page=%d&user_key=%s" % \
(co_type, page_no, CB_KEY)
print "working on URL: ", url
page = get_page_from_url(url)
if not page:
continue
decoded = page.decode('ASCII', 'ignore')
print (decoded)
results = json.loads(decoded, strict=False)['data']['items']
for co in results:
dic = co['properties']
dic['uuid'] = co['uuid']
companies_df = companies_df.append(co['properties'],
ignore_index=True)
print_info_to_csv(companies_df, page_no)
def print_info_to_csv(companies_df, page_no):
fn = os.path.join(DEST_DIR, '%d.csv' % page_no)
companies_df.to_csv(fn, encoding='utf-8')
print 'Done, saved to ', fn
def get_all_files(dest):
f = []
for (dirpath, dirnames, filenames) in os.walk(dest):
f.extend(filenames)
break
return f
def get_last_success(dest):
f = get_all_files(dest)
res = {}
for fn in f:
curr = int(fn.split('.')[0])
res[curr] = True
return res
def merge_csv(dest):
f = get_all_files(dest)
mn = -1
mx = -1
companies_df = pd.DataFrame()
for fn in f:
companies_df = companies_df.append(pd.read_csv(os.path.join(dest, fn)))
curr = int(fn.split('.')[0])
if mx == -1 or curr > mx:
mx = curr
if mn == -1 or curr < mn:
mn = curr
fn = 'getCompany%d_%d.csv' % (mn, mx)
companies_df.to_csv(fn, encoding='utf-8')
print 'Saved to', fn, 'in current dir.'
def main():
last = get_last_success(DEST_DIR)
read_crunchbase(last)
merge_csv(DEST_DIR)
main()