forked from justmarkham/DAT4
-
Notifications
You must be signed in to change notification settings - Fork 0
/
12_scrape_pandora.py
51 lines (42 loc) · 1.44 KB
/
12_scrape_pandora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
import pandas as pd
import requests
import time
import csv
import matplotlib.pyplot as plt
import numpy as np
# change this to your pandora screen name!
screen_name = ''
#####
set_list = set()
stations = requests.get('http://pandorasongs.oliverzheng.com/username/'+screen_name).json()
for station in stations['stations'][2:4]:
stationID = station['stationId']
print "on "+ station['stationName']
i = 0
songs = requests.get('http://pandorasongs.oliverzheng.com/station/'+stationID+'/'+str(i)).json()
while songs['hasMore']:
for song in songs['songs']:
set_list.add( song['link'] )
i += 1
songs = requests.get('http://pandorasongs.oliverzheng.com/station/'+stationID+'/'+str(i)).json()
set_list = list(set_list)
len(set_list)
# get attributes of each song
new_set_list = []
all_atributes = set()
for song in set_list:
print song
site_text = requests.get('http://pandora.com'+song).text
attributes = [r.strip().replace('<br>','') for r in re.findall('\\t\\t\\t[\w\s]*<br>\\n', site_text) if len(r) >= 9]
if len(attributes):
new_set_list.append( {'name':song, 'attributes':attributes} )
[all_atributes.add(a) for a in attributes]
#create and save dataframe
rows = []
all_atributes = list(all_atributes)
for n in new_set_list:
rows.append( [n['name']] + [a in n['attributes'] for a in all_atributes] )
df = pd.DataFrame(rows, columns = ['name']+list(all_atributes))
# save
df.to_csv('songs.csv')