-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
102 lines (78 loc) · 2.86 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
from splinter import Browser
from time import sleep
from pprint import pprint
from bs4 import BeautifulSoup
import requests
import pymongo
import time
news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
jpl_base = 'https://www.jpl.nasa.gov'
hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
facts_url = 'https://space-facts.com/mars/'
def init_browser():
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
return Browser("chrome", **executable_path, headless=False)
def scrape():
mars_data = {}
browser = init_browser()
browser.visit(news_url)
html = browser.html
news_data = BeautifulSoup(html, "html.parser")
#strip headline
news_headline = news_data.find_all('div', class_='content_title')[1].text
#strip body
news_desc = news_data.find_all('div', class_='article_teaser_body')[0].text
mars_data.update( {
'news_headline': news_headline,
'news_description': news_desc
})
# JPL Featureed Space Image
sleep(1)
browser.visit(jpl_url)
html = browser.html
JPL_image = BeautifulSoup(html, "html.parser")
featured_image = JPL_image.find(class_='carousel_item')['style']
image_urlend = featured_image.split("'")[1]
image_url = jpl_base + image_urlend
mars_data.update( {
"featured_img": image_url
})
# Mars Facts tables
sleep(1)
Facts_Tables = pd.read_html(facts_url)
tables_df = Facts_Tables[0]
tables_df.columns = ['Item', 'Values']
tables_df.set_index('Item', inplace=True)
html_table = tables_df.to_html(
classes='table table-striped table-hover')
mars_data.update({
"html_table": html_table
})
browser.visit(hemi_url)
# Hemisphere images
sleep(1)
browser.visit(hemi_url)
html = browser.html
hemi_image = BeautifulSoup(html, "html.parser")
hemi_urls = hemi_image.find_all('div', class_='item')
links = hemi_image.find_all('div', class_='item')
hemi_photos_urls = []
for x in links:
link_base = "https://astrogeology.usgs.gov"
img_link = x.find("div", class_="description").a["href"]
title = x.find('h3').text
hemilink = link_base + img_link
browser.visit(hemilink)
hemi_html = browser.html
hemi_soup = BeautifulSoup(hemi_html, 'html.parser')
img_url = hemi_soup.find("img", class_="wide-image")["src"]
hemi_photos_urls.append(
{'title': title, 'url': 'https://astrogeology.usgs.gov' + img_url})
mars_data.update({
"hemishere_urls": hemi_photos_urls
})
browser.quit()
print(mars_data)
return mars_data