forked from NITDgpOS/UIP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
52 lines (44 loc) · 1.69 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import urlopen
from urllib.request import urlretrieve
import os
import sys
from constants import PICS_FOLDER,NUMBER_OF_IMAGES_TO_PARSE,CURR_DIR, PICS_FOLDER
'''makes soup, that is basically parsing the html document'''
def make_soup(url):
req = request.Request(url=url,headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
response = request.urlopen(req)
html=response.read()
return BeautifulSoup(html, "html.parser")
'''Show Progress bar'''
def dlProgress(count, blockSize, totalSize):
percent = int(count*blockSize*100/totalSize)
sys.stdout.write("\r" + "...%d%%" % percent)
sys.stdout.flush()
'''scrape images from /r/wallpapers'''
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
thumbnails = soup.find_all("a",class_="thumbnail",href=True)
"""Thumbnails in /r/wallpapers contain href to original
full-sized image."""
image_links=[]
if not thumbnails:
print('No matching image found')
return
for link in thumbnails:
if link['href'].endswith(('jpg','png','jpeg')):
image_links.append( link['href'])
if(len(image_links) == NUMBER_OF_IMAGES_TO_PARSE):
break
for image in image_links:
path=os.path.join(CURR_DIR,PICS_FOLDER)
if not os.path.exists(path):
os.makedirs(path)
filename = image.split('/')[-1]
if filename not in os.listdir(path):
try:
urlretrieve(image,os.path.join(path,filename),reporthook=dlProgress)
except:
pass