-
Notifications
You must be signed in to change notification settings - Fork 1
/
Task8.py
90 lines (79 loc) · 2.93 KB
/
Task8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
## In this task, I am storing the data of that movie in the IDs folder, whose link you'll give (if it not exists already).
import requests, os.path, json
from os import path
from bs4 import BeautifulSoup
def scrape_movie_details(user):
a = requests.get(user)
b = a.text
soup = BeautifulSoup(b, "html.parser")
# to print the main dictionary
main_dic = {}
# to print the movie name
h1 = soup.find('h1').text
# to print the movie name
movie_name = ""
for i in h1:
if i == " ":
break
else:
movie_name += i
main_dic['Name'] = (movie_name)
# to print the director name
director = soup.find('div', class_ = 'credit_summary_item')
find_all_a = director.find_all('a')
dir_list = [dirname.text for dirname in find_all_a]
main_dic['Director'] = dir_list
country = soup.find('div', attrs = {'class':'article', 'id': 'titleDetails'})
divs = country.find_all('div', class_ = "txt-block")
for i in divs:
if i.find('h4') in i:
h4 = i.find('h4').text
if h4 == 'Country:':
country_name = i.find('a').text
main_dic['Country'] = country_name # to print country name
elif h4 == 'Language:':
language = i.find_all('a') # to print language in which the film has been released
total_lang = ([b.text for b in language])
main_dic["Language"] = total_lang
elif h4 == 'Runtime:':
runtime = i.find('time').text
main_dic['Runtime'] = runtime
# to print the link of the image
poster = soup.find('div', class_= 'poster')
poster_url = poster.find('a').img['src']
main_dic['Poster URL'] = poster_url
#to print the bio of the movie
bio = soup.find('div', class_ = "summary_text").text.strip()
main_dic['Bio'] = (bio)
#to print the genre of the movie
genre1 = soup.find('div', attrs = {'class':'article', 'id': 'titleStoryLine'})
genre2 = genre1.find_all('div', class_ = 'see-more inline canwrap')
for i in genre2:
h4s = i.find('h4', class_ = 'inline').text
# print (h4s)
all_a = i.find_all('a')
if h4s == 'Genres:':
value = [k.text for k in all_a]
main_dic['Genres'] = (value)
return (main_dic)
url = input("Enter the url of the movie: ")
b = (url.index('title'))
string = ""
for i in range(b+6, len(url)):
if url[i] == "/":
break
else:
string+=url[i]
id = string + ".json" # filename to check whether it exists in our local files or not!
newname = os.path.join("/home/yogi/Documents/IMDB-Movie-Scraper/IDs",id)
exists = path.exists(newname)
# print (newname)
if exists:
with open(newname) as f:
data = json.load(f)
print (data)
else:
arg = scrape_movie_details(url)
with open(newname, "w") as file1:
toFile = json.dumps(arg, indent=4, sort_keys=True)
file1.write(toFile)