-
Notifications
You must be signed in to change notification settings - Fork 0
/
renamer.py
260 lines (195 loc) · 9.34 KB
/
renamer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import requests
import os
import urllib.parse
import urllib.request
import os.path
import json
import re
import pandas as pd
import gzip
import shutil
import argparse
def query_yn(question, default="yes"):
'''
Ask a yes/no question via input() and return their answer.
'''
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
while True:
choice = input(question).lower()
if default is not None and choice == "":
return valid[default]
if choice in valid:
return valid[choice]
print("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
def get_langage(movie:str, apikey:str):
'''
Get movie first langage with given imdb id and apikey
'''
params = {
"apikey": apikey,
"i" : movie,
"type": "movie",
}
resp = requests.get("http://www.omdbapi.com/", params=params).json()
# One or more langage seperated by comma, getting the first one
return resp["Language"].split(",")[0]
def parse(movie_json:dict, file:str, df:pd.DataFrame=None, langage:str=None, apikey:str=None):
'''
Takes a IMDB response and parse it to obtain a list of movies name
'''
arr = []
# If there is some results
if 'd' in movie_json:
movie_json = movie_json["d"]
for idx, movie in enumerate(movie_json):
if 'q' in movie:
# If results is a feature movie
if movie["q"] == "feature":
name = movie['l']
# If the movie have a year of release
if 'y' in movie:
if not df.empty:
# Line of the found title in the local database
dfo = df.loc[df['tconst'] == movie['id']]
if not dfo.empty:
# Original title
orignal = dfo['originalTitle'].iloc[0]
# If the langage is the expected one from the user (e.g. = 'french'), then we keep the original title
movie_langage = get_langage(movie['id'], apikey)
if movie_langage.lower() == langage.lower():
name = orignal
name = f"{name} ({movie['y']})"
# Forbidden characters in file names
name = name.replace(":", "").replace("?", "")
# Add movie to the list
arr.append(name)
# If the 1st result is already right, we don't look for others results
match = re.search(r"[0-9]{4}$", file)
if match:
file = f"{file[:-4]}({match.group()})"
if name.lower() == file.lower() and idx == 0:
break
return arr
def load_database():
'''
Load or downlaod IMDB title basic database
'''
df = None
tsv_path = os.path.dirname(os.path.realpath(__file__))
tsv_file = os.path.join(tsv_path, 'data.tsv')
if(os.path.isfile(os.path.splitext(tsv_file)[0] + ".csv")):
print("[*] Loading local database...")
df = pd.read_csv(os.path.splitext(tsv_file)[0] + ".csv")
else:
if not query_yn("No local database found, do you want to download it ? [Y/n]"):
exit()
print("[*] Downloading https://datasets.imdbws.com/title.basics.tsv.gz")
urllib.request.urlretrieve(
"https://datasets.imdbws.com/title.basics.tsv.gz", os.path.join(tsv_path, 'title.basics.tsv.gz'))
print("Extracting...")
with gzip.open(os.path.join(tsv_path, 'title.basics.tsv.gz'), 'rb') as f_in:
with open(os.path.join(tsv_path, 'data.tsv'), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
print("[*] Sorting...")
csv_table = pd.DataFrame()
for chunk in pd.read_table(tsv_file, sep='\t', usecols=[0, 1, 2, 3], chunksize=50000):
chunk = chunk[(chunk.titleType == 'movie')]
chunk = chunk.drop(columns=['titleType'])
csv_table = pd.concat([csv_table, chunk], ignore_index=True)
print("[*] Saving database...")
csv_table.to_csv(os.path.join(tsv_path, 'data.csv'), index=False)
os.remove(os.path.join(tsv_path, 'title.basics.tsv.gz'))
os.remove(tsv_file)
df = csv_table
return df
def rename_movies(path:str, all:bool=False, df:pd.DataFrame=None, langage:str=None, apikey:str=None):
print("[*] Starting to loop throught movies")
for f in os.listdir(path):
if os.path.isfile(os.path.join(path, f)):
# File name without extension
ifile = os.path.splitext(f)[0]
# Search for the expected format
search = re.search('^.*\([0-9]{4}\)$', ifile)
# If it's not from the expected format or otpion --all is true
if not search or all:
# Remove useless strings
file = re.sub(r'\[.*\]', '', ifile)
file = re.sub(r'(M|m)(U|u)(L|l)(T|t)(I|i).*', '', file)
file = re.sub(r'(V|v)(O|o)(S|s)(T|t)(F|f)(R|r).*', '', file)
file = re.sub(r'avi.*', '', file)
file = re.sub(r'mkv.*', '', file)
file = re.sub(r'www.*', '', file)
file = re.sub(r'(H|h)(D|d).*', '', file)
file = re.sub(r'[0-9]{3}([0-9])?p.*', '', file)
file = file.replace('.', ' ')
# Search for a date
search = re.search('\(?[0-9]{4}\)?', file)
if search:
find = file.find(search.group())
# If there is a date, we remove everything after the date
if find != -1:
done = False
if len(file) > find + len(search.group()):
# If char p after date because it will be quality not date
if file[find+len(search.group())] == 'p':
file = file[0:find - 1]
done = True
if not done:
file = file[0:find+len(search.group())]
# Get the 1st letter of the movie name, to search into IMDB API
fletter = file[0].lower()
# Build the URL
url = f"https://sg.media-imdb.com/suggests/{fletter}/{urllib.parse.quote_plus(file)}.json"
req = requests.get(url).text
# Parse the response to put it in a JSON format
req = req[req.find("({") + 1: -1]
# Parse the json to get the movie name array
arr = parse(json.loads(req), file, df, langage, apikey)
# If the movie is in the array, then the name is already good
if ifile in arr:
print(f"[+] {file} - Already right")
else:
# If not movie found
if len(arr) == 0:
print("[-] No results found for : " + file)
else:
print(f"[+] Results found for '{f}' (searched as '{file}'):")
print(f"[+] Enter movie ID or -1 to not update")
# Print the list of movie found
for idx, title in enumerate(arr):
print(str(idx) + " : " + title)
# Ask for the choice of user (-1 if no rename)
inp = input()
try:
response = int(inp)
if response >= 0 and response < len(arr):
os.rename(os.path.join(path, f), os.path.join(
path, arr[response] + os.path.splitext(f)[1]))
except:
print("[-] Failed to rename the file")
def def_args():
parser = argparse.ArgumentParser(
description='Loop throught movie directory and call IMDB API to find the exact title')
parser.add_argument(dest="dir", metavar='dir',
help="define the directory to search into")
parser.add_argument("-l", "--langage", default=False,
help="define the language for which original title is kept, requires downloading local database and OMDB API key")
parser.add_argument("-k", "--key", default=False,
help="OMDB API key (get one at https://www.omdbapi.com")
parser.add_argument("-a", "--all", action="store_true", default=False,
help="iterate through every movie, even those already respecting the expected format: movie_name (movie_year)")
return parser.parse_args()
if __name__ == "__main__":
args = def_args()
if args.langage and not args.key:
print(f"[-] OMDB API key required with --langage (-l), use --key (-k)")
exit(1)
movie_path = args.dir
if not os.path.exists(movie_path):
print(f"[-] Path: {movie_path} not found !")
exit(1)
df = None
if args.langage:
df = load_database()
rename_movies(movie_path, args.all, df, args.langage, args.key)
input("[+] Execution finished (press enter to continue)")