-
Notifications
You must be signed in to change notification settings - Fork 0
/
Duplicate_Files_Remover.py
61 lines (51 loc) · 1.93 KB
/
Duplicate_Files_Remover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import hashlib
from concurrent.futures import ProcessPoolExecutor
def hash_file(filename, algorithm='sha256'):
BLOCKSIZE = 65536
hasher = hashlib.new(algorithm)
with open(filename, 'rb') as file:
buf = file.read(BLOCKSIZE)
while buf:
hasher.update(buf)
buf = file.read(BLOCKSIZE)
return hasher.hexdigest()
def scan_directory(path="."):
# Using os.walk to go through all directories and subdirectories
for dirpath, dirnames, filenames in os.walk(path):
for f in filenames:
full_path = os.path.join(dirpath, f)
yield full_path
def main():
hashMap = {}
sizeMap = {}
duplicates = set()
# Collecting all files from directory and subdirectories
files = list(scan_directory())
# Filtering by file size
for f in files:
fsize = os.path.getsize(f)
if fsize not in sizeMap:
sizeMap[fsize] = [f]
else:
sizeMap[fsize].append(f)
# Using processes to parallelize the hashing, especially beneficial for CPU-bound operations
with ProcessPoolExecutor() as executor:
for size, files_with_same_size in sizeMap.items():
if len(files_with_same_size) > 1:
# Map each file to the hash_file function
results = list(executor.map(hash_file, files_with_same_size))
for i, file_hash in enumerate(results):
if file_hash in hashMap:
duplicates.add(files_with_same_size[i])
else:
hashMap[file_hash] = files_with_same_size[i]
if duplicates:
print('Duplicates Found:')
for duplicate in duplicates:
print(duplicate)
# This just prints the duplicates. You can choose to delete or move them as required.
else:
print('No duplicate files found')
if __name__ == "__main__":
main()